MindsDB 25.3.4.2__py3-none-any.whl → 25.4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +21 -4
- mindsdb/api/executor/command_executor.py +62 -61
- mindsdb/api/executor/data_types/answer.py +9 -12
- mindsdb/api/executor/datahub/classes/response.py +11 -0
- mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/steps.py +2 -1
- mindsdb/api/executor/sql_query/result_set.py +10 -7
- mindsdb/api/executor/sql_query/sql_query.py +36 -82
- mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
- mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
- mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
- mindsdb/api/http/namespaces/sql.py +4 -1
- mindsdb/api/mcp/__init__.py +0 -0
- mindsdb/api/mcp/start.py +152 -0
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
- mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
- mindsdb/integrations/libs/response.py +9 -4
- mindsdb/integrations/libs/vectordatabase_handler.py +37 -25
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +35 -15
- mindsdb/interfaces/database/log.py +8 -9
- mindsdb/interfaces/database/projects.py +16 -5
- mindsdb/interfaces/functions/controller.py +59 -17
- mindsdb/interfaces/functions/to_markdown.py +194 -0
- mindsdb/interfaces/jobs/jobs_controller.py +3 -3
- mindsdb/interfaces/knowledge_base/controller.py +143 -26
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
- mindsdb/interfaces/query_context/context_controller.py +3 -1
- mindsdb/utilities/config.py +8 -0
- mindsdb/utilities/starters.py +7 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +233 -231
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +53 -49
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@ from typing import Dict, List, Optional
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import hashlib
|
|
7
|
+
import numpy as np
|
|
7
8
|
|
|
8
9
|
from mindsdb_sql_parser.ast import (
|
|
9
10
|
BinaryOperation,
|
|
@@ -34,12 +35,20 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import Preprocessing
|
|
|
34
35
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
|
|
35
36
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
36
37
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
37
39
|
|
|
38
40
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
39
41
|
from mindsdb.utilities import log
|
|
42
|
+
from mindsdb.integrations.utilities.rag.rerankers.reranker_compressor import LLMReranker
|
|
40
43
|
|
|
41
44
|
logger = log.getLogger(__name__)
|
|
42
45
|
|
|
46
|
+
KB_TO_VECTORDB_COLUMNS = {
|
|
47
|
+
'id': 'original_row_id',
|
|
48
|
+
'chunk_id': 'id',
|
|
49
|
+
'chunk_content': 'content'
|
|
50
|
+
}
|
|
51
|
+
|
|
43
52
|
|
|
44
53
|
class KnowledgeBaseTable:
|
|
45
54
|
"""
|
|
@@ -77,46 +86,150 @@ class KnowledgeBaseTable:
|
|
|
77
86
|
"""
|
|
78
87
|
logger.debug(f"Processing select query: {query}")
|
|
79
88
|
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
|
|
89
|
+
# Extract the content query text for potential reranking
|
|
90
|
+
|
|
91
|
+
db_handler = self.get_vector_db()
|
|
83
92
|
|
|
93
|
+
logger.debug("Replaced content with embeddings in where clause")
|
|
84
94
|
# set table name
|
|
85
95
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
86
96
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
87
97
|
|
|
88
|
-
|
|
89
|
-
targets = []
|
|
98
|
+
requested_kb_columns = []
|
|
90
99
|
for target in query.targets:
|
|
91
100
|
if isinstance(target, Star):
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
requested_kb_columns = None
|
|
102
|
+
break
|
|
103
|
+
else:
|
|
104
|
+
requested_kb_columns.append(target.parts[-1].lower())
|
|
105
|
+
|
|
106
|
+
query.targets = [
|
|
107
|
+
Identifier(TableField.ID.value),
|
|
108
|
+
Identifier(TableField.CONTENT.value),
|
|
109
|
+
Identifier(TableField.METADATA.value),
|
|
110
|
+
Identifier(TableField.DISTANCE.value),
|
|
111
|
+
]
|
|
101
112
|
|
|
102
113
|
# Get response from vector db
|
|
103
|
-
db_handler = self.get_vector_db()
|
|
104
114
|
logger.debug(f"Using vector db handler: {type(db_handler)}")
|
|
105
115
|
|
|
106
|
-
|
|
116
|
+
# extract values from conditions and prepare for vectordb
|
|
117
|
+
conditions = []
|
|
118
|
+
query_text = None
|
|
119
|
+
reranking_threshold = None
|
|
120
|
+
query_conditions = db_handler.extract_conditions(query.where)
|
|
121
|
+
if query_conditions is not None:
|
|
122
|
+
for item in query_conditions:
|
|
123
|
+
if item.column == "reranking_threshold" and item.op.value == "=":
|
|
124
|
+
try:
|
|
125
|
+
reranking_threshold = float(item.value)
|
|
126
|
+
# Validate range: must be between 0 and 1
|
|
127
|
+
if not (0 <= reranking_threshold <= 1):
|
|
128
|
+
raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
|
|
129
|
+
logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
|
|
130
|
+
except (ValueError, TypeError) as e:
|
|
131
|
+
error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
|
|
132
|
+
logger.error(error_msg)
|
|
133
|
+
raise ValueError(error_msg)
|
|
134
|
+
elif item.column == TableField.CONTENT.value:
|
|
135
|
+
query_text = item.value
|
|
136
|
+
|
|
137
|
+
# replace content with embeddings
|
|
138
|
+
conditions.append(FilterCondition(
|
|
139
|
+
column=TableField.EMBEDDINGS.value,
|
|
140
|
+
value=self._content_to_embeddings(item.value),
|
|
141
|
+
op=FilterOperator.EQUAL,
|
|
142
|
+
))
|
|
143
|
+
else:
|
|
144
|
+
conditions.append(item)
|
|
107
145
|
|
|
108
|
-
|
|
146
|
+
logger.debug(f"Extracted query text: {query_text}")
|
|
109
147
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
148
|
+
self.addapt_conditions_columns(conditions)
|
|
149
|
+
df = db_handler.dispatch_select(query, conditions)
|
|
150
|
+
df = self.addapt_result_columns(df)
|
|
151
|
+
|
|
152
|
+
logger.debug(f"Query returned {len(df)} rows")
|
|
153
|
+
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
154
|
+
# Check if we have a rerank_model configured in KB params
|
|
155
|
+
|
|
156
|
+
df = self.add_relevance(df, query_text, reranking_threshold)
|
|
157
|
+
|
|
158
|
+
# filter by targets
|
|
159
|
+
if requested_kb_columns is not None:
|
|
160
|
+
df = df[requested_kb_columns]
|
|
161
|
+
return df
|
|
162
|
+
|
|
163
|
+
def add_relevance(self, df, query_text, reranking_threshold=None):
|
|
164
|
+
relevance_column = TableField.RELEVANCE.value
|
|
165
|
+
|
|
166
|
+
rerank_model = self._kb.params.get("rerank_model")
|
|
167
|
+
if rerank_model and query_text and len(df) > 0:
|
|
168
|
+
# Use reranker for relevance score
|
|
169
|
+
try:
|
|
170
|
+
logger.info(f"Using reranker model {rerank_model} for relevance calculation")
|
|
171
|
+
reranker_params = {"model": rerank_model}
|
|
172
|
+
# Apply custom filtering threshold if provided
|
|
173
|
+
if reranking_threshold is not None:
|
|
174
|
+
reranker_params["filtering_threshold"] = reranking_threshold
|
|
175
|
+
logger.info(f"Using custom filtering threshold: {reranking_threshold}")
|
|
176
|
+
|
|
177
|
+
reranker = LLMReranker(**reranker_params)
|
|
178
|
+
# Get documents to rerank
|
|
179
|
+
documents = df['chunk_content'].tolist()
|
|
180
|
+
# Use the get_scores method with disable_events=True
|
|
181
|
+
scores = reranker.get_scores(query_text, documents)
|
|
182
|
+
# Add scores as the relevance column
|
|
183
|
+
df[relevance_column] = scores
|
|
184
|
+
|
|
185
|
+
# Filter by threshold
|
|
186
|
+
scores_array = np.array(scores)
|
|
187
|
+
df = df[scores_array > reranker.filtering_threshold]
|
|
188
|
+
logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f"Error during reranking: {str(e)}")
|
|
191
|
+
# Fallback to distance-based relevance
|
|
192
|
+
if 'distance' in df.columns:
|
|
193
|
+
df[relevance_column] = 1 / (1 + df['distance'])
|
|
194
|
+
else:
|
|
195
|
+
logger.info("No distance or reranker available")
|
|
196
|
+
|
|
197
|
+
elif 'distance' in df.columns:
|
|
198
|
+
# Calculate relevance from distance
|
|
199
|
+
logger.info("Calculating relevance from vector distance")
|
|
200
|
+
df[relevance_column] = 1 / (1 + df['distance'])
|
|
117
201
|
|
|
202
|
+
else:
|
|
203
|
+
df[relevance_column] = None
|
|
204
|
+
df['distance'] = None
|
|
205
|
+
# Sort by relevance
|
|
206
|
+
df = df.sort_values(by=relevance_column, ascending=False)
|
|
118
207
|
return df
|
|
119
208
|
|
|
209
|
+
def addapt_conditions_columns(self, conditions):
|
|
210
|
+
if conditions is None:
|
|
211
|
+
return
|
|
212
|
+
for condition in conditions:
|
|
213
|
+
if condition.column in KB_TO_VECTORDB_COLUMNS:
|
|
214
|
+
condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
|
|
215
|
+
|
|
216
|
+
def addapt_result_columns(self, df):
|
|
217
|
+
col_update = {}
|
|
218
|
+
for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
|
|
219
|
+
if vec_col in df.columns:
|
|
220
|
+
col_update[vec_col] = kb_col
|
|
221
|
+
|
|
222
|
+
df = df.rename(columns=col_update)
|
|
223
|
+
|
|
224
|
+
columns = list(df.columns)
|
|
225
|
+
# update id, get from metadata
|
|
226
|
+
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
227
|
+
lambda m: None if m is None else m.get('original_row_id')
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# id on first place
|
|
231
|
+
return df[[TableField.ID.value] + columns]
|
|
232
|
+
|
|
120
233
|
def insert_files(self, file_names: List[str]):
|
|
121
234
|
"""Process and insert files"""
|
|
122
235
|
if not self.document_loader:
|
|
@@ -202,7 +315,9 @@ class KnowledgeBaseTable:
|
|
|
202
315
|
|
|
203
316
|
# send to vectordb
|
|
204
317
|
db_handler = self.get_vector_db()
|
|
205
|
-
db_handler.
|
|
318
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
319
|
+
self.addapt_conditions_columns(conditions)
|
|
320
|
+
db_handler.dispatch_update(query, conditions)
|
|
206
321
|
|
|
207
322
|
def delete_query(self, query: Delete):
|
|
208
323
|
"""
|
|
@@ -217,7 +332,9 @@ class KnowledgeBaseTable:
|
|
|
217
332
|
|
|
218
333
|
# send to vectordb
|
|
219
334
|
db_handler = self.get_vector_db()
|
|
220
|
-
db_handler.
|
|
335
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
336
|
+
self.addapt_conditions_columns(conditions)
|
|
337
|
+
db_handler.dispatch_delete(query, conditions)
|
|
221
338
|
|
|
222
339
|
def hybrid_search(
|
|
223
340
|
self,
|
|
@@ -92,9 +92,7 @@ class DocumentPreprocessor:
|
|
|
92
92
|
|
|
93
93
|
def _generate_chunk_id(
|
|
94
94
|
self,
|
|
95
|
-
content: str,
|
|
96
95
|
chunk_index: Optional[int] = None,
|
|
97
|
-
content_column: str = None,
|
|
98
96
|
provided_id: str = None,
|
|
99
97
|
) -> str:
|
|
100
98
|
"""Generate deterministic ID for a chunk"""
|
|
@@ -262,15 +260,8 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
262
260
|
if doc.metadata:
|
|
263
261
|
metadata.update(doc.metadata)
|
|
264
262
|
|
|
265
|
-
# Pass through doc.id and content_column
|
|
266
|
-
content_column = (
|
|
267
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
268
|
-
)
|
|
269
263
|
chunk_id = self._generate_chunk_id(
|
|
270
|
-
|
|
271
|
-
chunk_index,
|
|
272
|
-
content_column=content_column,
|
|
273
|
-
provided_id=doc.id,
|
|
264
|
+
chunk_index=chunk_index, provided_id=doc.id
|
|
274
265
|
)
|
|
275
266
|
processed_chunks.append(
|
|
276
267
|
ProcessedChunk(
|
|
@@ -335,7 +326,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
335
326
|
|
|
336
327
|
# Pass through doc.id and content_column
|
|
337
328
|
id = self._generate_chunk_id(
|
|
338
|
-
|
|
329
|
+
chunk_index=0, provided_id=doc.id
|
|
339
330
|
)
|
|
340
331
|
processed_chunks.append(
|
|
341
332
|
ProcessedChunk(
|
|
@@ -358,9 +349,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
358
349
|
|
|
359
350
|
# Pass through doc.id and content_column
|
|
360
351
|
chunk_id = self._generate_chunk_id(
|
|
361
|
-
|
|
362
|
-
i,
|
|
363
|
-
content_column=content_column,
|
|
352
|
+
chunk_index=i,
|
|
364
353
|
provided_id=doc.id,
|
|
365
354
|
)
|
|
366
355
|
processed_chunks.append(
|
|
@@ -156,10 +156,12 @@ class QueryContextController:
|
|
|
156
156
|
last_values = {}
|
|
157
157
|
for query, info in l_query.get_init_queries():
|
|
158
158
|
|
|
159
|
-
|
|
159
|
+
response = dn.query(
|
|
160
160
|
query=query,
|
|
161
161
|
session=session
|
|
162
162
|
)
|
|
163
|
+
data = response.data_frame
|
|
164
|
+
columns_info = response.columns
|
|
163
165
|
|
|
164
166
|
if len(data) == 0:
|
|
165
167
|
value = None
|
mindsdb/utilities/config.py
CHANGED
|
@@ -201,6 +201,14 @@ class Config:
|
|
|
201
201
|
"host": api_host,
|
|
202
202
|
"port": "55432",
|
|
203
203
|
"database": "mindsdb"
|
|
204
|
+
},
|
|
205
|
+
"mcp": {
|
|
206
|
+
"host": api_host,
|
|
207
|
+
"port": "47337",
|
|
208
|
+
"enabled": True,
|
|
209
|
+
"restart_on_failure": True,
|
|
210
|
+
"max_restart_count": 1,
|
|
211
|
+
"max_restart_interval_seconds": 60
|
|
204
212
|
}
|
|
205
213
|
},
|
|
206
214
|
"cache": {
|
mindsdb/utilities/starters.py
CHANGED
|
@@ -31,3 +31,10 @@ def start_ml_task_queue(*args, **kwargs):
|
|
|
31
31
|
def start_scheduler(*args, **kwargs):
|
|
32
32
|
from mindsdb.interfaces.jobs.scheduler import start
|
|
33
33
|
start(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def start_mcp(*args, **kwargs):
|
|
37
|
+
"""Start the MCP server"""
|
|
38
|
+
from mindsdb.api.mcp.start import start
|
|
39
|
+
|
|
40
|
+
start(*args, **kwargs)
|