MindsDB 25.3.4.2__py3-none-any.whl → 25.4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (53) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +21 -4
  3. mindsdb/api/executor/command_executor.py +62 -61
  4. mindsdb/api/executor/data_types/answer.py +9 -12
  5. mindsdb/api/executor/datahub/classes/response.py +11 -0
  6. mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
  7. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -9
  8. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
  9. mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
  10. mindsdb/api/executor/planner/plan_join.py +1 -1
  11. mindsdb/api/executor/planner/steps.py +2 -1
  12. mindsdb/api/executor/sql_query/result_set.py +10 -7
  13. mindsdb/api/executor/sql_query/sql_query.py +36 -82
  14. mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
  15. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
  16. mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
  17. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
  18. mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
  19. mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
  20. mindsdb/api/http/namespaces/sql.py +4 -1
  21. mindsdb/api/mcp/__init__.py +0 -0
  22. mindsdb/api/mcp/start.py +152 -0
  23. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
  24. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
  25. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
  26. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
  27. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
  28. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
  29. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
  30. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
  31. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +19 -5
  32. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +9 -4
  33. mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
  34. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
  35. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
  36. mindsdb/integrations/libs/response.py +9 -4
  37. mindsdb/integrations/libs/vectordatabase_handler.py +37 -25
  38. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +35 -15
  39. mindsdb/interfaces/database/log.py +8 -9
  40. mindsdb/interfaces/database/projects.py +16 -5
  41. mindsdb/interfaces/functions/controller.py +59 -17
  42. mindsdb/interfaces/functions/to_markdown.py +194 -0
  43. mindsdb/interfaces/jobs/jobs_controller.py +3 -3
  44. mindsdb/interfaces/knowledge_base/controller.py +143 -26
  45. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
  46. mindsdb/interfaces/query_context/context_controller.py +3 -1
  47. mindsdb/utilities/config.py +8 -0
  48. mindsdb/utilities/starters.py +7 -0
  49. {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/METADATA +233 -231
  50. {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/RECORD +53 -49
  51. {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/WHEEL +0 -0
  52. {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/licenses/LICENSE +0 -0
  53. {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.2.0.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ from typing import Dict, List, Optional
4
4
 
5
5
  import pandas as pd
6
6
  import hashlib
7
+ import numpy as np
7
8
 
8
9
  from mindsdb_sql_parser.ast import (
9
10
  BinaryOperation,
@@ -34,12 +35,20 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import Preprocessing
34
35
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
35
36
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
36
37
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
37
39
 
38
40
  from mindsdb.api.executor.command_executor import ExecuteCommands
39
41
  from mindsdb.utilities import log
42
+ from mindsdb.integrations.utilities.rag.rerankers.reranker_compressor import LLMReranker
40
43
 
41
44
  logger = log.getLogger(__name__)
42
45
 
46
+ KB_TO_VECTORDB_COLUMNS = {
47
+ 'id': 'original_row_id',
48
+ 'chunk_id': 'id',
49
+ 'chunk_content': 'content'
50
+ }
51
+
43
52
 
44
53
  class KnowledgeBaseTable:
45
54
  """
@@ -77,46 +86,150 @@ class KnowledgeBaseTable:
77
86
  """
78
87
  logger.debug(f"Processing select query: {query}")
79
88
 
80
- # replace content with embeddings
81
- query_traversal(query.where, self._replace_query_content)
82
- logger.debug("Replaced content with embeddings in where clause")
89
+ # Extract the content query text for potential reranking
90
+
91
+ db_handler = self.get_vector_db()
83
92
 
93
+ logger.debug("Replaced content with embeddings in where clause")
84
94
  # set table name
85
95
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
86
96
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
87
97
 
88
- # remove embeddings from result
89
- targets = []
98
+ requested_kb_columns = []
90
99
  for target in query.targets:
91
100
  if isinstance(target, Star):
92
- targets.extend([
93
- Identifier(TableField.ID.value),
94
- Identifier(TableField.CONTENT.value),
95
- Identifier(TableField.METADATA.value),
96
- ])
97
- elif isinstance(target, Identifier) and target.parts[-1].lower() != TableField.EMBEDDINGS.value:
98
- targets.append(target)
99
- query.targets = targets
100
- logger.debug(f"Modified query targets: {targets}")
101
+ requested_kb_columns = None
102
+ break
103
+ else:
104
+ requested_kb_columns.append(target.parts[-1].lower())
105
+
106
+ query.targets = [
107
+ Identifier(TableField.ID.value),
108
+ Identifier(TableField.CONTENT.value),
109
+ Identifier(TableField.METADATA.value),
110
+ Identifier(TableField.DISTANCE.value),
111
+ ]
101
112
 
102
113
  # Get response from vector db
103
- db_handler = self.get_vector_db()
104
114
  logger.debug(f"Using vector db handler: {type(db_handler)}")
105
115
 
106
- df = db_handler.dispatch_select(query)
116
+ # extract values from conditions and prepare for vectordb
117
+ conditions = []
118
+ query_text = None
119
+ reranking_threshold = None
120
+ query_conditions = db_handler.extract_conditions(query.where)
121
+ if query_conditions is not None:
122
+ for item in query_conditions:
123
+ if item.column == "reranking_threshold" and item.op.value == "=":
124
+ try:
125
+ reranking_threshold = float(item.value)
126
+ # Validate range: must be between 0 and 1
127
+ if not (0 <= reranking_threshold <= 1):
128
+ raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
129
+ logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
130
+ except (ValueError, TypeError) as e:
131
+ error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
132
+ logger.error(error_msg)
133
+ raise ValueError(error_msg)
134
+ elif item.column == TableField.CONTENT.value:
135
+ query_text = item.value
136
+
137
+ # replace content with embeddings
138
+ conditions.append(FilterCondition(
139
+ column=TableField.EMBEDDINGS.value,
140
+ value=self._content_to_embeddings(item.value),
141
+ op=FilterOperator.EQUAL,
142
+ ))
143
+ else:
144
+ conditions.append(item)
107
145
 
108
- if df is not None:
146
+ logger.debug(f"Extracted query text: {query_text}")
109
147
 
110
- logger.debug(f"Query returned {len(df)} rows")
111
- logger.debug(f"Columns in response: {df.columns.tolist()}")
112
- # Log a sample of IDs to help diagnose issues
113
- if not df.empty:
114
- logger.debug(f"Sample of IDs in response: {df['id'].head().tolist()}")
115
- else:
116
- logger.warning("Query returned no data")
148
+ self.addapt_conditions_columns(conditions)
149
+ df = db_handler.dispatch_select(query, conditions)
150
+ df = self.addapt_result_columns(df)
151
+
152
+ logger.debug(f"Query returned {len(df)} rows")
153
+ logger.debug(f"Columns in response: {df.columns.tolist()}")
154
+ # Check if we have a rerank_model configured in KB params
155
+
156
+ df = self.add_relevance(df, query_text, reranking_threshold)
157
+
158
+ # filter by targets
159
+ if requested_kb_columns is not None:
160
+ df = df[requested_kb_columns]
161
+ return df
162
+
163
+ def add_relevance(self, df, query_text, reranking_threshold=None):
164
+ relevance_column = TableField.RELEVANCE.value
165
+
166
+ rerank_model = self._kb.params.get("rerank_model")
167
+ if rerank_model and query_text and len(df) > 0:
168
+ # Use reranker for relevance score
169
+ try:
170
+ logger.info(f"Using reranker model {rerank_model} for relevance calculation")
171
+ reranker_params = {"model": rerank_model}
172
+ # Apply custom filtering threshold if provided
173
+ if reranking_threshold is not None:
174
+ reranker_params["filtering_threshold"] = reranking_threshold
175
+ logger.info(f"Using custom filtering threshold: {reranking_threshold}")
176
+
177
+ reranker = LLMReranker(**reranker_params)
178
+ # Get documents to rerank
179
+ documents = df['chunk_content'].tolist()
180
+ # Use the get_scores method with disable_events=True
181
+ scores = reranker.get_scores(query_text, documents)
182
+ # Add scores as the relevance column
183
+ df[relevance_column] = scores
184
+
185
+ # Filter by threshold
186
+ scores_array = np.array(scores)
187
+ df = df[scores_array > reranker.filtering_threshold]
188
+ logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
189
+ except Exception as e:
190
+ logger.error(f"Error during reranking: {str(e)}")
191
+ # Fallback to distance-based relevance
192
+ if 'distance' in df.columns:
193
+ df[relevance_column] = 1 / (1 + df['distance'])
194
+ else:
195
+ logger.info("No distance or reranker available")
196
+
197
+ elif 'distance' in df.columns:
198
+ # Calculate relevance from distance
199
+ logger.info("Calculating relevance from vector distance")
200
+ df[relevance_column] = 1 / (1 + df['distance'])
117
201
 
202
+ else:
203
+ df[relevance_column] = None
204
+ df['distance'] = None
205
+ # Sort by relevance
206
+ df = df.sort_values(by=relevance_column, ascending=False)
118
207
  return df
119
208
 
209
+ def addapt_conditions_columns(self, conditions):
210
+ if conditions is None:
211
+ return
212
+ for condition in conditions:
213
+ if condition.column in KB_TO_VECTORDB_COLUMNS:
214
+ condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
215
+
216
+ def addapt_result_columns(self, df):
217
+ col_update = {}
218
+ for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
219
+ if vec_col in df.columns:
220
+ col_update[vec_col] = kb_col
221
+
222
+ df = df.rename(columns=col_update)
223
+
224
+ columns = list(df.columns)
225
+ # update id, get from metadata
226
+ df[TableField.ID.value] = df[TableField.METADATA.value].apply(
227
+ lambda m: None if m is None else m.get('original_row_id')
228
+ )
229
+
230
+ # id on first place
231
+ return df[[TableField.ID.value] + columns]
232
+
120
233
  def insert_files(self, file_names: List[str]):
121
234
  """Process and insert files"""
122
235
  if not self.document_loader:
@@ -202,7 +315,9 @@ class KnowledgeBaseTable:
202
315
 
203
316
  # send to vectordb
204
317
  db_handler = self.get_vector_db()
205
- db_handler.query(query)
318
+ conditions = db_handler.extract_conditions(query.where)
319
+ self.addapt_conditions_columns(conditions)
320
+ db_handler.dispatch_update(query, conditions)
206
321
 
207
322
  def delete_query(self, query: Delete):
208
323
  """
@@ -217,7 +332,9 @@ class KnowledgeBaseTable:
217
332
 
218
333
  # send to vectordb
219
334
  db_handler = self.get_vector_db()
220
- db_handler.dispatch_delete(query)
335
+ conditions = db_handler.extract_conditions(query.where)
336
+ self.addapt_conditions_columns(conditions)
337
+ db_handler.dispatch_delete(query, conditions)
221
338
 
222
339
  def hybrid_search(
223
340
  self,
@@ -92,9 +92,7 @@ class DocumentPreprocessor:
92
92
 
93
93
  def _generate_chunk_id(
94
94
  self,
95
- content: str,
96
95
  chunk_index: Optional[int] = None,
97
- content_column: str = None,
98
96
  provided_id: str = None,
99
97
  ) -> str:
100
98
  """Generate deterministic ID for a chunk"""
@@ -262,15 +260,8 @@ Please give a short succinct context to situate this chunk within the overall do
262
260
  if doc.metadata:
263
261
  metadata.update(doc.metadata)
264
262
 
265
- # Pass through doc.id and content_column
266
- content_column = (
267
- doc.metadata.get("content_column") if doc.metadata else None
268
- )
269
263
  chunk_id = self._generate_chunk_id(
270
- processed_content,
271
- chunk_index,
272
- content_column=content_column,
273
- provided_id=doc.id,
264
+ chunk_index=chunk_index, provided_id=doc.id
274
265
  )
275
266
  processed_chunks.append(
276
267
  ProcessedChunk(
@@ -335,7 +326,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
335
326
 
336
327
  # Pass through doc.id and content_column
337
328
  id = self._generate_chunk_id(
338
- chunk_doc.content, content_column=content_column, provided_id=doc.id
329
+ chunk_index=0, provided_id=doc.id
339
330
  )
340
331
  processed_chunks.append(
341
332
  ProcessedChunk(
@@ -358,9 +349,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
358
349
 
359
350
  # Pass through doc.id and content_column
360
351
  chunk_id = self._generate_chunk_id(
361
- chunk_doc.content,
362
- i,
363
- content_column=content_column,
352
+ chunk_index=i,
364
353
  provided_id=doc.id,
365
354
  )
366
355
  processed_chunks.append(
@@ -156,10 +156,12 @@ class QueryContextController:
156
156
  last_values = {}
157
157
  for query, info in l_query.get_init_queries():
158
158
 
159
- data, columns_info = dn.query(
159
+ response = dn.query(
160
160
  query=query,
161
161
  session=session
162
162
  )
163
+ data = response.data_frame
164
+ columns_info = response.columns
163
165
 
164
166
  if len(data) == 0:
165
167
  value = None
@@ -201,6 +201,14 @@ class Config:
201
201
  "host": api_host,
202
202
  "port": "55432",
203
203
  "database": "mindsdb"
204
+ },
205
+ "mcp": {
206
+ "host": api_host,
207
+ "port": "47337",
208
+ "enabled": True,
209
+ "restart_on_failure": True,
210
+ "max_restart_count": 1,
211
+ "max_restart_interval_seconds": 60
204
212
  }
205
213
  },
206
214
  "cache": {
@@ -31,3 +31,10 @@ def start_ml_task_queue(*args, **kwargs):
31
31
  def start_scheduler(*args, **kwargs):
32
32
  from mindsdb.interfaces.jobs.scheduler import start
33
33
  start(*args, **kwargs)
34
+
35
+
36
+ def start_mcp(*args, **kwargs):
37
+ """Start the MCP server"""
38
+ from mindsdb.api.mcp.start import start
39
+
40
+ start(*args, **kwargs)