MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (55) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +8 -6
  3. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +72 -44
  4. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +14 -1
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
  6. mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
  7. mindsdb/api/executor/planner/plan_join.py +1 -1
  8. mindsdb/api/executor/planner/query_planner.py +7 -1
  9. mindsdb/api/executor/planner/query_prepare.py +68 -87
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  11. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  12. mindsdb/api/http/namespaces/file.py +49 -24
  13. mindsdb/api/mcp/start.py +45 -31
  14. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  15. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  16. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  17. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  18. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  19. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  20. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  21. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  22. mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
  23. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
  24. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  25. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
  26. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  27. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  28. mindsdb/integrations/libs/api_handler.py +6 -7
  29. mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
  30. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  31. mindsdb/interfaces/agents/agents_controller.py +29 -9
  32. mindsdb/interfaces/agents/constants.py +44 -0
  33. mindsdb/interfaces/agents/langchain_agent.py +15 -6
  34. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  35. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  36. mindsdb/interfaces/data_catalog/data_catalog_reader.py +22 -3
  37. mindsdb/interfaces/knowledge_base/controller.py +121 -102
  38. mindsdb/interfaces/knowledge_base/evaluate.py +19 -7
  39. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  40. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  41. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  42. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  43. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +26 -22
  44. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
  45. mindsdb/interfaces/skills/skill_tool.py +91 -88
  46. mindsdb/interfaces/skills/sql_agent.py +181 -130
  47. mindsdb/interfaces/storage/db.py +9 -7
  48. mindsdb/utilities/config.py +12 -1
  49. mindsdb/utilities/exception.py +47 -7
  50. mindsdb/utilities/security.py +54 -11
  51. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +239 -251
  52. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +55 -54
  53. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
  54. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
  55. {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
@@ -71,11 +71,11 @@ class MCPLangchainAgent(LangchainAgent):
71
71
  self,
72
72
  agent: db.Agents,
73
73
  model: dict = None,
74
- params: dict = None,
74
+ llm_params: dict = None,
75
75
  mcp_host: str = "127.0.0.1",
76
76
  mcp_port: int = 47337,
77
77
  ):
78
- super().__init__(agent, model, params)
78
+ super().__init__(agent, model, llm_params)
79
79
  self.mcp_host = mcp_host
80
80
  self.mcp_port = mcp_port
81
81
  self.exit_stack = AsyncExitStack()
@@ -251,10 +251,10 @@ def create_mcp_agent(
251
251
  raise ValueError(f"Agent {agent_name} not found in project {project_name}")
252
252
 
253
253
  # Get merged parameters (defaults + agent params)
254
- merged_params = agent_controller.get_agent_llm_params(agent_db.params)
254
+ llm_params = agent_controller.get_agent_llm_params(agent_db.params)
255
255
 
256
256
  # Create MCP agent with merged parameters
257
- mcp_agent = MCPLangchainAgent(agent_db, params=merged_params, mcp_host=mcp_host, mcp_port=mcp_port)
257
+ mcp_agent = MCPLangchainAgent(agent_db, llm_params=llm_params, mcp_host=mcp_host, mcp_port=mcp_port)
258
258
 
259
259
  # Wrap for LiteLLM compatibility
260
260
  return LiteLLMAgentWrapper(mcp_agent)
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
96
96
  # Log the query for debugging
97
97
  logger.info(f"Executing SQL query: {command}")
98
98
 
99
- # Removing backticks causes in query execution.
100
- # remove backticks
101
- # command = command.replace('`', '')
102
-
103
- # Parse the SQL string to an AST object first
104
- from mindsdb_sql_parser import parse_sql
105
-
106
- ast_query = parse_sql(command)
107
-
108
- # Now execute the parsed query
109
- result = self._sql_agent.skill_tool.get_command_executor().execute_command(
110
- ast_query, database_name="mindsdb"
111
- )
112
-
113
- # Convert ExecuteAnswer to a DataFrame for easier manipulation
114
- if result.data is not None:
115
- df = result.data.to_df()
116
- return df.to_string(index=False)
117
-
118
- else:
119
- return "Query executed successfully, but returned no data."
99
+ return self._sql_agent.query(command)
120
100
 
121
101
  except Exception as e:
122
102
  logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
127
107
  return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
128
108
  return f"Error: {str(e)}"
129
109
 
130
- # def run_no_throw(self, command: str, fetch: str = "all") -> str:
131
- # """Execute a SQL command and return the result as a string.
132
- #
133
- # This method catches any exceptions and returns an error message instead of raising an exception.
134
- #
135
- # Args:
136
- # command: The SQL command to execute
137
- # fetch: Whether to fetch 'all' results or just 'one'
138
- #
139
- # Returns:
140
- # A string representation of the result or an error message
141
- # """
142
- # command = extract_essential(command)
143
- # try:
144
- # return self._sql_agent.query_safe(command)
145
- # except Exception as e:
146
- # logger.error(f"Error executing SQL command: {str(e)}")
147
- # # If this is a knowledge base query, provide a more helpful error message
148
- # if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
149
- # return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
150
- # return f"Error: {str(e)}"
151
-
152
110
  def get_usable_knowledge_base_names(self) -> List[str]:
153
111
  """Get a list of usable knowledge base names.
154
112
 
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
160
118
  except Exception as e:
161
119
  logger.error(f"Error getting usable knowledge base names: {str(e)}")
162
120
  return []
121
+
122
+ def check_knowledge_base_permission(self, name):
123
+ """Get a list of usable knowledge base names.
124
+
125
+ Returns:
126
+ A list of knowledge base names that can be used in queries
127
+ """
128
+
129
+ return self._sql_agent.check_knowledge_base_permission(name)
@@ -11,8 +11,6 @@ class DataCatalogReader(BaseDataCatalog):
11
11
  """
12
12
  Read the metadata from the data catalog and return it as a string.
13
13
  """
14
- if not self.is_data_catalog_supported():
15
- return f"Data catalog is not supported for database '{self.database_name}'."
16
14
  tables = self._read_metadata()
17
15
  if not tables:
18
16
  self.logger.warning(f"No metadata found for database '{self.database_name}'")
@@ -20,16 +18,37 @@ class DataCatalogReader(BaseDataCatalog):
20
18
 
21
19
  metadata_str = "Data Catalog: \n"
22
20
  if hasattr(self.data_handler, "meta_get_handler_info"):
23
- metadata_str += self.data_handler.meta_get_handler_info() + "\n\n"
21
+ info = self.data_handler.meta_get_handler_info()
22
+ if info:
23
+ metadata_str += info + "\n\n"
24
24
 
25
25
  for table in tables:
26
26
  metadata_str += table.as_string() + "\n\n"
27
27
  return metadata_str
28
28
 
29
+ def read_metadata_as_records(self) -> list:
30
+ """
31
+ Read the metadata from the data catalog and return it as a list of database records.
32
+ """
33
+ tables = self._read_metadata()
34
+ if not tables:
35
+ self.logger.warning(f"No metadata found for database '{self.database_name}'")
36
+ return []
37
+ return tables
38
+
39
+ def get_handler_info(self) -> str:
40
+ """
41
+ Get the handler info for the database.
42
+ """
43
+ return self.data_handler.meta_get_handler_info()
44
+
29
45
  def _read_metadata(self) -> list:
30
46
  """
31
47
  Read the metadata from the data catalog and return it in a structured format.
32
48
  """
49
+ if not self.is_data_catalog_supported():
50
+ return f"Data catalog is not supported for database '{self.database_name}'."
51
+
33
52
  query = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id)
34
53
  if self.table_names:
35
54
  cleaned_table_names = [name.strip("`").split(".")[-1] for name in self.table_names]
@@ -6,6 +6,7 @@ import decimal
6
6
 
7
7
  import pandas as pd
8
8
  import numpy as np
9
+ from sqlalchemy.orm.attributes import flag_modified
9
10
 
10
11
  from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
11
12
  from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
@@ -33,6 +34,7 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
33
34
  from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
34
35
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
35
36
  from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
37
+ from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
36
38
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
37
39
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
40
  from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
@@ -46,25 +48,21 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
46
48
 
47
49
  logger = log.getLogger(__name__)
48
50
 
49
- KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
50
-
51
51
 
52
52
  def get_model_params(model_params: dict, default_config_key: str):
53
53
  """
54
54
  Get model parameters by combining default config with user provided parameters.
55
55
  """
56
- # If the default config key is for reranking and the switch to use the default LLM is enabled,
57
- # switch to the default LLM model.
58
- if default_config_key == "default_reranking_model" and config.get("default_reranking_model").get(
59
- "use_default_llm", False
60
- ):
61
- default_config_key = "default_llm_model"
62
-
63
56
  combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
64
57
 
65
58
  if model_params:
59
+ if not isinstance(model_params, dict):
60
+ raise ValueError("Model parameters must be passed as a JSON object")
61
+
66
62
  combined_model_params.update(model_params)
67
63
 
64
+ combined_model_params.pop("use_default_llm", None)
65
+
68
66
  return combined_model_params
69
67
 
70
68
 
@@ -105,8 +103,6 @@ def get_reranking_model_from_params(reranking_model_params: dict):
105
103
  params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
106
104
  params_copy["model"] = params_copy.pop("model_name", None)
107
105
 
108
- params_copy.pop("use_default_llm", None)
109
-
110
106
  return BaseLLMReranker(**params_copy)
111
107
 
112
108
 
@@ -144,23 +140,29 @@ class KnowledgeBaseTable:
144
140
  self.document_loader = None
145
141
  self.model_params = None
146
142
 
143
+ self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
144
+ if self._kb.params.get("version", 0) < 2:
145
+ self.kb_to_vector_columns["id"] = "original_doc_id"
146
+
147
147
  def configure_preprocessing(self, config: Optional[dict] = None):
148
148
  """Configure preprocessing for the knowledge base table"""
149
149
  logger.debug(f"Configuring preprocessing with config: {config}")
150
150
  self.document_preprocessor = None # Reset existing preprocessor
151
- if config is not None:
152
- # Ensure content_column is set for JSON chunking if not already specified
153
- if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
154
- if "content_column" not in config["json_chunking_config"]:
155
- config["json_chunking_config"]["content_column"] = "content"
156
-
157
- preprocessing_config = PreprocessingConfig(**config)
158
- self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
159
- logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
160
- else:
161
- # Always create a default preprocessor if none specified
162
- self.document_preprocessor = PreprocessorFactory.create_preprocessor()
163
- logger.debug("Created default preprocessor")
151
+ if config is None:
152
+ config = {}
153
+
154
+ # Ensure content_column is set for JSON chunking if not already specified
155
+ if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
156
+ if "content_column" not in config["json_chunking_config"]:
157
+ config["json_chunking_config"]["content_column"] = "content"
158
+
159
+ preprocessing_config = PreprocessingConfig(**config)
160
+ self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
161
+
162
+ # set doc_id column name
163
+ self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
164
+
165
+ logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
164
166
 
165
167
  def select_query(self, query: Select) -> pd.DataFrame:
166
168
  """
@@ -169,6 +171,30 @@ class KnowledgeBaseTable:
169
171
  :param query: query to KB table
170
172
  :return: dataframe with the result table
171
173
  """
174
+
175
+ # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
176
+ query_copy = copy.deepcopy(query)
177
+
178
+ executor = KnowledgeBaseQueryExecutor(self)
179
+ df = executor.run(query)
180
+
181
+ if (
182
+ query.group_by is not None
183
+ or query.order_by is not None
184
+ or query.having is not None
185
+ or query.distinct is True
186
+ or len(query.targets) != 1
187
+ or not isinstance(query.targets[0], Star)
188
+ ):
189
+ query_copy.where = None
190
+ if "metadata" in df.columns:
191
+ df["metadata"] = df["metadata"].apply(to_json)
192
+
193
+ df = query_df(df, query_copy, session=self.session)
194
+
195
+ return df
196
+
197
+ def select(self, query, disable_reranking=False):
172
198
  logger.debug(f"Processing select query: {query}")
173
199
 
174
200
  # Extract the content query text for potential reranking
@@ -180,9 +206,6 @@ class KnowledgeBaseTable:
180
206
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
181
207
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
182
208
 
183
- # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
184
- query_copy = copy.deepcopy(query)
185
-
186
209
  query.targets = [
187
210
  Identifier(TableField.ID.value),
188
211
  Identifier(TableField.CONTENT.value),
@@ -197,7 +220,6 @@ class KnowledgeBaseTable:
197
220
  conditions = []
198
221
  query_text = None
199
222
  relevance_threshold = None
200
- reranking_enabled_flag = True
201
223
  query_conditions = db_handler.extract_conditions(query.where)
202
224
  if query_conditions is not None:
203
225
  for item in query_conditions:
@@ -213,10 +235,9 @@ class KnowledgeBaseTable:
213
235
  logger.error(error_msg)
214
236
  raise ValueError(error_msg)
215
237
  elif item.column == "reranking":
216
- reranking_enabled_flag = item.value
217
- # cast to boolean
218
- if isinstance(reranking_enabled_flag, str):
219
- reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
238
+ if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
239
+ disable_reranking = True
240
+
220
241
  elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
221
242
  raise ValueError(
222
243
  f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -248,66 +269,59 @@ class KnowledgeBaseTable:
248
269
  limit = 100
249
270
  query.limit = Constant(limit)
250
271
 
251
- df = db_handler.dispatch_select(query, conditions)
272
+ allowed_metadata_columns = self._get_allowed_metadata_columns()
273
+ df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
252
274
  df = self.addapt_result_columns(df)
253
275
 
254
276
  logger.debug(f"Query returned {len(df)} rows")
255
277
  logger.debug(f"Columns in response: {df.columns.tolist()}")
256
278
  # Check if we have a rerank_model configured in KB params
257
- df = self.add_relevance(df, query_text, relevance_threshold, reranking_enabled_flag)
279
+ df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
258
280
 
259
- if (
260
- query.group_by is not None
261
- or query.order_by is not None
262
- or query.having is not None
263
- or query.distinct is True
264
- or len(query.targets) != 1
265
- or not isinstance(query.targets[0], Star)
266
- ):
267
- query_copy.where = None
268
- if "metadata" in df.columns:
269
- df["metadata"] = df["metadata"].apply(to_json)
281
+ return df
270
282
 
271
- df = query_df(df, query_copy, session=self.session)
283
+ def _get_allowed_metadata_columns(self) -> List[str] | None:
284
+ # Return list of KB columns to restrict querying, if None: no restrictions
272
285
 
273
- return df
286
+ if self._kb.params.get("version", 0) < 2:
287
+ # disable for old version KBs
288
+ return None
289
+
290
+ user_columns = self._kb.params.get("metadata_columns", [])
291
+ dynamic_columns = self._kb.params.get("inserted_metadata", [])
292
+
293
+ columns = set(user_columns) | set(dynamic_columns)
294
+ return [col.lower() for col in columns]
274
295
 
275
296
  def score_documents(self, query_text, documents, reranking_model_params):
276
297
  reranker = get_reranking_model_from_params(reranking_model_params)
277
298
  return reranker.get_scores(query_text, documents)
278
299
 
279
- def add_relevance(self, df, query_text, relevance_threshold=None, reranking_enabled_flag=True):
300
+ def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
280
301
  relevance_column = TableField.RELEVANCE.value
281
302
 
282
303
  reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
283
- if reranking_model_params and query_text and len(df) > 0 and reranking_enabled_flag:
304
+ if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
284
305
  # Use reranker for relevance score
285
- try:
286
- logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
287
- # Apply custom filtering threshold if provided
288
- if relevance_threshold is not None:
289
- reranking_model_params["filtering_threshold"] = relevance_threshold
290
- logger.info(f"Using custom filtering threshold: {relevance_threshold}")
291
-
292
- reranker = get_reranking_model_from_params(reranking_model_params)
293
- # Get documents to rerank
294
- documents = df["chunk_content"].tolist()
295
- # Use the get_scores method with disable_events=True
296
- scores = reranker.get_scores(query_text, documents)
297
- # Add scores as the relevance column
298
- df[relevance_column] = scores
299
-
300
- # Filter by threshold
301
- scores_array = np.array(scores)
302
- df = df[scores_array > reranker.filtering_threshold]
303
- logger.debug(f"Applied reranking with params: {reranking_model_params}")
304
- except Exception as e:
305
- logger.error(f"Error during reranking: {str(e)}")
306
- # Fallback to distance-based relevance
307
- if "distance" in df.columns:
308
- df[relevance_column] = 1 / (1 + df["distance"])
309
- else:
310
- logger.info("No distance or reranker available")
306
+
307
+ logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
308
+ # Apply custom filtering threshold if provided
309
+ if relevance_threshold is not None:
310
+ reranking_model_params["filtering_threshold"] = relevance_threshold
311
+ logger.info(f"Using custom filtering threshold: {relevance_threshold}")
312
+
313
+ reranker = get_reranking_model_from_params(reranking_model_params)
314
+ # Get documents to rerank
315
+ documents = df["chunk_content"].tolist()
316
+ # Use the get_scores method with disable_events=True
317
+ scores = reranker.get_scores(query_text, documents)
318
+ # Add scores as the relevance column
319
+ df[relevance_column] = scores
320
+
321
+ # Filter by threshold
322
+ scores_array = np.array(scores)
323
+ df = df[scores_array > reranker.filtering_threshold]
324
+ logger.debug(f"Applied reranking with params: {reranking_model_params}")
311
325
 
312
326
  elif "distance" in df.columns:
313
327
  # Calculate relevance from distance
@@ -327,12 +341,12 @@ class KnowledgeBaseTable:
327
341
  if conditions is None:
328
342
  return
329
343
  for condition in conditions:
330
- if condition.column in KB_TO_VECTORDB_COLUMNS:
331
- condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
344
+ if condition.column in self.kb_to_vector_columns:
345
+ condition.column = self.kb_to_vector_columns[condition.column]
332
346
 
333
347
  def addapt_result_columns(self, df):
334
348
  col_update = {}
335
- for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
349
+ for kb_col, vec_col in self.kb_to_vector_columns.items():
336
350
  if vec_col in df.columns:
337
351
  col_update[vec_col] = kb_col
338
352
 
@@ -341,7 +355,7 @@ class KnowledgeBaseTable:
341
355
  columns = list(df.columns)
342
356
  # update id, get from metadata
343
357
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
344
- lambda m: None if m is None else m.get("original_doc_id")
358
+ lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
345
359
  )
346
360
 
347
361
  # id on first place
@@ -528,8 +542,8 @@ class KnowledgeBaseTable:
528
542
 
529
543
  metadata = {
530
544
  **base_metadata,
531
- "original_row_index": str(idx), # provide link to original row index
532
- "content_column": col,
545
+ "_original_row_index": str(idx), # provide link to original row index
546
+ "_content_column": col,
533
547
  }
534
548
 
535
549
  raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
@@ -624,16 +638,22 @@ class KnowledgeBaseTable:
624
638
  metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
625
639
  logger.debug(f"Mapped metadata columns: {metadata_columns}")
626
640
 
627
- if content_columns is not None:
628
- content_columns = list(set(content_columns).intersection(columns))
629
- if len(content_columns) == 0:
630
- raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
641
+ content_columns = list(set(content_columns).intersection(columns))
642
+ if len(content_columns) == 0:
643
+ raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
631
644
 
632
- if metadata_columns is not None:
633
- metadata_columns = list(set(metadata_columns).intersection(columns))
634
- else:
635
- # all the rest columns
636
- metadata_columns = list(set(columns).difference(content_columns))
645
+ if metadata_columns is not None:
646
+ metadata_columns = list(set(metadata_columns).intersection(columns))
647
+ else:
648
+ # all the rest columns
649
+ metadata_columns = list(set(columns).difference(content_columns))
650
+
651
+ # update list of used columns
652
+ inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
653
+ inserted_metadata.update(metadata_columns)
654
+ self._kb.params["inserted_metadata"] = list(inserted_metadata)
655
+ flag_modified(self._kb, "params")
656
+ db.session.commit()
637
657
 
638
658
  # Add content columns directly (don't combine them)
639
659
  for col in content_columns:
@@ -659,7 +679,7 @@ class KnowledgeBaseTable:
659
679
  elif isinstance(value, dict):
660
680
  metadata.update(value)
661
681
  continue
662
- else:
682
+ elif value is not None:
663
683
  value = str(value)
664
684
  metadata[col] = value
665
685
  return metadata
@@ -766,15 +786,10 @@ class KnowledgeBaseTable:
766
786
  llm_model = args.pop("model_name")
767
787
  engine = args.pop("provider")
768
788
 
769
- llm_model = f"{engine}/{llm_model}"
770
-
771
- if "base_url" in args:
772
- args["api_base"] = args.pop("base_url")
773
-
774
789
  module = session.integration_controller.get_handler_module("litellm")
775
790
  if module is None or module.Handler is None:
776
791
  raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
777
- return module.Handler.embeddings(llm_model, messages, args)
792
+ return module.Handler.embeddings(engine, llm_model, messages, args)
778
793
 
779
794
  def build_rag_pipeline(self, retrieval_config: dict):
780
795
  """
@@ -896,6 +911,8 @@ class KnowledgeBaseController:
896
911
  manages knowledge bases
897
912
  """
898
913
 
914
+ KB_VERSION = 2
915
+
899
916
  def __init__(self, session) -> None:
900
917
  self.session = session
901
918
 
@@ -907,6 +924,7 @@ class KnowledgeBaseController:
907
924
  params: dict,
908
925
  preprocessing_config: Optional[dict] = None,
909
926
  if_not_exists: bool = False,
927
+ keyword_search_enabled: bool = False,
910
928
  # embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
911
929
  ) -> db.KnowledgeBase:
912
930
  """
@@ -961,10 +979,7 @@ class KnowledgeBaseController:
961
979
  # # it is params for model
962
980
  # embedding_params.update(params["embedding_model"])
963
981
 
964
- if "embedding_model" in params:
965
- if not isinstance(params["embedding_model"], dict):
966
- raise ValueError("embedding_model should be JSON object with model parameters.")
967
- embedding_params.update(params["embedding_model"])
982
+ embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
968
983
 
969
984
  # if model_name is None: # Legacy
970
985
  model_name = self._create_embedding_model(
@@ -1023,7 +1038,10 @@ class KnowledgeBaseController:
1023
1038
  vector_db_name, vector_table_name = storage.parts
1024
1039
 
1025
1040
  # create table in vectordb before creating KB
1026
- self.session.datahub.get(vector_db_name).integration_handler.create_table(vector_table_name)
1041
+ vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
1042
+ vector_store_handler.create_table(vector_table_name)
1043
+ if keyword_search_enabled:
1044
+ vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
1027
1045
  vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
1028
1046
 
1029
1047
  # Store sparse vector settings in params if specified
@@ -1033,6 +1051,7 @@ class KnowledgeBaseController:
1033
1051
  if vector_size is not None:
1034
1052
  params["vector_config"]["vector_size"] = vector_size
1035
1053
 
1054
+ params["version"] = self.KB_VERSION
1036
1055
  kb = db.KnowledgeBase(
1037
1056
  name=name,
1038
1057
  project_id=project_id,
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import datetime as dt
8
8
 
9
9
  from mindsdb.api.executor.sql_query.result_set import ResultSet
10
- from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
10
+ from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
11
11
  from mindsdb.utilities import log
12
12
 
13
13
  from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
@@ -130,6 +130,8 @@ class EvaluateBase:
130
130
  integration_name = table_name.parts[0]
131
131
  table_name = Identifier(parts=table_name.parts[1:])
132
132
  dn = self.session.datahub.get(integration_name)
133
+ if dn is None:
134
+ raise ValueError(f"Can't find database: {integration_name}")
133
135
  return dn, table_name
134
136
 
135
137
  def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
@@ -168,13 +170,13 @@ class EvaluateBase:
168
170
  test_data = self.generate_test_data(gen_params)
169
171
 
170
172
  self.save_to_table(test_table, test_data, is_replace=True)
171
- else:
172
- test_data = self.read_from_table(test_table)
173
173
 
174
174
  if params.get("evaluate", True) is False:
175
175
  # no evaluate is required
176
176
  return pd.DataFrame()
177
177
 
178
+ test_data = self.read_from_table(test_table)
179
+
178
180
  scores = self.evaluate(test_data)
179
181
  scores["name"] = self.name
180
182
  scores["created_at"] = dt.datetime.now()
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
256
258
 
257
259
  start_time = time.time()
258
260
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
259
- df_answers = self.kb.select_query(Select(targets=[Identifier("chunk_content")], limit=Constant(self.TOP_K)))
261
+ df_answers = self.kb.select_query(
262
+ Select(
263
+ targets=[Identifier("chunk_content")],
264
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
265
+ limit=Constant(self.TOP_K),
266
+ )
267
+ )
260
268
  query_time = time.time() - start_time
261
269
 
262
270
  proposed_responses = list(df_answers["chunk_content"])
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
410
418
  Checks if ID in response from KB is matched with doc ID in test dataset
411
419
  """
412
420
 
413
- TOP_K = 100
421
+ TOP_K = 20
414
422
 
415
423
  def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
416
424
  if "id" not in sampled_df.columns:
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
462
470
  start_time = time.time()
463
471
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
464
472
  df_answers = self.kb.select_query(
465
- Select(targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.TOP_K))
473
+ Select(
474
+ targets=[Identifier("chunk_content"), Identifier("id")],
475
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
476
+ limit=Constant(self.TOP_K),
477
+ )
466
478
  )
467
479
  query_time = time.time() - start_time
468
480
 
@@ -511,6 +523,6 @@ class EvaluateDocID(EvaluateBase):
511
523
  "total": total_questions,
512
524
  "total_found": total_found,
513
525
  "retrieved_in_top_10": accurate_in_top_10,
514
- "cumulative_recall": cumulative_recall,
526
+ "cumulative_recall": json.dumps(cumulative_recall),
515
527
  "avg_query_time": avg_query_time,
516
528
  }