MindsDB 25.6.4.0__py3-none-any.whl → 25.7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (46) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +8 -6
  3. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  4. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  5. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  6. mindsdb/api/executor/planner/query_prepare.py +68 -87
  7. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  8. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  9. mindsdb/api/http/namespaces/file.py +49 -24
  10. mindsdb/api/mcp/start.py +45 -31
  11. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  12. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  13. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  14. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  15. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  16. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  17. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  18. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  19. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
  20. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  21. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  22. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  23. mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
  24. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  25. mindsdb/interfaces/agents/agents_controller.py +29 -9
  26. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  27. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  28. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  29. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  30. mindsdb/interfaces/knowledge_base/controller.py +115 -89
  31. mindsdb/interfaces/knowledge_base/evaluate.py +16 -4
  32. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  33. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  34. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  35. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  36. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  37. mindsdb/interfaces/skills/sql_agent.py +181 -130
  38. mindsdb/interfaces/storage/db.py +9 -7
  39. mindsdb/utilities/config.py +12 -1
  40. mindsdb/utilities/exception.py +47 -7
  41. mindsdb/utilities/security.py +54 -11
  42. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +248 -262
  43. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +46 -45
  44. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
  45. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
  46. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
96
96
  # Log the query for debugging
97
97
  logger.info(f"Executing SQL query: {command}")
98
98
 
99
- # Removing backticks causes in query execution.
100
- # remove backticks
101
- # command = command.replace('`', '')
102
-
103
- # Parse the SQL string to an AST object first
104
- from mindsdb_sql_parser import parse_sql
105
-
106
- ast_query = parse_sql(command)
107
-
108
- # Now execute the parsed query
109
- result = self._sql_agent.skill_tool.get_command_executor().execute_command(
110
- ast_query, database_name="mindsdb"
111
- )
112
-
113
- # Convert ExecuteAnswer to a DataFrame for easier manipulation
114
- if result.data is not None:
115
- df = result.data.to_df()
116
- return df.to_string(index=False)
117
-
118
- else:
119
- return "Query executed successfully, but returned no data."
99
+ return self._sql_agent.query(command)
120
100
 
121
101
  except Exception as e:
122
102
  logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
127
107
  return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
128
108
  return f"Error: {str(e)}"
129
109
 
130
- # def run_no_throw(self, command: str, fetch: str = "all") -> str:
131
- # """Execute a SQL command and return the result as a string.
132
- #
133
- # This method catches any exceptions and returns an error message instead of raising an exception.
134
- #
135
- # Args:
136
- # command: The SQL command to execute
137
- # fetch: Whether to fetch 'all' results or just 'one'
138
- #
139
- # Returns:
140
- # A string representation of the result or an error message
141
- # """
142
- # command = extract_essential(command)
143
- # try:
144
- # return self._sql_agent.query_safe(command)
145
- # except Exception as e:
146
- # logger.error(f"Error executing SQL command: {str(e)}")
147
- # # If this is a knowledge base query, provide a more helpful error message
148
- # if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
149
- # return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
150
- # return f"Error: {str(e)}"
151
-
152
110
  def get_usable_knowledge_base_names(self) -> List[str]:
153
111
  """Get a list of usable knowledge base names.
154
112
 
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
160
118
  except Exception as e:
161
119
  logger.error(f"Error getting usable knowledge base names: {str(e)}")
162
120
  return []
121
+
122
+ def check_knowledge_base_permission(self, name):
123
+ """Get a list of usable knowledge base names.
124
+
125
+ Returns:
126
+ A list of knowledge base names that can be used in queries
127
+ """
128
+
129
+ return self._sql_agent.check_knowledge_base_permission(name)
@@ -18,7 +18,9 @@ class DataCatalogReader(BaseDataCatalog):
18
18
 
19
19
  metadata_str = "Data Catalog: \n"
20
20
  if hasattr(self.data_handler, "meta_get_handler_info"):
21
- metadata_str += self.data_handler.meta_get_handler_info() + "\n\n"
21
+ info = self.data_handler.meta_get_handler_info()
22
+ if info:
23
+ metadata_str += info + "\n\n"
22
24
 
23
25
  for table in tables:
24
26
  metadata_str += table.as_string() + "\n\n"
@@ -6,6 +6,7 @@ import decimal
6
6
 
7
7
  import pandas as pd
8
8
  import numpy as np
9
+ from sqlalchemy.orm.attributes import flag_modified
9
10
 
10
11
  from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
11
12
  from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
@@ -33,6 +34,7 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
33
34
  from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
34
35
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
35
36
  from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
37
+ from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
36
38
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
37
39
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
40
  from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
@@ -46,8 +48,6 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
46
48
 
47
49
  logger = log.getLogger(__name__)
48
50
 
49
- KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
50
-
51
51
 
52
52
  def get_model_params(model_params: dict, default_config_key: str):
53
53
  """
@@ -140,23 +140,29 @@ class KnowledgeBaseTable:
140
140
  self.document_loader = None
141
141
  self.model_params = None
142
142
 
143
+ self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
144
+ if self._kb.params.get("version", 0) < 2:
145
+ self.kb_to_vector_columns["id"] = "original_doc_id"
146
+
143
147
  def configure_preprocessing(self, config: Optional[dict] = None):
144
148
  """Configure preprocessing for the knowledge base table"""
145
149
  logger.debug(f"Configuring preprocessing with config: {config}")
146
150
  self.document_preprocessor = None # Reset existing preprocessor
147
- if config is not None:
148
- # Ensure content_column is set for JSON chunking if not already specified
149
- if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
150
- if "content_column" not in config["json_chunking_config"]:
151
- config["json_chunking_config"]["content_column"] = "content"
152
-
153
- preprocessing_config = PreprocessingConfig(**config)
154
- self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
155
- logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
156
- else:
157
- # Always create a default preprocessor if none specified
158
- self.document_preprocessor = PreprocessorFactory.create_preprocessor()
159
- logger.debug("Created default preprocessor")
151
+ if config is None:
152
+ config = {}
153
+
154
+ # Ensure content_column is set for JSON chunking if not already specified
155
+ if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
156
+ if "content_column" not in config["json_chunking_config"]:
157
+ config["json_chunking_config"]["content_column"] = "content"
158
+
159
+ preprocessing_config = PreprocessingConfig(**config)
160
+ self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
161
+
162
+ # set doc_id column name
163
+ self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
164
+
165
+ logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
160
166
 
161
167
  def select_query(self, query: Select) -> pd.DataFrame:
162
168
  """
@@ -165,6 +171,30 @@ class KnowledgeBaseTable:
165
171
  :param query: query to KB table
166
172
  :return: dataframe with the result table
167
173
  """
174
+
175
+ # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
176
+ query_copy = copy.deepcopy(query)
177
+
178
+ executor = KnowledgeBaseQueryExecutor(self)
179
+ df = executor.run(query)
180
+
181
+ if (
182
+ query.group_by is not None
183
+ or query.order_by is not None
184
+ or query.having is not None
185
+ or query.distinct is True
186
+ or len(query.targets) != 1
187
+ or not isinstance(query.targets[0], Star)
188
+ ):
189
+ query_copy.where = None
190
+ if "metadata" in df.columns:
191
+ df["metadata"] = df["metadata"].apply(to_json)
192
+
193
+ df = query_df(df, query_copy, session=self.session)
194
+
195
+ return df
196
+
197
+ def select(self, query, disable_reranking=False):
168
198
  logger.debug(f"Processing select query: {query}")
169
199
 
170
200
  # Extract the content query text for potential reranking
@@ -176,9 +206,6 @@ class KnowledgeBaseTable:
176
206
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
177
207
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
178
208
 
179
- # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
180
- query_copy = copy.deepcopy(query)
181
-
182
209
  query.targets = [
183
210
  Identifier(TableField.ID.value),
184
211
  Identifier(TableField.CONTENT.value),
@@ -193,7 +220,6 @@ class KnowledgeBaseTable:
193
220
  conditions = []
194
221
  query_text = None
195
222
  relevance_threshold = None
196
- reranking_enabled_flag = True
197
223
  query_conditions = db_handler.extract_conditions(query.where)
198
224
  if query_conditions is not None:
199
225
  for item in query_conditions:
@@ -209,10 +235,9 @@ class KnowledgeBaseTable:
209
235
  logger.error(error_msg)
210
236
  raise ValueError(error_msg)
211
237
  elif item.column == "reranking":
212
- reranking_enabled_flag = item.value
213
- # cast to boolean
214
- if isinstance(reranking_enabled_flag, str):
215
- reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
238
+ if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
239
+ disable_reranking = True
240
+
216
241
  elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
217
242
  raise ValueError(
218
243
  f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -244,66 +269,59 @@ class KnowledgeBaseTable:
244
269
  limit = 100
245
270
  query.limit = Constant(limit)
246
271
 
247
- df = db_handler.dispatch_select(query, conditions)
272
+ allowed_metadata_columns = self._get_allowed_metadata_columns()
273
+ df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
248
274
  df = self.addapt_result_columns(df)
249
275
 
250
276
  logger.debug(f"Query returned {len(df)} rows")
251
277
  logger.debug(f"Columns in response: {df.columns.tolist()}")
252
278
  # Check if we have a rerank_model configured in KB params
253
- df = self.add_relevance(df, query_text, relevance_threshold, reranking_enabled_flag)
279
+ df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
254
280
 
255
- if (
256
- query.group_by is not None
257
- or query.order_by is not None
258
- or query.having is not None
259
- or query.distinct is True
260
- or len(query.targets) != 1
261
- or not isinstance(query.targets[0], Star)
262
- ):
263
- query_copy.where = None
264
- if "metadata" in df.columns:
265
- df["metadata"] = df["metadata"].apply(to_json)
281
+ return df
266
282
 
267
- df = query_df(df, query_copy, session=self.session)
283
+ def _get_allowed_metadata_columns(self) -> List[str] | None:
284
+ # Return list of KB columns to restrict querying, if None: no restrictions
268
285
 
269
- return df
286
+ if self._kb.params.get("version", 0) < 2:
287
+ # disable for old version KBs
288
+ return None
289
+
290
+ user_columns = self._kb.params.get("metadata_columns", [])
291
+ dynamic_columns = self._kb.params.get("inserted_metadata", [])
292
+
293
+ columns = set(user_columns) | set(dynamic_columns)
294
+ return [col.lower() for col in columns]
270
295
 
271
296
  def score_documents(self, query_text, documents, reranking_model_params):
272
297
  reranker = get_reranking_model_from_params(reranking_model_params)
273
298
  return reranker.get_scores(query_text, documents)
274
299
 
275
- def add_relevance(self, df, query_text, relevance_threshold=None, reranking_enabled_flag=True):
300
+ def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
276
301
  relevance_column = TableField.RELEVANCE.value
277
302
 
278
303
  reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
279
- if reranking_model_params and query_text and len(df) > 0 and reranking_enabled_flag:
304
+ if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
280
305
  # Use reranker for relevance score
281
- try:
282
- logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
283
- # Apply custom filtering threshold if provided
284
- if relevance_threshold is not None:
285
- reranking_model_params["filtering_threshold"] = relevance_threshold
286
- logger.info(f"Using custom filtering threshold: {relevance_threshold}")
287
-
288
- reranker = get_reranking_model_from_params(reranking_model_params)
289
- # Get documents to rerank
290
- documents = df["chunk_content"].tolist()
291
- # Use the get_scores method with disable_events=True
292
- scores = reranker.get_scores(query_text, documents)
293
- # Add scores as the relevance column
294
- df[relevance_column] = scores
295
-
296
- # Filter by threshold
297
- scores_array = np.array(scores)
298
- df = df[scores_array > reranker.filtering_threshold]
299
- logger.debug(f"Applied reranking with params: {reranking_model_params}")
300
- except Exception as e:
301
- logger.error(f"Error during reranking: {str(e)}")
302
- # Fallback to distance-based relevance
303
- if "distance" in df.columns:
304
- df[relevance_column] = 1 / (1 + df["distance"])
305
- else:
306
- logger.info("No distance or reranker available")
306
+
307
+ logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
308
+ # Apply custom filtering threshold if provided
309
+ if relevance_threshold is not None:
310
+ reranking_model_params["filtering_threshold"] = relevance_threshold
311
+ logger.info(f"Using custom filtering threshold: {relevance_threshold}")
312
+
313
+ reranker = get_reranking_model_from_params(reranking_model_params)
314
+ # Get documents to rerank
315
+ documents = df["chunk_content"].tolist()
316
+ # Use the get_scores method with disable_events=True
317
+ scores = reranker.get_scores(query_text, documents)
318
+ # Add scores as the relevance column
319
+ df[relevance_column] = scores
320
+
321
+ # Filter by threshold
322
+ scores_array = np.array(scores)
323
+ df = df[scores_array > reranker.filtering_threshold]
324
+ logger.debug(f"Applied reranking with params: {reranking_model_params}")
307
325
 
308
326
  elif "distance" in df.columns:
309
327
  # Calculate relevance from distance
@@ -323,12 +341,12 @@ class KnowledgeBaseTable:
323
341
  if conditions is None:
324
342
  return
325
343
  for condition in conditions:
326
- if condition.column in KB_TO_VECTORDB_COLUMNS:
327
- condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
344
+ if condition.column in self.kb_to_vector_columns:
345
+ condition.column = self.kb_to_vector_columns[condition.column]
328
346
 
329
347
  def addapt_result_columns(self, df):
330
348
  col_update = {}
331
- for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
349
+ for kb_col, vec_col in self.kb_to_vector_columns.items():
332
350
  if vec_col in df.columns:
333
351
  col_update[vec_col] = kb_col
334
352
 
@@ -337,7 +355,7 @@ class KnowledgeBaseTable:
337
355
  columns = list(df.columns)
338
356
  # update id, get from metadata
339
357
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
340
- lambda m: None if m is None else m.get("original_doc_id")
358
+ lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
341
359
  )
342
360
 
343
361
  # id on first place
@@ -524,8 +542,8 @@ class KnowledgeBaseTable:
524
542
 
525
543
  metadata = {
526
544
  **base_metadata,
527
- "original_row_index": str(idx), # provide link to original row index
528
- "content_column": col,
545
+ "_original_row_index": str(idx), # provide link to original row index
546
+ "_content_column": col,
529
547
  }
530
548
 
531
549
  raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
@@ -620,16 +638,22 @@ class KnowledgeBaseTable:
620
638
  metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
621
639
  logger.debug(f"Mapped metadata columns: {metadata_columns}")
622
640
 
623
- if content_columns is not None:
624
- content_columns = list(set(content_columns).intersection(columns))
625
- if len(content_columns) == 0:
626
- raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
641
+ content_columns = list(set(content_columns).intersection(columns))
642
+ if len(content_columns) == 0:
643
+ raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
627
644
 
628
- if metadata_columns is not None:
629
- metadata_columns = list(set(metadata_columns).intersection(columns))
630
- else:
631
- # all the rest columns
632
- metadata_columns = list(set(columns).difference(content_columns))
645
+ if metadata_columns is not None:
646
+ metadata_columns = list(set(metadata_columns).intersection(columns))
647
+ else:
648
+ # all the rest columns
649
+ metadata_columns = list(set(columns).difference(content_columns))
650
+
651
+ # update list of used columns
652
+ inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
653
+ inserted_metadata.update(metadata_columns)
654
+ self._kb.params["inserted_metadata"] = list(inserted_metadata)
655
+ flag_modified(self._kb, "params")
656
+ db.session.commit()
633
657
 
634
658
  # Add content columns directly (don't combine them)
635
659
  for col in content_columns:
@@ -655,7 +679,7 @@ class KnowledgeBaseTable:
655
679
  elif isinstance(value, dict):
656
680
  metadata.update(value)
657
681
  continue
658
- else:
682
+ elif value is not None:
659
683
  value = str(value)
660
684
  metadata[col] = value
661
685
  return metadata
@@ -762,15 +786,10 @@ class KnowledgeBaseTable:
762
786
  llm_model = args.pop("model_name")
763
787
  engine = args.pop("provider")
764
788
 
765
- llm_model = f"{engine}/{llm_model}"
766
-
767
- if "base_url" in args:
768
- args["api_base"] = args.pop("base_url")
769
-
770
789
  module = session.integration_controller.get_handler_module("litellm")
771
790
  if module is None or module.Handler is None:
772
791
  raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
773
- return module.Handler.embeddings(llm_model, messages, args)
792
+ return module.Handler.embeddings(engine, llm_model, messages, args)
774
793
 
775
794
  def build_rag_pipeline(self, retrieval_config: dict):
776
795
  """
@@ -892,6 +911,8 @@ class KnowledgeBaseController:
892
911
  manages knowledge bases
893
912
  """
894
913
 
914
+ KB_VERSION = 2
915
+
895
916
  def __init__(self, session) -> None:
896
917
  self.session = session
897
918
 
@@ -903,6 +924,7 @@ class KnowledgeBaseController:
903
924
  params: dict,
904
925
  preprocessing_config: Optional[dict] = None,
905
926
  if_not_exists: bool = False,
927
+ keyword_search_enabled: bool = False,
906
928
  # embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
907
929
  ) -> db.KnowledgeBase:
908
930
  """
@@ -1016,7 +1038,10 @@ class KnowledgeBaseController:
1016
1038
  vector_db_name, vector_table_name = storage.parts
1017
1039
 
1018
1040
  # create table in vectordb before creating KB
1019
- self.session.datahub.get(vector_db_name).integration_handler.create_table(vector_table_name)
1041
+ vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
1042
+ vector_store_handler.create_table(vector_table_name)
1043
+ if keyword_search_enabled:
1044
+ vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
1020
1045
  vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
1021
1046
 
1022
1047
  # Store sparse vector settings in params if specified
@@ -1026,6 +1051,7 @@ class KnowledgeBaseController:
1026
1051
  if vector_size is not None:
1027
1052
  params["vector_config"]["vector_size"] = vector_size
1028
1053
 
1054
+ params["version"] = self.KB_VERSION
1029
1055
  kb = db.KnowledgeBase(
1030
1056
  name=name,
1031
1057
  project_id=project_id,
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import datetime as dt
8
8
 
9
9
  from mindsdb.api.executor.sql_query.result_set import ResultSet
10
- from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
10
+ from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
11
11
  from mindsdb.utilities import log
12
12
 
13
13
  from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
@@ -130,6 +130,8 @@ class EvaluateBase:
130
130
  integration_name = table_name.parts[0]
131
131
  table_name = Identifier(parts=table_name.parts[1:])
132
132
  dn = self.session.datahub.get(integration_name)
133
+ if dn is None:
134
+ raise ValueError(f"Can't find database: {integration_name}")
133
135
  return dn, table_name
134
136
 
135
137
  def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
256
258
 
257
259
  start_time = time.time()
258
260
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
259
- df_answers = self.kb.select_query(Select(targets=[Identifier("chunk_content")], limit=Constant(self.TOP_K)))
261
+ df_answers = self.kb.select_query(
262
+ Select(
263
+ targets=[Identifier("chunk_content")],
264
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
265
+ limit=Constant(self.TOP_K),
266
+ )
267
+ )
260
268
  query_time = time.time() - start_time
261
269
 
262
270
  proposed_responses = list(df_answers["chunk_content"])
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
410
418
  Checks if ID in response from KB is matched with doc ID in test dataset
411
419
  """
412
420
 
413
- TOP_K = 100
421
+ TOP_K = 20
414
422
 
415
423
  def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
416
424
  if "id" not in sampled_df.columns:
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
462
470
  start_time = time.time()
463
471
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
464
472
  df_answers = self.kb.select_query(
465
- Select(targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.TOP_K))
473
+ Select(
474
+ targets=[Identifier("chunk_content"), Identifier("id")],
475
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
476
+ limit=Constant(self.TOP_K),
477
+ )
466
478
  )
467
479
  query_time = time.time() - start_time
468
480