MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,19 @@
1
1
  import os
2
2
  import copy
3
- from typing import Dict, List, Optional
3
+ from typing import Dict, List, Optional, Any, Text
4
4
  import json
5
5
  import decimal
6
6
 
7
7
  import pandas as pd
8
8
  import numpy as np
9
+ from pydantic import BaseModel, ValidationError
10
+ from sqlalchemy.orm.attributes import flag_modified
9
11
 
10
12
  from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
11
13
  from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
12
14
  from mindsdb_sql_parser import parse_sql
13
15
 
16
+ from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
14
17
  from mindsdb.integrations.utilities.query_traversal import query_traversal
15
18
 
16
19
  import mindsdb.interfaces.storage.db as db
@@ -33,9 +36,10 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
33
36
  from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
34
37
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
35
38
  from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
39
+ from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
36
40
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
37
41
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
- from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
42
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator, KeywordSearchArgs
39
43
  from mindsdb.utilities.config import config
40
44
  from mindsdb.utilities.context import context as ctx
41
45
 
@@ -46,7 +50,19 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
46
50
 
47
51
  logger = log.getLogger(__name__)
48
52
 
49
- KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
53
+
54
+ class KnowledgeBaseInputParams(BaseModel):
55
+ metadata_columns: List[str] | None = None
56
+ content_columns: List[str] | None = None
57
+ id_column: str | None = None
58
+ kb_no_upsert: bool = False
59
+ embedding_model: Dict[Text, Any] | None = None
60
+ is_sparse: bool = False
61
+ vector_size: int | None = None
62
+ reranking_model: Dict[Text, Any] | None = None
63
+
64
+ class Config:
65
+ extra = "forbid"
50
66
 
51
67
 
52
68
  def get_model_params(model_params: dict, default_config_key: str):
@@ -101,7 +117,10 @@ def get_reranking_model_from_params(reranking_model_params: dict):
101
117
 
102
118
  if "api_key" not in params_copy:
103
119
  params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
104
- params_copy["model"] = params_copy.pop("model_name", None)
120
+
121
+ if "model_name" not in params_copy:
122
+ raise ValueError("'model_name' must be provided for reranking model")
123
+ params_copy["model"] = params_copy.pop("model_name")
105
124
 
106
125
  return BaseLLMReranker(**params_copy)
107
126
 
@@ -140,23 +159,29 @@ class KnowledgeBaseTable:
140
159
  self.document_loader = None
141
160
  self.model_params = None
142
161
 
162
+ self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
163
+ if self._kb.params.get("version", 0) < 2:
164
+ self.kb_to_vector_columns["id"] = "original_doc_id"
165
+
143
166
  def configure_preprocessing(self, config: Optional[dict] = None):
144
167
  """Configure preprocessing for the knowledge base table"""
145
168
  logger.debug(f"Configuring preprocessing with config: {config}")
146
169
  self.document_preprocessor = None # Reset existing preprocessor
147
- if config is not None:
148
- # Ensure content_column is set for JSON chunking if not already specified
149
- if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
150
- if "content_column" not in config["json_chunking_config"]:
151
- config["json_chunking_config"]["content_column"] = "content"
152
-
153
- preprocessing_config = PreprocessingConfig(**config)
154
- self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
155
- logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
156
- else:
157
- # Always create a default preprocessor if none specified
158
- self.document_preprocessor = PreprocessorFactory.create_preprocessor()
159
- logger.debug("Created default preprocessor")
170
+ if config is None:
171
+ config = {}
172
+
173
+ # Ensure content_column is set for JSON chunking if not already specified
174
+ if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
175
+ if "content_column" not in config["json_chunking_config"]:
176
+ config["json_chunking_config"]["content_column"] = "content"
177
+
178
+ preprocessing_config = PreprocessingConfig(**config)
179
+ self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
180
+
181
+ # set doc_id column name
182
+ self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
183
+
184
+ logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
160
185
 
161
186
  def select_query(self, query: Select) -> pd.DataFrame:
162
187
  """
@@ -165,6 +190,33 @@ class KnowledgeBaseTable:
165
190
  :param query: query to KB table
166
191
  :return: dataframe with the result table
167
192
  """
193
+
194
+ # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
195
+ query_copy = copy.deepcopy(query)
196
+
197
+ executor = KnowledgeBaseQueryExecutor(self)
198
+ df = executor.run(query)
199
+
200
+ if (
201
+ query_copy.group_by is not None
202
+ or query_copy.order_by is not None
203
+ or query_copy.having is not None
204
+ or query_copy.distinct is True
205
+ or len(query_copy.targets) != 1
206
+ or not isinstance(query_copy.targets[0], Star)
207
+ ):
208
+ query_copy.where = None
209
+ if "metadata" in df.columns:
210
+ df["metadata"] = df["metadata"].apply(to_json)
211
+
212
+ if query_copy.from_table is None:
213
+ query_copy.from_table = Identifier(parts=[self._kb.name])
214
+
215
+ df = query_df(df, query_copy, session=self.session)
216
+
217
+ return df
218
+
219
+ def select(self, query, disable_reranking=False):
168
220
  logger.debug(f"Processing select query: {query}")
169
221
 
170
222
  # Extract the content query text for potential reranking
@@ -176,9 +228,6 @@ class KnowledgeBaseTable:
176
228
  query.from_table = Identifier(parts=[self._kb.vector_database_table])
177
229
  logger.debug(f"Set table name to: {self._kb.vector_database_table}")
178
230
 
179
- # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
180
- query_copy = copy.deepcopy(query)
181
-
182
231
  query.targets = [
183
232
  Identifier(TableField.ID.value),
184
233
  Identifier(TableField.CONTENT.value),
@@ -191,9 +240,12 @@ class KnowledgeBaseTable:
191
240
 
192
241
  # extract values from conditions and prepare for vectordb
193
242
  conditions = []
243
+ keyword_search_conditions = []
244
+ keyword_search_cols_and_values = []
194
245
  query_text = None
195
246
  relevance_threshold = None
196
247
  reranking_enabled_flag = True
248
+ hybrid_search_enabled_flag = False
197
249
  query_conditions = db_handler.extract_conditions(query.where)
198
250
  if query_conditions is not None:
199
251
  for item in query_conditions:
@@ -213,6 +265,13 @@ class KnowledgeBaseTable:
213
265
  # cast to boolean
214
266
  if isinstance(reranking_enabled_flag, str):
215
267
  reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
268
+ elif item.column == "hybrid_search":
269
+ hybrid_search_enabled_flag = item.value
270
+ # cast to boolean
271
+ if isinstance(hybrid_search_enabled_flag, str):
272
+ hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
273
+ if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
274
+ disable_reranking = True
216
275
  elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
217
276
  raise ValueError(
218
277
  f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -228,8 +287,16 @@ class KnowledgeBaseTable:
228
287
  op=FilterOperator.EQUAL,
229
288
  )
230
289
  )
290
+ keyword_search_cols_and_values.append((TableField.CONTENT.value, item.value))
231
291
  else:
232
292
  conditions.append(item)
293
+ keyword_search_conditions.append(item) # keyword search conditions do not use embeddings
294
+
295
+ if len(keyword_search_cols_and_values) > 1:
296
+ raise ValueError(
297
+ "Multiple content columns found in query conditions. "
298
+ "Only one content column is allowed for keyword search."
299
+ )
233
300
 
234
301
  logger.debug(f"Extracted query text: {query_text}")
235
302
 
@@ -244,66 +311,92 @@ class KnowledgeBaseTable:
244
311
  limit = 100
245
312
  query.limit = Constant(limit)
246
313
 
247
- df = db_handler.dispatch_select(query, conditions)
314
+ allowed_metadata_columns = self._get_allowed_metadata_columns()
315
+ df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
248
316
  df = self.addapt_result_columns(df)
249
-
250
317
  logger.debug(f"Query returned {len(df)} rows")
251
318
  logger.debug(f"Columns in response: {df.columns.tolist()}")
252
- # Check if we have a rerank_model configured in KB params
253
- df = self.add_relevance(df, query_text, relevance_threshold, reranking_enabled_flag)
254
319
 
255
- if (
256
- query.group_by is not None
257
- or query.order_by is not None
258
- or query.having is not None
259
- or query.distinct is True
260
- or len(query.targets) != 1
261
- or not isinstance(query.targets[0], Star)
262
- ):
263
- query_copy.where = None
264
- if "metadata" in df.columns:
265
- df["metadata"] = df["metadata"].apply(to_json)
320
+ if hybrid_search_enabled_flag and not isinstance(db_handler, KeywordSearchBase):
321
+ raise ValueError(f"Hybrid search is enabled but the db_handler {type(db_handler)} does not support it. ")
322
+ # check if db_handler inherits from KeywordSearchBase
323
+ if hybrid_search_enabled_flag and isinstance(db_handler, KeywordSearchBase):
324
+ # If query_text is present, use it for keyword search
325
+ logger.debug(f"Performing keyword search with query text: {query_text}")
326
+ keyword_search_args = KeywordSearchArgs(query=query_text, column=TableField.CONTENT.value)
327
+ keyword_query_obj = copy.deepcopy(query)
328
+
329
+ keyword_query_obj.targets = [
330
+ Identifier(TableField.ID.value),
331
+ Identifier(TableField.CONTENT.value),
332
+ Identifier(TableField.METADATA.value),
333
+ ]
266
334
 
267
- df = query_df(df, query_copy, session=self.session)
335
+ df_keyword_select = db_handler.dispatch_select(
336
+ keyword_query_obj, keyword_search_conditions, keyword_search_args=keyword_search_args
337
+ )
338
+ df_keyword_select = self.addapt_result_columns(df_keyword_select)
339
+ logger.debug(f"Keyword search returned {len(df_keyword_select)} rows")
340
+ logger.debug(f"Columns in keyword search response: {df_keyword_select.columns.tolist()}")
341
+ # ensure df and df_keyword_select have exactly the same columns
342
+ if not df_keyword_select.empty:
343
+ if set(df.columns) != set(df_keyword_select.columns):
344
+ raise ValueError(
345
+ f"Keyword search returned different columns: {df_keyword_select.columns} "
346
+ f"than expected: {df.columns}"
347
+ )
348
+ df = pd.concat([df, df_keyword_select], ignore_index=True)
349
+ # if chunk_id column exists remove duplicates based on chunk_id
350
+ if "chunk_id" in df.columns:
351
+ df = df.drop_duplicates(subset=["chunk_id"])
352
+
353
+ # Check if we have a rerank_model configured in KB params
354
+ df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
268
355
 
269
356
  return df
270
357
 
358
+ def _get_allowed_metadata_columns(self) -> List[str] | None:
359
+ # Return list of KB columns to restrict querying, if None: no restrictions
360
+
361
+ if self._kb.params.get("version", 0) < 2:
362
+ # disable for old version KBs
363
+ return None
364
+
365
+ user_columns = self._kb.params.get("metadata_columns", [])
366
+ dynamic_columns = self._kb.params.get("inserted_metadata", [])
367
+
368
+ columns = set(user_columns) | set(dynamic_columns)
369
+ return [col.lower() for col in columns]
370
+
271
371
  def score_documents(self, query_text, documents, reranking_model_params):
272
372
  reranker = get_reranking_model_from_params(reranking_model_params)
273
373
  return reranker.get_scores(query_text, documents)
274
374
 
275
- def add_relevance(self, df, query_text, relevance_threshold=None, reranking_enabled_flag=True):
375
+ def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
276
376
  relevance_column = TableField.RELEVANCE.value
277
377
 
278
378
  reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
279
- if reranking_model_params and query_text and len(df) > 0 and reranking_enabled_flag:
379
+ if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
280
380
  # Use reranker for relevance score
281
- try:
282
- logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
283
- # Apply custom filtering threshold if provided
284
- if relevance_threshold is not None:
285
- reranking_model_params["filtering_threshold"] = relevance_threshold
286
- logger.info(f"Using custom filtering threshold: {relevance_threshold}")
287
381
 
288
- reranker = get_reranking_model_from_params(reranking_model_params)
289
- # Get documents to rerank
290
- documents = df["chunk_content"].tolist()
291
- # Use the get_scores method with disable_events=True
292
- scores = reranker.get_scores(query_text, documents)
293
- # Add scores as the relevance column
294
- df[relevance_column] = scores
295
-
296
- # Filter by threshold
297
- scores_array = np.array(scores)
298
- df = df[scores_array > reranker.filtering_threshold]
299
- logger.debug(f"Applied reranking with params: {reranking_model_params}")
300
- except Exception as e:
301
- logger.error(f"Error during reranking: {str(e)}")
302
- # Fallback to distance-based relevance
303
- if "distance" in df.columns:
304
- df[relevance_column] = 1 / (1 + df["distance"])
305
- else:
306
- logger.info("No distance or reranker available")
382
+ logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
383
+ # Apply custom filtering threshold if provided
384
+ if relevance_threshold is not None:
385
+ reranking_model_params["filtering_threshold"] = relevance_threshold
386
+ logger.info(f"Using custom filtering threshold: {relevance_threshold}")
387
+
388
+ reranker = get_reranking_model_from_params(reranking_model_params)
389
+ # Get documents to rerank
390
+ documents = df["chunk_content"].tolist()
391
+ # Use the get_scores method with disable_events=True
392
+ scores = reranker.get_scores(query_text, documents)
393
+ # Add scores as the relevance column
394
+ df[relevance_column] = scores
395
+
396
+ # Filter by threshold
397
+ scores_array = np.array(scores)
398
+ df = df[scores_array > reranker.filtering_threshold]
399
+ logger.debug(f"Applied reranking with params: {reranking_model_params}")
307
400
 
308
401
  elif "distance" in df.columns:
309
402
  # Calculate relevance from distance
@@ -323,12 +416,12 @@ class KnowledgeBaseTable:
323
416
  if conditions is None:
324
417
  return
325
418
  for condition in conditions:
326
- if condition.column in KB_TO_VECTORDB_COLUMNS:
327
- condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
419
+ if condition.column in self.kb_to_vector_columns:
420
+ condition.column = self.kb_to_vector_columns[condition.column]
328
421
 
329
422
  def addapt_result_columns(self, df):
330
423
  col_update = {}
331
- for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
424
+ for kb_col, vec_col in self.kb_to_vector_columns.items():
332
425
  if vec_col in df.columns:
333
426
  col_update[vec_col] = kb_col
334
427
 
@@ -337,7 +430,7 @@ class KnowledgeBaseTable:
337
430
  columns = list(df.columns)
338
431
  # update id, get from metadata
339
432
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
340
- lambda m: None if m is None else m.get("original_doc_id")
433
+ lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
341
434
  )
342
435
 
343
436
  # id on first place
@@ -524,8 +617,8 @@ class KnowledgeBaseTable:
524
617
 
525
618
  metadata = {
526
619
  **base_metadata,
527
- "original_row_index": str(idx), # provide link to original row index
528
- "content_column": col,
620
+ "_original_row_index": str(idx), # provide link to original row index
621
+ "_content_column": col,
529
622
  }
530
623
 
531
624
  raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
@@ -620,16 +713,22 @@ class KnowledgeBaseTable:
620
713
  metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
621
714
  logger.debug(f"Mapped metadata columns: {metadata_columns}")
622
715
 
623
- if content_columns is not None:
624
- content_columns = list(set(content_columns).intersection(columns))
625
- if len(content_columns) == 0:
626
- raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
716
+ content_columns = list(set(content_columns).intersection(columns))
717
+ if len(content_columns) == 0:
718
+ raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
627
719
 
628
- if metadata_columns is not None:
629
- metadata_columns = list(set(metadata_columns).intersection(columns))
630
- else:
631
- # all the rest columns
632
- metadata_columns = list(set(columns).difference(content_columns))
720
+ if metadata_columns is not None:
721
+ metadata_columns = list(set(metadata_columns).intersection(columns))
722
+ else:
723
+ # all the rest columns
724
+ metadata_columns = list(set(columns).difference(content_columns))
725
+
726
+ # update list of used columns
727
+ inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
728
+ inserted_metadata.update(metadata_columns)
729
+ self._kb.params["inserted_metadata"] = list(inserted_metadata)
730
+ flag_modified(self._kb, "params")
731
+ db.session.commit()
633
732
 
634
733
  # Add content columns directly (don't combine them)
635
734
  for col in content_columns:
@@ -655,7 +754,7 @@ class KnowledgeBaseTable:
655
754
  elif isinstance(value, dict):
656
755
  metadata.update(value)
657
756
  continue
658
- else:
757
+ elif value is not None:
659
758
  value = str(value)
660
759
  metadata[col] = value
661
760
  return metadata
@@ -712,8 +811,7 @@ class KnowledgeBaseTable:
712
811
  if model_id is None:
713
812
  # call litellm handler
714
813
  messages = list(df[TableField.CONTENT.value])
715
- embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
716
- embedding_params.update(self._kb.params["embedding_model"])
814
+ embedding_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
717
815
  results = self.call_litellm_embedding(self.session, embedding_params, messages)
718
816
  results = [[val] for val in results]
719
817
  return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
@@ -759,18 +857,16 @@ class KnowledgeBaseTable:
759
857
  def call_litellm_embedding(session, model_params, messages):
760
858
  args = copy.deepcopy(model_params)
761
859
 
860
+ if "model_name" not in args:
861
+ raise ValueError("'model_name' must be provided for embedding model")
862
+
762
863
  llm_model = args.pop("model_name")
763
864
  engine = args.pop("provider")
764
865
 
765
- llm_model = f"{engine}/{llm_model}"
766
-
767
- if "base_url" in args:
768
- args["api_base"] = args.pop("base_url")
769
-
770
866
  module = session.integration_controller.get_handler_module("litellm")
771
867
  if module is None or module.Handler is None:
772
868
  raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
773
- return module.Handler.embeddings(llm_model, messages, args)
869
+ return module.Handler.embeddings(engine, llm_model, messages, args)
774
870
 
775
871
  def build_rag_pipeline(self, retrieval_config: dict):
776
872
  """
@@ -892,6 +988,8 @@ class KnowledgeBaseController:
892
988
  manages knowledge bases
893
989
  """
894
990
 
991
+ KB_VERSION = 2
992
+
895
993
  def __init__(self, session) -> None:
896
994
  self.session = session
897
995
 
@@ -903,6 +1001,7 @@ class KnowledgeBaseController:
903
1001
  params: dict,
904
1002
  preprocessing_config: Optional[dict] = None,
905
1003
  if_not_exists: bool = False,
1004
+ keyword_search_enabled: bool = False,
906
1005
  # embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
907
1006
  ) -> db.KnowledgeBase:
908
1007
  """
@@ -914,6 +1013,24 @@ class KnowledgeBaseController:
914
1013
  # fill variables
915
1014
  params = variables_controller.fill_parameters(params)
916
1015
 
1016
+ try:
1017
+ KnowledgeBaseInputParams.model_validate(params)
1018
+ except ValidationError as e:
1019
+ problems = []
1020
+ for error in e.errors():
1021
+ parameter = ".".join([str(i) for i in error["loc"]])
1022
+ param_type = error["type"]
1023
+ if param_type == "extra_forbidden":
1024
+ msg = f"Parameter '{parameter}' is not allowed"
1025
+ else:
1026
+ msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
1027
+ problems.append(msg)
1028
+
1029
+ msg = "\n".join(problems)
1030
+ if len(problems) > 1:
1031
+ msg = "\n" + msg
1032
+ raise ValueError(f"Problem with knowledge base parameters: {msg}")
1033
+
917
1034
  # Validate preprocessing config first if provided
918
1035
  if preprocessing_config is not None:
919
1036
  PreprocessingConfig(**preprocessing_config) # Validate before storing
@@ -939,24 +1056,6 @@ class KnowledgeBaseController:
939
1056
  return kb
940
1057
  raise EntityExistsError("Knowledge base already exists", name)
941
1058
 
942
- embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
943
-
944
- # Legacy
945
- # model_name = None
946
- # model_project = project
947
- # if embedding_model:
948
- # model_name = embedding_model.parts[-1]
949
- # if len(embedding_model.parts) > 1:
950
- # model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
951
-
952
- # elif "embedding_model" in params:
953
- # if isinstance(params["embedding_model"], str):
954
- # # it is model name
955
- # model_name = params["embedding_model"]
956
- # else:
957
- # # it is params for model
958
- # embedding_params.update(params["embedding_model"])
959
-
960
1059
  embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
961
1060
 
962
1061
  # if model_name is None: # Legacy
@@ -987,7 +1086,11 @@ class KnowledgeBaseController:
987
1086
  if reranking_model_params:
988
1087
  # Get reranking model from params.
989
1088
  # This is called here to check validaity of the parameters.
990
- get_reranking_model_from_params(reranking_model_params)
1089
+ try:
1090
+ reranker = get_reranking_model_from_params(reranking_model_params)
1091
+ reranker.get_scores("test", ["test"])
1092
+ except (ValueError, RuntimeError) as e:
1093
+ raise RuntimeError(f"Problem with reranker config: {e}")
991
1094
 
992
1095
  # search for the vector database table
993
1096
  if storage is None:
@@ -1016,7 +1119,10 @@ class KnowledgeBaseController:
1016
1119
  vector_db_name, vector_table_name = storage.parts
1017
1120
 
1018
1121
  # create table in vectordb before creating KB
1019
- self.session.datahub.get(vector_db_name).integration_handler.create_table(vector_table_name)
1122
+ vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
1123
+ vector_store_handler.create_table(vector_table_name)
1124
+ if keyword_search_enabled:
1125
+ vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
1020
1126
  vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
1021
1127
 
1022
1128
  # Store sparse vector settings in params if specified
@@ -1026,6 +1132,7 @@ class KnowledgeBaseController:
1026
1132
  if vector_size is not None:
1027
1133
  params["vector_config"]["vector_size"] = vector_size
1028
1134
 
1135
+ params["version"] = self.KB_VERSION
1029
1136
  kb = db.KnowledgeBase(
1030
1137
  name=name,
1031
1138
  project_id=project_id,
@@ -1076,15 +1183,26 @@ class KnowledgeBaseController:
1076
1183
  except PredictorRecordNotFound:
1077
1184
  pass
1078
1185
 
1079
- if params.get("provider", None) not in ("openai", "azure_openai"):
1186
+ if "provider" not in params:
1187
+ raise ValueError("'provider' parameter is required for embedding model")
1188
+
1189
+ if params["provider"] not in ("openai", "azure_openai"):
1080
1190
  # try use litellm
1081
- KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
1191
+ try:
1192
+ KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
1193
+ except Exception as e:
1194
+ raise RuntimeError(f"Problem with embedding model config: {e}")
1082
1195
  return
1083
1196
 
1084
1197
  if "provider" in params:
1085
1198
  engine = params.pop("provider").lower()
1086
1199
 
1087
- api_key = get_api_key(engine, params, strict=False) or params.pop("api_key")
1200
+ api_key = get_api_key(engine, params, strict=False)
1201
+ if api_key is None:
1202
+ if "api_key" in params:
1203
+ params.pop("api_key")
1204
+ else:
1205
+ raise ValueError("'api_key' parameter is required for embedding model")
1088
1206
 
1089
1207
  if engine == "azure_openai":
1090
1208
  engine = "openai"
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import datetime as dt
8
8
 
9
9
  from mindsdb.api.executor.sql_query.result_set import ResultSet
10
- from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
10
+ from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
11
11
  from mindsdb.utilities import log
12
12
 
13
13
  from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
@@ -90,7 +90,7 @@ class EvaluateBase:
90
90
  df = response.data_frame
91
91
 
92
92
  if "content" not in df.columns:
93
- raise ValueError("`content` column isn't found in source data")
93
+ raise ValueError(f"`content` column isn't found in provided sql: {gen_params['from_sql']}")
94
94
 
95
95
  df.rename(columns={"content": "chunk_content"}, inplace=True)
96
96
  else:
@@ -130,6 +130,8 @@ class EvaluateBase:
130
130
  integration_name = table_name.parts[0]
131
131
  table_name = Identifier(parts=table_name.parts[1:])
132
132
  dn = self.session.datahub.get(integration_name)
133
+ if dn is None:
134
+ raise ValueError(f"Can't find database: {integration_name}")
133
135
  return dn, table_name
134
136
 
135
137
  def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
@@ -184,7 +186,7 @@ class EvaluateBase:
184
186
  to_table = params["save_to"]
185
187
  if isinstance(to_table, str):
186
188
  to_table = Identifier(to_table)
187
- self.save_to_table(to_table, scores)
189
+ self.save_to_table(to_table, scores.copy())
188
190
 
189
191
  return scores
190
192
 
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
256
258
 
257
259
  start_time = time.time()
258
260
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
259
- df_answers = self.kb.select_query(Select(targets=[Identifier("chunk_content")], limit=Constant(self.TOP_K)))
261
+ df_answers = self.kb.select_query(
262
+ Select(
263
+ targets=[Identifier("chunk_content")],
264
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
265
+ limit=Constant(self.TOP_K),
266
+ )
267
+ )
260
268
  query_time = time.time() - start_time
261
269
 
262
270
  proposed_responses = list(df_answers["chunk_content"])
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
410
418
  Checks if ID in response from KB is matched with doc ID in test dataset
411
419
  """
412
420
 
413
- TOP_K = 100
421
+ TOP_K = 20
414
422
 
415
423
  def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
416
424
  if "id" not in sampled_df.columns:
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
462
470
  start_time = time.time()
463
471
  logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
464
472
  df_answers = self.kb.select_query(
465
- Select(targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.TOP_K))
473
+ Select(
474
+ targets=[Identifier("chunk_content"), Identifier("id")],
475
+ where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
476
+ limit=Constant(self.TOP_K),
477
+ )
466
478
  )
467
479
  query_time = time.time() - start_time
468
480