MindsDB 25.9.2.0a1__py3-none-any.whl → 25.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (163) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +40 -29
  3. mindsdb/api/a2a/__init__.py +1 -1
  4. mindsdb/api/a2a/agent.py +16 -10
  5. mindsdb/api/a2a/common/server/server.py +7 -3
  6. mindsdb/api/a2a/common/server/task_manager.py +12 -5
  7. mindsdb/api/a2a/common/types.py +66 -0
  8. mindsdb/api/a2a/task_manager.py +65 -17
  9. mindsdb/api/common/middleware.py +10 -12
  10. mindsdb/api/executor/command_executor.py +51 -40
  11. mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
  12. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -13
  13. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +101 -49
  14. mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
  15. mindsdb/api/executor/datahub/datanodes/system_tables.py +3 -2
  16. mindsdb/api/executor/exceptions.py +29 -10
  17. mindsdb/api/executor/planner/plan_join.py +17 -3
  18. mindsdb/api/executor/planner/query_prepare.py +2 -20
  19. mindsdb/api/executor/sql_query/sql_query.py +74 -74
  20. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
  21. mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
  22. mindsdb/api/executor/utilities/functions.py +6 -6
  23. mindsdb/api/executor/utilities/sql.py +37 -20
  24. mindsdb/api/http/gui.py +5 -11
  25. mindsdb/api/http/initialize.py +75 -61
  26. mindsdb/api/http/namespaces/agents.py +10 -15
  27. mindsdb/api/http/namespaces/analysis.py +13 -20
  28. mindsdb/api/http/namespaces/auth.py +1 -1
  29. mindsdb/api/http/namespaces/chatbots.py +0 -5
  30. mindsdb/api/http/namespaces/config.py +15 -11
  31. mindsdb/api/http/namespaces/databases.py +140 -201
  32. mindsdb/api/http/namespaces/file.py +17 -4
  33. mindsdb/api/http/namespaces/handlers.py +17 -7
  34. mindsdb/api/http/namespaces/knowledge_bases.py +28 -7
  35. mindsdb/api/http/namespaces/models.py +94 -126
  36. mindsdb/api/http/namespaces/projects.py +13 -22
  37. mindsdb/api/http/namespaces/sql.py +33 -25
  38. mindsdb/api/http/namespaces/tab.py +27 -37
  39. mindsdb/api/http/namespaces/views.py +1 -1
  40. mindsdb/api/http/start.py +16 -10
  41. mindsdb/api/mcp/__init__.py +2 -1
  42. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
  43. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
  44. mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
  45. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +8 -2
  46. mindsdb/integrations/handlers/byom_handler/byom_handler.py +165 -190
  47. mindsdb/integrations/handlers/databricks_handler/databricks_handler.py +98 -46
  48. mindsdb/integrations/handlers/druid_handler/druid_handler.py +32 -40
  49. mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
  50. mindsdb/integrations/handlers/gitlab_handler/gitlab_handler.py +5 -2
  51. mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
  52. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +438 -100
  53. mindsdb/integrations/handlers/mssql_handler/requirements_odbc.txt +3 -0
  54. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +235 -3
  55. mindsdb/integrations/handlers/oracle_handler/__init__.py +2 -0
  56. mindsdb/integrations/handlers/oracle_handler/connection_args.py +7 -1
  57. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +321 -16
  58. mindsdb/integrations/handlers/oracle_handler/requirements.txt +1 -1
  59. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +14 -2
  60. mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
  61. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
  62. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  63. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  64. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
  65. mindsdb/integrations/handlers/zendesk_handler/zendesk_tables.py +144 -111
  66. mindsdb/integrations/libs/api_handler.py +10 -10
  67. mindsdb/integrations/libs/base.py +4 -4
  68. mindsdb/integrations/libs/llm/utils.py +2 -2
  69. mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
  70. mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
  71. mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
  72. mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
  73. mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
  74. mindsdb/integrations/libs/process_cache.py +132 -140
  75. mindsdb/integrations/libs/response.py +18 -12
  76. mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
  77. mindsdb/integrations/utilities/files/file_reader.py +6 -7
  78. mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/__init__.py +1 -0
  79. mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/snowflake_jwt_gen.py +151 -0
  80. mindsdb/integrations/utilities/rag/config_loader.py +37 -26
  81. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +83 -30
  82. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
  83. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
  84. mindsdb/integrations/utilities/rag/settings.py +58 -133
  85. mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
  86. mindsdb/interfaces/agents/agents_controller.py +2 -3
  87. mindsdb/interfaces/agents/constants.py +0 -2
  88. mindsdb/interfaces/agents/litellm_server.py +34 -58
  89. mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
  90. mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
  91. mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
  92. mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
  93. mindsdb/interfaces/chatbot/polling.py +30 -18
  94. mindsdb/interfaces/data_catalog/data_catalog_loader.py +16 -17
  95. mindsdb/interfaces/data_catalog/data_catalog_reader.py +15 -4
  96. mindsdb/interfaces/database/data_handlers_cache.py +190 -0
  97. mindsdb/interfaces/database/database.py +3 -3
  98. mindsdb/interfaces/database/integrations.py +7 -110
  99. mindsdb/interfaces/database/projects.py +2 -6
  100. mindsdb/interfaces/database/views.py +1 -4
  101. mindsdb/interfaces/file/file_controller.py +6 -6
  102. mindsdb/interfaces/functions/controller.py +1 -1
  103. mindsdb/interfaces/functions/to_markdown.py +2 -2
  104. mindsdb/interfaces/jobs/jobs_controller.py +5 -9
  105. mindsdb/interfaces/jobs/scheduler.py +3 -9
  106. mindsdb/interfaces/knowledge_base/controller.py +244 -128
  107. mindsdb/interfaces/knowledge_base/evaluate.py +36 -41
  108. mindsdb/interfaces/knowledge_base/executor.py +11 -0
  109. mindsdb/interfaces/knowledge_base/llm_client.py +51 -17
  110. mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
  111. mindsdb/interfaces/model/model_controller.py +172 -168
  112. mindsdb/interfaces/query_context/context_controller.py +14 -2
  113. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +10 -14
  114. mindsdb/interfaces/skills/retrieval_tool.py +43 -50
  115. mindsdb/interfaces/skills/skill_tool.py +2 -2
  116. mindsdb/interfaces/skills/skills_controller.py +1 -4
  117. mindsdb/interfaces/skills/sql_agent.py +25 -19
  118. mindsdb/interfaces/storage/db.py +16 -6
  119. mindsdb/interfaces/storage/fs.py +114 -169
  120. mindsdb/interfaces/storage/json.py +19 -18
  121. mindsdb/interfaces/tabs/tabs_controller.py +49 -72
  122. mindsdb/interfaces/tasks/task_monitor.py +3 -9
  123. mindsdb/interfaces/tasks/task_thread.py +7 -9
  124. mindsdb/interfaces/triggers/trigger_task.py +7 -13
  125. mindsdb/interfaces/triggers/triggers_controller.py +47 -52
  126. mindsdb/migrations/migrate.py +16 -16
  127. mindsdb/utilities/api_status.py +58 -0
  128. mindsdb/utilities/config.py +68 -2
  129. mindsdb/utilities/exception.py +40 -1
  130. mindsdb/utilities/fs.py +0 -1
  131. mindsdb/utilities/hooks/profiling.py +17 -14
  132. mindsdb/utilities/json_encoder.py +24 -10
  133. mindsdb/utilities/langfuse.py +40 -45
  134. mindsdb/utilities/log.py +272 -0
  135. mindsdb/utilities/ml_task_queue/consumer.py +52 -58
  136. mindsdb/utilities/ml_task_queue/producer.py +26 -30
  137. mindsdb/utilities/render/sqlalchemy_render.py +22 -20
  138. mindsdb/utilities/starters.py +0 -10
  139. mindsdb/utilities/utils.py +2 -2
  140. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/METADATA +293 -276
  141. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/RECORD +144 -158
  142. mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
  143. mindsdb/api/postgres/__init__.py +0 -0
  144. mindsdb/api/postgres/postgres_proxy/__init__.py +0 -0
  145. mindsdb/api/postgres/postgres_proxy/executor/__init__.py +0 -1
  146. mindsdb/api/postgres/postgres_proxy/executor/executor.py +0 -189
  147. mindsdb/api/postgres/postgres_proxy/postgres_packets/__init__.py +0 -0
  148. mindsdb/api/postgres/postgres_proxy/postgres_packets/errors.py +0 -322
  149. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_fields.py +0 -34
  150. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message.py +0 -31
  151. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +0 -1265
  152. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_identifiers.py +0 -31
  153. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +0 -253
  154. mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +0 -477
  155. mindsdb/api/postgres/postgres_proxy/utilities/__init__.py +0 -10
  156. mindsdb/api/postgres/start.py +0 -11
  157. mindsdb/integrations/handlers/mssql_handler/tests/__init__.py +0 -0
  158. mindsdb/integrations/handlers/mssql_handler/tests/test_mssql_handler.py +0 -169
  159. mindsdb/integrations/handlers/oracle_handler/tests/__init__.py +0 -0
  160. mindsdb/integrations/handlers/oracle_handler/tests/test_oracle_handler.py +0 -32
  161. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/WHEEL +0 -0
  162. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/licenses/LICENSE +0 -0
  163. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,6 @@ from pydantic import BaseModel, ValidationError
10
10
  from sqlalchemy.orm.attributes import flag_modified
11
11
 
12
12
  from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
13
- from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
14
13
  from mindsdb_sql_parser import parse_sql
15
14
 
16
15
  from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
@@ -22,12 +21,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
22
21
  TableField,
23
22
  VectorStoreHandler,
24
23
  )
25
- from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
26
- from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
27
24
  from mindsdb.integrations.utilities.handler_utils import get_api_key
28
- from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
29
- construct_model_from_args,
30
- )
25
+ from mindsdb.integrations.utilities.handlers.auth_utilities.snowflake import get_validated_jwt
31
26
 
32
27
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS, MAX_INSERT_BATCH_SIZE
33
28
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -47,6 +42,7 @@ from mindsdb.api.executor.command_executor import ExecuteCommands
47
42
  from mindsdb.api.executor.utilities.sql import query_df
48
43
  from mindsdb.utilities import log
49
44
  from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
45
+ from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
50
46
 
51
47
  logger = log.getLogger(__name__)
52
48
 
@@ -56,6 +52,7 @@ class KnowledgeBaseInputParams(BaseModel):
56
52
  content_columns: List[str] | None = None
57
53
  id_column: str | None = None
58
54
  kb_no_upsert: bool = False
55
+ kb_skip_existing: bool = False
59
56
  embedding_model: Dict[Text, Any] | None = None
60
57
  is_sparse: bool = False
61
58
  vector_size: int | None = None
@@ -76,6 +73,10 @@ def get_model_params(model_params: dict, default_config_key: str):
76
73
  if not isinstance(model_params, dict):
77
74
  raise ValueError("Model parameters must be passed as a JSON object")
78
75
 
76
+ # if provider mismatches - don't use default values
77
+ if "provider" in model_params and model_params["provider"] != combined_model_params.get("provider"):
78
+ return model_params
79
+
79
80
  combined_model_params.update(model_params)
80
81
 
81
82
  combined_model_params.pop("use_default_llm", None)
@@ -83,9 +84,9 @@ def get_model_params(model_params: dict, default_config_key: str):
83
84
  return combined_model_params
84
85
 
85
86
 
86
- def get_embedding_model_from_params(embedding_model_params: dict):
87
+ def adapt_embedding_model_params(embedding_model_params: dict):
87
88
  """
88
- Create embedding model from parameters.
89
+ Prepare parameters for embedding model.
89
90
  """
90
91
  params_copy = copy.deepcopy(embedding_model_params)
91
92
  provider = params_copy.pop("provider", None).lower()
@@ -106,7 +107,7 @@ def get_embedding_model_from_params(embedding_model_params: dict):
106
107
  params_copy.pop("api_key", None)
107
108
  params_copy["model"] = params_copy.pop("model_name", None)
108
109
 
109
- return construct_model_from_args(params_copy)
110
+ return params_copy
110
111
 
111
112
 
112
113
  def get_reranking_model_from_params(reranking_model_params: dict):
@@ -146,6 +147,28 @@ def to_json(obj):
146
147
  return obj
147
148
 
148
149
 
150
+ def rotate_provider_api_key(params):
151
+ """
152
+ Check api key for specific providers. At the moment it checks and updated jwt token of snowflake provider
153
+ :param params: input params, can be modified by this function
154
+ :return: a new api key if it is refreshed
155
+ """
156
+ provider = params.get("provider").lower()
157
+
158
+ if provider == "snowflake":
159
+ api_key = params.get("api_key")
160
+ api_key2 = get_validated_jwt(
161
+ api_key,
162
+ account=params.get("snowflake_account_id"),
163
+ user=params.get("user"),
164
+ private_key=params.get("private_key"),
165
+ )
166
+ if api_key2 != api_key:
167
+ # update keys
168
+ params["api_key"] = api_key2
169
+ return api_key2
170
+
171
+
149
172
  class KnowledgeBaseTable:
150
173
  """
151
174
  Knowledge base table interface
@@ -198,6 +221,22 @@ class KnowledgeBaseTable:
198
221
  executor = KnowledgeBaseQueryExecutor(self)
199
222
  df = executor.run(query)
200
223
 
224
+ # copy metadata to columns
225
+ if "metadata" in df.columns:
226
+ meta_columns = self._get_allowed_metadata_columns()
227
+ if meta_columns:
228
+ meta_data = pd.json_normalize(df["metadata"])
229
+ # exclude absent columns and used colunns
230
+ df_columns = list(df.columns)
231
+ meta_columns = list(set(meta_columns).intersection(meta_data.columns).difference(df_columns))
232
+
233
+ # add columns
234
+ df = df.join(meta_data[meta_columns])
235
+
236
+ # put metadata in the end
237
+ df_columns.remove("metadata")
238
+ df = df[df_columns + meta_columns + ["metadata"]]
239
+
201
240
  if (
202
241
  query_copy.group_by is not None
203
242
  or query_copy.order_by is not None
@@ -265,9 +304,9 @@ class KnowledgeBaseTable:
265
304
  gt_filtering = True
266
305
  logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
267
306
  except (ValueError, TypeError) as e:
268
- error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
307
+ error_msg = f"Invalid relevance_threshold value: {item.value}. {e}"
269
308
  logger.error(error_msg)
270
- raise ValueError(error_msg)
309
+ raise ValueError(error_msg) from e
271
310
  elif (item.column == "relevance") and (item.op.value not in relevance_threshold_allowed_operators):
272
311
  raise ValueError(
273
312
  f"Invalid operator for relevance: {item.op.value}. Only the following operators are allowed: "
@@ -318,13 +357,20 @@ class KnowledgeBaseTable:
318
357
  self.addapt_conditions_columns(conditions)
319
358
 
320
359
  # Set default limit if query is present
360
+ limit = query.limit.value if query.limit is not None else None
321
361
  if query_text is not None:
322
- limit = query.limit.value if query.limit is not None else None
323
362
  if limit is None:
324
363
  limit = 10
325
364
  elif limit > 100:
326
365
  limit = 100
327
- query.limit = Constant(limit)
366
+
367
+ if not disable_reranking:
368
+ # expand limit, get more records before reranking usage:
369
+ # get twice size of input but not greater than 30
370
+ query_limit = min(limit * 2, limit + 30)
371
+ else:
372
+ query_limit = limit
373
+ query.limit = Constant(query_limit)
328
374
 
329
375
  allowed_metadata_columns = self._get_allowed_metadata_columns()
330
376
  df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
@@ -375,11 +421,13 @@ class KnowledgeBaseTable:
375
421
 
376
422
  # Check if we have a rerank_model configured in KB params
377
423
  df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
424
+ if limit is not None:
425
+ df = df[:limit]
378
426
 
379
427
  # if relevance filtering method is strictly GREATER THAN we filter the df
380
428
  if gt_filtering:
381
429
  relevance_scores = TableField.RELEVANCE.value
382
- df = df[relevance_scores > relevance_threshold]
430
+ df = df[df[relevance_scores] > relevance_threshold]
383
431
 
384
432
  return df
385
433
 
@@ -397,6 +445,7 @@ class KnowledgeBaseTable:
397
445
  return [col.lower() for col in columns]
398
446
 
399
447
  def score_documents(self, query_text, documents, reranking_model_params):
448
+ rotate_provider_api_key(reranking_model_params)
400
449
  reranker = get_reranking_model_from_params(reranking_model_params)
401
450
  return reranker.get_scores(query_text, documents)
402
451
 
@@ -407,7 +456,15 @@ class KnowledgeBaseTable:
407
456
  if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
408
457
  # Use reranker for relevance score
409
458
 
410
- logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
459
+ new_api_key = rotate_provider_api_key(reranking_model_params)
460
+ if new_api_key:
461
+ # update key
462
+ if "reranking_model" not in self._kb.params:
463
+ self._kb.params["reranking_model"] = {}
464
+ self._kb.params["reranking_model"]["api_key"] = new_api_key
465
+ flag_modified(self._kb, "params")
466
+ db.session.commit()
467
+
411
468
  # Apply custom filtering threshold if provided
412
469
  if relevance_threshold is not None:
413
470
  reranking_model_params["filtering_threshold"] = relevance_threshold
@@ -424,7 +481,6 @@ class KnowledgeBaseTable:
424
481
  # Filter by threshold
425
482
  scores_array = np.array(scores)
426
483
  df = df[scores_array >= reranker.filtering_threshold]
427
- logger.debug(f"Applied reranking with params: {reranking_model_params}")
428
484
 
429
485
  elif "distance" in df.columns:
430
486
  # Calculate relevance from distance
@@ -678,6 +734,25 @@ class KnowledgeBaseTable:
678
734
  logger.warning("No valid content found in any content columns")
679
735
  return
680
736
 
737
+ # Check if we should skip existing items (before calculating embeddings)
738
+ if params is not None and params.get("kb_skip_existing", False):
739
+ logger.debug(f"Checking for existing items to skip before processing {len(df)} items")
740
+ db_handler = self.get_vector_db()
741
+
742
+ # Get list of IDs from current batch
743
+ current_ids = df[TableField.ID.value].dropna().astype(str).tolist()
744
+ if current_ids:
745
+ # Check which IDs already exist
746
+ existing_ids = db_handler.check_existing_ids(self._kb.vector_database_table, current_ids)
747
+ if existing_ids:
748
+ # Filter out existing items
749
+ df = df[~df[TableField.ID.value].astype(str).isin(existing_ids)]
750
+ logger.info(f"Skipped {len(existing_ids)} existing items, processing {len(df)} new items")
751
+
752
+ if df.empty:
753
+ logger.info("All items already exist, nothing to insert")
754
+ return
755
+
681
756
  # add embeddings and send to vector db
682
757
  df_emb = self._df_to_embeddings(df)
683
758
  df = pd.concat([df, df_emb], axis=1)
@@ -842,10 +917,12 @@ class KnowledgeBaseTable:
842
917
  model_id = self._kb.embedding_model_id
843
918
 
844
919
  if model_id is None:
845
- # call litellm handler
846
920
  messages = list(df[TableField.CONTENT.value])
847
921
  embedding_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
848
- results = self.call_litellm_embedding(self.session, embedding_params, messages)
922
+
923
+ llm_client = LLMClient(embedding_params, session=self.session)
924
+ results = llm_client.embeddings(messages)
925
+
849
926
  results = [[val] for val in results]
850
927
  return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
851
928
 
@@ -915,7 +992,12 @@ class KnowledgeBaseTable:
915
992
  ValueError: If the configuration is invalid or required components are missing
916
993
  """
917
994
  # Get embedding model from knowledge base
918
- embeddings_model = None
995
+ from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
996
+ construct_model_from_args,
997
+ )
998
+ from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
999
+ from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
1000
+
919
1001
  embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
920
1002
  if self._kb.embedding_model:
921
1003
  # Extract embedding model args from knowledge base table
@@ -924,7 +1006,7 @@ class KnowledgeBaseTable:
924
1006
  embeddings_model = construct_model_from_args(embedding_args)
925
1007
  logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
926
1008
  elif embedding_model_params:
927
- embeddings_model = get_embedding_model_from_params(embedding_model_params)
1009
+ embeddings_model = construct_model_from_args(adapt_embedding_model_params(embedding_model_params))
928
1010
  logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
929
1011
  else:
930
1012
  embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
@@ -952,8 +1034,8 @@ class KnowledgeBaseTable:
952
1034
  return rag
953
1035
 
954
1036
  except Exception as e:
955
- logger.error(f"Error building RAG pipeline: {str(e)}")
956
- raise ValueError(f"Failed to build RAG pipeline: {str(e)}")
1037
+ logger.exception("Error building RAG pipeline:")
1038
+ raise ValueError(f"Failed to build RAG pipeline: {str(e)}") from e
957
1039
 
958
1040
  def _parse_metadata(self, base_metadata):
959
1041
  """Helper function to robustly parse metadata string to dict"""
@@ -1026,6 +1108,26 @@ class KnowledgeBaseController:
1026
1108
  def __init__(self, session) -> None:
1027
1109
  self.session = session
1028
1110
 
1111
+ def _check_kb_input_params(self, params):
1112
+ # check names and types KB params
1113
+ try:
1114
+ KnowledgeBaseInputParams.model_validate(params)
1115
+ except ValidationError as e:
1116
+ problems = []
1117
+ for error in e.errors():
1118
+ parameter = ".".join([str(i) for i in error["loc"]])
1119
+ param_type = error["type"]
1120
+ if param_type == "extra_forbidden":
1121
+ msg = f"Parameter '{parameter}' is not allowed"
1122
+ else:
1123
+ msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
1124
+ problems.append(msg)
1125
+
1126
+ msg = "\n".join(problems)
1127
+ if len(problems) > 1:
1128
+ msg = "\n" + msg
1129
+ raise ValueError(f"Problem with knowledge base parameters: {msg}") from e
1130
+
1029
1131
  def add(
1030
1132
  self,
1031
1133
  name: str,
@@ -1043,36 +1145,18 @@ class KnowledgeBaseController:
1043
1145
  :param is_sparse: Whether to use sparse vectors for embeddings
1044
1146
  :param vector_size: Optional size specification for vectors, required when is_sparse=True
1045
1147
  """
1046
- if not name.islower():
1047
- raise ValueError(f"The name must be in lower case: {name}")
1048
1148
 
1049
1149
  # fill variables
1050
1150
  params = variables_controller.fill_parameters(params)
1051
1151
 
1052
- try:
1053
- KnowledgeBaseInputParams.model_validate(params)
1054
- except ValidationError as e:
1055
- problems = []
1056
- for error in e.errors():
1057
- parameter = ".".join([str(i) for i in error["loc"]])
1058
- param_type = error["type"]
1059
- if param_type == "extra_forbidden":
1060
- msg = f"Parameter '{parameter}' is not allowed"
1061
- else:
1062
- msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
1063
- problems.append(msg)
1064
-
1065
- msg = "\n".join(problems)
1066
- if len(problems) > 1:
1067
- msg = "\n" + msg
1068
- raise ValueError(f"Problem with knowledge base parameters: {msg}")
1069
-
1070
1152
  # Validate preprocessing config first if provided
1071
1153
  if preprocessing_config is not None:
1072
1154
  PreprocessingConfig(**preprocessing_config) # Validate before storing
1073
1155
  params = params or {}
1074
1156
  params["preprocessing"] = preprocessing_config
1075
1157
 
1158
+ self._check_kb_input_params(params)
1159
+
1076
1160
  # Check if vector_size is provided when using sparse vectors
1077
1161
  is_sparse = params.get("is_sparse")
1078
1162
  vector_size = params.get("vector_size")
@@ -1083,8 +1167,6 @@ class KnowledgeBaseController:
1083
1167
  project = self.session.database_controller.get_project(project_name)
1084
1168
  project_id = project.id
1085
1169
 
1086
- # not difference between cases in sql
1087
- name = name.lower()
1088
1170
  # check if knowledge base already exists
1089
1171
  kb = self.get(name, project_id)
1090
1172
  if kb is not None:
@@ -1096,42 +1178,25 @@ class KnowledgeBaseController:
1096
1178
  params["embedding_model"] = embedding_params
1097
1179
 
1098
1180
  # if model_name is None: # Legacy
1099
- model_name = self._create_embedding_model(
1181
+ self._check_embedding_model(
1100
1182
  project.name,
1101
1183
  params=embedding_params,
1102
1184
  kb_name=name,
1103
1185
  )
1104
- if model_name is not None:
1105
- params["created_embedding_model"] = model_name
1106
-
1107
- embedding_model_id = None
1108
- if model_name is not None:
1109
- model = self.session.model_controller.get_model(name=model_name, project_name=project.name)
1110
- model_record = db.Predictor.query.get(model["id"])
1111
- embedding_model_id = model_record.id
1112
-
1113
- if model_record.learn_args.get("using", {}).get("sparse"):
1114
- is_sparse = True
1115
1186
 
1116
1187
  # if params.get("reranking_model", {}) is bool and False we evaluate it to empty dictionary
1117
1188
  reranking_model_params = params.get("reranking_model", {})
1118
1189
 
1119
1190
  if isinstance(reranking_model_params, bool) and not reranking_model_params:
1120
1191
  params["reranking_model"] = {}
1121
- # if params.get("reranking_model", {}) is string and false in any case we evaluate it to empty dictionary
1122
- if isinstance(reranking_model_params, str) and reranking_model_params.lower() == "false":
1123
- params["reranking_model"] = {}
1124
1192
 
1125
1193
  reranking_model_params = get_model_params(reranking_model_params, "default_reranking_model")
1126
1194
  params["reranking_model"] = reranking_model_params
1127
1195
  if reranking_model_params:
1128
1196
  # Get reranking model from params.
1129
1197
  # This is called here to check validaity of the parameters.
1130
- try:
1131
- reranker = get_reranking_model_from_params(reranking_model_params)
1132
- reranker.get_scores("test", ["test"])
1133
- except (ValueError, RuntimeError) as e:
1134
- raise RuntimeError(f"Problem with reranker config: {e}")
1198
+ rotate_provider_api_key(reranking_model_params)
1199
+ self._test_reranking(reranking_model_params)
1135
1200
 
1136
1201
  # search for the vector database table
1137
1202
  if storage is None:
@@ -1184,13 +1249,115 @@ class KnowledgeBaseController:
1184
1249
  project_id=project_id,
1185
1250
  vector_database_id=vector_database_id,
1186
1251
  vector_database_table=vector_table_name,
1187
- embedding_model_id=embedding_model_id,
1252
+ embedding_model_id=None,
1188
1253
  params=params,
1189
1254
  )
1190
1255
  db.session.add(kb)
1191
1256
  db.session.commit()
1192
1257
  return kb
1193
1258
 
1259
+ def update(
1260
+ self,
1261
+ name: str,
1262
+ project_name: str,
1263
+ params: dict,
1264
+ preprocessing_config: Optional[dict] = None,
1265
+ ) -> db.KnowledgeBase:
1266
+ """
1267
+ Update the knowledge base
1268
+ :param name: The name of the knowledge base
1269
+ :param project_name: Current project name
1270
+ :param params: The parameters to update
1271
+ :param preprocessing_config: Optional preprocessing configuration to validate and store
1272
+ """
1273
+
1274
+ # fill variables
1275
+ params = variables_controller.fill_parameters(params)
1276
+
1277
+ # Validate preprocessing config first if provided
1278
+ if preprocessing_config is not None:
1279
+ PreprocessingConfig(**preprocessing_config) # Validate before storing
1280
+ params = params or {}
1281
+ params["preprocessing"] = preprocessing_config
1282
+
1283
+ self._check_kb_input_params(params)
1284
+
1285
+ # get project id
1286
+ project = self.session.database_controller.get_project(project_name)
1287
+ project_id = project.id
1288
+
1289
+ # get existed KB
1290
+ kb = self.get(name.lower(), project_id)
1291
+ if kb is None:
1292
+ raise EntityNotExistsError("Knowledge base doesn't exists", name)
1293
+
1294
+ if "embedding_model" in params:
1295
+ new_config = params["embedding_model"]
1296
+ # update embedding
1297
+ embed_params = kb.params.get("embedding_model", {})
1298
+ if not embed_params:
1299
+ # maybe old version of KB
1300
+ raise ValueError("No embedding config to update")
1301
+
1302
+ # some parameters are not allowed to update
1303
+ for key in ("provider", "model_name"):
1304
+ if key in new_config and new_config[key] != embed_params.get(key):
1305
+ raise ValueError(f"You can't update '{key}' setting")
1306
+
1307
+ embed_params.update(new_config)
1308
+
1309
+ self._check_embedding_model(
1310
+ project.name,
1311
+ params=embed_params,
1312
+ kb_name=name,
1313
+ )
1314
+ kb.params["embedding_model"] = embed_params
1315
+
1316
+ if "reranking_model" in params:
1317
+ new_config = params["reranking_model"]
1318
+ # update embedding
1319
+ rerank_params = kb.params.get("reranking_model", {})
1320
+
1321
+ if new_config is False:
1322
+ # disable reranking
1323
+ rerank_params = {}
1324
+ elif "provider" in new_config and new_config["provider"] != rerank_params.get("provider"):
1325
+ # use new config (and include default config)
1326
+ rerank_params = get_model_params(new_config, "default_reranking_model")
1327
+ else:
1328
+ # update current config
1329
+ rerank_params.update(new_config)
1330
+
1331
+ if rerank_params:
1332
+ self._test_reranking(rerank_params)
1333
+
1334
+ kb.params["reranking_model"] = rerank_params
1335
+
1336
+ # update other keys
1337
+ for key in ["id_column", "metadata_columns", "content_columns", "preprocessing"]:
1338
+ if key in params:
1339
+ kb.params[key] = params[key]
1340
+
1341
+ flag_modified(kb, "params")
1342
+ db.session.commit()
1343
+
1344
+ return self.get(name.lower(), project_id)
1345
+
1346
+ def _test_reranking(self, params):
1347
+ try:
1348
+ reranker = get_reranking_model_from_params(params)
1349
+ reranker.get_scores("test", ["test"])
1350
+ except (ValueError, RuntimeError) as e:
1351
+ if params["provider"] in ("azure_openai", "openai") and params.get("method") != "no-logprobs":
1352
+ # check with no-logprobs
1353
+ params["method"] = "no-logprobs"
1354
+ self._test_reranking(params)
1355
+ logger.warning(
1356
+ f"logprobs is not supported for this model: {params.get('model_name')}. using no-logprobs mode"
1357
+ )
1358
+ else:
1359
+ raise RuntimeError(f"Problem with reranker config: {e}") from e
1360
+
1194
1361
  def _create_persistent_pgvector(self, params=None):
1195
1362
  """Create default vector database for knowledge base, if not specified"""
1196
1363
  vector_store_name = "kb_pgvector_store"
@@ -1217,11 +1384,11 @@ class KnowledgeBaseController:
1217
1384
  self.session.integration_controller.add(vector_store_name, engine, connection_args)
1218
1385
  return vector_store_name
1219
1386
 
1220
- def _create_embedding_model(self, project_name, engine="openai", params: dict = None, kb_name=""):
1221
- """create a default embedding model for knowledge base, if not specified"""
1222
- model_name = f"kb_embedding_{kb_name}"
1387
+ def _check_embedding_model(self, project_name, params: dict = None, kb_name=""):
1388
+ """check embedding model for knowledge base"""
1223
1389
 
1224
- # drop if exists - parameters can be different
1390
+ # if mindsdb model from old KB exists - drop it
1391
+ model_name = f"kb_embedding_{kb_name}"
1225
1392
  try:
1226
1393
  model = self.session.model_controller.get_model(model_name, project_name=project_name)
1227
1394
  if model is not None:
@@ -1233,63 +1400,18 @@ class KnowledgeBaseController:
1233
1400
  raise ValueError("'provider' parameter is required for embedding model")
1234
1401
 
1235
1402
  # check available providers
1236
- avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google")
1403
+ avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google", "ollama")
1237
1404
  if params["provider"] not in avail_providers:
1238
1405
  raise ValueError(
1239
1406
  f"Wrong embedding provider: {params['provider']}. Available providers: {', '.join(avail_providers)}"
1240
1407
  )
1241
1408
 
1242
- if params["provider"] not in ("openai", "azure_openai"):
1243
- # try use litellm
1244
- try:
1245
- KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
1246
- except Exception as e:
1247
- raise RuntimeError(f"Problem with embedding model config: {e}")
1248
- return
1249
-
1250
- params = copy.deepcopy(params)
1251
- if "provider" in params:
1252
- engine = params.pop("provider").lower()
1253
-
1254
- api_key = get_api_key(engine, params, strict=False)
1255
- if api_key is None:
1256
- if "api_key" in params:
1257
- params.pop("api_key")
1258
- else:
1259
- raise ValueError("'api_key' parameter is required for embedding model")
1260
-
1261
- if engine == "azure_openai":
1262
- engine = "openai"
1263
- params["provider"] = "azure"
1264
-
1265
- if engine == "openai":
1266
- if "question_column" not in params:
1267
- params["question_column"] = "content"
1268
- if api_key:
1269
- params[f"{engine}_api_key"] = api_key
1270
- if "api_key" in params:
1271
- params.pop("api_key")
1272
- if "base_url" in params:
1273
- params["api_base"] = params.pop("base_url")
1274
-
1275
- params["engine"] = engine
1276
- params["join_learn_process"] = True
1277
- params["mode"] = "embedding"
1278
-
1279
- # Include API key if provided.
1280
- statement = CreatePredictor(
1281
- name=Identifier(parts=[project_name, model_name]),
1282
- using=params,
1283
- targets=[Identifier(parts=[TableField.EMBEDDINGS.value])],
1284
- )
1409
+ llm_client = LLMClient(params, session=self.session)
1285
1410
 
1286
- command_executor = ExecuteCommands(self.session)
1287
- resp = command_executor.answer_create_predictor(statement, project_name)
1288
- # check model status
1289
- record = resp.data.records[0]
1290
- if record["STATUS"] == "error":
1291
- raise ValueError("Embedding model error:" + record["ERROR"])
1292
- return model_name
1411
+ try:
1412
+ llm_client.embeddings(["test"])
1413
+ except Exception as e:
1414
+ raise RuntimeError(f"Problem with embedding model config: {e}") from e
1293
1415
 
1294
1416
  def delete(self, name: str, project_name: int, if_exists: bool = False) -> None:
1295
1417
  """
@@ -1297,8 +1419,8 @@ class KnowledgeBaseController:
1297
1419
  """
1298
1420
  try:
1299
1421
  project = self.session.database_controller.get_project(project_name)
1300
- except ValueError:
1301
- raise ValueError(f"Project not found: {project_name}")
1422
+ except ValueError as e:
1423
+ raise ValueError(f"Project not found: {project_name}") from e
1302
1424
  project_id = project.id
1303
1425
 
1304
1426
  # check if knowledge base exists
@@ -1395,12 +1517,6 @@ class KnowledgeBaseController:
1395
1517
  kb_table = self.get_table(table_name, project_id)
1396
1518
  kb_table.create_index()
1397
1519
 
1398
- def update(self, name: str, project_id: int, **kwargs) -> db.KnowledgeBase:
1399
- """
1400
- Update a knowledge base record
1401
- """
1402
- raise NotImplementedError()
1403
-
1404
1520
  def evaluate(self, table_name: str, project_name: str, params: dict = None) -> pd.DataFrame:
1405
1521
  """
1406
1522
  Run evaluate and/or create test data for evaluation