MindsDB 25.9.2.0a1__py3-none-any.whl → 25.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +40 -29
- mindsdb/api/a2a/__init__.py +1 -1
- mindsdb/api/a2a/agent.py +16 -10
- mindsdb/api/a2a/common/server/server.py +7 -3
- mindsdb/api/a2a/common/server/task_manager.py +12 -5
- mindsdb/api/a2a/common/types.py +66 -0
- mindsdb/api/a2a/task_manager.py +65 -17
- mindsdb/api/common/middleware.py +10 -12
- mindsdb/api/executor/command_executor.py +51 -40
- mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -13
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +101 -49
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
- mindsdb/api/executor/datahub/datanodes/system_tables.py +3 -2
- mindsdb/api/executor/exceptions.py +29 -10
- mindsdb/api/executor/planner/plan_join.py +17 -3
- mindsdb/api/executor/planner/query_prepare.py +2 -20
- mindsdb/api/executor/sql_query/sql_query.py +74 -74
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
- mindsdb/api/executor/utilities/functions.py +6 -6
- mindsdb/api/executor/utilities/sql.py +37 -20
- mindsdb/api/http/gui.py +5 -11
- mindsdb/api/http/initialize.py +75 -61
- mindsdb/api/http/namespaces/agents.py +10 -15
- mindsdb/api/http/namespaces/analysis.py +13 -20
- mindsdb/api/http/namespaces/auth.py +1 -1
- mindsdb/api/http/namespaces/chatbots.py +0 -5
- mindsdb/api/http/namespaces/config.py +15 -11
- mindsdb/api/http/namespaces/databases.py +140 -201
- mindsdb/api/http/namespaces/file.py +17 -4
- mindsdb/api/http/namespaces/handlers.py +17 -7
- mindsdb/api/http/namespaces/knowledge_bases.py +28 -7
- mindsdb/api/http/namespaces/models.py +94 -126
- mindsdb/api/http/namespaces/projects.py +13 -22
- mindsdb/api/http/namespaces/sql.py +33 -25
- mindsdb/api/http/namespaces/tab.py +27 -37
- mindsdb/api/http/namespaces/views.py +1 -1
- mindsdb/api/http/start.py +16 -10
- mindsdb/api/mcp/__init__.py +2 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
- mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +8 -2
- mindsdb/integrations/handlers/byom_handler/byom_handler.py +165 -190
- mindsdb/integrations/handlers/databricks_handler/databricks_handler.py +98 -46
- mindsdb/integrations/handlers/druid_handler/druid_handler.py +32 -40
- mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
- mindsdb/integrations/handlers/gitlab_handler/gitlab_handler.py +5 -2
- mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +438 -100
- mindsdb/integrations/handlers/mssql_handler/requirements_odbc.txt +3 -0
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +235 -3
- mindsdb/integrations/handlers/oracle_handler/__init__.py +2 -0
- mindsdb/integrations/handlers/oracle_handler/connection_args.py +7 -1
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +321 -16
- mindsdb/integrations/handlers/oracle_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +14 -2
- mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
- mindsdb/integrations/handlers/zendesk_handler/zendesk_tables.py +144 -111
- mindsdb/integrations/libs/api_handler.py +10 -10
- mindsdb/integrations/libs/base.py +4 -4
- mindsdb/integrations/libs/llm/utils.py +2 -2
- mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
- mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
- mindsdb/integrations/libs/process_cache.py +132 -140
- mindsdb/integrations/libs/response.py +18 -12
- mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
- mindsdb/integrations/utilities/files/file_reader.py +6 -7
- mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/__init__.py +1 -0
- mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/snowflake_jwt_gen.py +151 -0
- mindsdb/integrations/utilities/rag/config_loader.py +37 -26
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +83 -30
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
- mindsdb/integrations/utilities/rag/settings.py +58 -133
- mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
- mindsdb/interfaces/agents/agents_controller.py +2 -3
- mindsdb/interfaces/agents/constants.py +0 -2
- mindsdb/interfaces/agents/litellm_server.py +34 -58
- mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
- mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
- mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
- mindsdb/interfaces/chatbot/polling.py +30 -18
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +16 -17
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +15 -4
- mindsdb/interfaces/database/data_handlers_cache.py +190 -0
- mindsdb/interfaces/database/database.py +3 -3
- mindsdb/interfaces/database/integrations.py +7 -110
- mindsdb/interfaces/database/projects.py +2 -6
- mindsdb/interfaces/database/views.py +1 -4
- mindsdb/interfaces/file/file_controller.py +6 -6
- mindsdb/interfaces/functions/controller.py +1 -1
- mindsdb/interfaces/functions/to_markdown.py +2 -2
- mindsdb/interfaces/jobs/jobs_controller.py +5 -9
- mindsdb/interfaces/jobs/scheduler.py +3 -9
- mindsdb/interfaces/knowledge_base/controller.py +244 -128
- mindsdb/interfaces/knowledge_base/evaluate.py +36 -41
- mindsdb/interfaces/knowledge_base/executor.py +11 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +51 -17
- mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
- mindsdb/interfaces/model/model_controller.py +172 -168
- mindsdb/interfaces/query_context/context_controller.py +14 -2
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +10 -14
- mindsdb/interfaces/skills/retrieval_tool.py +43 -50
- mindsdb/interfaces/skills/skill_tool.py +2 -2
- mindsdb/interfaces/skills/skills_controller.py +1 -4
- mindsdb/interfaces/skills/sql_agent.py +25 -19
- mindsdb/interfaces/storage/db.py +16 -6
- mindsdb/interfaces/storage/fs.py +114 -169
- mindsdb/interfaces/storage/json.py +19 -18
- mindsdb/interfaces/tabs/tabs_controller.py +49 -72
- mindsdb/interfaces/tasks/task_monitor.py +3 -9
- mindsdb/interfaces/tasks/task_thread.py +7 -9
- mindsdb/interfaces/triggers/trigger_task.py +7 -13
- mindsdb/interfaces/triggers/triggers_controller.py +47 -52
- mindsdb/migrations/migrate.py +16 -16
- mindsdb/utilities/api_status.py +58 -0
- mindsdb/utilities/config.py +68 -2
- mindsdb/utilities/exception.py +40 -1
- mindsdb/utilities/fs.py +0 -1
- mindsdb/utilities/hooks/profiling.py +17 -14
- mindsdb/utilities/json_encoder.py +24 -10
- mindsdb/utilities/langfuse.py +40 -45
- mindsdb/utilities/log.py +272 -0
- mindsdb/utilities/ml_task_queue/consumer.py +52 -58
- mindsdb/utilities/ml_task_queue/producer.py +26 -30
- mindsdb/utilities/render/sqlalchemy_render.py +22 -20
- mindsdb/utilities/starters.py +0 -10
- mindsdb/utilities/utils.py +2 -2
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/METADATA +293 -276
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/RECORD +144 -158
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
- mindsdb/api/postgres/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/executor/__init__.py +0 -1
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +0 -189
- mindsdb/api/postgres/postgres_proxy/postgres_packets/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/postgres_packets/errors.py +0 -322
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_fields.py +0 -34
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message.py +0 -31
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +0 -1265
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_identifiers.py +0 -31
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +0 -253
- mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +0 -477
- mindsdb/api/postgres/postgres_proxy/utilities/__init__.py +0 -10
- mindsdb/api/postgres/start.py +0 -11
- mindsdb/integrations/handlers/mssql_handler/tests/__init__.py +0 -0
- mindsdb/integrations/handlers/mssql_handler/tests/test_mssql_handler.py +0 -169
- mindsdb/integrations/handlers/oracle_handler/tests/__init__.py +0 -0
- mindsdb/integrations/handlers/oracle_handler/tests/test_oracle_handler.py +0 -32
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/WHEEL +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,6 @@ from pydantic import BaseModel, ValidationError
|
|
|
10
10
|
from sqlalchemy.orm.attributes import flag_modified
|
|
11
11
|
|
|
12
12
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
13
|
-
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
14
13
|
from mindsdb_sql_parser import parse_sql
|
|
15
14
|
|
|
16
15
|
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
@@ -22,12 +21,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
22
21
|
TableField,
|
|
23
22
|
VectorStoreHandler,
|
|
24
23
|
)
|
|
25
|
-
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
26
|
-
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
27
24
|
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
28
|
-
from mindsdb.integrations.handlers.
|
|
29
|
-
construct_model_from_args,
|
|
30
|
-
)
|
|
25
|
+
from mindsdb.integrations.utilities.handlers.auth_utilities.snowflake import get_validated_jwt
|
|
31
26
|
|
|
32
27
|
from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS, MAX_INSERT_BATCH_SIZE
|
|
33
28
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
|
|
@@ -47,6 +42,7 @@ from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
|
47
42
|
from mindsdb.api.executor.utilities.sql import query_df
|
|
48
43
|
from mindsdb.utilities import log
|
|
49
44
|
from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMReranker
|
|
45
|
+
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
50
46
|
|
|
51
47
|
logger = log.getLogger(__name__)
|
|
52
48
|
|
|
@@ -56,6 +52,7 @@ class KnowledgeBaseInputParams(BaseModel):
|
|
|
56
52
|
content_columns: List[str] | None = None
|
|
57
53
|
id_column: str | None = None
|
|
58
54
|
kb_no_upsert: bool = False
|
|
55
|
+
kb_skip_existing: bool = False
|
|
59
56
|
embedding_model: Dict[Text, Any] | None = None
|
|
60
57
|
is_sparse: bool = False
|
|
61
58
|
vector_size: int | None = None
|
|
@@ -76,6 +73,10 @@ def get_model_params(model_params: dict, default_config_key: str):
|
|
|
76
73
|
if not isinstance(model_params, dict):
|
|
77
74
|
raise ValueError("Model parameters must be passed as a JSON object")
|
|
78
75
|
|
|
76
|
+
# if provider mismatches - don't use default values
|
|
77
|
+
if "provider" in model_params and model_params["provider"] != combined_model_params.get("provider"):
|
|
78
|
+
return model_params
|
|
79
|
+
|
|
79
80
|
combined_model_params.update(model_params)
|
|
80
81
|
|
|
81
82
|
combined_model_params.pop("use_default_llm", None)
|
|
@@ -83,9 +84,9 @@ def get_model_params(model_params: dict, default_config_key: str):
|
|
|
83
84
|
return combined_model_params
|
|
84
85
|
|
|
85
86
|
|
|
86
|
-
def
|
|
87
|
+
def adapt_embedding_model_params(embedding_model_params: dict):
|
|
87
88
|
"""
|
|
88
|
-
|
|
89
|
+
Prepare parameters for embedding model.
|
|
89
90
|
"""
|
|
90
91
|
params_copy = copy.deepcopy(embedding_model_params)
|
|
91
92
|
provider = params_copy.pop("provider", None).lower()
|
|
@@ -106,7 +107,7 @@ def get_embedding_model_from_params(embedding_model_params: dict):
|
|
|
106
107
|
params_copy.pop("api_key", None)
|
|
107
108
|
params_copy["model"] = params_copy.pop("model_name", None)
|
|
108
109
|
|
|
109
|
-
return
|
|
110
|
+
return params_copy
|
|
110
111
|
|
|
111
112
|
|
|
112
113
|
def get_reranking_model_from_params(reranking_model_params: dict):
|
|
@@ -146,6 +147,28 @@ def to_json(obj):
|
|
|
146
147
|
return obj
|
|
147
148
|
|
|
148
149
|
|
|
150
|
+
def rotate_provider_api_key(params):
|
|
151
|
+
"""
|
|
152
|
+
Check api key for specific providers. At the moment it checks and updated jwt token of snowflake provider
|
|
153
|
+
:param params: input params, can be modified by this function
|
|
154
|
+
:return: a new api key if it is refreshed
|
|
155
|
+
"""
|
|
156
|
+
provider = params.get("provider").lower()
|
|
157
|
+
|
|
158
|
+
if provider == "snowflake":
|
|
159
|
+
api_key = params.get("api_key")
|
|
160
|
+
api_key2 = get_validated_jwt(
|
|
161
|
+
api_key,
|
|
162
|
+
account=params.get("snowflake_account_id"),
|
|
163
|
+
user=params.get("user"),
|
|
164
|
+
private_key=params.get("private_key"),
|
|
165
|
+
)
|
|
166
|
+
if api_key2 != api_key:
|
|
167
|
+
# update keys
|
|
168
|
+
params["api_key"] = api_key2
|
|
169
|
+
return api_key2
|
|
170
|
+
|
|
171
|
+
|
|
149
172
|
class KnowledgeBaseTable:
|
|
150
173
|
"""
|
|
151
174
|
Knowledge base table interface
|
|
@@ -198,6 +221,22 @@ class KnowledgeBaseTable:
|
|
|
198
221
|
executor = KnowledgeBaseQueryExecutor(self)
|
|
199
222
|
df = executor.run(query)
|
|
200
223
|
|
|
224
|
+
# copy metadata to columns
|
|
225
|
+
if "metadata" in df.columns:
|
|
226
|
+
meta_columns = self._get_allowed_metadata_columns()
|
|
227
|
+
if meta_columns:
|
|
228
|
+
meta_data = pd.json_normalize(df["metadata"])
|
|
229
|
+
# exclude absent columns and used colunns
|
|
230
|
+
df_columns = list(df.columns)
|
|
231
|
+
meta_columns = list(set(meta_columns).intersection(meta_data.columns).difference(df_columns))
|
|
232
|
+
|
|
233
|
+
# add columns
|
|
234
|
+
df = df.join(meta_data[meta_columns])
|
|
235
|
+
|
|
236
|
+
# put metadata in the end
|
|
237
|
+
df_columns.remove("metadata")
|
|
238
|
+
df = df[df_columns + meta_columns + ["metadata"]]
|
|
239
|
+
|
|
201
240
|
if (
|
|
202
241
|
query_copy.group_by is not None
|
|
203
242
|
or query_copy.order_by is not None
|
|
@@ -265,9 +304,9 @@ class KnowledgeBaseTable:
|
|
|
265
304
|
gt_filtering = True
|
|
266
305
|
logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
|
|
267
306
|
except (ValueError, TypeError) as e:
|
|
268
|
-
error_msg = f"Invalid relevance_threshold value: {item.value}. {
|
|
307
|
+
error_msg = f"Invalid relevance_threshold value: {item.value}. {e}"
|
|
269
308
|
logger.error(error_msg)
|
|
270
|
-
raise ValueError(error_msg)
|
|
309
|
+
raise ValueError(error_msg) from e
|
|
271
310
|
elif (item.column == "relevance") and (item.op.value not in relevance_threshold_allowed_operators):
|
|
272
311
|
raise ValueError(
|
|
273
312
|
f"Invalid operator for relevance: {item.op.value}. Only the following operators are allowed: "
|
|
@@ -318,13 +357,20 @@ class KnowledgeBaseTable:
|
|
|
318
357
|
self.addapt_conditions_columns(conditions)
|
|
319
358
|
|
|
320
359
|
# Set default limit if query is present
|
|
360
|
+
limit = query.limit.value if query.limit is not None else None
|
|
321
361
|
if query_text is not None:
|
|
322
|
-
limit = query.limit.value if query.limit is not None else None
|
|
323
362
|
if limit is None:
|
|
324
363
|
limit = 10
|
|
325
364
|
elif limit > 100:
|
|
326
365
|
limit = 100
|
|
327
|
-
|
|
366
|
+
|
|
367
|
+
if not disable_reranking:
|
|
368
|
+
# expand limit, get more records before reranking usage:
|
|
369
|
+
# get twice size of input but not greater than 30
|
|
370
|
+
query_limit = min(limit * 2, limit + 30)
|
|
371
|
+
else:
|
|
372
|
+
query_limit = limit
|
|
373
|
+
query.limit = Constant(query_limit)
|
|
328
374
|
|
|
329
375
|
allowed_metadata_columns = self._get_allowed_metadata_columns()
|
|
330
376
|
df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
@@ -375,11 +421,13 @@ class KnowledgeBaseTable:
|
|
|
375
421
|
|
|
376
422
|
# Check if we have a rerank_model configured in KB params
|
|
377
423
|
df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
|
|
424
|
+
if limit is not None:
|
|
425
|
+
df = df[:limit]
|
|
378
426
|
|
|
379
427
|
# if relevance filtering method is strictly GREATER THAN we filter the df
|
|
380
428
|
if gt_filtering:
|
|
381
429
|
relevance_scores = TableField.RELEVANCE.value
|
|
382
|
-
df = df[relevance_scores > relevance_threshold]
|
|
430
|
+
df = df[df[relevance_scores] > relevance_threshold]
|
|
383
431
|
|
|
384
432
|
return df
|
|
385
433
|
|
|
@@ -397,6 +445,7 @@ class KnowledgeBaseTable:
|
|
|
397
445
|
return [col.lower() for col in columns]
|
|
398
446
|
|
|
399
447
|
def score_documents(self, query_text, documents, reranking_model_params):
|
|
448
|
+
rotate_provider_api_key(reranking_model_params)
|
|
400
449
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
401
450
|
return reranker.get_scores(query_text, documents)
|
|
402
451
|
|
|
@@ -407,7 +456,15 @@ class KnowledgeBaseTable:
|
|
|
407
456
|
if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
|
|
408
457
|
# Use reranker for relevance score
|
|
409
458
|
|
|
410
|
-
|
|
459
|
+
new_api_key = rotate_provider_api_key(reranking_model_params)
|
|
460
|
+
if new_api_key:
|
|
461
|
+
# update key
|
|
462
|
+
if "reranking_model" not in self._kb.params:
|
|
463
|
+
self._kb.params["reranking_model"] = {}
|
|
464
|
+
self._kb.params["reranking_model"]["api_key"] = new_api_key
|
|
465
|
+
flag_modified(self._kb, "params")
|
|
466
|
+
db.session.commit()
|
|
467
|
+
|
|
411
468
|
# Apply custom filtering threshold if provided
|
|
412
469
|
if relevance_threshold is not None:
|
|
413
470
|
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
@@ -424,7 +481,6 @@ class KnowledgeBaseTable:
|
|
|
424
481
|
# Filter by threshold
|
|
425
482
|
scores_array = np.array(scores)
|
|
426
483
|
df = df[scores_array >= reranker.filtering_threshold]
|
|
427
|
-
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
428
484
|
|
|
429
485
|
elif "distance" in df.columns:
|
|
430
486
|
# Calculate relevance from distance
|
|
@@ -678,6 +734,25 @@ class KnowledgeBaseTable:
|
|
|
678
734
|
logger.warning("No valid content found in any content columns")
|
|
679
735
|
return
|
|
680
736
|
|
|
737
|
+
# Check if we should skip existing items (before calculating embeddings)
|
|
738
|
+
if params is not None and params.get("kb_skip_existing", False):
|
|
739
|
+
logger.debug(f"Checking for existing items to skip before processing {len(df)} items")
|
|
740
|
+
db_handler = self.get_vector_db()
|
|
741
|
+
|
|
742
|
+
# Get list of IDs from current batch
|
|
743
|
+
current_ids = df[TableField.ID.value].dropna().astype(str).tolist()
|
|
744
|
+
if current_ids:
|
|
745
|
+
# Check which IDs already exist
|
|
746
|
+
existing_ids = db_handler.check_existing_ids(self._kb.vector_database_table, current_ids)
|
|
747
|
+
if existing_ids:
|
|
748
|
+
# Filter out existing items
|
|
749
|
+
df = df[~df[TableField.ID.value].astype(str).isin(existing_ids)]
|
|
750
|
+
logger.info(f"Skipped {len(existing_ids)} existing items, processing {len(df)} new items")
|
|
751
|
+
|
|
752
|
+
if df.empty:
|
|
753
|
+
logger.info("All items already exist, nothing to insert")
|
|
754
|
+
return
|
|
755
|
+
|
|
681
756
|
# add embeddings and send to vector db
|
|
682
757
|
df_emb = self._df_to_embeddings(df)
|
|
683
758
|
df = pd.concat([df, df_emb], axis=1)
|
|
@@ -842,10 +917,12 @@ class KnowledgeBaseTable:
|
|
|
842
917
|
model_id = self._kb.embedding_model_id
|
|
843
918
|
|
|
844
919
|
if model_id is None:
|
|
845
|
-
# call litellm handler
|
|
846
920
|
messages = list(df[TableField.CONTENT.value])
|
|
847
921
|
embedding_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
|
|
848
|
-
|
|
922
|
+
|
|
923
|
+
llm_client = LLMClient(embedding_params, session=self.session)
|
|
924
|
+
results = llm_client.embeddings(messages)
|
|
925
|
+
|
|
849
926
|
results = [[val] for val in results]
|
|
850
927
|
return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
|
|
851
928
|
|
|
@@ -915,7 +992,12 @@ class KnowledgeBaseTable:
|
|
|
915
992
|
ValueError: If the configuration is invalid or required components are missing
|
|
916
993
|
"""
|
|
917
994
|
# Get embedding model from knowledge base
|
|
918
|
-
|
|
995
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
|
|
996
|
+
construct_model_from_args,
|
|
997
|
+
)
|
|
998
|
+
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
999
|
+
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
1000
|
+
|
|
919
1001
|
embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
|
|
920
1002
|
if self._kb.embedding_model:
|
|
921
1003
|
# Extract embedding model args from knowledge base table
|
|
@@ -924,7 +1006,7 @@ class KnowledgeBaseTable:
|
|
|
924
1006
|
embeddings_model = construct_model_from_args(embedding_args)
|
|
925
1007
|
logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
|
|
926
1008
|
elif embedding_model_params:
|
|
927
|
-
embeddings_model =
|
|
1009
|
+
embeddings_model = construct_model_from_args(adapt_embedding_model_params(embedding_model_params))
|
|
928
1010
|
logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
|
|
929
1011
|
else:
|
|
930
1012
|
embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
|
|
@@ -952,8 +1034,8 @@ class KnowledgeBaseTable:
|
|
|
952
1034
|
return rag
|
|
953
1035
|
|
|
954
1036
|
except Exception as e:
|
|
955
|
-
logger.
|
|
956
|
-
raise ValueError(f"Failed to build RAG pipeline: {str(e)}")
|
|
1037
|
+
logger.exception("Error building RAG pipeline:")
|
|
1038
|
+
raise ValueError(f"Failed to build RAG pipeline: {str(e)}") from e
|
|
957
1039
|
|
|
958
1040
|
def _parse_metadata(self, base_metadata):
|
|
959
1041
|
"""Helper function to robustly parse metadata string to dict"""
|
|
@@ -1026,6 +1108,26 @@ class KnowledgeBaseController:
|
|
|
1026
1108
|
def __init__(self, session) -> None:
|
|
1027
1109
|
self.session = session
|
|
1028
1110
|
|
|
1111
|
+
def _check_kb_input_params(self, params):
|
|
1112
|
+
# check names and types KB params
|
|
1113
|
+
try:
|
|
1114
|
+
KnowledgeBaseInputParams.model_validate(params)
|
|
1115
|
+
except ValidationError as e:
|
|
1116
|
+
problems = []
|
|
1117
|
+
for error in e.errors():
|
|
1118
|
+
parameter = ".".join([str(i) for i in error["loc"]])
|
|
1119
|
+
param_type = error["type"]
|
|
1120
|
+
if param_type == "extra_forbidden":
|
|
1121
|
+
msg = f"Parameter '{parameter}' is not allowed"
|
|
1122
|
+
else:
|
|
1123
|
+
msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
|
|
1124
|
+
problems.append(msg)
|
|
1125
|
+
|
|
1126
|
+
msg = "\n".join(problems)
|
|
1127
|
+
if len(problems) > 1:
|
|
1128
|
+
msg = "\n" + msg
|
|
1129
|
+
raise ValueError(f"Problem with knowledge base parameters: {msg}") from e
|
|
1130
|
+
|
|
1029
1131
|
def add(
|
|
1030
1132
|
self,
|
|
1031
1133
|
name: str,
|
|
@@ -1043,36 +1145,18 @@ class KnowledgeBaseController:
|
|
|
1043
1145
|
:param is_sparse: Whether to use sparse vectors for embeddings
|
|
1044
1146
|
:param vector_size: Optional size specification for vectors, required when is_sparse=True
|
|
1045
1147
|
"""
|
|
1046
|
-
if not name.islower():
|
|
1047
|
-
raise ValueError(f"The name must be in lower case: {name}")
|
|
1048
1148
|
|
|
1049
1149
|
# fill variables
|
|
1050
1150
|
params = variables_controller.fill_parameters(params)
|
|
1051
1151
|
|
|
1052
|
-
try:
|
|
1053
|
-
KnowledgeBaseInputParams.model_validate(params)
|
|
1054
|
-
except ValidationError as e:
|
|
1055
|
-
problems = []
|
|
1056
|
-
for error in e.errors():
|
|
1057
|
-
parameter = ".".join([str(i) for i in error["loc"]])
|
|
1058
|
-
param_type = error["type"]
|
|
1059
|
-
if param_type == "extra_forbidden":
|
|
1060
|
-
msg = f"Parameter '{parameter}' is not allowed"
|
|
1061
|
-
else:
|
|
1062
|
-
msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
|
|
1063
|
-
problems.append(msg)
|
|
1064
|
-
|
|
1065
|
-
msg = "\n".join(problems)
|
|
1066
|
-
if len(problems) > 1:
|
|
1067
|
-
msg = "\n" + msg
|
|
1068
|
-
raise ValueError(f"Problem with knowledge base parameters: {msg}")
|
|
1069
|
-
|
|
1070
1152
|
# Validate preprocessing config first if provided
|
|
1071
1153
|
if preprocessing_config is not None:
|
|
1072
1154
|
PreprocessingConfig(**preprocessing_config) # Validate before storing
|
|
1073
1155
|
params = params or {}
|
|
1074
1156
|
params["preprocessing"] = preprocessing_config
|
|
1075
1157
|
|
|
1158
|
+
self._check_kb_input_params(params)
|
|
1159
|
+
|
|
1076
1160
|
# Check if vector_size is provided when using sparse vectors
|
|
1077
1161
|
is_sparse = params.get("is_sparse")
|
|
1078
1162
|
vector_size = params.get("vector_size")
|
|
@@ -1083,8 +1167,6 @@ class KnowledgeBaseController:
|
|
|
1083
1167
|
project = self.session.database_controller.get_project(project_name)
|
|
1084
1168
|
project_id = project.id
|
|
1085
1169
|
|
|
1086
|
-
# not difference between cases in sql
|
|
1087
|
-
name = name.lower()
|
|
1088
1170
|
# check if knowledge base already exists
|
|
1089
1171
|
kb = self.get(name, project_id)
|
|
1090
1172
|
if kb is not None:
|
|
@@ -1096,42 +1178,25 @@ class KnowledgeBaseController:
|
|
|
1096
1178
|
params["embedding_model"] = embedding_params
|
|
1097
1179
|
|
|
1098
1180
|
# if model_name is None: # Legacy
|
|
1099
|
-
|
|
1181
|
+
self._check_embedding_model(
|
|
1100
1182
|
project.name,
|
|
1101
1183
|
params=embedding_params,
|
|
1102
1184
|
kb_name=name,
|
|
1103
1185
|
)
|
|
1104
|
-
if model_name is not None:
|
|
1105
|
-
params["created_embedding_model"] = model_name
|
|
1106
|
-
|
|
1107
|
-
embedding_model_id = None
|
|
1108
|
-
if model_name is not None:
|
|
1109
|
-
model = self.session.model_controller.get_model(name=model_name, project_name=project.name)
|
|
1110
|
-
model_record = db.Predictor.query.get(model["id"])
|
|
1111
|
-
embedding_model_id = model_record.id
|
|
1112
|
-
|
|
1113
|
-
if model_record.learn_args.get("using", {}).get("sparse"):
|
|
1114
|
-
is_sparse = True
|
|
1115
1186
|
|
|
1116
1187
|
# if params.get("reranking_model", {}) is bool and False we evaluate it to empty dictionary
|
|
1117
1188
|
reranking_model_params = params.get("reranking_model", {})
|
|
1118
1189
|
|
|
1119
1190
|
if isinstance(reranking_model_params, bool) and not reranking_model_params:
|
|
1120
1191
|
params["reranking_model"] = {}
|
|
1121
|
-
# if params.get("reranking_model", {}) is string and false in any case we evaluate it to empty dictionary
|
|
1122
|
-
if isinstance(reranking_model_params, str) and reranking_model_params.lower() == "false":
|
|
1123
|
-
params["reranking_model"] = {}
|
|
1124
1192
|
|
|
1125
1193
|
reranking_model_params = get_model_params(reranking_model_params, "default_reranking_model")
|
|
1126
1194
|
params["reranking_model"] = reranking_model_params
|
|
1127
1195
|
if reranking_model_params:
|
|
1128
1196
|
# Get reranking model from params.
|
|
1129
1197
|
# This is called here to check validaity of the parameters.
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
reranker.get_scores("test", ["test"])
|
|
1133
|
-
except (ValueError, RuntimeError) as e:
|
|
1134
|
-
raise RuntimeError(f"Problem with reranker config: {e}")
|
|
1198
|
+
rotate_provider_api_key(reranking_model_params)
|
|
1199
|
+
self._test_reranking(reranking_model_params)
|
|
1135
1200
|
|
|
1136
1201
|
# search for the vector database table
|
|
1137
1202
|
if storage is None:
|
|
@@ -1184,13 +1249,115 @@ class KnowledgeBaseController:
|
|
|
1184
1249
|
project_id=project_id,
|
|
1185
1250
|
vector_database_id=vector_database_id,
|
|
1186
1251
|
vector_database_table=vector_table_name,
|
|
1187
|
-
embedding_model_id=
|
|
1252
|
+
embedding_model_id=None,
|
|
1188
1253
|
params=params,
|
|
1189
1254
|
)
|
|
1190
1255
|
db.session.add(kb)
|
|
1191
1256
|
db.session.commit()
|
|
1192
1257
|
return kb
|
|
1193
1258
|
|
|
1259
|
+
def update(
|
|
1260
|
+
self,
|
|
1261
|
+
name: str,
|
|
1262
|
+
project_name: str,
|
|
1263
|
+
params: dict,
|
|
1264
|
+
preprocessing_config: Optional[dict] = None,
|
|
1265
|
+
) -> db.KnowledgeBase:
|
|
1266
|
+
"""
|
|
1267
|
+
Update the knowledge base
|
|
1268
|
+
:param name: The name of the knowledge base
|
|
1269
|
+
:param project_name: Current project name
|
|
1270
|
+
:param params: The parameters to update
|
|
1271
|
+
:param preprocessing_config: Optional preprocessing configuration to validate and store
|
|
1272
|
+
"""
|
|
1273
|
+
|
|
1274
|
+
# fill variables
|
|
1275
|
+
params = variables_controller.fill_parameters(params)
|
|
1276
|
+
|
|
1277
|
+
# Validate preprocessing config first if provided
|
|
1278
|
+
if preprocessing_config is not None:
|
|
1279
|
+
PreprocessingConfig(**preprocessing_config) # Validate before storing
|
|
1280
|
+
params = params or {}
|
|
1281
|
+
params["preprocessing"] = preprocessing_config
|
|
1282
|
+
|
|
1283
|
+
self._check_kb_input_params(params)
|
|
1284
|
+
|
|
1285
|
+
# get project id
|
|
1286
|
+
project = self.session.database_controller.get_project(project_name)
|
|
1287
|
+
project_id = project.id
|
|
1288
|
+
|
|
1289
|
+
# get existed KB
|
|
1290
|
+
kb = self.get(name.lower(), project_id)
|
|
1291
|
+
if kb is None:
|
|
1292
|
+
raise EntityNotExistsError("Knowledge base doesn't exists", name)
|
|
1293
|
+
|
|
1294
|
+
if "embedding_model" in params:
|
|
1295
|
+
new_config = params["embedding_model"]
|
|
1296
|
+
# update embedding
|
|
1297
|
+
embed_params = kb.params.get("embedding_model", {})
|
|
1298
|
+
if not embed_params:
|
|
1299
|
+
# maybe old version of KB
|
|
1300
|
+
raise ValueError("No embedding config to update")
|
|
1301
|
+
|
|
1302
|
+
# some parameters are not allowed to update
|
|
1303
|
+
for key in ("provider", "model_name"):
|
|
1304
|
+
if key in new_config and new_config[key] != embed_params.get(key):
|
|
1305
|
+
raise ValueError(f"You can't update '{key}' setting")
|
|
1306
|
+
|
|
1307
|
+
embed_params.update(new_config)
|
|
1308
|
+
|
|
1309
|
+
self._check_embedding_model(
|
|
1310
|
+
project.name,
|
|
1311
|
+
params=embed_params,
|
|
1312
|
+
kb_name=name,
|
|
1313
|
+
)
|
|
1314
|
+
kb.params["embedding_model"] = embed_params
|
|
1315
|
+
|
|
1316
|
+
if "reranking_model" in params:
|
|
1317
|
+
new_config = params["reranking_model"]
|
|
1318
|
+
# update embedding
|
|
1319
|
+
rerank_params = kb.params.get("reranking_model", {})
|
|
1320
|
+
|
|
1321
|
+
if new_config is False:
|
|
1322
|
+
# disable reranking
|
|
1323
|
+
rerank_params = {}
|
|
1324
|
+
elif "provider" in new_config and new_config["provider"] != rerank_params.get("provider"):
|
|
1325
|
+
# use new config (and include default config)
|
|
1326
|
+
rerank_params = get_model_params(new_config, "default_reranking_model")
|
|
1327
|
+
else:
|
|
1328
|
+
# update current config
|
|
1329
|
+
rerank_params.update(new_config)
|
|
1330
|
+
|
|
1331
|
+
if rerank_params:
|
|
1332
|
+
self._test_reranking(rerank_params)
|
|
1333
|
+
|
|
1334
|
+
kb.params["reranking_model"] = rerank_params
|
|
1335
|
+
|
|
1336
|
+
# update other keys
|
|
1337
|
+
for key in ["id_column", "metadata_columns", "content_columns", "preprocessing"]:
|
|
1338
|
+
if key in params:
|
|
1339
|
+
kb.params[key] = params[key]
|
|
1340
|
+
|
|
1341
|
+
flag_modified(kb, "params")
|
|
1342
|
+
db.session.commit()
|
|
1343
|
+
|
|
1344
|
+
return self.get(name.lower(), project_id)
|
|
1345
|
+
|
|
1346
|
+
def _test_reranking(self, params):
|
|
1347
|
+
try:
|
|
1348
|
+
reranker = get_reranking_model_from_params(params)
|
|
1349
|
+
reranker.get_scores("test", ["test"])
|
|
1350
|
+
except (ValueError, RuntimeError) as e:
|
|
1351
|
+
if params["provider"] in ("azure_openai", "openai") and params.get("method") != "no-logprobs":
|
|
1352
|
+
# check with no-logprobs
|
|
1353
|
+
params["method"] = "no-logprobs"
|
|
1354
|
+
self._test_reranking(params)
|
|
1355
|
+
logger.warning(
|
|
1356
|
+
f"logprobs is not supported for this model: {params.get('model_name')}. using no-logprobs mode"
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
raise RuntimeError(f"Problem with reranker config: {e}") from e
|
|
1360
|
+
|
|
1194
1361
|
def _create_persistent_pgvector(self, params=None):
|
|
1195
1362
|
"""Create default vector database for knowledge base, if not specified"""
|
|
1196
1363
|
vector_store_name = "kb_pgvector_store"
|
|
@@ -1217,11 +1384,11 @@ class KnowledgeBaseController:
|
|
|
1217
1384
|
self.session.integration_controller.add(vector_store_name, engine, connection_args)
|
|
1218
1385
|
return vector_store_name
|
|
1219
1386
|
|
|
1220
|
-
def
|
|
1221
|
-
"""
|
|
1222
|
-
model_name = f"kb_embedding_{kb_name}"
|
|
1387
|
+
def _check_embedding_model(self, project_name, params: dict = None, kb_name=""):
|
|
1388
|
+
"""check embedding model for knowledge base"""
|
|
1223
1389
|
|
|
1224
|
-
#
|
|
1390
|
+
# if mindsdb model from old KB exists - drop it
|
|
1391
|
+
model_name = f"kb_embedding_{kb_name}"
|
|
1225
1392
|
try:
|
|
1226
1393
|
model = self.session.model_controller.get_model(model_name, project_name=project_name)
|
|
1227
1394
|
if model is not None:
|
|
@@ -1233,63 +1400,18 @@ class KnowledgeBaseController:
|
|
|
1233
1400
|
raise ValueError("'provider' parameter is required for embedding model")
|
|
1234
1401
|
|
|
1235
1402
|
# check available providers
|
|
1236
|
-
avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google")
|
|
1403
|
+
avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google", "ollama")
|
|
1237
1404
|
if params["provider"] not in avail_providers:
|
|
1238
1405
|
raise ValueError(
|
|
1239
1406
|
f"Wrong embedding provider: {params['provider']}. Available providers: {', '.join(avail_providers)}"
|
|
1240
1407
|
)
|
|
1241
1408
|
|
|
1242
|
-
|
|
1243
|
-
# try use litellm
|
|
1244
|
-
try:
|
|
1245
|
-
KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
|
|
1246
|
-
except Exception as e:
|
|
1247
|
-
raise RuntimeError(f"Problem with embedding model config: {e}")
|
|
1248
|
-
return
|
|
1249
|
-
|
|
1250
|
-
params = copy.deepcopy(params)
|
|
1251
|
-
if "provider" in params:
|
|
1252
|
-
engine = params.pop("provider").lower()
|
|
1253
|
-
|
|
1254
|
-
api_key = get_api_key(engine, params, strict=False)
|
|
1255
|
-
if api_key is None:
|
|
1256
|
-
if "api_key" in params:
|
|
1257
|
-
params.pop("api_key")
|
|
1258
|
-
else:
|
|
1259
|
-
raise ValueError("'api_key' parameter is required for embedding model")
|
|
1260
|
-
|
|
1261
|
-
if engine == "azure_openai":
|
|
1262
|
-
engine = "openai"
|
|
1263
|
-
params["provider"] = "azure"
|
|
1264
|
-
|
|
1265
|
-
if engine == "openai":
|
|
1266
|
-
if "question_column" not in params:
|
|
1267
|
-
params["question_column"] = "content"
|
|
1268
|
-
if api_key:
|
|
1269
|
-
params[f"{engine}_api_key"] = api_key
|
|
1270
|
-
if "api_key" in params:
|
|
1271
|
-
params.pop("api_key")
|
|
1272
|
-
if "base_url" in params:
|
|
1273
|
-
params["api_base"] = params.pop("base_url")
|
|
1274
|
-
|
|
1275
|
-
params["engine"] = engine
|
|
1276
|
-
params["join_learn_process"] = True
|
|
1277
|
-
params["mode"] = "embedding"
|
|
1278
|
-
|
|
1279
|
-
# Include API key if provided.
|
|
1280
|
-
statement = CreatePredictor(
|
|
1281
|
-
name=Identifier(parts=[project_name, model_name]),
|
|
1282
|
-
using=params,
|
|
1283
|
-
targets=[Identifier(parts=[TableField.EMBEDDINGS.value])],
|
|
1284
|
-
)
|
|
1409
|
+
llm_client = LLMClient(params, session=self.session)
|
|
1285
1410
|
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
if record["STATUS"] == "error":
|
|
1291
|
-
raise ValueError("Embedding model error:" + record["ERROR"])
|
|
1292
|
-
return model_name
|
|
1411
|
+
try:
|
|
1412
|
+
llm_client.embeddings(["test"])
|
|
1413
|
+
except Exception as e:
|
|
1414
|
+
raise RuntimeError(f"Problem with embedding model config: {e}") from e
|
|
1293
1415
|
|
|
1294
1416
|
def delete(self, name: str, project_name: int, if_exists: bool = False) -> None:
|
|
1295
1417
|
"""
|
|
@@ -1297,8 +1419,8 @@ class KnowledgeBaseController:
|
|
|
1297
1419
|
"""
|
|
1298
1420
|
try:
|
|
1299
1421
|
project = self.session.database_controller.get_project(project_name)
|
|
1300
|
-
except ValueError:
|
|
1301
|
-
raise ValueError(f"Project not found: {project_name}")
|
|
1422
|
+
except ValueError as e:
|
|
1423
|
+
raise ValueError(f"Project not found: {project_name}") from e
|
|
1302
1424
|
project_id = project.id
|
|
1303
1425
|
|
|
1304
1426
|
# check if knowledge base exists
|
|
@@ -1395,12 +1517,6 @@ class KnowledgeBaseController:
|
|
|
1395
1517
|
kb_table = self.get_table(table_name, project_id)
|
|
1396
1518
|
kb_table.create_index()
|
|
1397
1519
|
|
|
1398
|
-
def update(self, name: str, project_id: int, **kwargs) -> db.KnowledgeBase:
|
|
1399
|
-
"""
|
|
1400
|
-
Update a knowledge base record
|
|
1401
|
-
"""
|
|
1402
|
-
raise NotImplementedError()
|
|
1403
|
-
|
|
1404
1520
|
def evaluate(self, table_name: str, project_name: str, params: dict = None) -> pd.DataFrame:
|
|
1405
1521
|
"""
|
|
1406
1522
|
Run evaluate and/or create test data for evaluation
|