MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +228 -110
- mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +58 -40
- mindsdb/utilities/exception.py +58 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import copy
|
|
3
|
-
from typing import Dict, List, Optional
|
|
3
|
+
from typing import Dict, List, Optional, Any, Text
|
|
4
4
|
import json
|
|
5
5
|
import decimal
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import numpy as np
|
|
9
|
+
from pydantic import BaseModel, ValidationError
|
|
10
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
9
11
|
|
|
10
12
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
11
13
|
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
12
14
|
from mindsdb_sql_parser import parse_sql
|
|
13
15
|
|
|
16
|
+
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
14
17
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
15
18
|
|
|
16
19
|
import mindsdb.interfaces.storage.db as db
|
|
@@ -33,9 +36,10 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
|
|
|
33
36
|
from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
|
|
34
37
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
|
|
35
38
|
from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
|
|
39
|
+
from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
|
|
36
40
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
37
41
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
|
-
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
42
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator, KeywordSearchArgs
|
|
39
43
|
from mindsdb.utilities.config import config
|
|
40
44
|
from mindsdb.utilities.context import context as ctx
|
|
41
45
|
|
|
@@ -46,7 +50,19 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
|
|
|
46
50
|
|
|
47
51
|
logger = log.getLogger(__name__)
|
|
48
52
|
|
|
49
|
-
|
|
53
|
+
|
|
54
|
+
class KnowledgeBaseInputParams(BaseModel):
|
|
55
|
+
metadata_columns: List[str] | None = None
|
|
56
|
+
content_columns: List[str] | None = None
|
|
57
|
+
id_column: str | None = None
|
|
58
|
+
kb_no_upsert: bool = False
|
|
59
|
+
embedding_model: Dict[Text, Any] | None = None
|
|
60
|
+
is_sparse: bool = False
|
|
61
|
+
vector_size: int | None = None
|
|
62
|
+
reranking_model: Dict[Text, Any] | None = None
|
|
63
|
+
|
|
64
|
+
class Config:
|
|
65
|
+
extra = "forbid"
|
|
50
66
|
|
|
51
67
|
|
|
52
68
|
def get_model_params(model_params: dict, default_config_key: str):
|
|
@@ -101,7 +117,10 @@ def get_reranking_model_from_params(reranking_model_params: dict):
|
|
|
101
117
|
|
|
102
118
|
if "api_key" not in params_copy:
|
|
103
119
|
params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
|
|
104
|
-
|
|
120
|
+
|
|
121
|
+
if "model_name" not in params_copy:
|
|
122
|
+
raise ValueError("'model_name' must be provided for reranking model")
|
|
123
|
+
params_copy["model"] = params_copy.pop("model_name")
|
|
105
124
|
|
|
106
125
|
return BaseLLMReranker(**params_copy)
|
|
107
126
|
|
|
@@ -140,23 +159,29 @@ class KnowledgeBaseTable:
|
|
|
140
159
|
self.document_loader = None
|
|
141
160
|
self.model_params = None
|
|
142
161
|
|
|
162
|
+
self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
|
|
163
|
+
if self._kb.params.get("version", 0) < 2:
|
|
164
|
+
self.kb_to_vector_columns["id"] = "original_doc_id"
|
|
165
|
+
|
|
143
166
|
def configure_preprocessing(self, config: Optional[dict] = None):
|
|
144
167
|
"""Configure preprocessing for the knowledge base table"""
|
|
145
168
|
logger.debug(f"Configuring preprocessing with config: {config}")
|
|
146
169
|
self.document_preprocessor = None # Reset existing preprocessor
|
|
147
|
-
if config is
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
170
|
+
if config is None:
|
|
171
|
+
config = {}
|
|
172
|
+
|
|
173
|
+
# Ensure content_column is set for JSON chunking if not already specified
|
|
174
|
+
if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
|
|
175
|
+
if "content_column" not in config["json_chunking_config"]:
|
|
176
|
+
config["json_chunking_config"]["content_column"] = "content"
|
|
177
|
+
|
|
178
|
+
preprocessing_config = PreprocessingConfig(**config)
|
|
179
|
+
self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
|
|
180
|
+
|
|
181
|
+
# set doc_id column name
|
|
182
|
+
self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
|
|
183
|
+
|
|
184
|
+
logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
|
|
160
185
|
|
|
161
186
|
def select_query(self, query: Select) -> pd.DataFrame:
|
|
162
187
|
"""
|
|
@@ -165,6 +190,33 @@ class KnowledgeBaseTable:
|
|
|
165
190
|
:param query: query to KB table
|
|
166
191
|
:return: dataframe with the result table
|
|
167
192
|
"""
|
|
193
|
+
|
|
194
|
+
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
195
|
+
query_copy = copy.deepcopy(query)
|
|
196
|
+
|
|
197
|
+
executor = KnowledgeBaseQueryExecutor(self)
|
|
198
|
+
df = executor.run(query)
|
|
199
|
+
|
|
200
|
+
if (
|
|
201
|
+
query_copy.group_by is not None
|
|
202
|
+
or query_copy.order_by is not None
|
|
203
|
+
or query_copy.having is not None
|
|
204
|
+
or query_copy.distinct is True
|
|
205
|
+
or len(query_copy.targets) != 1
|
|
206
|
+
or not isinstance(query_copy.targets[0], Star)
|
|
207
|
+
):
|
|
208
|
+
query_copy.where = None
|
|
209
|
+
if "metadata" in df.columns:
|
|
210
|
+
df["metadata"] = df["metadata"].apply(to_json)
|
|
211
|
+
|
|
212
|
+
if query_copy.from_table is None:
|
|
213
|
+
query_copy.from_table = Identifier(parts=[self._kb.name])
|
|
214
|
+
|
|
215
|
+
df = query_df(df, query_copy, session=self.session)
|
|
216
|
+
|
|
217
|
+
return df
|
|
218
|
+
|
|
219
|
+
def select(self, query, disable_reranking=False):
|
|
168
220
|
logger.debug(f"Processing select query: {query}")
|
|
169
221
|
|
|
170
222
|
# Extract the content query text for potential reranking
|
|
@@ -176,9 +228,6 @@ class KnowledgeBaseTable:
|
|
|
176
228
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
177
229
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
178
230
|
|
|
179
|
-
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
180
|
-
query_copy = copy.deepcopy(query)
|
|
181
|
-
|
|
182
231
|
query.targets = [
|
|
183
232
|
Identifier(TableField.ID.value),
|
|
184
233
|
Identifier(TableField.CONTENT.value),
|
|
@@ -191,9 +240,12 @@ class KnowledgeBaseTable:
|
|
|
191
240
|
|
|
192
241
|
# extract values from conditions and prepare for vectordb
|
|
193
242
|
conditions = []
|
|
243
|
+
keyword_search_conditions = []
|
|
244
|
+
keyword_search_cols_and_values = []
|
|
194
245
|
query_text = None
|
|
195
246
|
relevance_threshold = None
|
|
196
247
|
reranking_enabled_flag = True
|
|
248
|
+
hybrid_search_enabled_flag = False
|
|
197
249
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
198
250
|
if query_conditions is not None:
|
|
199
251
|
for item in query_conditions:
|
|
@@ -213,6 +265,13 @@ class KnowledgeBaseTable:
|
|
|
213
265
|
# cast to boolean
|
|
214
266
|
if isinstance(reranking_enabled_flag, str):
|
|
215
267
|
reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
|
|
268
|
+
elif item.column == "hybrid_search":
|
|
269
|
+
hybrid_search_enabled_flag = item.value
|
|
270
|
+
# cast to boolean
|
|
271
|
+
if isinstance(hybrid_search_enabled_flag, str):
|
|
272
|
+
hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
|
|
273
|
+
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
274
|
+
disable_reranking = True
|
|
216
275
|
elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
217
276
|
raise ValueError(
|
|
218
277
|
f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
|
|
@@ -228,8 +287,16 @@ class KnowledgeBaseTable:
|
|
|
228
287
|
op=FilterOperator.EQUAL,
|
|
229
288
|
)
|
|
230
289
|
)
|
|
290
|
+
keyword_search_cols_and_values.append((TableField.CONTENT.value, item.value))
|
|
231
291
|
else:
|
|
232
292
|
conditions.append(item)
|
|
293
|
+
keyword_search_conditions.append(item) # keyword search conditions do not use embeddings
|
|
294
|
+
|
|
295
|
+
if len(keyword_search_cols_and_values) > 1:
|
|
296
|
+
raise ValueError(
|
|
297
|
+
"Multiple content columns found in query conditions. "
|
|
298
|
+
"Only one content column is allowed for keyword search."
|
|
299
|
+
)
|
|
233
300
|
|
|
234
301
|
logger.debug(f"Extracted query text: {query_text}")
|
|
235
302
|
|
|
@@ -244,66 +311,92 @@ class KnowledgeBaseTable:
|
|
|
244
311
|
limit = 100
|
|
245
312
|
query.limit = Constant(limit)
|
|
246
313
|
|
|
247
|
-
|
|
314
|
+
allowed_metadata_columns = self._get_allowed_metadata_columns()
|
|
315
|
+
df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
248
316
|
df = self.addapt_result_columns(df)
|
|
249
|
-
|
|
250
317
|
logger.debug(f"Query returned {len(df)} rows")
|
|
251
318
|
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
252
|
-
# Check if we have a rerank_model configured in KB params
|
|
253
|
-
df = self.add_relevance(df, query_text, relevance_threshold, reranking_enabled_flag)
|
|
254
319
|
|
|
255
|
-
if (
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
320
|
+
if hybrid_search_enabled_flag and not isinstance(db_handler, KeywordSearchBase):
|
|
321
|
+
raise ValueError(f"Hybrid search is enabled but the db_handler {type(db_handler)} does not support it. ")
|
|
322
|
+
# check if db_handler inherits from KeywordSearchBase
|
|
323
|
+
if hybrid_search_enabled_flag and isinstance(db_handler, KeywordSearchBase):
|
|
324
|
+
# If query_text is present, use it for keyword search
|
|
325
|
+
logger.debug(f"Performing keyword search with query text: {query_text}")
|
|
326
|
+
keyword_search_args = KeywordSearchArgs(query=query_text, column=TableField.CONTENT.value)
|
|
327
|
+
keyword_query_obj = copy.deepcopy(query)
|
|
328
|
+
|
|
329
|
+
keyword_query_obj.targets = [
|
|
330
|
+
Identifier(TableField.ID.value),
|
|
331
|
+
Identifier(TableField.CONTENT.value),
|
|
332
|
+
Identifier(TableField.METADATA.value),
|
|
333
|
+
]
|
|
266
334
|
|
|
267
|
-
|
|
335
|
+
df_keyword_select = db_handler.dispatch_select(
|
|
336
|
+
keyword_query_obj, keyword_search_conditions, keyword_search_args=keyword_search_args
|
|
337
|
+
)
|
|
338
|
+
df_keyword_select = self.addapt_result_columns(df_keyword_select)
|
|
339
|
+
logger.debug(f"Keyword search returned {len(df_keyword_select)} rows")
|
|
340
|
+
logger.debug(f"Columns in keyword search response: {df_keyword_select.columns.tolist()}")
|
|
341
|
+
# ensure df and df_keyword_select have exactly the same columns
|
|
342
|
+
if not df_keyword_select.empty:
|
|
343
|
+
if set(df.columns) != set(df_keyword_select.columns):
|
|
344
|
+
raise ValueError(
|
|
345
|
+
f"Keyword search returned different columns: {df_keyword_select.columns} "
|
|
346
|
+
f"than expected: {df.columns}"
|
|
347
|
+
)
|
|
348
|
+
df = pd.concat([df, df_keyword_select], ignore_index=True)
|
|
349
|
+
# if chunk_id column exists remove duplicates based on chunk_id
|
|
350
|
+
if "chunk_id" in df.columns:
|
|
351
|
+
df = df.drop_duplicates(subset=["chunk_id"])
|
|
352
|
+
|
|
353
|
+
# Check if we have a rerank_model configured in KB params
|
|
354
|
+
df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
|
|
268
355
|
|
|
269
356
|
return df
|
|
270
357
|
|
|
358
|
+
def _get_allowed_metadata_columns(self) -> List[str] | None:
|
|
359
|
+
# Return list of KB columns to restrict querying, if None: no restrictions
|
|
360
|
+
|
|
361
|
+
if self._kb.params.get("version", 0) < 2:
|
|
362
|
+
# disable for old version KBs
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
user_columns = self._kb.params.get("metadata_columns", [])
|
|
366
|
+
dynamic_columns = self._kb.params.get("inserted_metadata", [])
|
|
367
|
+
|
|
368
|
+
columns = set(user_columns) | set(dynamic_columns)
|
|
369
|
+
return [col.lower() for col in columns]
|
|
370
|
+
|
|
271
371
|
def score_documents(self, query_text, documents, reranking_model_params):
|
|
272
372
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
273
373
|
return reranker.get_scores(query_text, documents)
|
|
274
374
|
|
|
275
|
-
def add_relevance(self, df, query_text, relevance_threshold=None,
|
|
375
|
+
def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
|
|
276
376
|
relevance_column = TableField.RELEVANCE.value
|
|
277
377
|
|
|
278
378
|
reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
|
|
279
|
-
if reranking_model_params and query_text and len(df) > 0 and
|
|
379
|
+
if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
|
|
280
380
|
# Use reranker for relevance score
|
|
281
|
-
try:
|
|
282
|
-
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
283
|
-
# Apply custom filtering threshold if provided
|
|
284
|
-
if relevance_threshold is not None:
|
|
285
|
-
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
286
|
-
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
287
381
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
logger.info("No distance or reranker available")
|
|
382
|
+
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
383
|
+
# Apply custom filtering threshold if provided
|
|
384
|
+
if relevance_threshold is not None:
|
|
385
|
+
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
386
|
+
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
387
|
+
|
|
388
|
+
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
389
|
+
# Get documents to rerank
|
|
390
|
+
documents = df["chunk_content"].tolist()
|
|
391
|
+
# Use the get_scores method with disable_events=True
|
|
392
|
+
scores = reranker.get_scores(query_text, documents)
|
|
393
|
+
# Add scores as the relevance column
|
|
394
|
+
df[relevance_column] = scores
|
|
395
|
+
|
|
396
|
+
# Filter by threshold
|
|
397
|
+
scores_array = np.array(scores)
|
|
398
|
+
df = df[scores_array > reranker.filtering_threshold]
|
|
399
|
+
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
307
400
|
|
|
308
401
|
elif "distance" in df.columns:
|
|
309
402
|
# Calculate relevance from distance
|
|
@@ -323,12 +416,12 @@ class KnowledgeBaseTable:
|
|
|
323
416
|
if conditions is None:
|
|
324
417
|
return
|
|
325
418
|
for condition in conditions:
|
|
326
|
-
if condition.column in
|
|
327
|
-
condition.column =
|
|
419
|
+
if condition.column in self.kb_to_vector_columns:
|
|
420
|
+
condition.column = self.kb_to_vector_columns[condition.column]
|
|
328
421
|
|
|
329
422
|
def addapt_result_columns(self, df):
|
|
330
423
|
col_update = {}
|
|
331
|
-
for kb_col, vec_col in
|
|
424
|
+
for kb_col, vec_col in self.kb_to_vector_columns.items():
|
|
332
425
|
if vec_col in df.columns:
|
|
333
426
|
col_update[vec_col] = kb_col
|
|
334
427
|
|
|
@@ -337,7 +430,7 @@ class KnowledgeBaseTable:
|
|
|
337
430
|
columns = list(df.columns)
|
|
338
431
|
# update id, get from metadata
|
|
339
432
|
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
340
|
-
lambda m: None if m is None else m.get("
|
|
433
|
+
lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
|
|
341
434
|
)
|
|
342
435
|
|
|
343
436
|
# id on first place
|
|
@@ -524,8 +617,8 @@ class KnowledgeBaseTable:
|
|
|
524
617
|
|
|
525
618
|
metadata = {
|
|
526
619
|
**base_metadata,
|
|
527
|
-
"
|
|
528
|
-
"
|
|
620
|
+
"_original_row_index": str(idx), # provide link to original row index
|
|
621
|
+
"_content_column": col,
|
|
529
622
|
}
|
|
530
623
|
|
|
531
624
|
raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
|
|
@@ -620,16 +713,22 @@ class KnowledgeBaseTable:
|
|
|
620
713
|
metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
|
|
621
714
|
logger.debug(f"Mapped metadata columns: {metadata_columns}")
|
|
622
715
|
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
716
|
+
content_columns = list(set(content_columns).intersection(columns))
|
|
717
|
+
if len(content_columns) == 0:
|
|
718
|
+
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
627
719
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
720
|
+
if metadata_columns is not None:
|
|
721
|
+
metadata_columns = list(set(metadata_columns).intersection(columns))
|
|
722
|
+
else:
|
|
723
|
+
# all the rest columns
|
|
724
|
+
metadata_columns = list(set(columns).difference(content_columns))
|
|
725
|
+
|
|
726
|
+
# update list of used columns
|
|
727
|
+
inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
|
|
728
|
+
inserted_metadata.update(metadata_columns)
|
|
729
|
+
self._kb.params["inserted_metadata"] = list(inserted_metadata)
|
|
730
|
+
flag_modified(self._kb, "params")
|
|
731
|
+
db.session.commit()
|
|
633
732
|
|
|
634
733
|
# Add content columns directly (don't combine them)
|
|
635
734
|
for col in content_columns:
|
|
@@ -655,7 +754,7 @@ class KnowledgeBaseTable:
|
|
|
655
754
|
elif isinstance(value, dict):
|
|
656
755
|
metadata.update(value)
|
|
657
756
|
continue
|
|
658
|
-
|
|
757
|
+
elif value is not None:
|
|
659
758
|
value = str(value)
|
|
660
759
|
metadata[col] = value
|
|
661
760
|
return metadata
|
|
@@ -712,8 +811,7 @@ class KnowledgeBaseTable:
|
|
|
712
811
|
if model_id is None:
|
|
713
812
|
# call litellm handler
|
|
714
813
|
messages = list(df[TableField.CONTENT.value])
|
|
715
|
-
embedding_params =
|
|
716
|
-
embedding_params.update(self._kb.params["embedding_model"])
|
|
814
|
+
embedding_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
|
|
717
815
|
results = self.call_litellm_embedding(self.session, embedding_params, messages)
|
|
718
816
|
results = [[val] for val in results]
|
|
719
817
|
return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
|
|
@@ -759,18 +857,16 @@ class KnowledgeBaseTable:
|
|
|
759
857
|
def call_litellm_embedding(session, model_params, messages):
|
|
760
858
|
args = copy.deepcopy(model_params)
|
|
761
859
|
|
|
860
|
+
if "model_name" not in args:
|
|
861
|
+
raise ValueError("'model_name' must be provided for embedding model")
|
|
862
|
+
|
|
762
863
|
llm_model = args.pop("model_name")
|
|
763
864
|
engine = args.pop("provider")
|
|
764
865
|
|
|
765
|
-
llm_model = f"{engine}/{llm_model}"
|
|
766
|
-
|
|
767
|
-
if "base_url" in args:
|
|
768
|
-
args["api_base"] = args.pop("base_url")
|
|
769
|
-
|
|
770
866
|
module = session.integration_controller.get_handler_module("litellm")
|
|
771
867
|
if module is None or module.Handler is None:
|
|
772
868
|
raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
|
|
773
|
-
return module.Handler.embeddings(llm_model, messages, args)
|
|
869
|
+
return module.Handler.embeddings(engine, llm_model, messages, args)
|
|
774
870
|
|
|
775
871
|
def build_rag_pipeline(self, retrieval_config: dict):
|
|
776
872
|
"""
|
|
@@ -892,6 +988,8 @@ class KnowledgeBaseController:
|
|
|
892
988
|
manages knowledge bases
|
|
893
989
|
"""
|
|
894
990
|
|
|
991
|
+
KB_VERSION = 2
|
|
992
|
+
|
|
895
993
|
def __init__(self, session) -> None:
|
|
896
994
|
self.session = session
|
|
897
995
|
|
|
@@ -903,6 +1001,7 @@ class KnowledgeBaseController:
|
|
|
903
1001
|
params: dict,
|
|
904
1002
|
preprocessing_config: Optional[dict] = None,
|
|
905
1003
|
if_not_exists: bool = False,
|
|
1004
|
+
keyword_search_enabled: bool = False,
|
|
906
1005
|
# embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
|
|
907
1006
|
) -> db.KnowledgeBase:
|
|
908
1007
|
"""
|
|
@@ -914,6 +1013,24 @@ class KnowledgeBaseController:
|
|
|
914
1013
|
# fill variables
|
|
915
1014
|
params = variables_controller.fill_parameters(params)
|
|
916
1015
|
|
|
1016
|
+
try:
|
|
1017
|
+
KnowledgeBaseInputParams.model_validate(params)
|
|
1018
|
+
except ValidationError as e:
|
|
1019
|
+
problems = []
|
|
1020
|
+
for error in e.errors():
|
|
1021
|
+
parameter = ".".join([str(i) for i in error["loc"]])
|
|
1022
|
+
param_type = error["type"]
|
|
1023
|
+
if param_type == "extra_forbidden":
|
|
1024
|
+
msg = f"Parameter '{parameter}' is not allowed"
|
|
1025
|
+
else:
|
|
1026
|
+
msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
|
|
1027
|
+
problems.append(msg)
|
|
1028
|
+
|
|
1029
|
+
msg = "\n".join(problems)
|
|
1030
|
+
if len(problems) > 1:
|
|
1031
|
+
msg = "\n" + msg
|
|
1032
|
+
raise ValueError(f"Problem with knowledge base parameters: {msg}")
|
|
1033
|
+
|
|
917
1034
|
# Validate preprocessing config first if provided
|
|
918
1035
|
if preprocessing_config is not None:
|
|
919
1036
|
PreprocessingConfig(**preprocessing_config) # Validate before storing
|
|
@@ -939,24 +1056,6 @@ class KnowledgeBaseController:
|
|
|
939
1056
|
return kb
|
|
940
1057
|
raise EntityExistsError("Knowledge base already exists", name)
|
|
941
1058
|
|
|
942
|
-
embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
|
|
943
|
-
|
|
944
|
-
# Legacy
|
|
945
|
-
# model_name = None
|
|
946
|
-
# model_project = project
|
|
947
|
-
# if embedding_model:
|
|
948
|
-
# model_name = embedding_model.parts[-1]
|
|
949
|
-
# if len(embedding_model.parts) > 1:
|
|
950
|
-
# model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
|
|
951
|
-
|
|
952
|
-
# elif "embedding_model" in params:
|
|
953
|
-
# if isinstance(params["embedding_model"], str):
|
|
954
|
-
# # it is model name
|
|
955
|
-
# model_name = params["embedding_model"]
|
|
956
|
-
# else:
|
|
957
|
-
# # it is params for model
|
|
958
|
-
# embedding_params.update(params["embedding_model"])
|
|
959
|
-
|
|
960
1059
|
embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
|
|
961
1060
|
|
|
962
1061
|
# if model_name is None: # Legacy
|
|
@@ -987,7 +1086,11 @@ class KnowledgeBaseController:
|
|
|
987
1086
|
if reranking_model_params:
|
|
988
1087
|
# Get reranking model from params.
|
|
989
1088
|
# This is called here to check validaity of the parameters.
|
|
990
|
-
|
|
1089
|
+
try:
|
|
1090
|
+
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
1091
|
+
reranker.get_scores("test", ["test"])
|
|
1092
|
+
except (ValueError, RuntimeError) as e:
|
|
1093
|
+
raise RuntimeError(f"Problem with reranker config: {e}")
|
|
991
1094
|
|
|
992
1095
|
# search for the vector database table
|
|
993
1096
|
if storage is None:
|
|
@@ -1016,7 +1119,10 @@ class KnowledgeBaseController:
|
|
|
1016
1119
|
vector_db_name, vector_table_name = storage.parts
|
|
1017
1120
|
|
|
1018
1121
|
# create table in vectordb before creating KB
|
|
1019
|
-
self.session.datahub.get(vector_db_name).integration_handler
|
|
1122
|
+
vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
|
|
1123
|
+
vector_store_handler.create_table(vector_table_name)
|
|
1124
|
+
if keyword_search_enabled:
|
|
1125
|
+
vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
|
|
1020
1126
|
vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
|
|
1021
1127
|
|
|
1022
1128
|
# Store sparse vector settings in params if specified
|
|
@@ -1026,6 +1132,7 @@ class KnowledgeBaseController:
|
|
|
1026
1132
|
if vector_size is not None:
|
|
1027
1133
|
params["vector_config"]["vector_size"] = vector_size
|
|
1028
1134
|
|
|
1135
|
+
params["version"] = self.KB_VERSION
|
|
1029
1136
|
kb = db.KnowledgeBase(
|
|
1030
1137
|
name=name,
|
|
1031
1138
|
project_id=project_id,
|
|
@@ -1076,15 +1183,26 @@ class KnowledgeBaseController:
|
|
|
1076
1183
|
except PredictorRecordNotFound:
|
|
1077
1184
|
pass
|
|
1078
1185
|
|
|
1079
|
-
if
|
|
1186
|
+
if "provider" not in params:
|
|
1187
|
+
raise ValueError("'provider' parameter is required for embedding model")
|
|
1188
|
+
|
|
1189
|
+
if params["provider"] not in ("openai", "azure_openai"):
|
|
1080
1190
|
# try use litellm
|
|
1081
|
-
|
|
1191
|
+
try:
|
|
1192
|
+
KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
|
|
1193
|
+
except Exception as e:
|
|
1194
|
+
raise RuntimeError(f"Problem with embedding model config: {e}")
|
|
1082
1195
|
return
|
|
1083
1196
|
|
|
1084
1197
|
if "provider" in params:
|
|
1085
1198
|
engine = params.pop("provider").lower()
|
|
1086
1199
|
|
|
1087
|
-
api_key = get_api_key(engine, params, strict=False)
|
|
1200
|
+
api_key = get_api_key(engine, params, strict=False)
|
|
1201
|
+
if api_key is None:
|
|
1202
|
+
if "api_key" in params:
|
|
1203
|
+
params.pop("api_key")
|
|
1204
|
+
else:
|
|
1205
|
+
raise ValueError("'api_key' parameter is required for embedding model")
|
|
1088
1206
|
|
|
1089
1207
|
if engine == "azure_openai":
|
|
1090
1208
|
engine = "openai"
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
import datetime as dt
|
|
8
8
|
|
|
9
9
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
10
|
-
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
|
|
10
|
+
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
|
|
11
11
|
from mindsdb.utilities import log
|
|
12
12
|
|
|
13
13
|
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
@@ -90,7 +90,7 @@ class EvaluateBase:
|
|
|
90
90
|
df = response.data_frame
|
|
91
91
|
|
|
92
92
|
if "content" not in df.columns:
|
|
93
|
-
raise ValueError("`content` column isn't found in
|
|
93
|
+
raise ValueError(f"`content` column isn't found in provided sql: {gen_params['from_sql']}")
|
|
94
94
|
|
|
95
95
|
df.rename(columns={"content": "chunk_content"}, inplace=True)
|
|
96
96
|
else:
|
|
@@ -130,6 +130,8 @@ class EvaluateBase:
|
|
|
130
130
|
integration_name = table_name.parts[0]
|
|
131
131
|
table_name = Identifier(parts=table_name.parts[1:])
|
|
132
132
|
dn = self.session.datahub.get(integration_name)
|
|
133
|
+
if dn is None:
|
|
134
|
+
raise ValueError(f"Can't find database: {integration_name}")
|
|
133
135
|
return dn, table_name
|
|
134
136
|
|
|
135
137
|
def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
|
|
@@ -184,7 +186,7 @@ class EvaluateBase:
|
|
|
184
186
|
to_table = params["save_to"]
|
|
185
187
|
if isinstance(to_table, str):
|
|
186
188
|
to_table = Identifier(to_table)
|
|
187
|
-
self.save_to_table(to_table, scores)
|
|
189
|
+
self.save_to_table(to_table, scores.copy())
|
|
188
190
|
|
|
189
191
|
return scores
|
|
190
192
|
|
|
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
|
|
|
256
258
|
|
|
257
259
|
start_time = time.time()
|
|
258
260
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
259
|
-
df_answers = self.kb.select_query(
|
|
261
|
+
df_answers = self.kb.select_query(
|
|
262
|
+
Select(
|
|
263
|
+
targets=[Identifier("chunk_content")],
|
|
264
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
265
|
+
limit=Constant(self.TOP_K),
|
|
266
|
+
)
|
|
267
|
+
)
|
|
260
268
|
query_time = time.time() - start_time
|
|
261
269
|
|
|
262
270
|
proposed_responses = list(df_answers["chunk_content"])
|
|
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
|
|
|
410
418
|
Checks if ID in response from KB is matched with doc ID in test dataset
|
|
411
419
|
"""
|
|
412
420
|
|
|
413
|
-
TOP_K =
|
|
421
|
+
TOP_K = 20
|
|
414
422
|
|
|
415
423
|
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
416
424
|
if "id" not in sampled_df.columns:
|
|
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
|
|
|
462
470
|
start_time = time.time()
|
|
463
471
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
464
472
|
df_answers = self.kb.select_query(
|
|
465
|
-
Select(
|
|
473
|
+
Select(
|
|
474
|
+
targets=[Identifier("chunk_content"), Identifier("id")],
|
|
475
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
476
|
+
limit=Constant(self.TOP_K),
|
|
477
|
+
)
|
|
466
478
|
)
|
|
467
479
|
query_time = time.time() - start_time
|
|
468
480
|
|