MindsDB 25.6.4.0__py3-none-any.whl → 25.7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/knowledge_base/controller.py +115 -89
- mindsdb/interfaces/knowledge_base/evaluate.py +16 -4
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +12 -1
- mindsdb/utilities/exception.py +47 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +248 -262
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +46 -45
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
|
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
|
|
|
96
96
|
# Log the query for debugging
|
|
97
97
|
logger.info(f"Executing SQL query: {command}")
|
|
98
98
|
|
|
99
|
-
|
|
100
|
-
# remove backticks
|
|
101
|
-
# command = command.replace('`', '')
|
|
102
|
-
|
|
103
|
-
# Parse the SQL string to an AST object first
|
|
104
|
-
from mindsdb_sql_parser import parse_sql
|
|
105
|
-
|
|
106
|
-
ast_query = parse_sql(command)
|
|
107
|
-
|
|
108
|
-
# Now execute the parsed query
|
|
109
|
-
result = self._sql_agent.skill_tool.get_command_executor().execute_command(
|
|
110
|
-
ast_query, database_name="mindsdb"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
114
|
-
if result.data is not None:
|
|
115
|
-
df = result.data.to_df()
|
|
116
|
-
return df.to_string(index=False)
|
|
117
|
-
|
|
118
|
-
else:
|
|
119
|
-
return "Query executed successfully, but returned no data."
|
|
99
|
+
return self._sql_agent.query(command)
|
|
120
100
|
|
|
121
101
|
except Exception as e:
|
|
122
102
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
|
|
|
127
107
|
return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
128
108
|
return f"Error: {str(e)}"
|
|
129
109
|
|
|
130
|
-
# def run_no_throw(self, command: str, fetch: str = "all") -> str:
|
|
131
|
-
# """Execute a SQL command and return the result as a string.
|
|
132
|
-
#
|
|
133
|
-
# This method catches any exceptions and returns an error message instead of raising an exception.
|
|
134
|
-
#
|
|
135
|
-
# Args:
|
|
136
|
-
# command: The SQL command to execute
|
|
137
|
-
# fetch: Whether to fetch 'all' results or just 'one'
|
|
138
|
-
#
|
|
139
|
-
# Returns:
|
|
140
|
-
# A string representation of the result or an error message
|
|
141
|
-
# """
|
|
142
|
-
# command = extract_essential(command)
|
|
143
|
-
# try:
|
|
144
|
-
# return self._sql_agent.query_safe(command)
|
|
145
|
-
# except Exception as e:
|
|
146
|
-
# logger.error(f"Error executing SQL command: {str(e)}")
|
|
147
|
-
# # If this is a knowledge base query, provide a more helpful error message
|
|
148
|
-
# if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
|
|
149
|
-
# return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
150
|
-
# return f"Error: {str(e)}"
|
|
151
|
-
|
|
152
110
|
def get_usable_knowledge_base_names(self) -> List[str]:
|
|
153
111
|
"""Get a list of usable knowledge base names.
|
|
154
112
|
|
|
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
|
|
|
160
118
|
except Exception as e:
|
|
161
119
|
logger.error(f"Error getting usable knowledge base names: {str(e)}")
|
|
162
120
|
return []
|
|
121
|
+
|
|
122
|
+
def check_knowledge_base_permission(self, name):
|
|
123
|
+
"""Get a list of usable knowledge base names.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
A list of knowledge base names that can be used in queries
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
return self._sql_agent.check_knowledge_base_permission(name)
|
|
@@ -18,7 +18,9 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
18
18
|
|
|
19
19
|
metadata_str = "Data Catalog: \n"
|
|
20
20
|
if hasattr(self.data_handler, "meta_get_handler_info"):
|
|
21
|
-
|
|
21
|
+
info = self.data_handler.meta_get_handler_info()
|
|
22
|
+
if info:
|
|
23
|
+
metadata_str += info + "\n\n"
|
|
22
24
|
|
|
23
25
|
for table in tables:
|
|
24
26
|
metadata_str += table.as_string() + "\n\n"
|
|
@@ -6,6 +6,7 @@ import decimal
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import numpy as np
|
|
9
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
9
10
|
|
|
10
11
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
11
12
|
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
@@ -33,6 +34,7 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
|
|
|
33
34
|
from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
|
|
34
35
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
|
|
35
36
|
from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
|
|
37
|
+
from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
|
|
36
38
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
37
39
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
40
|
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
@@ -46,8 +48,6 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
|
|
|
46
48
|
|
|
47
49
|
logger = log.getLogger(__name__)
|
|
48
50
|
|
|
49
|
-
KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
|
|
50
|
-
|
|
51
51
|
|
|
52
52
|
def get_model_params(model_params: dict, default_config_key: str):
|
|
53
53
|
"""
|
|
@@ -140,23 +140,29 @@ class KnowledgeBaseTable:
|
|
|
140
140
|
self.document_loader = None
|
|
141
141
|
self.model_params = None
|
|
142
142
|
|
|
143
|
+
self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
|
|
144
|
+
if self._kb.params.get("version", 0) < 2:
|
|
145
|
+
self.kb_to_vector_columns["id"] = "original_doc_id"
|
|
146
|
+
|
|
143
147
|
def configure_preprocessing(self, config: Optional[dict] = None):
|
|
144
148
|
"""Configure preprocessing for the knowledge base table"""
|
|
145
149
|
logger.debug(f"Configuring preprocessing with config: {config}")
|
|
146
150
|
self.document_preprocessor = None # Reset existing preprocessor
|
|
147
|
-
if config is
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
151
|
+
if config is None:
|
|
152
|
+
config = {}
|
|
153
|
+
|
|
154
|
+
# Ensure content_column is set for JSON chunking if not already specified
|
|
155
|
+
if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
|
|
156
|
+
if "content_column" not in config["json_chunking_config"]:
|
|
157
|
+
config["json_chunking_config"]["content_column"] = "content"
|
|
158
|
+
|
|
159
|
+
preprocessing_config = PreprocessingConfig(**config)
|
|
160
|
+
self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
|
|
161
|
+
|
|
162
|
+
# set doc_id column name
|
|
163
|
+
self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
|
|
164
|
+
|
|
165
|
+
logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
|
|
160
166
|
|
|
161
167
|
def select_query(self, query: Select) -> pd.DataFrame:
|
|
162
168
|
"""
|
|
@@ -165,6 +171,30 @@ class KnowledgeBaseTable:
|
|
|
165
171
|
:param query: query to KB table
|
|
166
172
|
:return: dataframe with the result table
|
|
167
173
|
"""
|
|
174
|
+
|
|
175
|
+
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
176
|
+
query_copy = copy.deepcopy(query)
|
|
177
|
+
|
|
178
|
+
executor = KnowledgeBaseQueryExecutor(self)
|
|
179
|
+
df = executor.run(query)
|
|
180
|
+
|
|
181
|
+
if (
|
|
182
|
+
query.group_by is not None
|
|
183
|
+
or query.order_by is not None
|
|
184
|
+
or query.having is not None
|
|
185
|
+
or query.distinct is True
|
|
186
|
+
or len(query.targets) != 1
|
|
187
|
+
or not isinstance(query.targets[0], Star)
|
|
188
|
+
):
|
|
189
|
+
query_copy.where = None
|
|
190
|
+
if "metadata" in df.columns:
|
|
191
|
+
df["metadata"] = df["metadata"].apply(to_json)
|
|
192
|
+
|
|
193
|
+
df = query_df(df, query_copy, session=self.session)
|
|
194
|
+
|
|
195
|
+
return df
|
|
196
|
+
|
|
197
|
+
def select(self, query, disable_reranking=False):
|
|
168
198
|
logger.debug(f"Processing select query: {query}")
|
|
169
199
|
|
|
170
200
|
# Extract the content query text for potential reranking
|
|
@@ -176,9 +206,6 @@ class KnowledgeBaseTable:
|
|
|
176
206
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
177
207
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
178
208
|
|
|
179
|
-
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
180
|
-
query_copy = copy.deepcopy(query)
|
|
181
|
-
|
|
182
209
|
query.targets = [
|
|
183
210
|
Identifier(TableField.ID.value),
|
|
184
211
|
Identifier(TableField.CONTENT.value),
|
|
@@ -193,7 +220,6 @@ class KnowledgeBaseTable:
|
|
|
193
220
|
conditions = []
|
|
194
221
|
query_text = None
|
|
195
222
|
relevance_threshold = None
|
|
196
|
-
reranking_enabled_flag = True
|
|
197
223
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
198
224
|
if query_conditions is not None:
|
|
199
225
|
for item in query_conditions:
|
|
@@ -209,10 +235,9 @@ class KnowledgeBaseTable:
|
|
|
209
235
|
logger.error(error_msg)
|
|
210
236
|
raise ValueError(error_msg)
|
|
211
237
|
elif item.column == "reranking":
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
|
|
238
|
+
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
239
|
+
disable_reranking = True
|
|
240
|
+
|
|
216
241
|
elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
217
242
|
raise ValueError(
|
|
218
243
|
f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
|
|
@@ -244,66 +269,59 @@ class KnowledgeBaseTable:
|
|
|
244
269
|
limit = 100
|
|
245
270
|
query.limit = Constant(limit)
|
|
246
271
|
|
|
247
|
-
|
|
272
|
+
allowed_metadata_columns = self._get_allowed_metadata_columns()
|
|
273
|
+
df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
248
274
|
df = self.addapt_result_columns(df)
|
|
249
275
|
|
|
250
276
|
logger.debug(f"Query returned {len(df)} rows")
|
|
251
277
|
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
252
278
|
# Check if we have a rerank_model configured in KB params
|
|
253
|
-
df = self.add_relevance(df, query_text, relevance_threshold,
|
|
279
|
+
df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
|
|
254
280
|
|
|
255
|
-
|
|
256
|
-
query.group_by is not None
|
|
257
|
-
or query.order_by is not None
|
|
258
|
-
or query.having is not None
|
|
259
|
-
or query.distinct is True
|
|
260
|
-
or len(query.targets) != 1
|
|
261
|
-
or not isinstance(query.targets[0], Star)
|
|
262
|
-
):
|
|
263
|
-
query_copy.where = None
|
|
264
|
-
if "metadata" in df.columns:
|
|
265
|
-
df["metadata"] = df["metadata"].apply(to_json)
|
|
281
|
+
return df
|
|
266
282
|
|
|
267
|
-
|
|
283
|
+
def _get_allowed_metadata_columns(self) -> List[str] | None:
|
|
284
|
+
# Return list of KB columns to restrict querying, if None: no restrictions
|
|
268
285
|
|
|
269
|
-
|
|
286
|
+
if self._kb.params.get("version", 0) < 2:
|
|
287
|
+
# disable for old version KBs
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
user_columns = self._kb.params.get("metadata_columns", [])
|
|
291
|
+
dynamic_columns = self._kb.params.get("inserted_metadata", [])
|
|
292
|
+
|
|
293
|
+
columns = set(user_columns) | set(dynamic_columns)
|
|
294
|
+
return [col.lower() for col in columns]
|
|
270
295
|
|
|
271
296
|
def score_documents(self, query_text, documents, reranking_model_params):
|
|
272
297
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
273
298
|
return reranker.get_scores(query_text, documents)
|
|
274
299
|
|
|
275
|
-
def add_relevance(self, df, query_text, relevance_threshold=None,
|
|
300
|
+
def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
|
|
276
301
|
relevance_column = TableField.RELEVANCE.value
|
|
277
302
|
|
|
278
303
|
reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
|
|
279
|
-
if reranking_model_params and query_text and len(df) > 0 and
|
|
304
|
+
if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
|
|
280
305
|
# Use reranker for relevance score
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
except Exception as e:
|
|
301
|
-
logger.error(f"Error during reranking: {str(e)}")
|
|
302
|
-
# Fallback to distance-based relevance
|
|
303
|
-
if "distance" in df.columns:
|
|
304
|
-
df[relevance_column] = 1 / (1 + df["distance"])
|
|
305
|
-
else:
|
|
306
|
-
logger.info("No distance or reranker available")
|
|
306
|
+
|
|
307
|
+
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
308
|
+
# Apply custom filtering threshold if provided
|
|
309
|
+
if relevance_threshold is not None:
|
|
310
|
+
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
311
|
+
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
312
|
+
|
|
313
|
+
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
314
|
+
# Get documents to rerank
|
|
315
|
+
documents = df["chunk_content"].tolist()
|
|
316
|
+
# Use the get_scores method with disable_events=True
|
|
317
|
+
scores = reranker.get_scores(query_text, documents)
|
|
318
|
+
# Add scores as the relevance column
|
|
319
|
+
df[relevance_column] = scores
|
|
320
|
+
|
|
321
|
+
# Filter by threshold
|
|
322
|
+
scores_array = np.array(scores)
|
|
323
|
+
df = df[scores_array > reranker.filtering_threshold]
|
|
324
|
+
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
307
325
|
|
|
308
326
|
elif "distance" in df.columns:
|
|
309
327
|
# Calculate relevance from distance
|
|
@@ -323,12 +341,12 @@ class KnowledgeBaseTable:
|
|
|
323
341
|
if conditions is None:
|
|
324
342
|
return
|
|
325
343
|
for condition in conditions:
|
|
326
|
-
if condition.column in
|
|
327
|
-
condition.column =
|
|
344
|
+
if condition.column in self.kb_to_vector_columns:
|
|
345
|
+
condition.column = self.kb_to_vector_columns[condition.column]
|
|
328
346
|
|
|
329
347
|
def addapt_result_columns(self, df):
|
|
330
348
|
col_update = {}
|
|
331
|
-
for kb_col, vec_col in
|
|
349
|
+
for kb_col, vec_col in self.kb_to_vector_columns.items():
|
|
332
350
|
if vec_col in df.columns:
|
|
333
351
|
col_update[vec_col] = kb_col
|
|
334
352
|
|
|
@@ -337,7 +355,7 @@ class KnowledgeBaseTable:
|
|
|
337
355
|
columns = list(df.columns)
|
|
338
356
|
# update id, get from metadata
|
|
339
357
|
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
340
|
-
lambda m: None if m is None else m.get("
|
|
358
|
+
lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
|
|
341
359
|
)
|
|
342
360
|
|
|
343
361
|
# id on first place
|
|
@@ -524,8 +542,8 @@ class KnowledgeBaseTable:
|
|
|
524
542
|
|
|
525
543
|
metadata = {
|
|
526
544
|
**base_metadata,
|
|
527
|
-
"
|
|
528
|
-
"
|
|
545
|
+
"_original_row_index": str(idx), # provide link to original row index
|
|
546
|
+
"_content_column": col,
|
|
529
547
|
}
|
|
530
548
|
|
|
531
549
|
raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
|
|
@@ -620,16 +638,22 @@ class KnowledgeBaseTable:
|
|
|
620
638
|
metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
|
|
621
639
|
logger.debug(f"Mapped metadata columns: {metadata_columns}")
|
|
622
640
|
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
641
|
+
content_columns = list(set(content_columns).intersection(columns))
|
|
642
|
+
if len(content_columns) == 0:
|
|
643
|
+
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
627
644
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
645
|
+
if metadata_columns is not None:
|
|
646
|
+
metadata_columns = list(set(metadata_columns).intersection(columns))
|
|
647
|
+
else:
|
|
648
|
+
# all the rest columns
|
|
649
|
+
metadata_columns = list(set(columns).difference(content_columns))
|
|
650
|
+
|
|
651
|
+
# update list of used columns
|
|
652
|
+
inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
|
|
653
|
+
inserted_metadata.update(metadata_columns)
|
|
654
|
+
self._kb.params["inserted_metadata"] = list(inserted_metadata)
|
|
655
|
+
flag_modified(self._kb, "params")
|
|
656
|
+
db.session.commit()
|
|
633
657
|
|
|
634
658
|
# Add content columns directly (don't combine them)
|
|
635
659
|
for col in content_columns:
|
|
@@ -655,7 +679,7 @@ class KnowledgeBaseTable:
|
|
|
655
679
|
elif isinstance(value, dict):
|
|
656
680
|
metadata.update(value)
|
|
657
681
|
continue
|
|
658
|
-
|
|
682
|
+
elif value is not None:
|
|
659
683
|
value = str(value)
|
|
660
684
|
metadata[col] = value
|
|
661
685
|
return metadata
|
|
@@ -762,15 +786,10 @@ class KnowledgeBaseTable:
|
|
|
762
786
|
llm_model = args.pop("model_name")
|
|
763
787
|
engine = args.pop("provider")
|
|
764
788
|
|
|
765
|
-
llm_model = f"{engine}/{llm_model}"
|
|
766
|
-
|
|
767
|
-
if "base_url" in args:
|
|
768
|
-
args["api_base"] = args.pop("base_url")
|
|
769
|
-
|
|
770
789
|
module = session.integration_controller.get_handler_module("litellm")
|
|
771
790
|
if module is None or module.Handler is None:
|
|
772
791
|
raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
|
|
773
|
-
return module.Handler.embeddings(llm_model, messages, args)
|
|
792
|
+
return module.Handler.embeddings(engine, llm_model, messages, args)
|
|
774
793
|
|
|
775
794
|
def build_rag_pipeline(self, retrieval_config: dict):
|
|
776
795
|
"""
|
|
@@ -892,6 +911,8 @@ class KnowledgeBaseController:
|
|
|
892
911
|
manages knowledge bases
|
|
893
912
|
"""
|
|
894
913
|
|
|
914
|
+
KB_VERSION = 2
|
|
915
|
+
|
|
895
916
|
def __init__(self, session) -> None:
|
|
896
917
|
self.session = session
|
|
897
918
|
|
|
@@ -903,6 +924,7 @@ class KnowledgeBaseController:
|
|
|
903
924
|
params: dict,
|
|
904
925
|
preprocessing_config: Optional[dict] = None,
|
|
905
926
|
if_not_exists: bool = False,
|
|
927
|
+
keyword_search_enabled: bool = False,
|
|
906
928
|
# embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
|
|
907
929
|
) -> db.KnowledgeBase:
|
|
908
930
|
"""
|
|
@@ -1016,7 +1038,10 @@ class KnowledgeBaseController:
|
|
|
1016
1038
|
vector_db_name, vector_table_name = storage.parts
|
|
1017
1039
|
|
|
1018
1040
|
# create table in vectordb before creating KB
|
|
1019
|
-
self.session.datahub.get(vector_db_name).integration_handler
|
|
1041
|
+
vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
|
|
1042
|
+
vector_store_handler.create_table(vector_table_name)
|
|
1043
|
+
if keyword_search_enabled:
|
|
1044
|
+
vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
|
|
1020
1045
|
vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
|
|
1021
1046
|
|
|
1022
1047
|
# Store sparse vector settings in params if specified
|
|
@@ -1026,6 +1051,7 @@ class KnowledgeBaseController:
|
|
|
1026
1051
|
if vector_size is not None:
|
|
1027
1052
|
params["vector_config"]["vector_size"] = vector_size
|
|
1028
1053
|
|
|
1054
|
+
params["version"] = self.KB_VERSION
|
|
1029
1055
|
kb = db.KnowledgeBase(
|
|
1030
1056
|
name=name,
|
|
1031
1057
|
project_id=project_id,
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
import datetime as dt
|
|
8
8
|
|
|
9
9
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
10
|
-
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
|
|
10
|
+
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
|
|
11
11
|
from mindsdb.utilities import log
|
|
12
12
|
|
|
13
13
|
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
@@ -130,6 +130,8 @@ class EvaluateBase:
|
|
|
130
130
|
integration_name = table_name.parts[0]
|
|
131
131
|
table_name = Identifier(parts=table_name.parts[1:])
|
|
132
132
|
dn = self.session.datahub.get(integration_name)
|
|
133
|
+
if dn is None:
|
|
134
|
+
raise ValueError(f"Can't find database: {integration_name}")
|
|
133
135
|
return dn, table_name
|
|
134
136
|
|
|
135
137
|
def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
|
|
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
|
|
|
256
258
|
|
|
257
259
|
start_time = time.time()
|
|
258
260
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
259
|
-
df_answers = self.kb.select_query(
|
|
261
|
+
df_answers = self.kb.select_query(
|
|
262
|
+
Select(
|
|
263
|
+
targets=[Identifier("chunk_content")],
|
|
264
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
265
|
+
limit=Constant(self.TOP_K),
|
|
266
|
+
)
|
|
267
|
+
)
|
|
260
268
|
query_time = time.time() - start_time
|
|
261
269
|
|
|
262
270
|
proposed_responses = list(df_answers["chunk_content"])
|
|
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
|
|
|
410
418
|
Checks if ID in response from KB is matched with doc ID in test dataset
|
|
411
419
|
"""
|
|
412
420
|
|
|
413
|
-
TOP_K =
|
|
421
|
+
TOP_K = 20
|
|
414
422
|
|
|
415
423
|
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
416
424
|
if "id" not in sampled_df.columns:
|
|
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
|
|
|
462
470
|
start_time = time.time()
|
|
463
471
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
464
472
|
df_answers = self.kb.select_query(
|
|
465
|
-
Select(
|
|
473
|
+
Select(
|
|
474
|
+
targets=[Identifier("chunk_content"), Identifier("id")],
|
|
475
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
476
|
+
limit=Constant(self.TOP_K),
|
|
477
|
+
)
|
|
466
478
|
)
|
|
467
479
|
query_time = time.time() - start_time
|
|
468
480
|
|