MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +72 -44
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +14 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +314 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_planner.py +7 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/ludwig_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +150 -140
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +2 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/api_handler.py +6 -7
- mindsdb/integrations/libs/vectordatabase_handler.py +86 -77
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/constants.py +44 -0
- mindsdb/interfaces/agents/langchain_agent.py +15 -6
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +22 -3
- mindsdb/interfaces/knowledge_base/controller.py +121 -102
- mindsdb/interfaces/knowledge_base/evaluate.py +19 -7
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +26 -22
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +40 -28
- mindsdb/interfaces/skills/skill_tool.py +91 -88
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +12 -1
- mindsdb/utilities/exception.py +47 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/METADATA +239 -251
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/RECORD +55 -54
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.3.1.dist-info → mindsdb-25.7.1.0.dist-info}/top_level.txt +0 -0
|
@@ -71,11 +71,11 @@ class MCPLangchainAgent(LangchainAgent):
|
|
|
71
71
|
self,
|
|
72
72
|
agent: db.Agents,
|
|
73
73
|
model: dict = None,
|
|
74
|
-
|
|
74
|
+
llm_params: dict = None,
|
|
75
75
|
mcp_host: str = "127.0.0.1",
|
|
76
76
|
mcp_port: int = 47337,
|
|
77
77
|
):
|
|
78
|
-
super().__init__(agent, model,
|
|
78
|
+
super().__init__(agent, model, llm_params)
|
|
79
79
|
self.mcp_host = mcp_host
|
|
80
80
|
self.mcp_port = mcp_port
|
|
81
81
|
self.exit_stack = AsyncExitStack()
|
|
@@ -251,10 +251,10 @@ def create_mcp_agent(
|
|
|
251
251
|
raise ValueError(f"Agent {agent_name} not found in project {project_name}")
|
|
252
252
|
|
|
253
253
|
# Get merged parameters (defaults + agent params)
|
|
254
|
-
|
|
254
|
+
llm_params = agent_controller.get_agent_llm_params(agent_db.params)
|
|
255
255
|
|
|
256
256
|
# Create MCP agent with merged parameters
|
|
257
|
-
mcp_agent = MCPLangchainAgent(agent_db,
|
|
257
|
+
mcp_agent = MCPLangchainAgent(agent_db, llm_params=llm_params, mcp_host=mcp_host, mcp_port=mcp_port)
|
|
258
258
|
|
|
259
259
|
# Wrap for LiteLLM compatibility
|
|
260
260
|
return LiteLLMAgentWrapper(mcp_agent)
|
|
@@ -96,27 +96,7 @@ class MindsDBSQL(SQLDatabase):
|
|
|
96
96
|
# Log the query for debugging
|
|
97
97
|
logger.info(f"Executing SQL query: {command}")
|
|
98
98
|
|
|
99
|
-
|
|
100
|
-
# remove backticks
|
|
101
|
-
# command = command.replace('`', '')
|
|
102
|
-
|
|
103
|
-
# Parse the SQL string to an AST object first
|
|
104
|
-
from mindsdb_sql_parser import parse_sql
|
|
105
|
-
|
|
106
|
-
ast_query = parse_sql(command)
|
|
107
|
-
|
|
108
|
-
# Now execute the parsed query
|
|
109
|
-
result = self._sql_agent.skill_tool.get_command_executor().execute_command(
|
|
110
|
-
ast_query, database_name="mindsdb"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
# Convert ExecuteAnswer to a DataFrame for easier manipulation
|
|
114
|
-
if result.data is not None:
|
|
115
|
-
df = result.data.to_df()
|
|
116
|
-
return df.to_string(index=False)
|
|
117
|
-
|
|
118
|
-
else:
|
|
119
|
-
return "Query executed successfully, but returned no data."
|
|
99
|
+
return self._sql_agent.query(command)
|
|
120
100
|
|
|
121
101
|
except Exception as e:
|
|
122
102
|
logger.error(f"Error executing SQL command: {str(e)}\n{traceback.format_exc()}")
|
|
@@ -127,28 +107,6 @@ class MindsDBSQL(SQLDatabase):
|
|
|
127
107
|
return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
128
108
|
return f"Error: {str(e)}"
|
|
129
109
|
|
|
130
|
-
# def run_no_throw(self, command: str, fetch: str = "all") -> str:
|
|
131
|
-
# """Execute a SQL command and return the result as a string.
|
|
132
|
-
#
|
|
133
|
-
# This method catches any exceptions and returns an error message instead of raising an exception.
|
|
134
|
-
#
|
|
135
|
-
# Args:
|
|
136
|
-
# command: The SQL command to execute
|
|
137
|
-
# fetch: Whether to fetch 'all' results or just 'one'
|
|
138
|
-
#
|
|
139
|
-
# Returns:
|
|
140
|
-
# A string representation of the result or an error message
|
|
141
|
-
# """
|
|
142
|
-
# command = extract_essential(command)
|
|
143
|
-
# try:
|
|
144
|
-
# return self._sql_agent.query_safe(command)
|
|
145
|
-
# except Exception as e:
|
|
146
|
-
# logger.error(f"Error executing SQL command: {str(e)}")
|
|
147
|
-
# # If this is a knowledge base query, provide a more helpful error message
|
|
148
|
-
# if "knowledge_base" in command.lower() or any(kb in command for kb in self._sql_agent.get_usable_knowledge_base_names()):
|
|
149
|
-
# return f"Error executing knowledge base query: {str(e)}. Please check that the knowledge base exists and your query syntax is correct."
|
|
150
|
-
# return f"Error: {str(e)}"
|
|
151
|
-
|
|
152
110
|
def get_usable_knowledge_base_names(self) -> List[str]:
|
|
153
111
|
"""Get a list of usable knowledge base names.
|
|
154
112
|
|
|
@@ -160,3 +118,12 @@ class MindsDBSQL(SQLDatabase):
|
|
|
160
118
|
except Exception as e:
|
|
161
119
|
logger.error(f"Error getting usable knowledge base names: {str(e)}")
|
|
162
120
|
return []
|
|
121
|
+
|
|
122
|
+
def check_knowledge_base_permission(self, name):
|
|
123
|
+
"""Get a list of usable knowledge base names.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
A list of knowledge base names that can be used in queries
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
return self._sql_agent.check_knowledge_base_permission(name)
|
|
@@ -11,8 +11,6 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
11
11
|
"""
|
|
12
12
|
Read the metadata from the data catalog and return it as a string.
|
|
13
13
|
"""
|
|
14
|
-
if not self.is_data_catalog_supported():
|
|
15
|
-
return f"Data catalog is not supported for database '{self.database_name}'."
|
|
16
14
|
tables = self._read_metadata()
|
|
17
15
|
if not tables:
|
|
18
16
|
self.logger.warning(f"No metadata found for database '{self.database_name}'")
|
|
@@ -20,16 +18,37 @@ class DataCatalogReader(BaseDataCatalog):
|
|
|
20
18
|
|
|
21
19
|
metadata_str = "Data Catalog: \n"
|
|
22
20
|
if hasattr(self.data_handler, "meta_get_handler_info"):
|
|
23
|
-
|
|
21
|
+
info = self.data_handler.meta_get_handler_info()
|
|
22
|
+
if info:
|
|
23
|
+
metadata_str += info + "\n\n"
|
|
24
24
|
|
|
25
25
|
for table in tables:
|
|
26
26
|
metadata_str += table.as_string() + "\n\n"
|
|
27
27
|
return metadata_str
|
|
28
28
|
|
|
29
|
+
def read_metadata_as_records(self) -> list:
|
|
30
|
+
"""
|
|
31
|
+
Read the metadata from the data catalog and return it as a list of database records.
|
|
32
|
+
"""
|
|
33
|
+
tables = self._read_metadata()
|
|
34
|
+
if not tables:
|
|
35
|
+
self.logger.warning(f"No metadata found for database '{self.database_name}'")
|
|
36
|
+
return []
|
|
37
|
+
return tables
|
|
38
|
+
|
|
39
|
+
def get_handler_info(self) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Get the handler info for the database.
|
|
42
|
+
"""
|
|
43
|
+
return self.data_handler.meta_get_handler_info()
|
|
44
|
+
|
|
29
45
|
def _read_metadata(self) -> list:
|
|
30
46
|
"""
|
|
31
47
|
Read the metadata from the data catalog and return it in a structured format.
|
|
32
48
|
"""
|
|
49
|
+
if not self.is_data_catalog_supported():
|
|
50
|
+
return f"Data catalog is not supported for database '{self.database_name}'."
|
|
51
|
+
|
|
33
52
|
query = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id)
|
|
34
53
|
if self.table_names:
|
|
35
54
|
cleaned_table_names = [name.strip("`").split(".")[-1] for name in self.table_names]
|
|
@@ -6,6 +6,7 @@ import decimal
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import numpy as np
|
|
9
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
9
10
|
|
|
10
11
|
from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
|
|
11
12
|
from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
|
|
@@ -33,6 +34,7 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
|
|
|
33
34
|
from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
|
|
34
35
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
|
|
35
36
|
from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
|
|
37
|
+
from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
|
|
36
38
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
37
39
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
40
|
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
@@ -46,25 +48,21 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
|
|
|
46
48
|
|
|
47
49
|
logger = log.getLogger(__name__)
|
|
48
50
|
|
|
49
|
-
KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
|
|
50
|
-
|
|
51
51
|
|
|
52
52
|
def get_model_params(model_params: dict, default_config_key: str):
|
|
53
53
|
"""
|
|
54
54
|
Get model parameters by combining default config with user provided parameters.
|
|
55
55
|
"""
|
|
56
|
-
# If the default config key is for reranking and the switch to use the default LLM is enabled,
|
|
57
|
-
# switch to the default LLM model.
|
|
58
|
-
if default_config_key == "default_reranking_model" and config.get("default_reranking_model").get(
|
|
59
|
-
"use_default_llm", False
|
|
60
|
-
):
|
|
61
|
-
default_config_key = "default_llm_model"
|
|
62
|
-
|
|
63
56
|
combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
|
|
64
57
|
|
|
65
58
|
if model_params:
|
|
59
|
+
if not isinstance(model_params, dict):
|
|
60
|
+
raise ValueError("Model parameters must be passed as a JSON object")
|
|
61
|
+
|
|
66
62
|
combined_model_params.update(model_params)
|
|
67
63
|
|
|
64
|
+
combined_model_params.pop("use_default_llm", None)
|
|
65
|
+
|
|
68
66
|
return combined_model_params
|
|
69
67
|
|
|
70
68
|
|
|
@@ -105,8 +103,6 @@ def get_reranking_model_from_params(reranking_model_params: dict):
|
|
|
105
103
|
params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
|
|
106
104
|
params_copy["model"] = params_copy.pop("model_name", None)
|
|
107
105
|
|
|
108
|
-
params_copy.pop("use_default_llm", None)
|
|
109
|
-
|
|
110
106
|
return BaseLLMReranker(**params_copy)
|
|
111
107
|
|
|
112
108
|
|
|
@@ -144,23 +140,29 @@ class KnowledgeBaseTable:
|
|
|
144
140
|
self.document_loader = None
|
|
145
141
|
self.model_params = None
|
|
146
142
|
|
|
143
|
+
self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
|
|
144
|
+
if self._kb.params.get("version", 0) < 2:
|
|
145
|
+
self.kb_to_vector_columns["id"] = "original_doc_id"
|
|
146
|
+
|
|
147
147
|
def configure_preprocessing(self, config: Optional[dict] = None):
|
|
148
148
|
"""Configure preprocessing for the knowledge base table"""
|
|
149
149
|
logger.debug(f"Configuring preprocessing with config: {config}")
|
|
150
150
|
self.document_preprocessor = None # Reset existing preprocessor
|
|
151
|
-
if config is
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
151
|
+
if config is None:
|
|
152
|
+
config = {}
|
|
153
|
+
|
|
154
|
+
# Ensure content_column is set for JSON chunking if not already specified
|
|
155
|
+
if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
|
|
156
|
+
if "content_column" not in config["json_chunking_config"]:
|
|
157
|
+
config["json_chunking_config"]["content_column"] = "content"
|
|
158
|
+
|
|
159
|
+
preprocessing_config = PreprocessingConfig(**config)
|
|
160
|
+
self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
|
|
161
|
+
|
|
162
|
+
# set doc_id column name
|
|
163
|
+
self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
|
|
164
|
+
|
|
165
|
+
logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
|
|
164
166
|
|
|
165
167
|
def select_query(self, query: Select) -> pd.DataFrame:
|
|
166
168
|
"""
|
|
@@ -169,6 +171,30 @@ class KnowledgeBaseTable:
|
|
|
169
171
|
:param query: query to KB table
|
|
170
172
|
:return: dataframe with the result table
|
|
171
173
|
"""
|
|
174
|
+
|
|
175
|
+
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
176
|
+
query_copy = copy.deepcopy(query)
|
|
177
|
+
|
|
178
|
+
executor = KnowledgeBaseQueryExecutor(self)
|
|
179
|
+
df = executor.run(query)
|
|
180
|
+
|
|
181
|
+
if (
|
|
182
|
+
query.group_by is not None
|
|
183
|
+
or query.order_by is not None
|
|
184
|
+
or query.having is not None
|
|
185
|
+
or query.distinct is True
|
|
186
|
+
or len(query.targets) != 1
|
|
187
|
+
or not isinstance(query.targets[0], Star)
|
|
188
|
+
):
|
|
189
|
+
query_copy.where = None
|
|
190
|
+
if "metadata" in df.columns:
|
|
191
|
+
df["metadata"] = df["metadata"].apply(to_json)
|
|
192
|
+
|
|
193
|
+
df = query_df(df, query_copy, session=self.session)
|
|
194
|
+
|
|
195
|
+
return df
|
|
196
|
+
|
|
197
|
+
def select(self, query, disable_reranking=False):
|
|
172
198
|
logger.debug(f"Processing select query: {query}")
|
|
173
199
|
|
|
174
200
|
# Extract the content query text for potential reranking
|
|
@@ -180,9 +206,6 @@ class KnowledgeBaseTable:
|
|
|
180
206
|
query.from_table = Identifier(parts=[self._kb.vector_database_table])
|
|
181
207
|
logger.debug(f"Set table name to: {self._kb.vector_database_table}")
|
|
182
208
|
|
|
183
|
-
# Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
|
|
184
|
-
query_copy = copy.deepcopy(query)
|
|
185
|
-
|
|
186
209
|
query.targets = [
|
|
187
210
|
Identifier(TableField.ID.value),
|
|
188
211
|
Identifier(TableField.CONTENT.value),
|
|
@@ -197,7 +220,6 @@ class KnowledgeBaseTable:
|
|
|
197
220
|
conditions = []
|
|
198
221
|
query_text = None
|
|
199
222
|
relevance_threshold = None
|
|
200
|
-
reranking_enabled_flag = True
|
|
201
223
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
202
224
|
if query_conditions is not None:
|
|
203
225
|
for item in query_conditions:
|
|
@@ -213,10 +235,9 @@ class KnowledgeBaseTable:
|
|
|
213
235
|
logger.error(error_msg)
|
|
214
236
|
raise ValueError(error_msg)
|
|
215
237
|
elif item.column == "reranking":
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
|
|
238
|
+
if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
|
|
239
|
+
disable_reranking = True
|
|
240
|
+
|
|
220
241
|
elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
|
|
221
242
|
raise ValueError(
|
|
222
243
|
f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
|
|
@@ -248,66 +269,59 @@ class KnowledgeBaseTable:
|
|
|
248
269
|
limit = 100
|
|
249
270
|
query.limit = Constant(limit)
|
|
250
271
|
|
|
251
|
-
|
|
272
|
+
allowed_metadata_columns = self._get_allowed_metadata_columns()
|
|
273
|
+
df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
252
274
|
df = self.addapt_result_columns(df)
|
|
253
275
|
|
|
254
276
|
logger.debug(f"Query returned {len(df)} rows")
|
|
255
277
|
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
256
278
|
# Check if we have a rerank_model configured in KB params
|
|
257
|
-
df = self.add_relevance(df, query_text, relevance_threshold,
|
|
279
|
+
df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
|
|
258
280
|
|
|
259
|
-
|
|
260
|
-
query.group_by is not None
|
|
261
|
-
or query.order_by is not None
|
|
262
|
-
or query.having is not None
|
|
263
|
-
or query.distinct is True
|
|
264
|
-
or len(query.targets) != 1
|
|
265
|
-
or not isinstance(query.targets[0], Star)
|
|
266
|
-
):
|
|
267
|
-
query_copy.where = None
|
|
268
|
-
if "metadata" in df.columns:
|
|
269
|
-
df["metadata"] = df["metadata"].apply(to_json)
|
|
281
|
+
return df
|
|
270
282
|
|
|
271
|
-
|
|
283
|
+
def _get_allowed_metadata_columns(self) -> List[str] | None:
|
|
284
|
+
# Return list of KB columns to restrict querying, if None: no restrictions
|
|
272
285
|
|
|
273
|
-
|
|
286
|
+
if self._kb.params.get("version", 0) < 2:
|
|
287
|
+
# disable for old version KBs
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
user_columns = self._kb.params.get("metadata_columns", [])
|
|
291
|
+
dynamic_columns = self._kb.params.get("inserted_metadata", [])
|
|
292
|
+
|
|
293
|
+
columns = set(user_columns) | set(dynamic_columns)
|
|
294
|
+
return [col.lower() for col in columns]
|
|
274
295
|
|
|
275
296
|
def score_documents(self, query_text, documents, reranking_model_params):
|
|
276
297
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
277
298
|
return reranker.get_scores(query_text, documents)
|
|
278
299
|
|
|
279
|
-
def add_relevance(self, df, query_text, relevance_threshold=None,
|
|
300
|
+
def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
|
|
280
301
|
relevance_column = TableField.RELEVANCE.value
|
|
281
302
|
|
|
282
303
|
reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
|
|
283
|
-
if reranking_model_params and query_text and len(df) > 0 and
|
|
304
|
+
if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
|
|
284
305
|
# Use reranker for relevance score
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
except Exception as e:
|
|
305
|
-
logger.error(f"Error during reranking: {str(e)}")
|
|
306
|
-
# Fallback to distance-based relevance
|
|
307
|
-
if "distance" in df.columns:
|
|
308
|
-
df[relevance_column] = 1 / (1 + df["distance"])
|
|
309
|
-
else:
|
|
310
|
-
logger.info("No distance or reranker available")
|
|
306
|
+
|
|
307
|
+
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
308
|
+
# Apply custom filtering threshold if provided
|
|
309
|
+
if relevance_threshold is not None:
|
|
310
|
+
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
311
|
+
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
312
|
+
|
|
313
|
+
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
314
|
+
# Get documents to rerank
|
|
315
|
+
documents = df["chunk_content"].tolist()
|
|
316
|
+
# Use the get_scores method with disable_events=True
|
|
317
|
+
scores = reranker.get_scores(query_text, documents)
|
|
318
|
+
# Add scores as the relevance column
|
|
319
|
+
df[relevance_column] = scores
|
|
320
|
+
|
|
321
|
+
# Filter by threshold
|
|
322
|
+
scores_array = np.array(scores)
|
|
323
|
+
df = df[scores_array > reranker.filtering_threshold]
|
|
324
|
+
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
311
325
|
|
|
312
326
|
elif "distance" in df.columns:
|
|
313
327
|
# Calculate relevance from distance
|
|
@@ -327,12 +341,12 @@ class KnowledgeBaseTable:
|
|
|
327
341
|
if conditions is None:
|
|
328
342
|
return
|
|
329
343
|
for condition in conditions:
|
|
330
|
-
if condition.column in
|
|
331
|
-
condition.column =
|
|
344
|
+
if condition.column in self.kb_to_vector_columns:
|
|
345
|
+
condition.column = self.kb_to_vector_columns[condition.column]
|
|
332
346
|
|
|
333
347
|
def addapt_result_columns(self, df):
|
|
334
348
|
col_update = {}
|
|
335
|
-
for kb_col, vec_col in
|
|
349
|
+
for kb_col, vec_col in self.kb_to_vector_columns.items():
|
|
336
350
|
if vec_col in df.columns:
|
|
337
351
|
col_update[vec_col] = kb_col
|
|
338
352
|
|
|
@@ -341,7 +355,7 @@ class KnowledgeBaseTable:
|
|
|
341
355
|
columns = list(df.columns)
|
|
342
356
|
# update id, get from metadata
|
|
343
357
|
df[TableField.ID.value] = df[TableField.METADATA.value].apply(
|
|
344
|
-
lambda m: None if m is None else m.get("
|
|
358
|
+
lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
|
|
345
359
|
)
|
|
346
360
|
|
|
347
361
|
# id on first place
|
|
@@ -528,8 +542,8 @@ class KnowledgeBaseTable:
|
|
|
528
542
|
|
|
529
543
|
metadata = {
|
|
530
544
|
**base_metadata,
|
|
531
|
-
"
|
|
532
|
-
"
|
|
545
|
+
"_original_row_index": str(idx), # provide link to original row index
|
|
546
|
+
"_content_column": col,
|
|
533
547
|
}
|
|
534
548
|
|
|
535
549
|
raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
|
|
@@ -624,16 +638,22 @@ class KnowledgeBaseTable:
|
|
|
624
638
|
metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
|
|
625
639
|
logger.debug(f"Mapped metadata columns: {metadata_columns}")
|
|
626
640
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
641
|
+
content_columns = list(set(content_columns).intersection(columns))
|
|
642
|
+
if len(content_columns) == 0:
|
|
643
|
+
raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
|
|
631
644
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
645
|
+
if metadata_columns is not None:
|
|
646
|
+
metadata_columns = list(set(metadata_columns).intersection(columns))
|
|
647
|
+
else:
|
|
648
|
+
# all the rest columns
|
|
649
|
+
metadata_columns = list(set(columns).difference(content_columns))
|
|
650
|
+
|
|
651
|
+
# update list of used columns
|
|
652
|
+
inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
|
|
653
|
+
inserted_metadata.update(metadata_columns)
|
|
654
|
+
self._kb.params["inserted_metadata"] = list(inserted_metadata)
|
|
655
|
+
flag_modified(self._kb, "params")
|
|
656
|
+
db.session.commit()
|
|
637
657
|
|
|
638
658
|
# Add content columns directly (don't combine them)
|
|
639
659
|
for col in content_columns:
|
|
@@ -659,7 +679,7 @@ class KnowledgeBaseTable:
|
|
|
659
679
|
elif isinstance(value, dict):
|
|
660
680
|
metadata.update(value)
|
|
661
681
|
continue
|
|
662
|
-
|
|
682
|
+
elif value is not None:
|
|
663
683
|
value = str(value)
|
|
664
684
|
metadata[col] = value
|
|
665
685
|
return metadata
|
|
@@ -766,15 +786,10 @@ class KnowledgeBaseTable:
|
|
|
766
786
|
llm_model = args.pop("model_name")
|
|
767
787
|
engine = args.pop("provider")
|
|
768
788
|
|
|
769
|
-
llm_model = f"{engine}/{llm_model}"
|
|
770
|
-
|
|
771
|
-
if "base_url" in args:
|
|
772
|
-
args["api_base"] = args.pop("base_url")
|
|
773
|
-
|
|
774
789
|
module = session.integration_controller.get_handler_module("litellm")
|
|
775
790
|
if module is None or module.Handler is None:
|
|
776
791
|
raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
|
|
777
|
-
return module.Handler.embeddings(llm_model, messages, args)
|
|
792
|
+
return module.Handler.embeddings(engine, llm_model, messages, args)
|
|
778
793
|
|
|
779
794
|
def build_rag_pipeline(self, retrieval_config: dict):
|
|
780
795
|
"""
|
|
@@ -896,6 +911,8 @@ class KnowledgeBaseController:
|
|
|
896
911
|
manages knowledge bases
|
|
897
912
|
"""
|
|
898
913
|
|
|
914
|
+
KB_VERSION = 2
|
|
915
|
+
|
|
899
916
|
def __init__(self, session) -> None:
|
|
900
917
|
self.session = session
|
|
901
918
|
|
|
@@ -907,6 +924,7 @@ class KnowledgeBaseController:
|
|
|
907
924
|
params: dict,
|
|
908
925
|
preprocessing_config: Optional[dict] = None,
|
|
909
926
|
if_not_exists: bool = False,
|
|
927
|
+
keyword_search_enabled: bool = False,
|
|
910
928
|
# embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
|
|
911
929
|
) -> db.KnowledgeBase:
|
|
912
930
|
"""
|
|
@@ -961,10 +979,7 @@ class KnowledgeBaseController:
|
|
|
961
979
|
# # it is params for model
|
|
962
980
|
# embedding_params.update(params["embedding_model"])
|
|
963
981
|
|
|
964
|
-
|
|
965
|
-
if not isinstance(params["embedding_model"], dict):
|
|
966
|
-
raise ValueError("embedding_model should be JSON object with model parameters.")
|
|
967
|
-
embedding_params.update(params["embedding_model"])
|
|
982
|
+
embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
|
|
968
983
|
|
|
969
984
|
# if model_name is None: # Legacy
|
|
970
985
|
model_name = self._create_embedding_model(
|
|
@@ -1023,7 +1038,10 @@ class KnowledgeBaseController:
|
|
|
1023
1038
|
vector_db_name, vector_table_name = storage.parts
|
|
1024
1039
|
|
|
1025
1040
|
# create table in vectordb before creating KB
|
|
1026
|
-
self.session.datahub.get(vector_db_name).integration_handler
|
|
1041
|
+
vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
|
|
1042
|
+
vector_store_handler.create_table(vector_table_name)
|
|
1043
|
+
if keyword_search_enabled:
|
|
1044
|
+
vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
|
|
1027
1045
|
vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
|
|
1028
1046
|
|
|
1029
1047
|
# Store sparse vector settings in params if specified
|
|
@@ -1033,6 +1051,7 @@ class KnowledgeBaseController:
|
|
|
1033
1051
|
if vector_size is not None:
|
|
1034
1052
|
params["vector_config"]["vector_size"] = vector_size
|
|
1035
1053
|
|
|
1054
|
+
params["version"] = self.KB_VERSION
|
|
1036
1055
|
kb = db.KnowledgeBase(
|
|
1037
1056
|
name=name,
|
|
1038
1057
|
project_id=project_id,
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
import datetime as dt
|
|
8
8
|
|
|
9
9
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
10
|
-
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
|
|
10
|
+
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
|
|
11
11
|
from mindsdb.utilities import log
|
|
12
12
|
|
|
13
13
|
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
@@ -130,6 +130,8 @@ class EvaluateBase:
|
|
|
130
130
|
integration_name = table_name.parts[0]
|
|
131
131
|
table_name = Identifier(parts=table_name.parts[1:])
|
|
132
132
|
dn = self.session.datahub.get(integration_name)
|
|
133
|
+
if dn is None:
|
|
134
|
+
raise ValueError(f"Can't find database: {integration_name}")
|
|
133
135
|
return dn, table_name
|
|
134
136
|
|
|
135
137
|
def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
|
|
@@ -168,13 +170,13 @@ class EvaluateBase:
|
|
|
168
170
|
test_data = self.generate_test_data(gen_params)
|
|
169
171
|
|
|
170
172
|
self.save_to_table(test_table, test_data, is_replace=True)
|
|
171
|
-
else:
|
|
172
|
-
test_data = self.read_from_table(test_table)
|
|
173
173
|
|
|
174
174
|
if params.get("evaluate", True) is False:
|
|
175
175
|
# no evaluate is required
|
|
176
176
|
return pd.DataFrame()
|
|
177
177
|
|
|
178
|
+
test_data = self.read_from_table(test_table)
|
|
179
|
+
|
|
178
180
|
scores = self.evaluate(test_data)
|
|
179
181
|
scores["name"] = self.name
|
|
180
182
|
scores["created_at"] = dt.datetime.now()
|
|
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
|
|
|
256
258
|
|
|
257
259
|
start_time = time.time()
|
|
258
260
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
259
|
-
df_answers = self.kb.select_query(
|
|
261
|
+
df_answers = self.kb.select_query(
|
|
262
|
+
Select(
|
|
263
|
+
targets=[Identifier("chunk_content")],
|
|
264
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
265
|
+
limit=Constant(self.TOP_K),
|
|
266
|
+
)
|
|
267
|
+
)
|
|
260
268
|
query_time = time.time() - start_time
|
|
261
269
|
|
|
262
270
|
proposed_responses = list(df_answers["chunk_content"])
|
|
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
|
|
|
410
418
|
Checks if ID in response from KB is matched with doc ID in test dataset
|
|
411
419
|
"""
|
|
412
420
|
|
|
413
|
-
TOP_K =
|
|
421
|
+
TOP_K = 20
|
|
414
422
|
|
|
415
423
|
def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
|
|
416
424
|
if "id" not in sampled_df.columns:
|
|
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
|
|
|
462
470
|
start_time = time.time()
|
|
463
471
|
logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
|
|
464
472
|
df_answers = self.kb.select_query(
|
|
465
|
-
Select(
|
|
473
|
+
Select(
|
|
474
|
+
targets=[Identifier("chunk_content"), Identifier("id")],
|
|
475
|
+
where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
|
|
476
|
+
limit=Constant(self.TOP_K),
|
|
477
|
+
)
|
|
466
478
|
)
|
|
467
479
|
query_time = time.time() - start_time
|
|
468
480
|
|
|
@@ -511,6 +523,6 @@ class EvaluateDocID(EvaluateBase):
|
|
|
511
523
|
"total": total_questions,
|
|
512
524
|
"total_found": total_found,
|
|
513
525
|
"retrieved_in_top_10": accurate_in_top_10,
|
|
514
|
-
"cumulative_recall": cumulative_recall,
|
|
526
|
+
"cumulative_recall": json.dumps(cumulative_recall),
|
|
515
527
|
"avg_query_time": avg_query_time,
|
|
516
528
|
}
|