MindsDB 25.9.1.2__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +39 -20
- mindsdb/api/a2a/agent.py +7 -9
- mindsdb/api/a2a/common/server/server.py +3 -3
- mindsdb/api/a2a/common/server/task_manager.py +4 -4
- mindsdb/api/a2a/task_manager.py +15 -17
- mindsdb/api/common/middleware.py +9 -11
- mindsdb/api/executor/command_executor.py +2 -4
- mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +100 -48
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/exceptions.py +29 -10
- mindsdb/api/executor/planner/plan_join.py +17 -3
- mindsdb/api/executor/sql_query/sql_query.py +74 -74
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
- mindsdb/api/executor/utilities/functions.py +6 -6
- mindsdb/api/executor/utilities/sql.py +32 -16
- mindsdb/api/http/gui.py +5 -11
- mindsdb/api/http/initialize.py +8 -10
- mindsdb/api/http/namespaces/agents.py +10 -12
- mindsdb/api/http/namespaces/analysis.py +13 -20
- mindsdb/api/http/namespaces/auth.py +1 -1
- mindsdb/api/http/namespaces/config.py +15 -11
- mindsdb/api/http/namespaces/databases.py +140 -201
- mindsdb/api/http/namespaces/file.py +15 -4
- mindsdb/api/http/namespaces/handlers.py +7 -2
- mindsdb/api/http/namespaces/knowledge_bases.py +8 -7
- mindsdb/api/http/namespaces/models.py +94 -126
- mindsdb/api/http/namespaces/projects.py +13 -22
- mindsdb/api/http/namespaces/sql.py +33 -25
- mindsdb/api/http/namespaces/tab.py +27 -37
- mindsdb/api/http/namespaces/views.py +1 -1
- mindsdb/api/http/start.py +14 -8
- mindsdb/api/mcp/__init__.py +2 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
- mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +6 -13
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +40 -28
- mindsdb/integrations/handlers/byom_handler/byom_handler.py +168 -185
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +11 -5
- mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
- mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +20 -2
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +18 -3
- mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
- mindsdb/integrations/libs/api_handler.py +10 -10
- mindsdb/integrations/libs/base.py +4 -4
- mindsdb/integrations/libs/llm/utils.py +2 -2
- mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
- mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
- mindsdb/integrations/libs/process_cache.py +132 -140
- mindsdb/integrations/libs/response.py +18 -12
- mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
- mindsdb/integrations/utilities/files/file_reader.py +6 -7
- mindsdb/integrations/utilities/rag/config_loader.py +37 -26
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +59 -9
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
- mindsdb/integrations/utilities/rag/settings.py +58 -133
- mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
- mindsdb/interfaces/agents/agents_controller.py +2 -1
- mindsdb/interfaces/agents/constants.py +0 -2
- mindsdb/interfaces/agents/litellm_server.py +34 -58
- mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
- mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
- mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
- mindsdb/interfaces/chatbot/polling.py +30 -18
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +10 -10
- mindsdb/interfaces/database/integrations.py +19 -2
- mindsdb/interfaces/file/file_controller.py +6 -6
- mindsdb/interfaces/functions/controller.py +1 -1
- mindsdb/interfaces/functions/to_markdown.py +2 -2
- mindsdb/interfaces/jobs/jobs_controller.py +5 -5
- mindsdb/interfaces/jobs/scheduler.py +3 -8
- mindsdb/interfaces/knowledge_base/controller.py +54 -25
- mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
- mindsdb/interfaces/model/model_controller.py +170 -166
- mindsdb/interfaces/query_context/context_controller.py +14 -2
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +6 -4
- mindsdb/interfaces/skills/retrieval_tool.py +43 -50
- mindsdb/interfaces/skills/skill_tool.py +2 -2
- mindsdb/interfaces/skills/sql_agent.py +25 -19
- mindsdb/interfaces/storage/fs.py +114 -169
- mindsdb/interfaces/storage/json.py +19 -18
- mindsdb/interfaces/storage/model_fs.py +54 -92
- mindsdb/interfaces/tabs/tabs_controller.py +49 -72
- mindsdb/interfaces/tasks/task_monitor.py +3 -9
- mindsdb/interfaces/tasks/task_thread.py +7 -9
- mindsdb/interfaces/triggers/trigger_task.py +7 -13
- mindsdb/interfaces/triggers/triggers_controller.py +47 -50
- mindsdb/migrations/migrate.py +16 -16
- mindsdb/utilities/api_status.py +58 -0
- mindsdb/utilities/config.py +49 -0
- mindsdb/utilities/exception.py +40 -1
- mindsdb/utilities/fs.py +0 -1
- mindsdb/utilities/hooks/profiling.py +17 -14
- mindsdb/utilities/langfuse.py +40 -45
- mindsdb/utilities/log.py +272 -0
- mindsdb/utilities/ml_task_queue/consumer.py +52 -58
- mindsdb/utilities/ml_task_queue/producer.py +26 -30
- mindsdb/utilities/render/sqlalchemy_render.py +8 -7
- mindsdb/utilities/utils.py +2 -2
- {mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/METADATA +266 -261
- {mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/RECORD +119 -119
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
- {mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/WHEEL +0 -0
- {mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@ logger = log.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def execute_async(q_in, q_out):
|
|
17
|
-
|
|
18
17
|
while True:
|
|
19
18
|
task = q_in.get()
|
|
20
19
|
|
|
@@ -44,7 +43,7 @@ class Scheduler:
|
|
|
44
43
|
self.q_in = queue.Queue()
|
|
45
44
|
self.q_out = queue.Queue()
|
|
46
45
|
self.work_thread = threading.Thread(
|
|
47
|
-
target=execute_async, args=(self.q_in, self.q_out), name=
|
|
46
|
+
target=execute_async, args=(self.q_in, self.q_out), name="Scheduler.execute_async"
|
|
48
47
|
)
|
|
49
48
|
self.work_thread.start()
|
|
50
49
|
|
|
@@ -58,14 +57,13 @@ class Scheduler:
|
|
|
58
57
|
check_interval = self.config.get("jobs", {}).get("check_interval", 30)
|
|
59
58
|
|
|
60
59
|
while True:
|
|
61
|
-
|
|
62
60
|
logger.debug("Scheduler check timetable")
|
|
63
61
|
try:
|
|
64
62
|
self.check_timetable()
|
|
65
63
|
except (SystemExit, KeyboardInterrupt):
|
|
66
64
|
raise
|
|
67
|
-
except Exception
|
|
68
|
-
logger.
|
|
65
|
+
except Exception:
|
|
66
|
+
logger.exception("Error in 'scheduler_monitor'")
|
|
69
67
|
|
|
70
68
|
# different instances should start in not the same time
|
|
71
69
|
|
|
@@ -83,7 +81,6 @@ class Scheduler:
|
|
|
83
81
|
db.session.remove()
|
|
84
82
|
|
|
85
83
|
def execute_task(self, record_id, exec_method):
|
|
86
|
-
|
|
87
84
|
executor = JobsExecutor()
|
|
88
85
|
if exec_method == "local":
|
|
89
86
|
history_id = executor.lock_record(record_id)
|
|
@@ -117,7 +114,6 @@ class Scheduler:
|
|
|
117
114
|
raise NotImplementedError()
|
|
118
115
|
|
|
119
116
|
def start(self):
|
|
120
|
-
|
|
121
117
|
config = Config()
|
|
122
118
|
db.init()
|
|
123
119
|
self.config = config
|
|
@@ -127,7 +123,6 @@ class Scheduler:
|
|
|
127
123
|
try:
|
|
128
124
|
self.scheduler_monitor()
|
|
129
125
|
except (KeyboardInterrupt, SystemExit):
|
|
130
|
-
|
|
131
126
|
self.stop_thread()
|
|
132
127
|
pass
|
|
133
128
|
|
|
@@ -22,12 +22,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
22
22
|
TableField,
|
|
23
23
|
VectorStoreHandler,
|
|
24
24
|
)
|
|
25
|
-
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
26
|
-
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
27
25
|
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
28
|
-
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
|
|
29
|
-
construct_model_from_args,
|
|
30
|
-
)
|
|
31
26
|
|
|
32
27
|
from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS, MAX_INSERT_BATCH_SIZE
|
|
33
28
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
|
|
@@ -56,6 +51,7 @@ class KnowledgeBaseInputParams(BaseModel):
|
|
|
56
51
|
content_columns: List[str] | None = None
|
|
57
52
|
id_column: str | None = None
|
|
58
53
|
kb_no_upsert: bool = False
|
|
54
|
+
kb_skip_existing: bool = False
|
|
59
55
|
embedding_model: Dict[Text, Any] | None = None
|
|
60
56
|
is_sparse: bool = False
|
|
61
57
|
vector_size: int | None = None
|
|
@@ -83,9 +79,9 @@ def get_model_params(model_params: dict, default_config_key: str):
|
|
|
83
79
|
return combined_model_params
|
|
84
80
|
|
|
85
81
|
|
|
86
|
-
def
|
|
82
|
+
def adapt_embedding_model_params(embedding_model_params: dict):
|
|
87
83
|
"""
|
|
88
|
-
|
|
84
|
+
Prepare parameters for embedding model.
|
|
89
85
|
"""
|
|
90
86
|
params_copy = copy.deepcopy(embedding_model_params)
|
|
91
87
|
provider = params_copy.pop("provider", None).lower()
|
|
@@ -106,7 +102,7 @@ def get_embedding_model_from_params(embedding_model_params: dict):
|
|
|
106
102
|
params_copy.pop("api_key", None)
|
|
107
103
|
params_copy["model"] = params_copy.pop("model_name", None)
|
|
108
104
|
|
|
109
|
-
return
|
|
105
|
+
return params_copy
|
|
110
106
|
|
|
111
107
|
|
|
112
108
|
def get_reranking_model_from_params(reranking_model_params: dict):
|
|
@@ -265,9 +261,9 @@ class KnowledgeBaseTable:
|
|
|
265
261
|
gt_filtering = True
|
|
266
262
|
logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
|
|
267
263
|
except (ValueError, TypeError) as e:
|
|
268
|
-
error_msg = f"Invalid relevance_threshold value: {item.value}. {
|
|
264
|
+
error_msg = f"Invalid relevance_threshold value: {item.value}. {e}"
|
|
269
265
|
logger.error(error_msg)
|
|
270
|
-
raise ValueError(error_msg)
|
|
266
|
+
raise ValueError(error_msg) from e
|
|
271
267
|
elif (item.column == "relevance") and (item.op.value not in relevance_threshold_allowed_operators):
|
|
272
268
|
raise ValueError(
|
|
273
269
|
f"Invalid operator for relevance: {item.op.value}. Only the following operators are allowed: "
|
|
@@ -318,13 +314,20 @@ class KnowledgeBaseTable:
|
|
|
318
314
|
self.addapt_conditions_columns(conditions)
|
|
319
315
|
|
|
320
316
|
# Set default limit if query is present
|
|
317
|
+
limit = query.limit.value if query.limit is not None else None
|
|
321
318
|
if query_text is not None:
|
|
322
|
-
limit = query.limit.value if query.limit is not None else None
|
|
323
319
|
if limit is None:
|
|
324
320
|
limit = 10
|
|
325
321
|
elif limit > 100:
|
|
326
322
|
limit = 100
|
|
327
|
-
|
|
323
|
+
|
|
324
|
+
if not disable_reranking:
|
|
325
|
+
# expand limit, get more records before reranking usage:
|
|
326
|
+
# get twice size of input but not greater than 30
|
|
327
|
+
query_limit = min(limit * 2, limit + 30)
|
|
328
|
+
else:
|
|
329
|
+
query_limit = limit
|
|
330
|
+
query.limit = Constant(query_limit)
|
|
328
331
|
|
|
329
332
|
allowed_metadata_columns = self._get_allowed_metadata_columns()
|
|
330
333
|
df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
|
|
@@ -375,6 +378,8 @@ class KnowledgeBaseTable:
|
|
|
375
378
|
|
|
376
379
|
# Check if we have a rerank_model configured in KB params
|
|
377
380
|
df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
|
|
381
|
+
if limit is not None:
|
|
382
|
+
df = df[:limit]
|
|
378
383
|
|
|
379
384
|
# if relevance filtering method is strictly GREATER THAN we filter the df
|
|
380
385
|
if gt_filtering:
|
|
@@ -407,7 +412,6 @@ class KnowledgeBaseTable:
|
|
|
407
412
|
if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
|
|
408
413
|
# Use reranker for relevance score
|
|
409
414
|
|
|
410
|
-
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
411
415
|
# Apply custom filtering threshold if provided
|
|
412
416
|
if relevance_threshold is not None:
|
|
413
417
|
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
@@ -424,7 +428,6 @@ class KnowledgeBaseTable:
|
|
|
424
428
|
# Filter by threshold
|
|
425
429
|
scores_array = np.array(scores)
|
|
426
430
|
df = df[scores_array >= reranker.filtering_threshold]
|
|
427
|
-
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
428
431
|
|
|
429
432
|
elif "distance" in df.columns:
|
|
430
433
|
# Calculate relevance from distance
|
|
@@ -547,7 +550,7 @@ class KnowledgeBaseTable:
|
|
|
547
550
|
if processed_chunks:
|
|
548
551
|
content.value = processed_chunks[0].content
|
|
549
552
|
|
|
550
|
-
query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
|
|
553
|
+
query.update_columns[emb_col] = Constant(self._content_to_embeddings(content.value))
|
|
551
554
|
|
|
552
555
|
if "metadata" not in query.update_columns:
|
|
553
556
|
query.update_columns["metadata"] = Constant({})
|
|
@@ -678,6 +681,25 @@ class KnowledgeBaseTable:
|
|
|
678
681
|
logger.warning("No valid content found in any content columns")
|
|
679
682
|
return
|
|
680
683
|
|
|
684
|
+
# Check if we should skip existing items (before calculating embeddings)
|
|
685
|
+
if params is not None and params.get("kb_skip_existing", False):
|
|
686
|
+
logger.debug(f"Checking for existing items to skip before processing {len(df)} items")
|
|
687
|
+
db_handler = self.get_vector_db()
|
|
688
|
+
|
|
689
|
+
# Get list of IDs from current batch
|
|
690
|
+
current_ids = df[TableField.ID.value].dropna().astype(str).tolist()
|
|
691
|
+
if current_ids:
|
|
692
|
+
# Check which IDs already exist
|
|
693
|
+
existing_ids = db_handler.check_existing_ids(self._kb.vector_database_table, current_ids)
|
|
694
|
+
if existing_ids:
|
|
695
|
+
# Filter out existing items
|
|
696
|
+
df = df[~df[TableField.ID.value].astype(str).isin(existing_ids)]
|
|
697
|
+
logger.info(f"Skipped {len(existing_ids)} existing items, processing {len(df)} new items")
|
|
698
|
+
|
|
699
|
+
if df.empty:
|
|
700
|
+
logger.info("All items already exist, nothing to insert")
|
|
701
|
+
return
|
|
702
|
+
|
|
681
703
|
# add embeddings and send to vector db
|
|
682
704
|
df_emb = self._df_to_embeddings(df)
|
|
683
705
|
df = pd.concat([df, df_emb], axis=1)
|
|
@@ -915,7 +937,12 @@ class KnowledgeBaseTable:
|
|
|
915
937
|
ValueError: If the configuration is invalid or required components are missing
|
|
916
938
|
"""
|
|
917
939
|
# Get embedding model from knowledge base
|
|
918
|
-
|
|
940
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
|
|
941
|
+
construct_model_from_args,
|
|
942
|
+
)
|
|
943
|
+
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
944
|
+
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
945
|
+
|
|
919
946
|
embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
|
|
920
947
|
if self._kb.embedding_model:
|
|
921
948
|
# Extract embedding model args from knowledge base table
|
|
@@ -924,7 +951,7 @@ class KnowledgeBaseTable:
|
|
|
924
951
|
embeddings_model = construct_model_from_args(embedding_args)
|
|
925
952
|
logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
|
|
926
953
|
elif embedding_model_params:
|
|
927
|
-
embeddings_model =
|
|
954
|
+
embeddings_model = construct_model_from_args(adapt_embedding_model_params(embedding_model_params))
|
|
928
955
|
logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
|
|
929
956
|
else:
|
|
930
957
|
embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
|
|
@@ -952,8 +979,8 @@ class KnowledgeBaseTable:
|
|
|
952
979
|
return rag
|
|
953
980
|
|
|
954
981
|
except Exception as e:
|
|
955
|
-
logger.
|
|
956
|
-
raise ValueError(f"Failed to build RAG pipeline: {str(e)}")
|
|
982
|
+
logger.exception("Error building RAG pipeline:")
|
|
983
|
+
raise ValueError(f"Failed to build RAG pipeline: {str(e)}") from e
|
|
957
984
|
|
|
958
985
|
def _parse_metadata(self, base_metadata):
|
|
959
986
|
"""Helper function to robustly parse metadata string to dict"""
|
|
@@ -1065,7 +1092,7 @@ class KnowledgeBaseController:
|
|
|
1065
1092
|
msg = "\n".join(problems)
|
|
1066
1093
|
if len(problems) > 1:
|
|
1067
1094
|
msg = "\n" + msg
|
|
1068
|
-
raise ValueError(f"Problem with knowledge base parameters: {msg}")
|
|
1095
|
+
raise ValueError(f"Problem with knowledge base parameters: {msg}") from e
|
|
1069
1096
|
|
|
1070
1097
|
# Validate preprocessing config first if provided
|
|
1071
1098
|
if preprocessing_config is not None:
|
|
@@ -1110,6 +1137,9 @@ class KnowledgeBaseController:
|
|
|
1110
1137
|
model_record = db.Predictor.query.get(model["id"])
|
|
1111
1138
|
embedding_model_id = model_record.id
|
|
1112
1139
|
|
|
1140
|
+
if model_record.learn_args.get("using", {}).get("sparse"):
|
|
1141
|
+
is_sparse = True
|
|
1142
|
+
|
|
1113
1143
|
# if params.get("reranking_model", {}) is bool and False we evaluate it to empty dictionary
|
|
1114
1144
|
reranking_model_params = params.get("reranking_model", {})
|
|
1115
1145
|
|
|
@@ -1128,7 +1158,7 @@ class KnowledgeBaseController:
|
|
|
1128
1158
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
1129
1159
|
reranker.get_scores("test", ["test"])
|
|
1130
1160
|
except (ValueError, RuntimeError) as e:
|
|
1131
|
-
raise RuntimeError(f"Problem with reranker config: {e}")
|
|
1161
|
+
raise RuntimeError(f"Problem with reranker config: {e}") from e
|
|
1132
1162
|
|
|
1133
1163
|
# search for the vector database table
|
|
1134
1164
|
if storage is None:
|
|
@@ -1138,7 +1168,6 @@ class KnowledgeBaseController:
|
|
|
1138
1168
|
# Add sparse vector support for pgvector
|
|
1139
1169
|
vector_db_params = {}
|
|
1140
1170
|
# Check both explicit parameter and model configuration
|
|
1141
|
-
is_sparse = is_sparse or model_record.learn_args.get("using", {}).get("sparse")
|
|
1142
1171
|
if is_sparse:
|
|
1143
1172
|
vector_db_params["is_sparse"] = True
|
|
1144
1173
|
if vector_size is not None:
|
|
@@ -1242,7 +1271,7 @@ class KnowledgeBaseController:
|
|
|
1242
1271
|
try:
|
|
1243
1272
|
KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
|
|
1244
1273
|
except Exception as e:
|
|
1245
|
-
raise RuntimeError(f"Problem with embedding model config: {e}")
|
|
1274
|
+
raise RuntimeError(f"Problem with embedding model config: {e}") from e
|
|
1246
1275
|
return
|
|
1247
1276
|
|
|
1248
1277
|
params = copy.deepcopy(params)
|
|
@@ -1295,8 +1324,8 @@ class KnowledgeBaseController:
|
|
|
1295
1324
|
"""
|
|
1296
1325
|
try:
|
|
1297
1326
|
project = self.session.database_controller.get_project(project_name)
|
|
1298
|
-
except ValueError:
|
|
1299
|
-
raise ValueError(f"Project not found: {project_name}")
|
|
1327
|
+
except ValueError as e:
|
|
1328
|
+
raise ValueError(f"Project not found: {project_name}") from e
|
|
1300
1329
|
project_id = project.id
|
|
1301
1330
|
|
|
1302
1331
|
# check if knowledge base exists
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import ast
|
|
2
2
|
import json
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
|
|
3
5
|
import pandas as pd
|
|
4
|
-
import ast
|
|
5
6
|
|
|
6
|
-
from mindsdb.interfaces.knowledge_base.preprocessing.models import
|
|
7
|
-
Document,
|
|
8
|
-
ProcessedChunk,
|
|
9
|
-
JSONChunkingConfig
|
|
10
|
-
)
|
|
7
|
+
from mindsdb.interfaces.knowledge_base.preprocessing.models import Document, ProcessedChunk, JSONChunkingConfig
|
|
11
8
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import DocumentPreprocessor
|
|
12
9
|
from mindsdb.utilities import log
|
|
13
10
|
|
|
@@ -50,7 +47,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
50
47
|
chunks = self._process_json_data(json_data, doc)
|
|
51
48
|
all_chunks.extend(chunks)
|
|
52
49
|
except Exception as e:
|
|
53
|
-
logger.
|
|
50
|
+
logger.exception(f"Error processing document {doc.id}:")
|
|
54
51
|
error_chunk = self._create_error_chunk(doc, str(e))
|
|
55
52
|
all_chunks.append(error_chunk)
|
|
56
53
|
|
|
@@ -76,8 +73,8 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
76
73
|
# If JSON parsing fails, try as Python literal
|
|
77
74
|
try:
|
|
78
75
|
return ast.literal_eval(doc.content)
|
|
79
|
-
except (SyntaxError, ValueError)
|
|
80
|
-
logger.
|
|
76
|
+
except (SyntaxError, ValueError):
|
|
77
|
+
logger.exception(f"Error parsing content for document {doc.id}:")
|
|
81
78
|
# We'll create the error chunk in the main process_documents method
|
|
82
79
|
return None
|
|
83
80
|
|
|
@@ -117,7 +114,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
117
114
|
return ProcessedChunk(
|
|
118
115
|
id=f"{doc.id}_error",
|
|
119
116
|
content=f"Error processing document: {error_message}",
|
|
120
|
-
metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata)
|
|
117
|
+
metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata),
|
|
121
118
|
)
|
|
122
119
|
|
|
123
120
|
def _process_json_list(self, json_list: List, doc: Document) -> List[ProcessedChunk]:
|
|
@@ -132,20 +129,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
132
129
|
elif isinstance(item, list):
|
|
133
130
|
# Handle nested lists by converting to string representation
|
|
134
131
|
chunk = self._create_chunk_from_primitive(
|
|
135
|
-
json.dumps(item),
|
|
136
|
-
doc,
|
|
137
|
-
chunk_index=i,
|
|
138
|
-
total_chunks=total_objects
|
|
132
|
+
json.dumps(item), doc, chunk_index=i, total_chunks=total_objects
|
|
139
133
|
)
|
|
140
134
|
chunks.append(chunk)
|
|
141
135
|
else:
|
|
142
136
|
# Handle primitive values
|
|
143
|
-
chunk = self._create_chunk_from_primitive(
|
|
144
|
-
item,
|
|
145
|
-
doc,
|
|
146
|
-
chunk_index=i,
|
|
147
|
-
total_chunks=total_objects
|
|
148
|
-
)
|
|
137
|
+
chunk = self._create_chunk_from_primitive(item, doc, chunk_index=i, total_chunks=total_objects)
|
|
149
138
|
chunks.append(chunk)
|
|
150
139
|
|
|
151
140
|
return chunks
|
|
@@ -159,7 +148,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
159
148
|
try:
|
|
160
149
|
json_dict = json.loads(json_dict)
|
|
161
150
|
except json.JSONDecodeError:
|
|
162
|
-
logger.
|
|
151
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
163
152
|
return [self._create_error_chunk(doc, "Invalid JSON string")]
|
|
164
153
|
|
|
165
154
|
# Filter fields based on include/exclude lists
|
|
@@ -190,31 +179,25 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
190
179
|
start_char=0,
|
|
191
180
|
end_char=len(field_content),
|
|
192
181
|
provided_id=doc.id,
|
|
193
|
-
content_column=self.config.content_column
|
|
182
|
+
content_column=self.config.content_column,
|
|
194
183
|
)
|
|
195
184
|
|
|
196
185
|
# Create and add the chunk
|
|
197
|
-
chunk = ProcessedChunk(
|
|
198
|
-
id=chunk_id,
|
|
199
|
-
content=field_content,
|
|
200
|
-
metadata=metadata
|
|
201
|
-
)
|
|
186
|
+
chunk = ProcessedChunk(id=chunk_id, content=field_content, metadata=metadata)
|
|
202
187
|
chunks.append(chunk)
|
|
203
188
|
|
|
204
189
|
return chunks
|
|
205
190
|
|
|
206
|
-
def _create_chunk_from_dict(
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
chunk_index: int,
|
|
210
|
-
total_chunks: int) -> ProcessedChunk:
|
|
191
|
+
def _create_chunk_from_dict(
|
|
192
|
+
self, json_dict: Dict, doc: Document, chunk_index: int, total_chunks: int
|
|
193
|
+
) -> ProcessedChunk:
|
|
211
194
|
"""Create a chunk from a JSON dictionary"""
|
|
212
195
|
# Ensure we're working with a dictionary
|
|
213
196
|
if isinstance(json_dict, str):
|
|
214
197
|
try:
|
|
215
198
|
json_dict = json.loads(json_dict)
|
|
216
199
|
except json.JSONDecodeError:
|
|
217
|
-
logger.
|
|
200
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
218
201
|
return self._create_error_chunk(doc, "Invalid JSON string")
|
|
219
202
|
|
|
220
203
|
# Format the content
|
|
@@ -223,9 +206,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
223
206
|
filtered_dict = self._filter_fields(flattened)
|
|
224
207
|
content = self._dict_to_text(filtered_dict)
|
|
225
208
|
else:
|
|
226
|
-
filtered_dict = {
|
|
227
|
-
|
|
228
|
-
|
|
209
|
+
filtered_dict = {
|
|
210
|
+
k: v
|
|
211
|
+
for k, v in json_dict.items()
|
|
212
|
+
if (not self.config.include_fields or k in self.config.include_fields)
|
|
213
|
+
and k not in self.config.exclude_fields
|
|
214
|
+
}
|
|
229
215
|
content = json.dumps(filtered_dict, indent=2)
|
|
230
216
|
|
|
231
217
|
# Create metadata
|
|
@@ -241,22 +227,23 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
241
227
|
start_char=0,
|
|
242
228
|
end_char=len(content),
|
|
243
229
|
provided_id=doc.id,
|
|
244
|
-
content_column=self.config.content_column
|
|
230
|
+
content_column=self.config.content_column,
|
|
245
231
|
)
|
|
246
232
|
|
|
247
|
-
return ProcessedChunk(
|
|
248
|
-
id=chunk_id,
|
|
249
|
-
content=content,
|
|
250
|
-
metadata=metadata
|
|
251
|
-
)
|
|
233
|
+
return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
|
|
252
234
|
|
|
253
235
|
def _filter_fields(self, flattened_dict: Dict) -> Dict:
|
|
254
236
|
"""Filter fields based on include/exclude configuration"""
|
|
255
237
|
# If include_fields is specified, only keep those fields
|
|
256
238
|
if self.config.include_fields:
|
|
257
|
-
filtered_dict = {
|
|
258
|
-
|
|
259
|
-
|
|
239
|
+
filtered_dict = {
|
|
240
|
+
k: v
|
|
241
|
+
for k, v in flattened_dict.items()
|
|
242
|
+
if any(
|
|
243
|
+
k == field or k.startswith(field + self.config.nested_delimiter)
|
|
244
|
+
for field in self.config.include_fields
|
|
245
|
+
)
|
|
246
|
+
}
|
|
260
247
|
else:
|
|
261
248
|
filtered_dict = flattened_dict.copy()
|
|
262
249
|
|
|
@@ -276,11 +263,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
276
263
|
return filtered_dict
|
|
277
264
|
|
|
278
265
|
def _create_chunk_from_primitive(
|
|
279
|
-
|
|
280
|
-
value: Any,
|
|
281
|
-
doc: Document,
|
|
282
|
-
chunk_index: int = 0,
|
|
283
|
-
total_chunks: int = 1
|
|
266
|
+
self, value: Any, doc: Document, chunk_index: int = 0, total_chunks: int = 1
|
|
284
267
|
) -> ProcessedChunk:
|
|
285
268
|
"""Create a chunk from a primitive value"""
|
|
286
269
|
content = str(value)
|
|
@@ -300,16 +283,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
300
283
|
start_char=0,
|
|
301
284
|
end_char=len(content),
|
|
302
285
|
provided_id=doc.id,
|
|
303
|
-
content_column=self.config.content_column
|
|
286
|
+
content_column=self.config.content_column,
|
|
304
287
|
)
|
|
305
288
|
|
|
306
|
-
return ProcessedChunk(
|
|
307
|
-
id=chunk_id,
|
|
308
|
-
content=content,
|
|
309
|
-
metadata=metadata
|
|
310
|
-
)
|
|
289
|
+
return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
|
|
311
290
|
|
|
312
|
-
def _flatten_dict(self, d: Dict, delimiter: str =
|
|
291
|
+
def _flatten_dict(self, d: Dict, delimiter: str = ".", prefix: str = "") -> Dict:
|
|
313
292
|
"""Flatten a nested dictionary structure"""
|
|
314
293
|
result = {}
|
|
315
294
|
for k, v in d.items():
|
|
@@ -337,7 +316,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
337
316
|
# Format list of dictionaries
|
|
338
317
|
lines.append(f"{key}:")
|
|
339
318
|
for i, item in enumerate(value):
|
|
340
|
-
lines.append(f" Item {i+1}:")
|
|
319
|
+
lines.append(f" Item {i + 1}:")
|
|
341
320
|
for k, v in item.items():
|
|
342
321
|
lines.append(f" {k}: {v}")
|
|
343
322
|
else:
|
|
@@ -362,7 +341,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
362
341
|
# Format list of dictionaries
|
|
363
342
|
lines = [f"{key}:"]
|
|
364
343
|
for i, item in enumerate(value):
|
|
365
|
-
lines.append(f" Item {i+1}:")
|
|
344
|
+
lines.append(f" Item {i + 1}:")
|
|
366
345
|
for k, v in item.items():
|
|
367
346
|
lines.append(f" {k}: {v}")
|
|
368
347
|
return "\n".join(lines)
|
|
@@ -380,7 +359,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
380
359
|
try:
|
|
381
360
|
json_dict = json.loads(json_dict)
|
|
382
361
|
except json.JSONDecodeError:
|
|
383
|
-
logger.
|
|
362
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
384
363
|
return
|
|
385
364
|
|
|
386
365
|
# Always flatten the dictionary for metadata extraction
|