MindsDB 25.9.2.0a1__py3-none-any.whl → 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +40 -29
- mindsdb/api/a2a/__init__.py +1 -1
- mindsdb/api/a2a/agent.py +16 -10
- mindsdb/api/a2a/common/server/server.py +7 -3
- mindsdb/api/a2a/common/server/task_manager.py +12 -5
- mindsdb/api/a2a/common/types.py +66 -0
- mindsdb/api/a2a/task_manager.py +65 -17
- mindsdb/api/common/middleware.py +10 -12
- mindsdb/api/executor/command_executor.py +51 -40
- mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -13
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +101 -49
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
- mindsdb/api/executor/datahub/datanodes/system_tables.py +3 -2
- mindsdb/api/executor/exceptions.py +29 -10
- mindsdb/api/executor/planner/plan_join.py +17 -3
- mindsdb/api/executor/planner/query_prepare.py +2 -20
- mindsdb/api/executor/sql_query/sql_query.py +74 -74
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
- mindsdb/api/executor/utilities/functions.py +6 -6
- mindsdb/api/executor/utilities/sql.py +37 -20
- mindsdb/api/http/gui.py +5 -11
- mindsdb/api/http/initialize.py +75 -61
- mindsdb/api/http/namespaces/agents.py +10 -15
- mindsdb/api/http/namespaces/analysis.py +13 -20
- mindsdb/api/http/namespaces/auth.py +1 -1
- mindsdb/api/http/namespaces/chatbots.py +0 -5
- mindsdb/api/http/namespaces/config.py +15 -11
- mindsdb/api/http/namespaces/databases.py +140 -201
- mindsdb/api/http/namespaces/file.py +17 -4
- mindsdb/api/http/namespaces/handlers.py +17 -7
- mindsdb/api/http/namespaces/knowledge_bases.py +28 -7
- mindsdb/api/http/namespaces/models.py +94 -126
- mindsdb/api/http/namespaces/projects.py +13 -22
- mindsdb/api/http/namespaces/sql.py +33 -25
- mindsdb/api/http/namespaces/tab.py +27 -37
- mindsdb/api/http/namespaces/views.py +1 -1
- mindsdb/api/http/start.py +16 -10
- mindsdb/api/mcp/__init__.py +2 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
- mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +8 -2
- mindsdb/integrations/handlers/byom_handler/byom_handler.py +165 -190
- mindsdb/integrations/handlers/databricks_handler/databricks_handler.py +98 -46
- mindsdb/integrations/handlers/druid_handler/druid_handler.py +32 -40
- mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
- mindsdb/integrations/handlers/gitlab_handler/gitlab_handler.py +5 -2
- mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +438 -100
- mindsdb/integrations/handlers/mssql_handler/requirements_odbc.txt +3 -0
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +235 -3
- mindsdb/integrations/handlers/oracle_handler/__init__.py +2 -0
- mindsdb/integrations/handlers/oracle_handler/connection_args.py +7 -1
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +321 -16
- mindsdb/integrations/handlers/oracle_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +14 -2
- mindsdb/integrations/handlers/shopify_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +80 -13
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
- mindsdb/integrations/handlers/zendesk_handler/zendesk_tables.py +144 -111
- mindsdb/integrations/libs/api_handler.py +10 -10
- mindsdb/integrations/libs/base.py +4 -4
- mindsdb/integrations/libs/llm/utils.py +2 -2
- mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
- mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
- mindsdb/integrations/libs/process_cache.py +132 -140
- mindsdb/integrations/libs/response.py +18 -12
- mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
- mindsdb/integrations/utilities/files/file_reader.py +6 -7
- mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/__init__.py +1 -0
- mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/snowflake_jwt_gen.py +151 -0
- mindsdb/integrations/utilities/rag/config_loader.py +37 -26
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +83 -30
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
- mindsdb/integrations/utilities/rag/settings.py +58 -133
- mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
- mindsdb/interfaces/agents/agents_controller.py +2 -3
- mindsdb/interfaces/agents/constants.py +0 -2
- mindsdb/interfaces/agents/litellm_server.py +34 -58
- mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
- mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
- mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
- mindsdb/interfaces/chatbot/polling.py +30 -18
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +16 -17
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +15 -4
- mindsdb/interfaces/database/data_handlers_cache.py +190 -0
- mindsdb/interfaces/database/database.py +3 -3
- mindsdb/interfaces/database/integrations.py +7 -110
- mindsdb/interfaces/database/projects.py +2 -6
- mindsdb/interfaces/database/views.py +1 -4
- mindsdb/interfaces/file/file_controller.py +6 -6
- mindsdb/interfaces/functions/controller.py +1 -1
- mindsdb/interfaces/functions/to_markdown.py +2 -2
- mindsdb/interfaces/jobs/jobs_controller.py +5 -9
- mindsdb/interfaces/jobs/scheduler.py +3 -9
- mindsdb/interfaces/knowledge_base/controller.py +244 -128
- mindsdb/interfaces/knowledge_base/evaluate.py +36 -41
- mindsdb/interfaces/knowledge_base/executor.py +11 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +51 -17
- mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
- mindsdb/interfaces/model/model_controller.py +172 -168
- mindsdb/interfaces/query_context/context_controller.py +14 -2
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +10 -14
- mindsdb/interfaces/skills/retrieval_tool.py +43 -50
- mindsdb/interfaces/skills/skill_tool.py +2 -2
- mindsdb/interfaces/skills/skills_controller.py +1 -4
- mindsdb/interfaces/skills/sql_agent.py +25 -19
- mindsdb/interfaces/storage/db.py +16 -6
- mindsdb/interfaces/storage/fs.py +114 -169
- mindsdb/interfaces/storage/json.py +19 -18
- mindsdb/interfaces/tabs/tabs_controller.py +49 -72
- mindsdb/interfaces/tasks/task_monitor.py +3 -9
- mindsdb/interfaces/tasks/task_thread.py +7 -9
- mindsdb/interfaces/triggers/trigger_task.py +7 -13
- mindsdb/interfaces/triggers/triggers_controller.py +47 -52
- mindsdb/migrations/migrate.py +16 -16
- mindsdb/utilities/api_status.py +58 -0
- mindsdb/utilities/config.py +68 -2
- mindsdb/utilities/exception.py +40 -1
- mindsdb/utilities/fs.py +0 -1
- mindsdb/utilities/hooks/profiling.py +17 -14
- mindsdb/utilities/json_encoder.py +24 -10
- mindsdb/utilities/langfuse.py +40 -45
- mindsdb/utilities/log.py +272 -0
- mindsdb/utilities/ml_task_queue/consumer.py +52 -58
- mindsdb/utilities/ml_task_queue/producer.py +26 -30
- mindsdb/utilities/render/sqlalchemy_render.py +22 -20
- mindsdb/utilities/starters.py +0 -10
- mindsdb/utilities/utils.py +2 -2
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0.dist-info}/METADATA +286 -267
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0.dist-info}/RECORD +145 -159
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
- mindsdb/api/postgres/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/executor/__init__.py +0 -1
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +0 -189
- mindsdb/api/postgres/postgres_proxy/postgres_packets/__init__.py +0 -0
- mindsdb/api/postgres/postgres_proxy/postgres_packets/errors.py +0 -322
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_fields.py +0 -34
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message.py +0 -31
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +0 -1265
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_identifiers.py +0 -31
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +0 -253
- mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +0 -477
- mindsdb/api/postgres/postgres_proxy/utilities/__init__.py +0 -10
- mindsdb/api/postgres/start.py +0 -11
- mindsdb/integrations/handlers/mssql_handler/tests/__init__.py +0 -0
- mindsdb/integrations/handlers/mssql_handler/tests/test_mssql_handler.py +0 -169
- mindsdb/integrations/handlers/oracle_handler/tests/__init__.py +0 -0
- mindsdb/integrations/handlers/oracle_handler/tests/test_oracle_handler.py +0 -32
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import math
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
|
+
import copy
|
|
5
6
|
from typing import List
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
@@ -10,6 +11,7 @@ import datetime as dt
|
|
|
10
11
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
11
12
|
from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
|
|
12
13
|
from mindsdb.utilities import log
|
|
14
|
+
from mindsdb.utilities.config import config
|
|
13
15
|
|
|
14
16
|
from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
|
|
15
17
|
|
|
@@ -105,7 +107,12 @@ class EvaluateBase:
|
|
|
105
107
|
if llm_params is None:
|
|
106
108
|
llm_params = self.kb._kb.params.get("reranking_model")
|
|
107
109
|
|
|
108
|
-
|
|
110
|
+
params = copy.deepcopy(config.get("default_llm", {}))
|
|
111
|
+
|
|
112
|
+
if llm_params:
|
|
113
|
+
params.update(llm_params)
|
|
114
|
+
|
|
115
|
+
self.llm_client = LLMClient(params)
|
|
109
116
|
|
|
110
117
|
def generate_test_data(self, gen_params: dict) -> pd.DataFrame:
|
|
111
118
|
# Extract source data (from users query or from KB itself) and call `generate` to get test data
|
|
@@ -241,6 +248,26 @@ class EvaluateBase:
|
|
|
241
248
|
|
|
242
249
|
return cls(session, kb_table).run_evaluate(params)
|
|
243
250
|
|
|
251
|
+
def generate_question_answer(self, text: str) -> (str, str):
|
|
252
|
+
messages = [
|
|
253
|
+
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
254
|
+
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
255
|
+
]
|
|
256
|
+
answer = self.llm_client.completion(messages, json_output=True)[0]
|
|
257
|
+
|
|
258
|
+
# Sanitize the response by removing markdown code block formatting like ```json
|
|
259
|
+
sanitized_answer = sanitize_json_response(answer)
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
output = json.loads(sanitized_answer)
|
|
263
|
+
except json.JSONDecodeError:
|
|
264
|
+
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
265
|
+
|
|
266
|
+
if "query" not in output or "reference_answer" not in output:
|
|
267
|
+
raise ValueError("Cant find question/answer in LLM response")
|
|
268
|
+
|
|
269
|
+
return output.get("query"), output.get("reference_answer")
|
|
270
|
+
|
|
244
271
|
|
|
245
272
|
class EvaluateRerank(EvaluateBase):
|
|
246
273
|
"""
|
|
@@ -268,28 +295,12 @@ class EvaluateRerank(EvaluateBase):
|
|
|
268
295
|
df["id"] = df.index
|
|
269
296
|
return df
|
|
270
297
|
|
|
271
|
-
def generate_question_answer(self, text: str) -> (str, str):
|
|
272
|
-
messages = [
|
|
273
|
-
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
274
|
-
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
275
|
-
]
|
|
276
|
-
answer = self.llm_client.completion(messages, json_output=True)
|
|
277
|
-
|
|
278
|
-
# Sanitize the response by removing markdown code block formatting like ```json
|
|
279
|
-
sanitized_answer = sanitize_json_response(answer)
|
|
280
|
-
|
|
281
|
-
try:
|
|
282
|
-
output = json.loads(sanitized_answer)
|
|
283
|
-
except json.JSONDecodeError:
|
|
284
|
-
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
285
|
-
|
|
286
|
-
if "query" not in output or "reference_answer" not in output:
|
|
287
|
-
raise ValueError("Cant find question/answer in LLM response")
|
|
288
|
-
|
|
289
|
-
return output.get("query"), output.get("reference_answer")
|
|
290
|
-
|
|
291
298
|
def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
|
|
292
299
|
json_to_log_list = []
|
|
300
|
+
if {"question", "answer"} - set(test_data.columns):
|
|
301
|
+
raise KeyError(
|
|
302
|
+
f'Test data must contain "question" and "answer" columns. Columns in the provided test data: {list(test_data.columns)}'
|
|
303
|
+
)
|
|
293
304
|
questions = test_data.to_dict("records")
|
|
294
305
|
|
|
295
306
|
for i, item in enumerate(questions):
|
|
@@ -483,28 +494,12 @@ class EvaluateDocID(EvaluateBase):
|
|
|
483
494
|
df = pd.DataFrame(qa_data)
|
|
484
495
|
return df
|
|
485
496
|
|
|
486
|
-
def generate_question_answer(self, text: str) -> (str, str):
|
|
487
|
-
messages = [
|
|
488
|
-
{"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
|
|
489
|
-
{"role": "user", "content": f"\n\nText:\n{text}\n\n"},
|
|
490
|
-
]
|
|
491
|
-
answer = self.llm_client.completion(messages, json_output=True)
|
|
492
|
-
|
|
493
|
-
# Sanitize the response by removing markdown code block formatting like ```json
|
|
494
|
-
sanitized_answer = sanitize_json_response(answer)
|
|
495
|
-
|
|
496
|
-
try:
|
|
497
|
-
output = json.loads(sanitized_answer)
|
|
498
|
-
except json.JSONDecodeError:
|
|
499
|
-
raise ValueError(f"Could not parse response from LLM: {answer}")
|
|
500
|
-
|
|
501
|
-
if "query" not in output or "reference_answer" not in output:
|
|
502
|
-
raise ValueError("Cant find question/answer in LLM response")
|
|
503
|
-
|
|
504
|
-
return output.get("query"), output.get("reference_answer")
|
|
505
|
-
|
|
506
497
|
def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
|
|
507
498
|
stats = []
|
|
499
|
+
if {"question", "doc_id"} - set(test_data.columns):
|
|
500
|
+
raise KeyError(
|
|
501
|
+
f'Test data must contain "question" and "doc_id" columns. Columns in the provided test data: {list(test_data.columns)}'
|
|
502
|
+
)
|
|
508
503
|
questions = test_data.to_dict("records")
|
|
509
504
|
|
|
510
505
|
for i, item in enumerate(questions):
|
|
@@ -43,7 +43,18 @@ class KnowledgeBaseQueryExecutor:
|
|
|
43
43
|
if isinstance(node, BinaryOperation):
|
|
44
44
|
if isinstance(node.args[0], Identifier):
|
|
45
45
|
parts = node.args[0].parts
|
|
46
|
+
|
|
47
|
+
# map chunk_content to content
|
|
48
|
+
if parts[0].lower() == "chunk_content":
|
|
49
|
+
parts[0] = self.content_column
|
|
50
|
+
|
|
46
51
|
if len(parts) == 1 and parts[0].lower() == self.content_column:
|
|
52
|
+
if "LIKE" in node.op.upper():
|
|
53
|
+
# remove '%'
|
|
54
|
+
arg = node.args[1]
|
|
55
|
+
if isinstance(arg, Constant) and isinstance(arg.value, str):
|
|
56
|
+
arg.value = arg.value.strip(" %")
|
|
57
|
+
|
|
47
58
|
return True
|
|
48
59
|
return False
|
|
49
60
|
|
|
@@ -1,11 +1,23 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import os
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
4
|
from openai import OpenAI, AzureOpenAI
|
|
6
5
|
|
|
7
6
|
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
8
|
-
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from mindsdb.integrations.handlers.openai_handler.helpers import retry_with_exponential_backoff
|
|
10
|
+
except ImportError:
|
|
11
|
+
|
|
12
|
+
def retry_with_exponential_backoff(func):
|
|
13
|
+
"""
|
|
14
|
+
An empty decorator
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def wrapper(*args, **kwargs):
|
|
18
|
+
return func(*args, **kwargs)
|
|
19
|
+
|
|
20
|
+
return wrapper
|
|
9
21
|
|
|
10
22
|
|
|
11
23
|
class LLMClient:
|
|
@@ -14,12 +26,8 @@ class LLMClient:
|
|
|
14
26
|
It chooses openai client or litellm handler depending on the config
|
|
15
27
|
"""
|
|
16
28
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
if llm_params:
|
|
21
|
-
params.update(llm_params)
|
|
22
|
-
|
|
29
|
+
def __init__(self, params: dict = None, session=None):
|
|
30
|
+
self._session = session
|
|
23
31
|
self.params = params
|
|
24
32
|
|
|
25
33
|
self.provider = params.get("provider", "openai")
|
|
@@ -27,11 +35,13 @@ class LLMClient:
|
|
|
27
35
|
if "api_key" not in params:
|
|
28
36
|
params["api_key"] = get_api_key(self.provider, params, strict=False)
|
|
29
37
|
|
|
38
|
+
self.engine = "openai"
|
|
39
|
+
|
|
30
40
|
if self.provider == "azure_openai":
|
|
31
41
|
azure_api_key = params.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY")
|
|
32
42
|
azure_api_endpoint = params.get("base_url") or os.environ.get("AZURE_OPENAI_ENDPOINT")
|
|
33
43
|
azure_api_version = params.get("api_version") or os.environ.get("AZURE_OPENAI_API_VERSION")
|
|
34
|
-
self.
|
|
44
|
+
self.client = AzureOpenAI(
|
|
35
45
|
api_key=azure_api_key, azure_endpoint=azure_api_endpoint, api_version=azure_api_version, max_retries=2
|
|
36
46
|
)
|
|
37
47
|
elif self.provider == "openai":
|
|
@@ -41,34 +51,58 @@ class LLMClient:
|
|
|
41
51
|
if base_url:
|
|
42
52
|
kwargs["base_url"] = base_url
|
|
43
53
|
self.client = OpenAI(**kwargs)
|
|
44
|
-
|
|
54
|
+
elif self.provider == "ollama":
|
|
55
|
+
kwargs = params.copy()
|
|
56
|
+
kwargs.pop("model_name")
|
|
57
|
+
kwargs.pop("provider", None)
|
|
58
|
+
if kwargs["api_key"] is None:
|
|
59
|
+
kwargs["api_key"] = "n/a"
|
|
60
|
+
self.client = OpenAI(**kwargs)
|
|
45
61
|
else:
|
|
46
62
|
# try to use litellm
|
|
47
|
-
|
|
63
|
+
if self._session is None:
|
|
64
|
+
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
48
65
|
|
|
49
|
-
|
|
50
|
-
module =
|
|
66
|
+
self._session = SessionController()
|
|
67
|
+
module = self._session.integration_controller.get_handler_module("litellm")
|
|
51
68
|
|
|
52
69
|
if module is None or module.Handler is None:
|
|
53
70
|
raise ValueError(f'Unable to use "{self.provider}" provider. Litellm handler is not installed')
|
|
54
71
|
|
|
55
72
|
self.client = module.Handler
|
|
73
|
+
self.engine = "litellm"
|
|
74
|
+
|
|
75
|
+
@retry_with_exponential_backoff()
|
|
76
|
+
def embeddings(self, messages: List[str]):
|
|
77
|
+
params = self.params
|
|
78
|
+
if self.engine == "openai":
|
|
79
|
+
response = self.client.embeddings.create(
|
|
80
|
+
model=params["model_name"],
|
|
81
|
+
input=messages,
|
|
82
|
+
)
|
|
83
|
+
return [item.embedding for item in response.data]
|
|
84
|
+
else:
|
|
85
|
+
kwargs = params.copy()
|
|
86
|
+
model = kwargs.pop("model_name")
|
|
87
|
+
kwargs.pop("provider", None)
|
|
88
|
+
|
|
89
|
+
return self.client.embeddings(self.provider, model=model, messages=messages, args=kwargs)
|
|
56
90
|
|
|
57
|
-
def completion(self, messages: List[dict], json_output: bool = False) -> str:
|
|
91
|
+
def completion(self, messages: List[dict], json_output: bool = False) -> List[str]:
|
|
58
92
|
"""
|
|
59
93
|
Call LLM completion and get response
|
|
60
94
|
"""
|
|
61
95
|
params = self.params
|
|
62
96
|
params["json_output"] = json_output
|
|
63
|
-
if self.
|
|
97
|
+
if self.engine == "openai":
|
|
64
98
|
response = self.client.chat.completions.create(
|
|
65
99
|
model=params["model_name"],
|
|
66
100
|
messages=messages,
|
|
67
101
|
)
|
|
68
|
-
return
|
|
102
|
+
return [item.message.content for item in response.choices]
|
|
69
103
|
else:
|
|
70
104
|
kwargs = params.copy()
|
|
71
105
|
model = kwargs.pop("model_name")
|
|
72
106
|
kwargs.pop("provider", None)
|
|
73
107
|
response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
|
|
74
|
-
return
|
|
108
|
+
return [item.message.content for item in response.choices]
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import ast
|
|
2
2
|
import json
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
|
|
3
5
|
import pandas as pd
|
|
4
|
-
import ast
|
|
5
6
|
|
|
6
|
-
from mindsdb.interfaces.knowledge_base.preprocessing.models import
|
|
7
|
-
Document,
|
|
8
|
-
ProcessedChunk,
|
|
9
|
-
JSONChunkingConfig
|
|
10
|
-
)
|
|
7
|
+
from mindsdb.interfaces.knowledge_base.preprocessing.models import Document, ProcessedChunk, JSONChunkingConfig
|
|
11
8
|
from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import DocumentPreprocessor
|
|
12
9
|
from mindsdb.utilities import log
|
|
13
10
|
|
|
@@ -50,7 +47,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
50
47
|
chunks = self._process_json_data(json_data, doc)
|
|
51
48
|
all_chunks.extend(chunks)
|
|
52
49
|
except Exception as e:
|
|
53
|
-
logger.
|
|
50
|
+
logger.exception(f"Error processing document {doc.id}:")
|
|
54
51
|
error_chunk = self._create_error_chunk(doc, str(e))
|
|
55
52
|
all_chunks.append(error_chunk)
|
|
56
53
|
|
|
@@ -76,8 +73,8 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
76
73
|
# If JSON parsing fails, try as Python literal
|
|
77
74
|
try:
|
|
78
75
|
return ast.literal_eval(doc.content)
|
|
79
|
-
except (SyntaxError, ValueError)
|
|
80
|
-
logger.
|
|
76
|
+
except (SyntaxError, ValueError):
|
|
77
|
+
logger.exception(f"Error parsing content for document {doc.id}:")
|
|
81
78
|
# We'll create the error chunk in the main process_documents method
|
|
82
79
|
return None
|
|
83
80
|
|
|
@@ -117,7 +114,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
117
114
|
return ProcessedChunk(
|
|
118
115
|
id=f"{doc.id}_error",
|
|
119
116
|
content=f"Error processing document: {error_message}",
|
|
120
|
-
metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata)
|
|
117
|
+
metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata),
|
|
121
118
|
)
|
|
122
119
|
|
|
123
120
|
def _process_json_list(self, json_list: List, doc: Document) -> List[ProcessedChunk]:
|
|
@@ -132,20 +129,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
132
129
|
elif isinstance(item, list):
|
|
133
130
|
# Handle nested lists by converting to string representation
|
|
134
131
|
chunk = self._create_chunk_from_primitive(
|
|
135
|
-
json.dumps(item),
|
|
136
|
-
doc,
|
|
137
|
-
chunk_index=i,
|
|
138
|
-
total_chunks=total_objects
|
|
132
|
+
json.dumps(item), doc, chunk_index=i, total_chunks=total_objects
|
|
139
133
|
)
|
|
140
134
|
chunks.append(chunk)
|
|
141
135
|
else:
|
|
142
136
|
# Handle primitive values
|
|
143
|
-
chunk = self._create_chunk_from_primitive(
|
|
144
|
-
item,
|
|
145
|
-
doc,
|
|
146
|
-
chunk_index=i,
|
|
147
|
-
total_chunks=total_objects
|
|
148
|
-
)
|
|
137
|
+
chunk = self._create_chunk_from_primitive(item, doc, chunk_index=i, total_chunks=total_objects)
|
|
149
138
|
chunks.append(chunk)
|
|
150
139
|
|
|
151
140
|
return chunks
|
|
@@ -159,7 +148,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
159
148
|
try:
|
|
160
149
|
json_dict = json.loads(json_dict)
|
|
161
150
|
except json.JSONDecodeError:
|
|
162
|
-
logger.
|
|
151
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
163
152
|
return [self._create_error_chunk(doc, "Invalid JSON string")]
|
|
164
153
|
|
|
165
154
|
# Filter fields based on include/exclude lists
|
|
@@ -190,31 +179,25 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
190
179
|
start_char=0,
|
|
191
180
|
end_char=len(field_content),
|
|
192
181
|
provided_id=doc.id,
|
|
193
|
-
content_column=self.config.content_column
|
|
182
|
+
content_column=self.config.content_column,
|
|
194
183
|
)
|
|
195
184
|
|
|
196
185
|
# Create and add the chunk
|
|
197
|
-
chunk = ProcessedChunk(
|
|
198
|
-
id=chunk_id,
|
|
199
|
-
content=field_content,
|
|
200
|
-
metadata=metadata
|
|
201
|
-
)
|
|
186
|
+
chunk = ProcessedChunk(id=chunk_id, content=field_content, metadata=metadata)
|
|
202
187
|
chunks.append(chunk)
|
|
203
188
|
|
|
204
189
|
return chunks
|
|
205
190
|
|
|
206
|
-
def _create_chunk_from_dict(
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
chunk_index: int,
|
|
210
|
-
total_chunks: int) -> ProcessedChunk:
|
|
191
|
+
def _create_chunk_from_dict(
|
|
192
|
+
self, json_dict: Dict, doc: Document, chunk_index: int, total_chunks: int
|
|
193
|
+
) -> ProcessedChunk:
|
|
211
194
|
"""Create a chunk from a JSON dictionary"""
|
|
212
195
|
# Ensure we're working with a dictionary
|
|
213
196
|
if isinstance(json_dict, str):
|
|
214
197
|
try:
|
|
215
198
|
json_dict = json.loads(json_dict)
|
|
216
199
|
except json.JSONDecodeError:
|
|
217
|
-
logger.
|
|
200
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
218
201
|
return self._create_error_chunk(doc, "Invalid JSON string")
|
|
219
202
|
|
|
220
203
|
# Format the content
|
|
@@ -223,9 +206,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
223
206
|
filtered_dict = self._filter_fields(flattened)
|
|
224
207
|
content = self._dict_to_text(filtered_dict)
|
|
225
208
|
else:
|
|
226
|
-
filtered_dict = {
|
|
227
|
-
|
|
228
|
-
|
|
209
|
+
filtered_dict = {
|
|
210
|
+
k: v
|
|
211
|
+
for k, v in json_dict.items()
|
|
212
|
+
if (not self.config.include_fields or k in self.config.include_fields)
|
|
213
|
+
and k not in self.config.exclude_fields
|
|
214
|
+
}
|
|
229
215
|
content = json.dumps(filtered_dict, indent=2)
|
|
230
216
|
|
|
231
217
|
# Create metadata
|
|
@@ -241,22 +227,23 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
241
227
|
start_char=0,
|
|
242
228
|
end_char=len(content),
|
|
243
229
|
provided_id=doc.id,
|
|
244
|
-
content_column=self.config.content_column
|
|
230
|
+
content_column=self.config.content_column,
|
|
245
231
|
)
|
|
246
232
|
|
|
247
|
-
return ProcessedChunk(
|
|
248
|
-
id=chunk_id,
|
|
249
|
-
content=content,
|
|
250
|
-
metadata=metadata
|
|
251
|
-
)
|
|
233
|
+
return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
|
|
252
234
|
|
|
253
235
|
def _filter_fields(self, flattened_dict: Dict) -> Dict:
|
|
254
236
|
"""Filter fields based on include/exclude configuration"""
|
|
255
237
|
# If include_fields is specified, only keep those fields
|
|
256
238
|
if self.config.include_fields:
|
|
257
|
-
filtered_dict = {
|
|
258
|
-
|
|
259
|
-
|
|
239
|
+
filtered_dict = {
|
|
240
|
+
k: v
|
|
241
|
+
for k, v in flattened_dict.items()
|
|
242
|
+
if any(
|
|
243
|
+
k == field or k.startswith(field + self.config.nested_delimiter)
|
|
244
|
+
for field in self.config.include_fields
|
|
245
|
+
)
|
|
246
|
+
}
|
|
260
247
|
else:
|
|
261
248
|
filtered_dict = flattened_dict.copy()
|
|
262
249
|
|
|
@@ -276,11 +263,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
276
263
|
return filtered_dict
|
|
277
264
|
|
|
278
265
|
def _create_chunk_from_primitive(
|
|
279
|
-
|
|
280
|
-
value: Any,
|
|
281
|
-
doc: Document,
|
|
282
|
-
chunk_index: int = 0,
|
|
283
|
-
total_chunks: int = 1
|
|
266
|
+
self, value: Any, doc: Document, chunk_index: int = 0, total_chunks: int = 1
|
|
284
267
|
) -> ProcessedChunk:
|
|
285
268
|
"""Create a chunk from a primitive value"""
|
|
286
269
|
content = str(value)
|
|
@@ -300,16 +283,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
300
283
|
start_char=0,
|
|
301
284
|
end_char=len(content),
|
|
302
285
|
provided_id=doc.id,
|
|
303
|
-
content_column=self.config.content_column
|
|
286
|
+
content_column=self.config.content_column,
|
|
304
287
|
)
|
|
305
288
|
|
|
306
|
-
return ProcessedChunk(
|
|
307
|
-
id=chunk_id,
|
|
308
|
-
content=content,
|
|
309
|
-
metadata=metadata
|
|
310
|
-
)
|
|
289
|
+
return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
|
|
311
290
|
|
|
312
|
-
def _flatten_dict(self, d: Dict, delimiter: str =
|
|
291
|
+
def _flatten_dict(self, d: Dict, delimiter: str = ".", prefix: str = "") -> Dict:
|
|
313
292
|
"""Flatten a nested dictionary structure"""
|
|
314
293
|
result = {}
|
|
315
294
|
for k, v in d.items():
|
|
@@ -337,7 +316,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
337
316
|
# Format list of dictionaries
|
|
338
317
|
lines.append(f"{key}:")
|
|
339
318
|
for i, item in enumerate(value):
|
|
340
|
-
lines.append(f" Item {i+1}:")
|
|
319
|
+
lines.append(f" Item {i + 1}:")
|
|
341
320
|
for k, v in item.items():
|
|
342
321
|
lines.append(f" {k}: {v}")
|
|
343
322
|
else:
|
|
@@ -362,7 +341,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
362
341
|
# Format list of dictionaries
|
|
363
342
|
lines = [f"{key}:"]
|
|
364
343
|
for i, item in enumerate(value):
|
|
365
|
-
lines.append(f" Item {i+1}:")
|
|
344
|
+
lines.append(f" Item {i + 1}:")
|
|
366
345
|
for k, v in item.items():
|
|
367
346
|
lines.append(f" {k}: {v}")
|
|
368
347
|
return "\n".join(lines)
|
|
@@ -380,7 +359,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
|
|
|
380
359
|
try:
|
|
381
360
|
json_dict = json.loads(json_dict)
|
|
382
361
|
except json.JSONDecodeError:
|
|
383
|
-
logger.
|
|
362
|
+
logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
|
|
384
363
|
return
|
|
385
364
|
|
|
386
365
|
# Always flatten the dictionary for metadata extraction
|