MindsDB 25.9.2.0a1__py3-none-any.whl → 25.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (163) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +40 -29
  3. mindsdb/api/a2a/__init__.py +1 -1
  4. mindsdb/api/a2a/agent.py +16 -10
  5. mindsdb/api/a2a/common/server/server.py +7 -3
  6. mindsdb/api/a2a/common/server/task_manager.py +12 -5
  7. mindsdb/api/a2a/common/types.py +66 -0
  8. mindsdb/api/a2a/task_manager.py +65 -17
  9. mindsdb/api/common/middleware.py +10 -12
  10. mindsdb/api/executor/command_executor.py +51 -40
  11. mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
  12. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +7 -13
  13. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +101 -49
  14. mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
  15. mindsdb/api/executor/datahub/datanodes/system_tables.py +3 -2
  16. mindsdb/api/executor/exceptions.py +29 -10
  17. mindsdb/api/executor/planner/plan_join.py +17 -3
  18. mindsdb/api/executor/planner/query_prepare.py +2 -20
  19. mindsdb/api/executor/sql_query/sql_query.py +74 -74
  20. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
  21. mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
  22. mindsdb/api/executor/utilities/functions.py +6 -6
  23. mindsdb/api/executor/utilities/sql.py +37 -20
  24. mindsdb/api/http/gui.py +5 -11
  25. mindsdb/api/http/initialize.py +75 -61
  26. mindsdb/api/http/namespaces/agents.py +10 -15
  27. mindsdb/api/http/namespaces/analysis.py +13 -20
  28. mindsdb/api/http/namespaces/auth.py +1 -1
  29. mindsdb/api/http/namespaces/chatbots.py +0 -5
  30. mindsdb/api/http/namespaces/config.py +15 -11
  31. mindsdb/api/http/namespaces/databases.py +140 -201
  32. mindsdb/api/http/namespaces/file.py +17 -4
  33. mindsdb/api/http/namespaces/handlers.py +17 -7
  34. mindsdb/api/http/namespaces/knowledge_bases.py +28 -7
  35. mindsdb/api/http/namespaces/models.py +94 -126
  36. mindsdb/api/http/namespaces/projects.py +13 -22
  37. mindsdb/api/http/namespaces/sql.py +33 -25
  38. mindsdb/api/http/namespaces/tab.py +27 -37
  39. mindsdb/api/http/namespaces/views.py +1 -1
  40. mindsdb/api/http/start.py +16 -10
  41. mindsdb/api/mcp/__init__.py +2 -1
  42. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
  43. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
  44. mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
  45. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +8 -2
  46. mindsdb/integrations/handlers/byom_handler/byom_handler.py +165 -190
  47. mindsdb/integrations/handlers/databricks_handler/databricks_handler.py +98 -46
  48. mindsdb/integrations/handlers/druid_handler/druid_handler.py +32 -40
  49. mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
  50. mindsdb/integrations/handlers/gitlab_handler/gitlab_handler.py +5 -2
  51. mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
  52. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +438 -100
  53. mindsdb/integrations/handlers/mssql_handler/requirements_odbc.txt +3 -0
  54. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +235 -3
  55. mindsdb/integrations/handlers/oracle_handler/__init__.py +2 -0
  56. mindsdb/integrations/handlers/oracle_handler/connection_args.py +7 -1
  57. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +321 -16
  58. mindsdb/integrations/handlers/oracle_handler/requirements.txt +1 -1
  59. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +14 -2
  60. mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
  61. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
  62. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  63. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  64. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
  65. mindsdb/integrations/handlers/zendesk_handler/zendesk_tables.py +144 -111
  66. mindsdb/integrations/libs/api_handler.py +10 -10
  67. mindsdb/integrations/libs/base.py +4 -4
  68. mindsdb/integrations/libs/llm/utils.py +2 -2
  69. mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
  70. mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
  71. mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
  72. mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
  73. mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
  74. mindsdb/integrations/libs/process_cache.py +132 -140
  75. mindsdb/integrations/libs/response.py +18 -12
  76. mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
  77. mindsdb/integrations/utilities/files/file_reader.py +6 -7
  78. mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/__init__.py +1 -0
  79. mindsdb/integrations/utilities/handlers/auth_utilities/snowflake/snowflake_jwt_gen.py +151 -0
  80. mindsdb/integrations/utilities/rag/config_loader.py +37 -26
  81. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +83 -30
  82. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
  83. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
  84. mindsdb/integrations/utilities/rag/settings.py +58 -133
  85. mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
  86. mindsdb/interfaces/agents/agents_controller.py +2 -3
  87. mindsdb/interfaces/agents/constants.py +0 -2
  88. mindsdb/interfaces/agents/litellm_server.py +34 -58
  89. mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
  90. mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
  91. mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
  92. mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
  93. mindsdb/interfaces/chatbot/polling.py +30 -18
  94. mindsdb/interfaces/data_catalog/data_catalog_loader.py +16 -17
  95. mindsdb/interfaces/data_catalog/data_catalog_reader.py +15 -4
  96. mindsdb/interfaces/database/data_handlers_cache.py +190 -0
  97. mindsdb/interfaces/database/database.py +3 -3
  98. mindsdb/interfaces/database/integrations.py +7 -110
  99. mindsdb/interfaces/database/projects.py +2 -6
  100. mindsdb/interfaces/database/views.py +1 -4
  101. mindsdb/interfaces/file/file_controller.py +6 -6
  102. mindsdb/interfaces/functions/controller.py +1 -1
  103. mindsdb/interfaces/functions/to_markdown.py +2 -2
  104. mindsdb/interfaces/jobs/jobs_controller.py +5 -9
  105. mindsdb/interfaces/jobs/scheduler.py +3 -9
  106. mindsdb/interfaces/knowledge_base/controller.py +244 -128
  107. mindsdb/interfaces/knowledge_base/evaluate.py +36 -41
  108. mindsdb/interfaces/knowledge_base/executor.py +11 -0
  109. mindsdb/interfaces/knowledge_base/llm_client.py +51 -17
  110. mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
  111. mindsdb/interfaces/model/model_controller.py +172 -168
  112. mindsdb/interfaces/query_context/context_controller.py +14 -2
  113. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +10 -14
  114. mindsdb/interfaces/skills/retrieval_tool.py +43 -50
  115. mindsdb/interfaces/skills/skill_tool.py +2 -2
  116. mindsdb/interfaces/skills/skills_controller.py +1 -4
  117. mindsdb/interfaces/skills/sql_agent.py +25 -19
  118. mindsdb/interfaces/storage/db.py +16 -6
  119. mindsdb/interfaces/storage/fs.py +114 -169
  120. mindsdb/interfaces/storage/json.py +19 -18
  121. mindsdb/interfaces/tabs/tabs_controller.py +49 -72
  122. mindsdb/interfaces/tasks/task_monitor.py +3 -9
  123. mindsdb/interfaces/tasks/task_thread.py +7 -9
  124. mindsdb/interfaces/triggers/trigger_task.py +7 -13
  125. mindsdb/interfaces/triggers/triggers_controller.py +47 -52
  126. mindsdb/migrations/migrate.py +16 -16
  127. mindsdb/utilities/api_status.py +58 -0
  128. mindsdb/utilities/config.py +68 -2
  129. mindsdb/utilities/exception.py +40 -1
  130. mindsdb/utilities/fs.py +0 -1
  131. mindsdb/utilities/hooks/profiling.py +17 -14
  132. mindsdb/utilities/json_encoder.py +24 -10
  133. mindsdb/utilities/langfuse.py +40 -45
  134. mindsdb/utilities/log.py +272 -0
  135. mindsdb/utilities/ml_task_queue/consumer.py +52 -58
  136. mindsdb/utilities/ml_task_queue/producer.py +26 -30
  137. mindsdb/utilities/render/sqlalchemy_render.py +22 -20
  138. mindsdb/utilities/starters.py +0 -10
  139. mindsdb/utilities/utils.py +2 -2
  140. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/METADATA +293 -276
  141. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/RECORD +144 -158
  142. mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
  143. mindsdb/api/postgres/__init__.py +0 -0
  144. mindsdb/api/postgres/postgres_proxy/__init__.py +0 -0
  145. mindsdb/api/postgres/postgres_proxy/executor/__init__.py +0 -1
  146. mindsdb/api/postgres/postgres_proxy/executor/executor.py +0 -189
  147. mindsdb/api/postgres/postgres_proxy/postgres_packets/__init__.py +0 -0
  148. mindsdb/api/postgres/postgres_proxy/postgres_packets/errors.py +0 -322
  149. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_fields.py +0 -34
  150. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message.py +0 -31
  151. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_formats.py +0 -1265
  152. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_message_identifiers.py +0 -31
  153. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +0 -253
  154. mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +0 -477
  155. mindsdb/api/postgres/postgres_proxy/utilities/__init__.py +0 -10
  156. mindsdb/api/postgres/start.py +0 -11
  157. mindsdb/integrations/handlers/mssql_handler/tests/__init__.py +0 -0
  158. mindsdb/integrations/handlers/mssql_handler/tests/test_mssql_handler.py +0 -169
  159. mindsdb/integrations/handlers/oracle_handler/tests/__init__.py +0 -0
  160. mindsdb/integrations/handlers/oracle_handler/tests/test_oracle_handler.py +0 -32
  161. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/WHEEL +0 -0
  162. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/licenses/LICENSE +0 -0
  163. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.10.0rc1.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import json
2
2
  import math
3
3
  import re
4
4
  import time
5
+ import copy
5
6
  from typing import List
6
7
 
7
8
  import pandas as pd
@@ -10,6 +11,7 @@ import datetime as dt
10
11
  from mindsdb.api.executor.sql_query.result_set import ResultSet
11
12
  from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
12
13
  from mindsdb.utilities import log
14
+ from mindsdb.utilities.config import config
13
15
 
14
16
  from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
15
17
 
@@ -105,7 +107,12 @@ class EvaluateBase:
105
107
  if llm_params is None:
106
108
  llm_params = self.kb._kb.params.get("reranking_model")
107
109
 
108
- self.llm_client = LLMClient(llm_params)
110
+ params = copy.deepcopy(config.get("default_llm", {}))
111
+
112
+ if llm_params:
113
+ params.update(llm_params)
114
+
115
+ self.llm_client = LLMClient(params)
109
116
 
110
117
  def generate_test_data(self, gen_params: dict) -> pd.DataFrame:
111
118
  # Extract source data (from users query or from KB itself) and call `generate` to get test data
@@ -241,6 +248,26 @@ class EvaluateBase:
241
248
 
242
249
  return cls(session, kb_table).run_evaluate(params)
243
250
 
251
+ def generate_question_answer(self, text: str) -> (str, str):
252
+ messages = [
253
+ {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
254
+ {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
255
+ ]
256
+ answer = self.llm_client.completion(messages, json_output=True)[0]
257
+
258
+ # Sanitize the response by removing markdown code block formatting like ```json
259
+ sanitized_answer = sanitize_json_response(answer)
260
+
261
+ try:
262
+ output = json.loads(sanitized_answer)
263
+ except json.JSONDecodeError:
264
+ raise ValueError(f"Could not parse response from LLM: {answer}")
265
+
266
+ if "query" not in output or "reference_answer" not in output:
267
+ raise ValueError("Cant find question/answer in LLM response")
268
+
269
+ return output.get("query"), output.get("reference_answer")
270
+
244
271
 
245
272
  class EvaluateRerank(EvaluateBase):
246
273
  """
@@ -268,28 +295,12 @@ class EvaluateRerank(EvaluateBase):
268
295
  df["id"] = df.index
269
296
  return df
270
297
 
271
- def generate_question_answer(self, text: str) -> (str, str):
272
- messages = [
273
- {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
274
- {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
275
- ]
276
- answer = self.llm_client.completion(messages, json_output=True)
277
-
278
- # Sanitize the response by removing markdown code block formatting like ```json
279
- sanitized_answer = sanitize_json_response(answer)
280
-
281
- try:
282
- output = json.loads(sanitized_answer)
283
- except json.JSONDecodeError:
284
- raise ValueError(f"Could not parse response from LLM: {answer}")
285
-
286
- if "query" not in output or "reference_answer" not in output:
287
- raise ValueError("Cant find question/answer in LLM response")
288
-
289
- return output.get("query"), output.get("reference_answer")
290
-
291
298
  def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
292
299
  json_to_log_list = []
300
+ if {"question", "answer"} - set(test_data.columns):
301
+ raise KeyError(
302
+ f'Test data must contain "question" and "answer" columns. Columns in the provided test data: {list(test_data.columns)}'
303
+ )
293
304
  questions = test_data.to_dict("records")
294
305
 
295
306
  for i, item in enumerate(questions):
@@ -483,28 +494,12 @@ class EvaluateDocID(EvaluateBase):
483
494
  df = pd.DataFrame(qa_data)
484
495
  return df
485
496
 
486
- def generate_question_answer(self, text: str) -> (str, str):
487
- messages = [
488
- {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
489
- {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
490
- ]
491
- answer = self.llm_client.completion(messages, json_output=True)
492
-
493
- # Sanitize the response by removing markdown code block formatting like ```json
494
- sanitized_answer = sanitize_json_response(answer)
495
-
496
- try:
497
- output = json.loads(sanitized_answer)
498
- except json.JSONDecodeError:
499
- raise ValueError(f"Could not parse response from LLM: {answer}")
500
-
501
- if "query" not in output or "reference_answer" not in output:
502
- raise ValueError("Cant find question/answer in LLM response")
503
-
504
- return output.get("query"), output.get("reference_answer")
505
-
506
497
  def evaluate(self, test_data: pd.DataFrame) -> pd.DataFrame:
507
498
  stats = []
499
+ if {"question", "doc_id"} - set(test_data.columns):
500
+ raise KeyError(
501
+ f'Test data must contain "question" and "doc_id" columns. Columns in the provided test data: {list(test_data.columns)}'
502
+ )
508
503
  questions = test_data.to_dict("records")
509
504
 
510
505
  for i, item in enumerate(questions):
@@ -43,7 +43,18 @@ class KnowledgeBaseQueryExecutor:
43
43
  if isinstance(node, BinaryOperation):
44
44
  if isinstance(node.args[0], Identifier):
45
45
  parts = node.args[0].parts
46
+
47
+ # map chunk_content to content
48
+ if parts[0].lower() == "chunk_content":
49
+ parts[0] = self.content_column
50
+
46
51
  if len(parts) == 1 and parts[0].lower() == self.content_column:
52
+ if "LIKE" in node.op.upper():
53
+ # remove '%'
54
+ arg = node.args[1]
55
+ if isinstance(arg, Constant) and isinstance(arg.value, str):
56
+ arg.value = arg.value.strip(" %")
57
+
47
58
  return True
48
59
  return False
49
60
 
@@ -1,11 +1,23 @@
1
- import copy
2
1
  import os
3
2
  from typing import List
4
3
 
5
4
  from openai import OpenAI, AzureOpenAI
6
5
 
7
6
  from mindsdb.integrations.utilities.handler_utils import get_api_key
8
- from mindsdb.utilities.config import config
7
+
8
+ try:
9
+ from mindsdb.integrations.handlers.openai_handler.helpers import retry_with_exponential_backoff
10
+ except ImportError:
11
+
12
+ def retry_with_exponential_backoff(func):
13
+ """
14
+ An empty decorator
15
+ """
16
+
17
+ def wrapper(*args, **kwargs):
18
+ return func(*args, **kwargs)
19
+
20
+ return wrapper
9
21
 
10
22
 
11
23
  class LLMClient:
@@ -14,12 +26,8 @@ class LLMClient:
14
26
  It chooses openai client or litellm handler depending on the config
15
27
  """
16
28
 
17
- def __init__(self, llm_params: dict = None):
18
- params = copy.deepcopy(config.get("default_llm", {}))
19
-
20
- if llm_params:
21
- params.update(llm_params)
22
-
29
+ def __init__(self, params: dict = None, session=None):
30
+ self._session = session
23
31
  self.params = params
24
32
 
25
33
  self.provider = params.get("provider", "openai")
@@ -27,11 +35,13 @@ class LLMClient:
27
35
  if "api_key" not in params:
28
36
  params["api_key"] = get_api_key(self.provider, params, strict=False)
29
37
 
38
+ self.engine = "openai"
39
+
30
40
  if self.provider == "azure_openai":
31
41
  azure_api_key = params.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY")
32
42
  azure_api_endpoint = params.get("base_url") or os.environ.get("AZURE_OPENAI_ENDPOINT")
33
43
  azure_api_version = params.get("api_version") or os.environ.get("AZURE_OPENAI_API_VERSION")
34
- self._llm_client = AzureOpenAI(
44
+ self.client = AzureOpenAI(
35
45
  api_key=azure_api_key, azure_endpoint=azure_api_endpoint, api_version=azure_api_version, max_retries=2
36
46
  )
37
47
  elif self.provider == "openai":
@@ -41,34 +51,58 @@ class LLMClient:
41
51
  if base_url:
42
52
  kwargs["base_url"] = base_url
43
53
  self.client = OpenAI(**kwargs)
44
-
54
+ elif self.provider == "ollama":
55
+ kwargs = params.copy()
56
+ kwargs.pop("model_name")
57
+ kwargs.pop("provider", None)
58
+ if kwargs["api_key"] is None:
59
+ kwargs["api_key"] = "n/a"
60
+ self.client = OpenAI(**kwargs)
45
61
  else:
46
62
  # try to use litellm
47
- from mindsdb.api.executor.controllers.session_controller import SessionController
63
+ if self._session is None:
64
+ from mindsdb.api.executor.controllers.session_controller import SessionController
48
65
 
49
- session = SessionController()
50
- module = session.integration_controller.get_handler_module("litellm")
66
+ self._session = SessionController()
67
+ module = self._session.integration_controller.get_handler_module("litellm")
51
68
 
52
69
  if module is None or module.Handler is None:
53
70
  raise ValueError(f'Unable to use "{self.provider}" provider. Litellm handler is not installed')
54
71
 
55
72
  self.client = module.Handler
73
+ self.engine = "litellm"
74
+
75
+ @retry_with_exponential_backoff()
76
+ def embeddings(self, messages: List[str]):
77
+ params = self.params
78
+ if self.engine == "openai":
79
+ response = self.client.embeddings.create(
80
+ model=params["model_name"],
81
+ input=messages,
82
+ )
83
+ return [item.embedding for item in response.data]
84
+ else:
85
+ kwargs = params.copy()
86
+ model = kwargs.pop("model_name")
87
+ kwargs.pop("provider", None)
88
+
89
+ return self.client.embeddings(self.provider, model=model, messages=messages, args=kwargs)
56
90
 
57
- def completion(self, messages: List[dict], json_output: bool = False) -> str:
91
+ def completion(self, messages: List[dict], json_output: bool = False) -> List[str]:
58
92
  """
59
93
  Call LLM completion and get response
60
94
  """
61
95
  params = self.params
62
96
  params["json_output"] = json_output
63
- if self.provider in ("azure_openai", "openai"):
97
+ if self.engine == "openai":
64
98
  response = self.client.chat.completions.create(
65
99
  model=params["model_name"],
66
100
  messages=messages,
67
101
  )
68
- return response.choices[0].message.content
102
+ return [item.message.content for item in response.choices]
69
103
  else:
70
104
  kwargs = params.copy()
71
105
  model = kwargs.pop("model_name")
72
106
  kwargs.pop("provider", None)
73
107
  response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
74
- return response.choices[0].message.content
108
+ return [item.message.content for item in response.choices]
@@ -1,13 +1,10 @@
1
- from typing import List, Dict, Any, Optional
1
+ import ast
2
2
  import json
3
+ from typing import List, Dict, Any, Optional
4
+
3
5
  import pandas as pd
4
- import ast
5
6
 
6
- from mindsdb.interfaces.knowledge_base.preprocessing.models import (
7
- Document,
8
- ProcessedChunk,
9
- JSONChunkingConfig
10
- )
7
+ from mindsdb.interfaces.knowledge_base.preprocessing.models import Document, ProcessedChunk, JSONChunkingConfig
11
8
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import DocumentPreprocessor
12
9
  from mindsdb.utilities import log
13
10
 
@@ -50,7 +47,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
50
47
  chunks = self._process_json_data(json_data, doc)
51
48
  all_chunks.extend(chunks)
52
49
  except Exception as e:
53
- logger.error(f"Error processing document {doc.id}: {e}")
50
+ logger.exception(f"Error processing document {doc.id}:")
54
51
  error_chunk = self._create_error_chunk(doc, str(e))
55
52
  all_chunks.append(error_chunk)
56
53
 
@@ -76,8 +73,8 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
76
73
  # If JSON parsing fails, try as Python literal
77
74
  try:
78
75
  return ast.literal_eval(doc.content)
79
- except (SyntaxError, ValueError) as e:
80
- logger.error(f"Error parsing content for document {doc.id}: {e}")
76
+ except (SyntaxError, ValueError):
77
+ logger.exception(f"Error parsing content for document {doc.id}:")
81
78
  # We'll create the error chunk in the main process_documents method
82
79
  return None
83
80
 
@@ -117,7 +114,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
117
114
  return ProcessedChunk(
118
115
  id=f"{doc.id}_error",
119
116
  content=f"Error processing document: {error_message}",
120
- metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata)
117
+ metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata),
121
118
  )
122
119
 
123
120
  def _process_json_list(self, json_list: List, doc: Document) -> List[ProcessedChunk]:
@@ -132,20 +129,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
132
129
  elif isinstance(item, list):
133
130
  # Handle nested lists by converting to string representation
134
131
  chunk = self._create_chunk_from_primitive(
135
- json.dumps(item),
136
- doc,
137
- chunk_index=i,
138
- total_chunks=total_objects
132
+ json.dumps(item), doc, chunk_index=i, total_chunks=total_objects
139
133
  )
140
134
  chunks.append(chunk)
141
135
  else:
142
136
  # Handle primitive values
143
- chunk = self._create_chunk_from_primitive(
144
- item,
145
- doc,
146
- chunk_index=i,
147
- total_chunks=total_objects
148
- )
137
+ chunk = self._create_chunk_from_primitive(item, doc, chunk_index=i, total_chunks=total_objects)
149
138
  chunks.append(chunk)
150
139
 
151
140
  return chunks
@@ -159,7 +148,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
159
148
  try:
160
149
  json_dict = json.loads(json_dict)
161
150
  except json.JSONDecodeError:
162
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
151
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
163
152
  return [self._create_error_chunk(doc, "Invalid JSON string")]
164
153
 
165
154
  # Filter fields based on include/exclude lists
@@ -190,31 +179,25 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
190
179
  start_char=0,
191
180
  end_char=len(field_content),
192
181
  provided_id=doc.id,
193
- content_column=self.config.content_column
182
+ content_column=self.config.content_column,
194
183
  )
195
184
 
196
185
  # Create and add the chunk
197
- chunk = ProcessedChunk(
198
- id=chunk_id,
199
- content=field_content,
200
- metadata=metadata
201
- )
186
+ chunk = ProcessedChunk(id=chunk_id, content=field_content, metadata=metadata)
202
187
  chunks.append(chunk)
203
188
 
204
189
  return chunks
205
190
 
206
- def _create_chunk_from_dict(self,
207
- json_dict: Dict,
208
- doc: Document,
209
- chunk_index: int,
210
- total_chunks: int) -> ProcessedChunk:
191
+ def _create_chunk_from_dict(
192
+ self, json_dict: Dict, doc: Document, chunk_index: int, total_chunks: int
193
+ ) -> ProcessedChunk:
211
194
  """Create a chunk from a JSON dictionary"""
212
195
  # Ensure we're working with a dictionary
213
196
  if isinstance(json_dict, str):
214
197
  try:
215
198
  json_dict = json.loads(json_dict)
216
199
  except json.JSONDecodeError:
217
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
200
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
218
201
  return self._create_error_chunk(doc, "Invalid JSON string")
219
202
 
220
203
  # Format the content
@@ -223,9 +206,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
223
206
  filtered_dict = self._filter_fields(flattened)
224
207
  content = self._dict_to_text(filtered_dict)
225
208
  else:
226
- filtered_dict = {k: v for k, v in json_dict.items()
227
- if (not self.config.include_fields or k in self.config.include_fields)
228
- and k not in self.config.exclude_fields}
209
+ filtered_dict = {
210
+ k: v
211
+ for k, v in json_dict.items()
212
+ if (not self.config.include_fields or k in self.config.include_fields)
213
+ and k not in self.config.exclude_fields
214
+ }
229
215
  content = json.dumps(filtered_dict, indent=2)
230
216
 
231
217
  # Create metadata
@@ -241,22 +227,23 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
241
227
  start_char=0,
242
228
  end_char=len(content),
243
229
  provided_id=doc.id,
244
- content_column=self.config.content_column
230
+ content_column=self.config.content_column,
245
231
  )
246
232
 
247
- return ProcessedChunk(
248
- id=chunk_id,
249
- content=content,
250
- metadata=metadata
251
- )
233
+ return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
252
234
 
253
235
  def _filter_fields(self, flattened_dict: Dict) -> Dict:
254
236
  """Filter fields based on include/exclude configuration"""
255
237
  # If include_fields is specified, only keep those fields
256
238
  if self.config.include_fields:
257
- filtered_dict = {k: v for k, v in flattened_dict.items()
258
- if any(k == field or k.startswith(field + self.config.nested_delimiter)
259
- for field in self.config.include_fields)}
239
+ filtered_dict = {
240
+ k: v
241
+ for k, v in flattened_dict.items()
242
+ if any(
243
+ k == field or k.startswith(field + self.config.nested_delimiter)
244
+ for field in self.config.include_fields
245
+ )
246
+ }
260
247
  else:
261
248
  filtered_dict = flattened_dict.copy()
262
249
 
@@ -276,11 +263,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
276
263
  return filtered_dict
277
264
 
278
265
  def _create_chunk_from_primitive(
279
- self,
280
- value: Any,
281
- doc: Document,
282
- chunk_index: int = 0,
283
- total_chunks: int = 1
266
+ self, value: Any, doc: Document, chunk_index: int = 0, total_chunks: int = 1
284
267
  ) -> ProcessedChunk:
285
268
  """Create a chunk from a primitive value"""
286
269
  content = str(value)
@@ -300,16 +283,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
300
283
  start_char=0,
301
284
  end_char=len(content),
302
285
  provided_id=doc.id,
303
- content_column=self.config.content_column
286
+ content_column=self.config.content_column,
304
287
  )
305
288
 
306
- return ProcessedChunk(
307
- id=chunk_id,
308
- content=content,
309
- metadata=metadata
310
- )
289
+ return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
311
290
 
312
- def _flatten_dict(self, d: Dict, delimiter: str = '.', prefix: str = '') -> Dict:
291
+ def _flatten_dict(self, d: Dict, delimiter: str = ".", prefix: str = "") -> Dict:
313
292
  """Flatten a nested dictionary structure"""
314
293
  result = {}
315
294
  for k, v in d.items():
@@ -337,7 +316,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
337
316
  # Format list of dictionaries
338
317
  lines.append(f"{key}:")
339
318
  for i, item in enumerate(value):
340
- lines.append(f" Item {i+1}:")
319
+ lines.append(f" Item {i + 1}:")
341
320
  for k, v in item.items():
342
321
  lines.append(f" {k}: {v}")
343
322
  else:
@@ -362,7 +341,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
362
341
  # Format list of dictionaries
363
342
  lines = [f"{key}:"]
364
343
  for i, item in enumerate(value):
365
- lines.append(f" Item {i+1}:")
344
+ lines.append(f" Item {i + 1}:")
366
345
  for k, v in item.items():
367
346
  lines.append(f" {k}: {v}")
368
347
  return "\n".join(lines)
@@ -380,7 +359,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
380
359
  try:
381
360
  json_dict = json.loads(json_dict)
382
361
  except json.JSONDecodeError:
383
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
362
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
384
363
  return
385
364
 
386
365
  # Always flatten the dictionary for metadata extraction