MindsDB 25.9.2.0a1__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (116) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +39 -20
  3. mindsdb/api/a2a/agent.py +7 -9
  4. mindsdb/api/a2a/common/server/server.py +3 -3
  5. mindsdb/api/a2a/common/server/task_manager.py +4 -4
  6. mindsdb/api/a2a/task_manager.py +15 -17
  7. mindsdb/api/common/middleware.py +9 -11
  8. mindsdb/api/executor/command_executor.py +2 -4
  9. mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
  10. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +100 -48
  11. mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
  12. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  13. mindsdb/api/executor/exceptions.py +29 -10
  14. mindsdb/api/executor/planner/plan_join.py +17 -3
  15. mindsdb/api/executor/sql_query/sql_query.py +74 -74
  16. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
  17. mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
  18. mindsdb/api/executor/utilities/functions.py +6 -6
  19. mindsdb/api/executor/utilities/sql.py +32 -16
  20. mindsdb/api/http/gui.py +5 -11
  21. mindsdb/api/http/initialize.py +8 -10
  22. mindsdb/api/http/namespaces/agents.py +10 -12
  23. mindsdb/api/http/namespaces/analysis.py +13 -20
  24. mindsdb/api/http/namespaces/auth.py +1 -1
  25. mindsdb/api/http/namespaces/config.py +15 -11
  26. mindsdb/api/http/namespaces/databases.py +140 -201
  27. mindsdb/api/http/namespaces/file.py +15 -4
  28. mindsdb/api/http/namespaces/handlers.py +7 -2
  29. mindsdb/api/http/namespaces/knowledge_bases.py +8 -7
  30. mindsdb/api/http/namespaces/models.py +94 -126
  31. mindsdb/api/http/namespaces/projects.py +13 -22
  32. mindsdb/api/http/namespaces/sql.py +33 -25
  33. mindsdb/api/http/namespaces/tab.py +27 -37
  34. mindsdb/api/http/namespaces/views.py +1 -1
  35. mindsdb/api/http/start.py +14 -8
  36. mindsdb/api/mcp/__init__.py +2 -1
  37. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
  38. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
  39. mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
  40. mindsdb/api/postgres/postgres_proxy/executor/executor.py +6 -13
  41. mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +40 -28
  42. mindsdb/integrations/handlers/byom_handler/byom_handler.py +168 -185
  43. mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
  44. mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
  45. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +13 -1
  46. mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
  47. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
  48. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  49. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  50. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
  51. mindsdb/integrations/libs/api_handler.py +10 -10
  52. mindsdb/integrations/libs/base.py +4 -4
  53. mindsdb/integrations/libs/llm/utils.py +2 -2
  54. mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
  55. mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
  56. mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
  57. mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
  58. mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
  59. mindsdb/integrations/libs/process_cache.py +132 -140
  60. mindsdb/integrations/libs/response.py +18 -12
  61. mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
  62. mindsdb/integrations/utilities/files/file_reader.py +6 -7
  63. mindsdb/integrations/utilities/rag/config_loader.py +37 -26
  64. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +59 -9
  65. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
  66. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
  67. mindsdb/integrations/utilities/rag/settings.py +58 -133
  68. mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
  69. mindsdb/interfaces/agents/agents_controller.py +2 -1
  70. mindsdb/interfaces/agents/constants.py +0 -2
  71. mindsdb/interfaces/agents/litellm_server.py +34 -58
  72. mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
  73. mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
  74. mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
  75. mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
  76. mindsdb/interfaces/chatbot/polling.py +30 -18
  77. mindsdb/interfaces/data_catalog/data_catalog_loader.py +10 -10
  78. mindsdb/interfaces/database/integrations.py +19 -2
  79. mindsdb/interfaces/file/file_controller.py +6 -6
  80. mindsdb/interfaces/functions/controller.py +1 -1
  81. mindsdb/interfaces/functions/to_markdown.py +2 -2
  82. mindsdb/interfaces/jobs/jobs_controller.py +5 -5
  83. mindsdb/interfaces/jobs/scheduler.py +3 -8
  84. mindsdb/interfaces/knowledge_base/controller.py +50 -23
  85. mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
  86. mindsdb/interfaces/model/model_controller.py +170 -166
  87. mindsdb/interfaces/query_context/context_controller.py +14 -2
  88. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +6 -4
  89. mindsdb/interfaces/skills/retrieval_tool.py +43 -50
  90. mindsdb/interfaces/skills/skill_tool.py +2 -2
  91. mindsdb/interfaces/skills/sql_agent.py +25 -19
  92. mindsdb/interfaces/storage/fs.py +114 -169
  93. mindsdb/interfaces/storage/json.py +19 -18
  94. mindsdb/interfaces/tabs/tabs_controller.py +49 -72
  95. mindsdb/interfaces/tasks/task_monitor.py +3 -9
  96. mindsdb/interfaces/tasks/task_thread.py +7 -9
  97. mindsdb/interfaces/triggers/trigger_task.py +7 -13
  98. mindsdb/interfaces/triggers/triggers_controller.py +47 -50
  99. mindsdb/migrations/migrate.py +16 -16
  100. mindsdb/utilities/api_status.py +58 -0
  101. mindsdb/utilities/config.py +49 -0
  102. mindsdb/utilities/exception.py +40 -1
  103. mindsdb/utilities/fs.py +0 -1
  104. mindsdb/utilities/hooks/profiling.py +17 -14
  105. mindsdb/utilities/langfuse.py +40 -45
  106. mindsdb/utilities/log.py +272 -0
  107. mindsdb/utilities/ml_task_queue/consumer.py +52 -58
  108. mindsdb/utilities/ml_task_queue/producer.py +26 -30
  109. mindsdb/utilities/render/sqlalchemy_render.py +7 -6
  110. mindsdb/utilities/utils.py +2 -2
  111. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/METADATA +269 -264
  112. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/RECORD +115 -115
  113. mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
  114. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/WHEEL +0 -0
  115. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/licenses/LICENSE +0 -0
  116. {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@ logger = log.getLogger(__name__)
14
14
 
15
15
 
16
16
  def execute_async(q_in, q_out):
17
-
18
17
  while True:
19
18
  task = q_in.get()
20
19
 
@@ -44,7 +43,7 @@ class Scheduler:
44
43
  self.q_in = queue.Queue()
45
44
  self.q_out = queue.Queue()
46
45
  self.work_thread = threading.Thread(
47
- target=execute_async, args=(self.q_in, self.q_out), name='Scheduler.execute_async'
46
+ target=execute_async, args=(self.q_in, self.q_out), name="Scheduler.execute_async"
48
47
  )
49
48
  self.work_thread.start()
50
49
 
@@ -58,14 +57,13 @@ class Scheduler:
58
57
  check_interval = self.config.get("jobs", {}).get("check_interval", 30)
59
58
 
60
59
  while True:
61
-
62
60
  logger.debug("Scheduler check timetable")
63
61
  try:
64
62
  self.check_timetable()
65
63
  except (SystemExit, KeyboardInterrupt):
66
64
  raise
67
- except Exception as e:
68
- logger.error(e)
65
+ except Exception:
66
+ logger.exception("Error in 'scheduler_monitor'")
69
67
 
70
68
  # different instances should start in not the same time
71
69
 
@@ -83,7 +81,6 @@ class Scheduler:
83
81
  db.session.remove()
84
82
 
85
83
  def execute_task(self, record_id, exec_method):
86
-
87
84
  executor = JobsExecutor()
88
85
  if exec_method == "local":
89
86
  history_id = executor.lock_record(record_id)
@@ -117,7 +114,6 @@ class Scheduler:
117
114
  raise NotImplementedError()
118
115
 
119
116
  def start(self):
120
-
121
117
  config = Config()
122
118
  db.init()
123
119
  self.config = config
@@ -127,7 +123,6 @@ class Scheduler:
127
123
  try:
128
124
  self.scheduler_monitor()
129
125
  except (KeyboardInterrupt, SystemExit):
130
-
131
126
  self.stop_thread()
132
127
  pass
133
128
 
@@ -22,12 +22,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
22
22
  TableField,
23
23
  VectorStoreHandler,
24
24
  )
25
- from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
26
- from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
27
25
  from mindsdb.integrations.utilities.handler_utils import get_api_key
28
- from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
29
- construct_model_from_args,
30
- )
31
26
 
32
27
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS, MAX_INSERT_BATCH_SIZE
33
28
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -56,6 +51,7 @@ class KnowledgeBaseInputParams(BaseModel):
56
51
  content_columns: List[str] | None = None
57
52
  id_column: str | None = None
58
53
  kb_no_upsert: bool = False
54
+ kb_skip_existing: bool = False
59
55
  embedding_model: Dict[Text, Any] | None = None
60
56
  is_sparse: bool = False
61
57
  vector_size: int | None = None
@@ -83,9 +79,9 @@ def get_model_params(model_params: dict, default_config_key: str):
83
79
  return combined_model_params
84
80
 
85
81
 
86
- def get_embedding_model_from_params(embedding_model_params: dict):
82
+ def adapt_embedding_model_params(embedding_model_params: dict):
87
83
  """
88
- Create embedding model from parameters.
84
+ Prepare parameters for embedding model.
89
85
  """
90
86
  params_copy = copy.deepcopy(embedding_model_params)
91
87
  provider = params_copy.pop("provider", None).lower()
@@ -106,7 +102,7 @@ def get_embedding_model_from_params(embedding_model_params: dict):
106
102
  params_copy.pop("api_key", None)
107
103
  params_copy["model"] = params_copy.pop("model_name", None)
108
104
 
109
- return construct_model_from_args(params_copy)
105
+ return params_copy
110
106
 
111
107
 
112
108
  def get_reranking_model_from_params(reranking_model_params: dict):
@@ -265,9 +261,9 @@ class KnowledgeBaseTable:
265
261
  gt_filtering = True
266
262
  logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
267
263
  except (ValueError, TypeError) as e:
268
- error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
264
+ error_msg = f"Invalid relevance_threshold value: {item.value}. {e}"
269
265
  logger.error(error_msg)
270
- raise ValueError(error_msg)
266
+ raise ValueError(error_msg) from e
271
267
  elif (item.column == "relevance") and (item.op.value not in relevance_threshold_allowed_operators):
272
268
  raise ValueError(
273
269
  f"Invalid operator for relevance: {item.op.value}. Only the following operators are allowed: "
@@ -318,13 +314,20 @@ class KnowledgeBaseTable:
318
314
  self.addapt_conditions_columns(conditions)
319
315
 
320
316
  # Set default limit if query is present
317
+ limit = query.limit.value if query.limit is not None else None
321
318
  if query_text is not None:
322
- limit = query.limit.value if query.limit is not None else None
323
319
  if limit is None:
324
320
  limit = 10
325
321
  elif limit > 100:
326
322
  limit = 100
327
- query.limit = Constant(limit)
323
+
324
+ if not disable_reranking:
325
+ # expand limit, get more records before reranking usage:
326
+ # get twice size of input but not greater than 30
327
+ query_limit = min(limit * 2, limit + 30)
328
+ else:
329
+ query_limit = limit
330
+ query.limit = Constant(query_limit)
328
331
 
329
332
  allowed_metadata_columns = self._get_allowed_metadata_columns()
330
333
  df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
@@ -375,6 +378,8 @@ class KnowledgeBaseTable:
375
378
 
376
379
  # Check if we have a rerank_model configured in KB params
377
380
  df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
381
+ if limit is not None:
382
+ df = df[:limit]
378
383
 
379
384
  # if relevance filtering method is strictly GREATER THAN we filter the df
380
385
  if gt_filtering:
@@ -407,7 +412,6 @@ class KnowledgeBaseTable:
407
412
  if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
408
413
  # Use reranker for relevance score
409
414
 
410
- logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
411
415
  # Apply custom filtering threshold if provided
412
416
  if relevance_threshold is not None:
413
417
  reranking_model_params["filtering_threshold"] = relevance_threshold
@@ -424,7 +428,6 @@ class KnowledgeBaseTable:
424
428
  # Filter by threshold
425
429
  scores_array = np.array(scores)
426
430
  df = df[scores_array >= reranker.filtering_threshold]
427
- logger.debug(f"Applied reranking with params: {reranking_model_params}")
428
431
 
429
432
  elif "distance" in df.columns:
430
433
  # Calculate relevance from distance
@@ -678,6 +681,25 @@ class KnowledgeBaseTable:
678
681
  logger.warning("No valid content found in any content columns")
679
682
  return
680
683
 
684
+ # Check if we should skip existing items (before calculating embeddings)
685
+ if params is not None and params.get("kb_skip_existing", False):
686
+ logger.debug(f"Checking for existing items to skip before processing {len(df)} items")
687
+ db_handler = self.get_vector_db()
688
+
689
+ # Get list of IDs from current batch
690
+ current_ids = df[TableField.ID.value].dropna().astype(str).tolist()
691
+ if current_ids:
692
+ # Check which IDs already exist
693
+ existing_ids = db_handler.check_existing_ids(self._kb.vector_database_table, current_ids)
694
+ if existing_ids:
695
+ # Filter out existing items
696
+ df = df[~df[TableField.ID.value].astype(str).isin(existing_ids)]
697
+ logger.info(f"Skipped {len(existing_ids)} existing items, processing {len(df)} new items")
698
+
699
+ if df.empty:
700
+ logger.info("All items already exist, nothing to insert")
701
+ return
702
+
681
703
  # add embeddings and send to vector db
682
704
  df_emb = self._df_to_embeddings(df)
683
705
  df = pd.concat([df, df_emb], axis=1)
@@ -915,7 +937,12 @@ class KnowledgeBaseTable:
915
937
  ValueError: If the configuration is invalid or required components are missing
916
938
  """
917
939
  # Get embedding model from knowledge base
918
- embeddings_model = None
940
+ from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
941
+ construct_model_from_args,
942
+ )
943
+ from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
944
+ from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
945
+
919
946
  embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
920
947
  if self._kb.embedding_model:
921
948
  # Extract embedding model args from knowledge base table
@@ -924,7 +951,7 @@ class KnowledgeBaseTable:
924
951
  embeddings_model = construct_model_from_args(embedding_args)
925
952
  logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
926
953
  elif embedding_model_params:
927
- embeddings_model = get_embedding_model_from_params(embedding_model_params)
954
+ embeddings_model = construct_model_from_args(adapt_embedding_model_params(embedding_model_params))
928
955
  logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
929
956
  else:
930
957
  embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
@@ -952,8 +979,8 @@ class KnowledgeBaseTable:
952
979
  return rag
953
980
 
954
981
  except Exception as e:
955
- logger.error(f"Error building RAG pipeline: {str(e)}")
956
- raise ValueError(f"Failed to build RAG pipeline: {str(e)}")
982
+ logger.exception("Error building RAG pipeline:")
983
+ raise ValueError(f"Failed to build RAG pipeline: {str(e)}") from e
957
984
 
958
985
  def _parse_metadata(self, base_metadata):
959
986
  """Helper function to robustly parse metadata string to dict"""
@@ -1065,7 +1092,7 @@ class KnowledgeBaseController:
1065
1092
  msg = "\n".join(problems)
1066
1093
  if len(problems) > 1:
1067
1094
  msg = "\n" + msg
1068
- raise ValueError(f"Problem with knowledge base parameters: {msg}")
1095
+ raise ValueError(f"Problem with knowledge base parameters: {msg}") from e
1069
1096
 
1070
1097
  # Validate preprocessing config first if provided
1071
1098
  if preprocessing_config is not None:
@@ -1131,7 +1158,7 @@ class KnowledgeBaseController:
1131
1158
  reranker = get_reranking_model_from_params(reranking_model_params)
1132
1159
  reranker.get_scores("test", ["test"])
1133
1160
  except (ValueError, RuntimeError) as e:
1134
- raise RuntimeError(f"Problem with reranker config: {e}")
1161
+ raise RuntimeError(f"Problem with reranker config: {e}") from e
1135
1162
 
1136
1163
  # search for the vector database table
1137
1164
  if storage is None:
@@ -1244,7 +1271,7 @@ class KnowledgeBaseController:
1244
1271
  try:
1245
1272
  KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
1246
1273
  except Exception as e:
1247
- raise RuntimeError(f"Problem with embedding model config: {e}")
1274
+ raise RuntimeError(f"Problem with embedding model config: {e}") from e
1248
1275
  return
1249
1276
 
1250
1277
  params = copy.deepcopy(params)
@@ -1297,8 +1324,8 @@ class KnowledgeBaseController:
1297
1324
  """
1298
1325
  try:
1299
1326
  project = self.session.database_controller.get_project(project_name)
1300
- except ValueError:
1301
- raise ValueError(f"Project not found: {project_name}")
1327
+ except ValueError as e:
1328
+ raise ValueError(f"Project not found: {project_name}") from e
1302
1329
  project_id = project.id
1303
1330
 
1304
1331
  # check if knowledge base exists
@@ -1,13 +1,10 @@
1
- from typing import List, Dict, Any, Optional
1
+ import ast
2
2
  import json
3
+ from typing import List, Dict, Any, Optional
4
+
3
5
  import pandas as pd
4
- import ast
5
6
 
6
- from mindsdb.interfaces.knowledge_base.preprocessing.models import (
7
- Document,
8
- ProcessedChunk,
9
- JSONChunkingConfig
10
- )
7
+ from mindsdb.interfaces.knowledge_base.preprocessing.models import Document, ProcessedChunk, JSONChunkingConfig
11
8
  from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import DocumentPreprocessor
12
9
  from mindsdb.utilities import log
13
10
 
@@ -50,7 +47,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
50
47
  chunks = self._process_json_data(json_data, doc)
51
48
  all_chunks.extend(chunks)
52
49
  except Exception as e:
53
- logger.error(f"Error processing document {doc.id}: {e}")
50
+ logger.exception(f"Error processing document {doc.id}:")
54
51
  error_chunk = self._create_error_chunk(doc, str(e))
55
52
  all_chunks.append(error_chunk)
56
53
 
@@ -76,8 +73,8 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
76
73
  # If JSON parsing fails, try as Python literal
77
74
  try:
78
75
  return ast.literal_eval(doc.content)
79
- except (SyntaxError, ValueError) as e:
80
- logger.error(f"Error parsing content for document {doc.id}: {e}")
76
+ except (SyntaxError, ValueError):
77
+ logger.exception(f"Error parsing content for document {doc.id}:")
81
78
  # We'll create the error chunk in the main process_documents method
82
79
  return None
83
80
 
@@ -117,7 +114,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
117
114
  return ProcessedChunk(
118
115
  id=f"{doc.id}_error",
119
116
  content=f"Error processing document: {error_message}",
120
- metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata)
117
+ metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata),
121
118
  )
122
119
 
123
120
  def _process_json_list(self, json_list: List, doc: Document) -> List[ProcessedChunk]:
@@ -132,20 +129,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
132
129
  elif isinstance(item, list):
133
130
  # Handle nested lists by converting to string representation
134
131
  chunk = self._create_chunk_from_primitive(
135
- json.dumps(item),
136
- doc,
137
- chunk_index=i,
138
- total_chunks=total_objects
132
+ json.dumps(item), doc, chunk_index=i, total_chunks=total_objects
139
133
  )
140
134
  chunks.append(chunk)
141
135
  else:
142
136
  # Handle primitive values
143
- chunk = self._create_chunk_from_primitive(
144
- item,
145
- doc,
146
- chunk_index=i,
147
- total_chunks=total_objects
148
- )
137
+ chunk = self._create_chunk_from_primitive(item, doc, chunk_index=i, total_chunks=total_objects)
149
138
  chunks.append(chunk)
150
139
 
151
140
  return chunks
@@ -159,7 +148,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
159
148
  try:
160
149
  json_dict = json.loads(json_dict)
161
150
  except json.JSONDecodeError:
162
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
151
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
163
152
  return [self._create_error_chunk(doc, "Invalid JSON string")]
164
153
 
165
154
  # Filter fields based on include/exclude lists
@@ -190,31 +179,25 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
190
179
  start_char=0,
191
180
  end_char=len(field_content),
192
181
  provided_id=doc.id,
193
- content_column=self.config.content_column
182
+ content_column=self.config.content_column,
194
183
  )
195
184
 
196
185
  # Create and add the chunk
197
- chunk = ProcessedChunk(
198
- id=chunk_id,
199
- content=field_content,
200
- metadata=metadata
201
- )
186
+ chunk = ProcessedChunk(id=chunk_id, content=field_content, metadata=metadata)
202
187
  chunks.append(chunk)
203
188
 
204
189
  return chunks
205
190
 
206
- def _create_chunk_from_dict(self,
207
- json_dict: Dict,
208
- doc: Document,
209
- chunk_index: int,
210
- total_chunks: int) -> ProcessedChunk:
191
+ def _create_chunk_from_dict(
192
+ self, json_dict: Dict, doc: Document, chunk_index: int, total_chunks: int
193
+ ) -> ProcessedChunk:
211
194
  """Create a chunk from a JSON dictionary"""
212
195
  # Ensure we're working with a dictionary
213
196
  if isinstance(json_dict, str):
214
197
  try:
215
198
  json_dict = json.loads(json_dict)
216
199
  except json.JSONDecodeError:
217
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
200
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
218
201
  return self._create_error_chunk(doc, "Invalid JSON string")
219
202
 
220
203
  # Format the content
@@ -223,9 +206,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
223
206
  filtered_dict = self._filter_fields(flattened)
224
207
  content = self._dict_to_text(filtered_dict)
225
208
  else:
226
- filtered_dict = {k: v for k, v in json_dict.items()
227
- if (not self.config.include_fields or k in self.config.include_fields)
228
- and k not in self.config.exclude_fields}
209
+ filtered_dict = {
210
+ k: v
211
+ for k, v in json_dict.items()
212
+ if (not self.config.include_fields or k in self.config.include_fields)
213
+ and k not in self.config.exclude_fields
214
+ }
229
215
  content = json.dumps(filtered_dict, indent=2)
230
216
 
231
217
  # Create metadata
@@ -241,22 +227,23 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
241
227
  start_char=0,
242
228
  end_char=len(content),
243
229
  provided_id=doc.id,
244
- content_column=self.config.content_column
230
+ content_column=self.config.content_column,
245
231
  )
246
232
 
247
- return ProcessedChunk(
248
- id=chunk_id,
249
- content=content,
250
- metadata=metadata
251
- )
233
+ return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
252
234
 
253
235
  def _filter_fields(self, flattened_dict: Dict) -> Dict:
254
236
  """Filter fields based on include/exclude configuration"""
255
237
  # If include_fields is specified, only keep those fields
256
238
  if self.config.include_fields:
257
- filtered_dict = {k: v for k, v in flattened_dict.items()
258
- if any(k == field or k.startswith(field + self.config.nested_delimiter)
259
- for field in self.config.include_fields)}
239
+ filtered_dict = {
240
+ k: v
241
+ for k, v in flattened_dict.items()
242
+ if any(
243
+ k == field or k.startswith(field + self.config.nested_delimiter)
244
+ for field in self.config.include_fields
245
+ )
246
+ }
260
247
  else:
261
248
  filtered_dict = flattened_dict.copy()
262
249
 
@@ -276,11 +263,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
276
263
  return filtered_dict
277
264
 
278
265
  def _create_chunk_from_primitive(
279
- self,
280
- value: Any,
281
- doc: Document,
282
- chunk_index: int = 0,
283
- total_chunks: int = 1
266
+ self, value: Any, doc: Document, chunk_index: int = 0, total_chunks: int = 1
284
267
  ) -> ProcessedChunk:
285
268
  """Create a chunk from a primitive value"""
286
269
  content = str(value)
@@ -300,16 +283,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
300
283
  start_char=0,
301
284
  end_char=len(content),
302
285
  provided_id=doc.id,
303
- content_column=self.config.content_column
286
+ content_column=self.config.content_column,
304
287
  )
305
288
 
306
- return ProcessedChunk(
307
- id=chunk_id,
308
- content=content,
309
- metadata=metadata
310
- )
289
+ return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
311
290
 
312
- def _flatten_dict(self, d: Dict, delimiter: str = '.', prefix: str = '') -> Dict:
291
+ def _flatten_dict(self, d: Dict, delimiter: str = ".", prefix: str = "") -> Dict:
313
292
  """Flatten a nested dictionary structure"""
314
293
  result = {}
315
294
  for k, v in d.items():
@@ -337,7 +316,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
337
316
  # Format list of dictionaries
338
317
  lines.append(f"{key}:")
339
318
  for i, item in enumerate(value):
340
- lines.append(f" Item {i+1}:")
319
+ lines.append(f" Item {i + 1}:")
341
320
  for k, v in item.items():
342
321
  lines.append(f" {k}: {v}")
343
322
  else:
@@ -362,7 +341,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
362
341
  # Format list of dictionaries
363
342
  lines = [f"{key}:"]
364
343
  for i, item in enumerate(value):
365
- lines.append(f" Item {i+1}:")
344
+ lines.append(f" Item {i + 1}:")
366
345
  for k, v in item.items():
367
346
  lines.append(f" {k}: {v}")
368
347
  return "\n".join(lines)
@@ -380,7 +359,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
380
359
  try:
381
360
  json_dict = json.loads(json_dict)
382
361
  except json.JSONDecodeError:
383
- logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
362
+ logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
384
363
  return
385
364
 
386
365
  # Always flatten the dictionary for metadata extraction