MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,346 @@
1
+ from dataclasses import dataclass
2
+ import copy
3
+ from typing import List, Optional, Union
4
+
5
+ from mindsdb_sql_parser.ast import (
6
+ BinaryOperation,
7
+ Identifier,
8
+ Constant,
9
+ UnaryOperation,
10
+ Select,
11
+ Star,
12
+ Tuple,
13
+ ASTNode,
14
+ BetweenOperation,
15
+ NullConstant,
16
+ )
17
+ import pandas as pd
18
+
19
+ from mindsdb.integrations.utilities.query_traversal import query_traversal
20
+
21
+
22
+ @dataclass
23
+ class ConditionBlock:
24
+ op: str
25
+ items: list
26
+
27
+
28
+ class KnowledgeBaseQueryExecutor:
29
+ def __init__(self, kb, content_column="content", id_column="chunk_id"):
30
+ self.kb = kb
31
+ self.content_column = content_column.lower()
32
+ self.id_column = id_column
33
+ self.limit = None
34
+ self._negative_set_size = 100
35
+ self._negative_set_threshold = 0.5
36
+
37
+ def is_content_condition(self, node: ASTNode) -> bool:
38
+ """
39
+ Checks if the node is a condition to Content column
40
+
41
+ :param node: condition to check
42
+ """
43
+ if isinstance(node, BinaryOperation):
44
+ if isinstance(node.args[0], Identifier):
45
+ parts = node.args[0].parts
46
+ if len(parts) == 1 and parts[0].lower() == self.content_column:
47
+ return True
48
+ return False
49
+
50
+ @staticmethod
51
+ def invert_content_op(node: BinaryOperation) -> BinaryOperation:
52
+ # Change operator of binary operation to opposite one
53
+ op_map = {"=": "!=", "!=": "=", "LIKE": "!=", "NOT LIKE": "=", "IN": "NOT IN", "NOT IN": "IN"}
54
+ if node.op.upper() not in op_map:
55
+ raise NotImplementedError(f"Can't handle condition: '{str(node)}'")
56
+ node.op = op_map[node.op.upper()]
57
+ return node
58
+
59
+ def convert_unary_ops(self, node: ASTNode, callstack: List[ASTNode], **kwargs) -> ASTNode:
60
+ """
61
+ Tries to remove unary operator and apply it to Binary operation.
62
+ Supported cases:
63
+ - "NOT content <op> value" => "content <!op> value"
64
+ - "content <op> NOT value" => "content <!op> value"
65
+
66
+ Where <!op> is inverted operator of <op>
67
+ """
68
+
69
+ if isinstance(node, UnaryOperation):
70
+ if node.op.upper() == "NOT":
71
+ # two options:
72
+ # 1. NOT content <op> value
73
+ if self.is_content_condition(node.args[0]):
74
+ self.invert_content_op(node.args[0])
75
+ return node.args[0]
76
+
77
+ # 2. content <op> NOT value
78
+ if self.is_content_condition(callstack[0]):
79
+ self.invert_content_op(callstack[0])
80
+ return node.args[0]
81
+
82
+ def union(self, results: List[pd.DataFrame]) -> pd.DataFrame:
83
+ # combine dataframes from input list to single one
84
+
85
+ if len(results) == 1:
86
+ return results[0]
87
+
88
+ res = pd.concat(results)
89
+ df = res.drop_duplicates(subset=[self.id_column]).reset_index()
90
+ return df
91
+
92
+ def intersect(self, results: List[pd.DataFrame]) -> pd.DataFrame:
93
+ # intersect dataframes from input list: return dataframe with rows that exist in all input dataframes
94
+
95
+ if len(results) == 1:
96
+ return results[0]
97
+
98
+ item = results[0]
99
+ for item2 in results[1:]:
100
+ item = item[item[self.id_column].isin(item2[self.id_column])]
101
+
102
+ df = item
103
+ return df
104
+
105
+ @classmethod
106
+ def flatten_conditions(cls, node: ASTNode) -> Union[ASTNode, ConditionBlock]:
107
+ """
108
+ Recursively inspect conditions tree and move conditions related to 'OR' or 'AND' operators of the same level
109
+ to same ConditionBlock
110
+ Example: or (a=1, or (b=2, c=3))
111
+ is converted to: ConditionBlock(or, [a=1, b=2, c=3])
112
+ """
113
+
114
+ if isinstance(node, BinaryOperation):
115
+ op = node.op.upper()
116
+ if op in ("AND", "OR"):
117
+ block = ConditionBlock(op, [])
118
+ for arg in node.args:
119
+ item = cls.flatten_conditions(arg)
120
+ if isinstance(item, ConditionBlock):
121
+ if item.op == block.op:
122
+ block.items.extend(item.items)
123
+ else:
124
+ # new type of block
125
+ block.items.append(item)
126
+ else:
127
+ block.items.append(item)
128
+ return block
129
+ else:
130
+ node.op = node.op.upper()
131
+ return node
132
+
133
+ elif isinstance(node, BetweenOperation):
134
+ block = ConditionBlock(
135
+ "AND",
136
+ [
137
+ BinaryOperation(">=", args=[node.args[0], node.args[1]]),
138
+ BinaryOperation("<=", args=[node.args[0], node.args[2]]),
139
+ ],
140
+ )
141
+ return block
142
+
143
+ raise NotImplementedError(f"Unknown node '{node}'")
144
+
145
+ def call_kb(
146
+ self, conditions: List[BinaryOperation], disable_reranking: bool = False, limit: int = None
147
+ ) -> pd.DataFrame:
148
+ """
149
+ Call KB with list of prepared conditions
150
+
151
+ :param conditions: input conditions
152
+ :param disable_reranking: flag disable reranking
153
+ :param limit: use custom limit
154
+ :return: result of querying KB
155
+ """
156
+
157
+ where = None
158
+ for condition in conditions:
159
+ if where is None:
160
+ where = condition
161
+ else:
162
+ where = BinaryOperation("AND", args=[where, condition])
163
+
164
+ query = Select(targets=[Star()], where=where)
165
+
166
+ if limit is not None:
167
+ query.limit = Constant(limit)
168
+ elif self.limit is not None:
169
+ query.limit = Constant(self.limit)
170
+
171
+ return self.kb.select(query, disable_reranking=disable_reranking)
172
+
173
+ def execute_content_condition(
174
+ self,
175
+ content_condition: BinaryOperation,
176
+ other_conditions: List[BinaryOperation] = None,
177
+ disable_reranking: bool = False,
178
+ limit: int = None,
179
+ ) -> pd.DataFrame:
180
+ """
181
+ Call KB using content condition. Only positive conditions for content can be here.
182
+ Negative conditions can be only as filter of ID
183
+ :param content_condition: condition for Content column
184
+ :param other_conditions: conditions for other columns
185
+ :param disable_reranking: turn off reranking
186
+ :param limit: override default limit
187
+ :return: result of the query
188
+ """
189
+
190
+ if other_conditions is None:
191
+ other_conditions = []
192
+
193
+ if content_condition.op == "IN":
194
+ # (select where content = ‘a’) UNION (select where content = ‘b’)
195
+ results = []
196
+ for el in content_condition.args[1].items:
197
+ el_cond = BinaryOperation(op="=", args=[Identifier(self.content_column), el])
198
+ results.append(
199
+ self.call_kb([el_cond] + other_conditions, disable_reranking=disable_reranking, limit=limit)
200
+ )
201
+ return self.union(results)
202
+
203
+ elif content_condition.op in ("=", "LIKE"):
204
+ # just '='
205
+ content_condition2 = copy.deepcopy(content_condition)
206
+ content_condition2.op = "="
207
+ return self.call_kb([content_condition2] + other_conditions)
208
+
209
+ elif content_condition.op == "IS" and isinstance(content_condition.args[1], NullConstant):
210
+ # return empty dataset, call to get column names
211
+ return self.call_kb([], limit=1)[:0]
212
+ elif content_condition.op == "IS NOT" and isinstance(content_condition.args[1], NullConstant):
213
+ # execute without conditions
214
+ return self.call_kb([])
215
+ else:
216
+ raise NotImplementedError(
217
+ f'Operator "{content_condition.op}" is not supported for condition: {content_condition}'
218
+ )
219
+
220
+ def to_excluded_ids(
221
+ self, content_condition: BinaryOperation, other_conditions: List[BinaryOperation]
222
+ ) -> Optional[List[str]]:
223
+ """
224
+ Handles negative conditions for content. If it is negative condition: extract and return list of IDs
225
+ that have to be excluded by parent query
226
+
227
+ :param content_condition: condition for Content column
228
+ :param other_conditions: conditions for other columns
229
+ :return: list of IDs to exclude or None
230
+ """
231
+
232
+ if content_condition.op in ("!=", "<>", "NOT LIKE"):
233
+ # id NOT IN (
234
+ # SELECT id FROM kb WHERE content =’...’ limit X
235
+ # )
236
+ el_cond = BinaryOperation(op="=", args=content_condition.args)
237
+ threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
238
+ res = self.call_kb(
239
+ [el_cond, threshold] + other_conditions, disable_reranking=True, limit=self._negative_set_size
240
+ )
241
+
242
+ return list(res[self.id_column])
243
+
244
+ elif content_condition.op == "NOT IN":
245
+ # id NOT IN (
246
+ # select id where content in (‘a’, ‘b’)
247
+ # )
248
+ content_condition2 = copy.deepcopy(content_condition)
249
+ content_condition2.op = "IN"
250
+
251
+ threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
252
+ res = self.execute_content_condition(
253
+ content_condition2,
254
+ other_conditions + [threshold],
255
+ disable_reranking=True,
256
+ limit=self._negative_set_size,
257
+ )
258
+
259
+ return list(res[self.id_column])
260
+ else:
261
+ return None
262
+
263
+ def execute_blocks(self, block: ConditionBlock) -> pd.DataFrame:
264
+ """
265
+ Split block to set of calls with conditions and execute them. Nested blocks are supported
266
+
267
+ :param block:
268
+ :return: dataframe with result of block execution
269
+ """
270
+
271
+ if not isinstance(block, ConditionBlock):
272
+ # single condition
273
+ if self.is_content_condition(block):
274
+ return self.execute_content_condition(block)
275
+ else:
276
+ return self.call_kb([block])
277
+
278
+ if block.op == "AND":
279
+ results = []
280
+
281
+ content_filters, other_filters = [], []
282
+ for item in block.items:
283
+ if isinstance(item, ConditionBlock):
284
+ results.append(self.execute_blocks(item))
285
+ else:
286
+ if self.is_content_condition(item):
287
+ content_filters.append(item)
288
+ else:
289
+ other_filters.append(item)
290
+ if len(content_filters) > 0:
291
+ content_filters2 = []
292
+ exclude_ids = set()
293
+ # exclude content conditions
294
+ for condition in content_filters:
295
+ ids = self.to_excluded_ids(condition, other_filters)
296
+ if ids is not None:
297
+ exclude_ids.update(ids)
298
+ else:
299
+ # keep origin content filter
300
+ content_filters2.append(condition)
301
+
302
+ if exclude_ids:
303
+ # add to filter
304
+ values = [Constant(i) for i in exclude_ids]
305
+ condition = BinaryOperation(op="NOT IN", args=[Identifier(self.id_column), Tuple(values)])
306
+ other_filters.append(condition)
307
+ # execute content filters
308
+ for condition in content_filters2:
309
+ result = self.execute_content_condition(condition, other_filters)
310
+ results.append(result)
311
+ elif len(other_filters) > 0:
312
+ results.append(self.call_kb(other_filters))
313
+
314
+ return self.intersect(results)
315
+
316
+ elif block.op == "OR":
317
+ results = []
318
+ for item in block.items:
319
+ results.append(self.execute_blocks(item))
320
+
321
+ return self.union(results)
322
+
323
+ def run(self, query: Select) -> pd.DataFrame:
324
+ """
325
+ Plan and execute query to KB. If query has complex conditions:
326
+ - convert them to several queries with simple conditions, execute them and combine results
327
+
328
+ Stages:
329
+ - Remove unary NOT from condition: try to apply it to related operator
330
+ - Flat conditions tree: convert into condition blocks:
331
+ - having with same operators of the same levels in the same block
332
+ - Recursively execute blocks
333
+ - get data from OR blocks and union them
334
+ - get data from AND blocks and intersect them
335
+
336
+ :param query: select query
337
+ :return: results
338
+ """
339
+ if query.where is not None:
340
+ query_traversal(query.where, self.convert_unary_ops)
341
+ blocks_tree = self.flatten_conditions(query.where)
342
+ if query.limit is not None:
343
+ self.limit = query.limit.value
344
+ return self.execute_blocks(blocks_tree)
345
+ else:
346
+ return self.kb.select(query)
@@ -36,8 +36,11 @@ class LLMClient:
36
36
  )
37
37
  elif self.provider == "openai":
38
38
  openai_api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY")
39
+ kwargs = {"api_key": openai_api_key, "max_retries": 2}
39
40
  base_url = params.get("base_url")
40
- self.client = OpenAI(api_key=openai_api_key, base_url=base_url, max_retries=2)
41
+ if base_url:
42
+ kwargs["base_url"] = base_url
43
+ self.client = OpenAI(**kwargs)
41
44
 
42
45
  else:
43
46
  # try to use litellm
@@ -67,9 +70,5 @@ class LLMClient:
67
70
  kwargs = params.copy()
68
71
  model = kwargs.pop("model_name")
69
72
 
70
- base_url = params.pop("base_url", None)
71
- if base_url is not None:
72
- kwargs["api_base"] = base_url
73
-
74
- response = self.client.completion(model=f"{self.provider}/{model}", messages=messages, args=kwargs)
73
+ response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
75
74
  return response.choices[0].message.content
@@ -31,17 +31,10 @@ _DEFAULT_CONTENT_COLUMN_NAME = "content"
31
31
  class DocumentPreprocessor:
32
32
  """Base class for document preprocessing"""
33
33
 
34
- RESERVED_METADATA_FIELDS = {
35
- "content",
36
- "id",
37
- "embeddings",
38
- "original_doc_id",
39
- "chunk_index",
40
- }
41
-
42
34
  def __init__(self):
43
35
  """Initialize preprocessor"""
44
36
  self.splitter = None # Will be set by child classes
37
+ self.config = None
45
38
 
46
39
  def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
47
40
  """Base implementation - should be overridden by child classes
@@ -57,15 +50,10 @@ class DocumentPreprocessor:
57
50
  raise ValueError("Splitter not configured")
58
51
 
59
52
  # Convert to langchain Document for splitting
60
- langchain_doc = LangchainDocument(
61
- page_content=doc.content, metadata=doc.metadata or {}
62
- )
53
+ langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
63
54
  # Split and convert back to our Document type
64
55
  split_docs = self.splitter.split_documents([langchain_doc])
65
- return [
66
- Document(content=split_doc.page_content, metadata=split_doc.metadata)
67
- for split_doc in split_docs
68
- ]
56
+ return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
69
57
 
70
58
  def _get_source(self) -> str:
71
59
  """Get the source identifier for this preprocessor"""
@@ -118,14 +106,14 @@ class DocumentPreprocessor:
118
106
 
119
107
  # Always preserve original document ID
120
108
  if doc_id is not None:
121
- metadata["original_doc_id"] = doc_id
109
+ metadata[self.config.doc_id_column_name] = doc_id
122
110
 
123
111
  # Add chunk index only for multi-chunk cases
124
112
  if chunk_index is not None:
125
- metadata["chunk_index"] = chunk_index
113
+ metadata["_chunk_index"] = chunk_index
126
114
 
127
115
  # Always set source
128
- metadata["source"] = self._get_source()
116
+ metadata["_source"] = self._get_source()
129
117
 
130
118
  return metadata
131
119
 
@@ -148,9 +136,7 @@ Please give a short succinct context to situate this chunk within the overall do
148
136
  super().__init__()
149
137
  self.config = config
150
138
  self.splitter = FileSplitter(
151
- FileSplitterConfig(
152
- chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap
153
- )
139
+ FileSplitterConfig(chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap)
154
140
  )
155
141
  self.llm = create_chat_model(
156
142
  {
@@ -162,28 +148,22 @@ Please give a short succinct context to situate this chunk within the overall do
162
148
  self.context_template = config.context_template or self.DEFAULT_CONTEXT_TEMPLATE
163
149
  self.summarize = self.config.summarize
164
150
 
165
- def _prepare_prompts(
166
- self, chunk_contents: list[str], full_documents: list[str]
167
- ) -> list[str]:
151
+ def _prepare_prompts(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
168
152
  prompts = [
169
- self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document)
170
- for full_document in full_documents
153
+ self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document) for full_document in full_documents
171
154
  ]
172
155
  prompts = [
173
- prompt.replace("{{CHUNK_CONTENT}}", chunk_content)
174
- for prompt, chunk_content in zip(prompts, chunk_contents)
156
+ prompt.replace("{{CHUNK_CONTENT}}", chunk_content) for prompt, chunk_content in zip(prompts, chunk_contents)
175
157
  ]
176
158
 
177
159
  return prompts
178
160
 
179
- def _generate_context(
180
- self, chunk_contents: list[str], full_documents: list[str]
181
- ) -> list[str]:
161
+ def _generate_context(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
182
162
  """Generate contextual description for a chunk using LLM"""
183
163
  prompts = self._prepare_prompts(chunk_contents, full_documents)
184
164
 
185
165
  # Check if LLM supports async
186
- if hasattr(self.llm, 'abatch'):
166
+ if hasattr(self.llm, "abatch"):
187
167
  loop = asyncio.new_event_loop()
188
168
  asyncio.set_event_loop(loop)
189
169
  try:
@@ -211,7 +191,6 @@ Please give a short succinct context to situate this chunk within the overall do
211
191
  processed_chunks = []
212
192
 
213
193
  for doc_index, doc in enumerate(documents):
214
-
215
194
  # Document ID must be provided by this point
216
195
  if doc.id is None:
217
196
  raise ValueError("Document ID must be provided before preprocessing")
@@ -247,12 +226,8 @@ Please give a short succinct context to situate this chunk within the overall do
247
226
  chunk_contents = [chunk_doc.content for chunk_doc in chunks_list]
248
227
  contexts = self._generate_context(chunk_contents, doc_contents)
249
228
 
250
- for context, chunk_doc, chunk_index, doc_index in zip(
251
- contexts, chunks_list, chunk_index_list, doc_index_list
252
- ):
253
- processed_content = (
254
- context if self.summarize else f"{context}\n\n{chunk_doc.content}"
255
- )
229
+ for context, chunk_doc, chunk_index, doc_index in zip(contexts, chunks_list, chunk_index_list, doc_index_list):
230
+ processed_content = context if self.summarize else f"{context}\n\n{chunk_doc.content}"
256
231
  doc = documents[doc_index]
257
232
 
258
233
  # Initialize metadata
@@ -261,7 +236,7 @@ Please give a short succinct context to situate this chunk within the overall do
261
236
  metadata.update(doc.metadata)
262
237
 
263
238
  # Get content_column from metadata or use default
264
- content_column = metadata.get('content_column')
239
+ content_column = metadata.get("_content_column")
265
240
  if content_column is None:
266
241
  # If content_column is not in metadata, use the default column name
267
242
  content_column = _DEFAULT_CONTENT_COLUMN_NAME
@@ -305,7 +280,6 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
305
280
  processed_chunks = []
306
281
 
307
282
  for doc in documents:
308
-
309
283
  # Document ID must be provided by this point
310
284
  if doc.id is None:
311
285
  raise ValueError("Document ID must be provided before preprocessing")
@@ -334,13 +308,13 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
334
308
  metadata.update(doc.metadata)
335
309
 
336
310
  # Add position metadata
337
- metadata["start_char"] = start_char
338
- metadata["end_char"] = end_char
311
+ metadata["_start_char"] = start_char
312
+ metadata["_end_char"] = end_char
339
313
 
340
314
  # Get content_column from metadata or use default
341
315
  content_column = None
342
316
  if doc.metadata:
343
- content_column = doc.metadata.get('content_column')
317
+ content_column = doc.metadata.get("_content_column")
344
318
 
345
319
  if content_column is None:
346
320
  # If content_column is not in metadata, use the default column name
@@ -353,7 +327,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
353
327
  start_char=start_char,
354
328
  end_char=end_char,
355
329
  provided_id=doc.id,
356
- content_column=content_column
330
+ content_column=content_column,
357
331
  )
358
332
 
359
333
  processed_chunks.append(
@@ -392,6 +366,7 @@ class PreprocessorFactory:
392
366
  elif config.type == PreprocessorType.JSON_CHUNKING:
393
367
  # Import here to avoid circular imports
394
368
  from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingPreprocessor
369
+
395
370
  return JSONChunkingPreprocessor(config.json_chunking_config)
396
371
  else:
397
372
  raise ValueError(f"Unknown preprocessor type: {config.type}")