MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +53 -94
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +8 -6
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/planner/query_prepare.py +68 -87
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
- mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +126 -201
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/api/http/namespaces/file.py +49 -24
- mindsdb/api/mcp/start.py +45 -31
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +29 -9
- mindsdb/interfaces/agents/langchain_agent.py +7 -5
- mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
- mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +228 -110
- mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
- mindsdb/interfaces/knowledge_base/executor.py +346 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
- mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
- mindsdb/interfaces/skills/sql_agent.py +181 -130
- mindsdb/interfaces/storage/db.py +9 -7
- mindsdb/utilities/config.py +58 -40
- mindsdb/utilities/exception.py +58 -7
- mindsdb/utilities/security.py +54 -11
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import copy
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from mindsdb_sql_parser.ast import (
|
|
6
|
+
BinaryOperation,
|
|
7
|
+
Identifier,
|
|
8
|
+
Constant,
|
|
9
|
+
UnaryOperation,
|
|
10
|
+
Select,
|
|
11
|
+
Star,
|
|
12
|
+
Tuple,
|
|
13
|
+
ASTNode,
|
|
14
|
+
BetweenOperation,
|
|
15
|
+
NullConstant,
|
|
16
|
+
)
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ConditionBlock:
|
|
24
|
+
op: str
|
|
25
|
+
items: list
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class KnowledgeBaseQueryExecutor:
|
|
29
|
+
def __init__(self, kb, content_column="content", id_column="chunk_id"):
|
|
30
|
+
self.kb = kb
|
|
31
|
+
self.content_column = content_column.lower()
|
|
32
|
+
self.id_column = id_column
|
|
33
|
+
self.limit = None
|
|
34
|
+
self._negative_set_size = 100
|
|
35
|
+
self._negative_set_threshold = 0.5
|
|
36
|
+
|
|
37
|
+
def is_content_condition(self, node: ASTNode) -> bool:
|
|
38
|
+
"""
|
|
39
|
+
Checks if the node is a condition to Content column
|
|
40
|
+
|
|
41
|
+
:param node: condition to check
|
|
42
|
+
"""
|
|
43
|
+
if isinstance(node, BinaryOperation):
|
|
44
|
+
if isinstance(node.args[0], Identifier):
|
|
45
|
+
parts = node.args[0].parts
|
|
46
|
+
if len(parts) == 1 and parts[0].lower() == self.content_column:
|
|
47
|
+
return True
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def invert_content_op(node: BinaryOperation) -> BinaryOperation:
|
|
52
|
+
# Change operator of binary operation to opposite one
|
|
53
|
+
op_map = {"=": "!=", "!=": "=", "LIKE": "!=", "NOT LIKE": "=", "IN": "NOT IN", "NOT IN": "IN"}
|
|
54
|
+
if node.op.upper() not in op_map:
|
|
55
|
+
raise NotImplementedError(f"Can't handle condition: '{str(node)}'")
|
|
56
|
+
node.op = op_map[node.op.upper()]
|
|
57
|
+
return node
|
|
58
|
+
|
|
59
|
+
def convert_unary_ops(self, node: ASTNode, callstack: List[ASTNode], **kwargs) -> ASTNode:
|
|
60
|
+
"""
|
|
61
|
+
Tries to remove unary operator and apply it to Binary operation.
|
|
62
|
+
Supported cases:
|
|
63
|
+
- "NOT content <op> value" => "content <!op> value"
|
|
64
|
+
- "content <op> NOT value" => "content <!op> value"
|
|
65
|
+
|
|
66
|
+
Where <!op> is inverted operator of <op>
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
if isinstance(node, UnaryOperation):
|
|
70
|
+
if node.op.upper() == "NOT":
|
|
71
|
+
# two options:
|
|
72
|
+
# 1. NOT content <op> value
|
|
73
|
+
if self.is_content_condition(node.args[0]):
|
|
74
|
+
self.invert_content_op(node.args[0])
|
|
75
|
+
return node.args[0]
|
|
76
|
+
|
|
77
|
+
# 2. content <op> NOT value
|
|
78
|
+
if self.is_content_condition(callstack[0]):
|
|
79
|
+
self.invert_content_op(callstack[0])
|
|
80
|
+
return node.args[0]
|
|
81
|
+
|
|
82
|
+
def union(self, results: List[pd.DataFrame]) -> pd.DataFrame:
|
|
83
|
+
# combine dataframes from input list to single one
|
|
84
|
+
|
|
85
|
+
if len(results) == 1:
|
|
86
|
+
return results[0]
|
|
87
|
+
|
|
88
|
+
res = pd.concat(results)
|
|
89
|
+
df = res.drop_duplicates(subset=[self.id_column]).reset_index()
|
|
90
|
+
return df
|
|
91
|
+
|
|
92
|
+
def intersect(self, results: List[pd.DataFrame]) -> pd.DataFrame:
|
|
93
|
+
# intersect dataframes from input list: return dataframe with rows that exist in all input dataframes
|
|
94
|
+
|
|
95
|
+
if len(results) == 1:
|
|
96
|
+
return results[0]
|
|
97
|
+
|
|
98
|
+
item = results[0]
|
|
99
|
+
for item2 in results[1:]:
|
|
100
|
+
item = item[item[self.id_column].isin(item2[self.id_column])]
|
|
101
|
+
|
|
102
|
+
df = item
|
|
103
|
+
return df
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def flatten_conditions(cls, node: ASTNode) -> Union[ASTNode, ConditionBlock]:
|
|
107
|
+
"""
|
|
108
|
+
Recursively inspect conditions tree and move conditions related to 'OR' or 'AND' operators of the same level
|
|
109
|
+
to same ConditionBlock
|
|
110
|
+
Example: or (a=1, or (b=2, c=3))
|
|
111
|
+
is converted to: ConditionBlock(or, [a=1, b=2, c=3])
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
if isinstance(node, BinaryOperation):
|
|
115
|
+
op = node.op.upper()
|
|
116
|
+
if op in ("AND", "OR"):
|
|
117
|
+
block = ConditionBlock(op, [])
|
|
118
|
+
for arg in node.args:
|
|
119
|
+
item = cls.flatten_conditions(arg)
|
|
120
|
+
if isinstance(item, ConditionBlock):
|
|
121
|
+
if item.op == block.op:
|
|
122
|
+
block.items.extend(item.items)
|
|
123
|
+
else:
|
|
124
|
+
# new type of block
|
|
125
|
+
block.items.append(item)
|
|
126
|
+
else:
|
|
127
|
+
block.items.append(item)
|
|
128
|
+
return block
|
|
129
|
+
else:
|
|
130
|
+
node.op = node.op.upper()
|
|
131
|
+
return node
|
|
132
|
+
|
|
133
|
+
elif isinstance(node, BetweenOperation):
|
|
134
|
+
block = ConditionBlock(
|
|
135
|
+
"AND",
|
|
136
|
+
[
|
|
137
|
+
BinaryOperation(">=", args=[node.args[0], node.args[1]]),
|
|
138
|
+
BinaryOperation("<=", args=[node.args[0], node.args[2]]),
|
|
139
|
+
],
|
|
140
|
+
)
|
|
141
|
+
return block
|
|
142
|
+
|
|
143
|
+
raise NotImplementedError(f"Unknown node '{node}'")
|
|
144
|
+
|
|
145
|
+
def call_kb(
|
|
146
|
+
self, conditions: List[BinaryOperation], disable_reranking: bool = False, limit: int = None
|
|
147
|
+
) -> pd.DataFrame:
|
|
148
|
+
"""
|
|
149
|
+
Call KB with list of prepared conditions
|
|
150
|
+
|
|
151
|
+
:param conditions: input conditions
|
|
152
|
+
:param disable_reranking: flag disable reranking
|
|
153
|
+
:param limit: use custom limit
|
|
154
|
+
:return: result of querying KB
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
where = None
|
|
158
|
+
for condition in conditions:
|
|
159
|
+
if where is None:
|
|
160
|
+
where = condition
|
|
161
|
+
else:
|
|
162
|
+
where = BinaryOperation("AND", args=[where, condition])
|
|
163
|
+
|
|
164
|
+
query = Select(targets=[Star()], where=where)
|
|
165
|
+
|
|
166
|
+
if limit is not None:
|
|
167
|
+
query.limit = Constant(limit)
|
|
168
|
+
elif self.limit is not None:
|
|
169
|
+
query.limit = Constant(self.limit)
|
|
170
|
+
|
|
171
|
+
return self.kb.select(query, disable_reranking=disable_reranking)
|
|
172
|
+
|
|
173
|
+
def execute_content_condition(
|
|
174
|
+
self,
|
|
175
|
+
content_condition: BinaryOperation,
|
|
176
|
+
other_conditions: List[BinaryOperation] = None,
|
|
177
|
+
disable_reranking: bool = False,
|
|
178
|
+
limit: int = None,
|
|
179
|
+
) -> pd.DataFrame:
|
|
180
|
+
"""
|
|
181
|
+
Call KB using content condition. Only positive conditions for content can be here.
|
|
182
|
+
Negative conditions can be only as filter of ID
|
|
183
|
+
:param content_condition: condition for Content column
|
|
184
|
+
:param other_conditions: conditions for other columns
|
|
185
|
+
:param disable_reranking: turn off reranking
|
|
186
|
+
:param limit: override default limit
|
|
187
|
+
:return: result of the query
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
if other_conditions is None:
|
|
191
|
+
other_conditions = []
|
|
192
|
+
|
|
193
|
+
if content_condition.op == "IN":
|
|
194
|
+
# (select where content = ‘a’) UNION (select where content = ‘b’)
|
|
195
|
+
results = []
|
|
196
|
+
for el in content_condition.args[1].items:
|
|
197
|
+
el_cond = BinaryOperation(op="=", args=[Identifier(self.content_column), el])
|
|
198
|
+
results.append(
|
|
199
|
+
self.call_kb([el_cond] + other_conditions, disable_reranking=disable_reranking, limit=limit)
|
|
200
|
+
)
|
|
201
|
+
return self.union(results)
|
|
202
|
+
|
|
203
|
+
elif content_condition.op in ("=", "LIKE"):
|
|
204
|
+
# just '='
|
|
205
|
+
content_condition2 = copy.deepcopy(content_condition)
|
|
206
|
+
content_condition2.op = "="
|
|
207
|
+
return self.call_kb([content_condition2] + other_conditions)
|
|
208
|
+
|
|
209
|
+
elif content_condition.op == "IS" and isinstance(content_condition.args[1], NullConstant):
|
|
210
|
+
# return empty dataset, call to get column names
|
|
211
|
+
return self.call_kb([], limit=1)[:0]
|
|
212
|
+
elif content_condition.op == "IS NOT" and isinstance(content_condition.args[1], NullConstant):
|
|
213
|
+
# execute without conditions
|
|
214
|
+
return self.call_kb([])
|
|
215
|
+
else:
|
|
216
|
+
raise NotImplementedError(
|
|
217
|
+
f'Operator "{content_condition.op}" is not supported for condition: {content_condition}'
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def to_excluded_ids(
|
|
221
|
+
self, content_condition: BinaryOperation, other_conditions: List[BinaryOperation]
|
|
222
|
+
) -> Optional[List[str]]:
|
|
223
|
+
"""
|
|
224
|
+
Handles negative conditions for content. If it is negative condition: extract and return list of IDs
|
|
225
|
+
that have to be excluded by parent query
|
|
226
|
+
|
|
227
|
+
:param content_condition: condition for Content column
|
|
228
|
+
:param other_conditions: conditions for other columns
|
|
229
|
+
:return: list of IDs to exclude or None
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
if content_condition.op in ("!=", "<>", "NOT LIKE"):
|
|
233
|
+
# id NOT IN (
|
|
234
|
+
# SELECT id FROM kb WHERE content =’...’ limit X
|
|
235
|
+
# )
|
|
236
|
+
el_cond = BinaryOperation(op="=", args=content_condition.args)
|
|
237
|
+
threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
|
|
238
|
+
res = self.call_kb(
|
|
239
|
+
[el_cond, threshold] + other_conditions, disable_reranking=True, limit=self._negative_set_size
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return list(res[self.id_column])
|
|
243
|
+
|
|
244
|
+
elif content_condition.op == "NOT IN":
|
|
245
|
+
# id NOT IN (
|
|
246
|
+
# select id where content in (‘a’, ‘b’)
|
|
247
|
+
# )
|
|
248
|
+
content_condition2 = copy.deepcopy(content_condition)
|
|
249
|
+
content_condition2.op = "IN"
|
|
250
|
+
|
|
251
|
+
threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
|
|
252
|
+
res = self.execute_content_condition(
|
|
253
|
+
content_condition2,
|
|
254
|
+
other_conditions + [threshold],
|
|
255
|
+
disable_reranking=True,
|
|
256
|
+
limit=self._negative_set_size,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return list(res[self.id_column])
|
|
260
|
+
else:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
def execute_blocks(self, block: ConditionBlock) -> pd.DataFrame:
|
|
264
|
+
"""
|
|
265
|
+
Split block to set of calls with conditions and execute them. Nested blocks are supported
|
|
266
|
+
|
|
267
|
+
:param block:
|
|
268
|
+
:return: dataframe with result of block execution
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
if not isinstance(block, ConditionBlock):
|
|
272
|
+
# single condition
|
|
273
|
+
if self.is_content_condition(block):
|
|
274
|
+
return self.execute_content_condition(block)
|
|
275
|
+
else:
|
|
276
|
+
return self.call_kb([block])
|
|
277
|
+
|
|
278
|
+
if block.op == "AND":
|
|
279
|
+
results = []
|
|
280
|
+
|
|
281
|
+
content_filters, other_filters = [], []
|
|
282
|
+
for item in block.items:
|
|
283
|
+
if isinstance(item, ConditionBlock):
|
|
284
|
+
results.append(self.execute_blocks(item))
|
|
285
|
+
else:
|
|
286
|
+
if self.is_content_condition(item):
|
|
287
|
+
content_filters.append(item)
|
|
288
|
+
else:
|
|
289
|
+
other_filters.append(item)
|
|
290
|
+
if len(content_filters) > 0:
|
|
291
|
+
content_filters2 = []
|
|
292
|
+
exclude_ids = set()
|
|
293
|
+
# exclude content conditions
|
|
294
|
+
for condition in content_filters:
|
|
295
|
+
ids = self.to_excluded_ids(condition, other_filters)
|
|
296
|
+
if ids is not None:
|
|
297
|
+
exclude_ids.update(ids)
|
|
298
|
+
else:
|
|
299
|
+
# keep origin content filter
|
|
300
|
+
content_filters2.append(condition)
|
|
301
|
+
|
|
302
|
+
if exclude_ids:
|
|
303
|
+
# add to filter
|
|
304
|
+
values = [Constant(i) for i in exclude_ids]
|
|
305
|
+
condition = BinaryOperation(op="NOT IN", args=[Identifier(self.id_column), Tuple(values)])
|
|
306
|
+
other_filters.append(condition)
|
|
307
|
+
# execute content filters
|
|
308
|
+
for condition in content_filters2:
|
|
309
|
+
result = self.execute_content_condition(condition, other_filters)
|
|
310
|
+
results.append(result)
|
|
311
|
+
elif len(other_filters) > 0:
|
|
312
|
+
results.append(self.call_kb(other_filters))
|
|
313
|
+
|
|
314
|
+
return self.intersect(results)
|
|
315
|
+
|
|
316
|
+
elif block.op == "OR":
|
|
317
|
+
results = []
|
|
318
|
+
for item in block.items:
|
|
319
|
+
results.append(self.execute_blocks(item))
|
|
320
|
+
|
|
321
|
+
return self.union(results)
|
|
322
|
+
|
|
323
|
+
def run(self, query: Select) -> pd.DataFrame:
|
|
324
|
+
"""
|
|
325
|
+
Plan and execute query to KB. If query has complex conditions:
|
|
326
|
+
- convert them to several queries with simple conditions, execute them and combine results
|
|
327
|
+
|
|
328
|
+
Stages:
|
|
329
|
+
- Remove unary NOT from condition: try to apply it to related operator
|
|
330
|
+
- Flat conditions tree: convert into condition blocks:
|
|
331
|
+
- having with same operators of the same levels in the same block
|
|
332
|
+
- Recursively execute blocks
|
|
333
|
+
- get data from OR blocks and union them
|
|
334
|
+
- get data from AND blocks and intersect them
|
|
335
|
+
|
|
336
|
+
:param query: select query
|
|
337
|
+
:return: results
|
|
338
|
+
"""
|
|
339
|
+
if query.where is not None:
|
|
340
|
+
query_traversal(query.where, self.convert_unary_ops)
|
|
341
|
+
blocks_tree = self.flatten_conditions(query.where)
|
|
342
|
+
if query.limit is not None:
|
|
343
|
+
self.limit = query.limit.value
|
|
344
|
+
return self.execute_blocks(blocks_tree)
|
|
345
|
+
else:
|
|
346
|
+
return self.kb.select(query)
|
|
@@ -36,8 +36,11 @@ class LLMClient:
|
|
|
36
36
|
)
|
|
37
37
|
elif self.provider == "openai":
|
|
38
38
|
openai_api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY")
|
|
39
|
+
kwargs = {"api_key": openai_api_key, "max_retries": 2}
|
|
39
40
|
base_url = params.get("base_url")
|
|
40
|
-
|
|
41
|
+
if base_url:
|
|
42
|
+
kwargs["base_url"] = base_url
|
|
43
|
+
self.client = OpenAI(**kwargs)
|
|
41
44
|
|
|
42
45
|
else:
|
|
43
46
|
# try to use litellm
|
|
@@ -67,9 +70,5 @@ class LLMClient:
|
|
|
67
70
|
kwargs = params.copy()
|
|
68
71
|
model = kwargs.pop("model_name")
|
|
69
72
|
|
|
70
|
-
|
|
71
|
-
if base_url is not None:
|
|
72
|
-
kwargs["api_base"] = base_url
|
|
73
|
-
|
|
74
|
-
response = self.client.completion(model=f"{self.provider}/{model}", messages=messages, args=kwargs)
|
|
73
|
+
response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
|
|
75
74
|
return response.choices[0].message.content
|
|
@@ -31,17 +31,10 @@ _DEFAULT_CONTENT_COLUMN_NAME = "content"
|
|
|
31
31
|
class DocumentPreprocessor:
|
|
32
32
|
"""Base class for document preprocessing"""
|
|
33
33
|
|
|
34
|
-
RESERVED_METADATA_FIELDS = {
|
|
35
|
-
"content",
|
|
36
|
-
"id",
|
|
37
|
-
"embeddings",
|
|
38
|
-
"original_doc_id",
|
|
39
|
-
"chunk_index",
|
|
40
|
-
}
|
|
41
|
-
|
|
42
34
|
def __init__(self):
|
|
43
35
|
"""Initialize preprocessor"""
|
|
44
36
|
self.splitter = None # Will be set by child classes
|
|
37
|
+
self.config = None
|
|
45
38
|
|
|
46
39
|
def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
|
|
47
40
|
"""Base implementation - should be overridden by child classes
|
|
@@ -57,15 +50,10 @@ class DocumentPreprocessor:
|
|
|
57
50
|
raise ValueError("Splitter not configured")
|
|
58
51
|
|
|
59
52
|
# Convert to langchain Document for splitting
|
|
60
|
-
langchain_doc = LangchainDocument(
|
|
61
|
-
page_content=doc.content, metadata=doc.metadata or {}
|
|
62
|
-
)
|
|
53
|
+
langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
|
|
63
54
|
# Split and convert back to our Document type
|
|
64
55
|
split_docs = self.splitter.split_documents([langchain_doc])
|
|
65
|
-
return [
|
|
66
|
-
Document(content=split_doc.page_content, metadata=split_doc.metadata)
|
|
67
|
-
for split_doc in split_docs
|
|
68
|
-
]
|
|
56
|
+
return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
|
|
69
57
|
|
|
70
58
|
def _get_source(self) -> str:
|
|
71
59
|
"""Get the source identifier for this preprocessor"""
|
|
@@ -118,14 +106,14 @@ class DocumentPreprocessor:
|
|
|
118
106
|
|
|
119
107
|
# Always preserve original document ID
|
|
120
108
|
if doc_id is not None:
|
|
121
|
-
metadata[
|
|
109
|
+
metadata[self.config.doc_id_column_name] = doc_id
|
|
122
110
|
|
|
123
111
|
# Add chunk index only for multi-chunk cases
|
|
124
112
|
if chunk_index is not None:
|
|
125
|
-
metadata["
|
|
113
|
+
metadata["_chunk_index"] = chunk_index
|
|
126
114
|
|
|
127
115
|
# Always set source
|
|
128
|
-
metadata["
|
|
116
|
+
metadata["_source"] = self._get_source()
|
|
129
117
|
|
|
130
118
|
return metadata
|
|
131
119
|
|
|
@@ -148,9 +136,7 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
148
136
|
super().__init__()
|
|
149
137
|
self.config = config
|
|
150
138
|
self.splitter = FileSplitter(
|
|
151
|
-
FileSplitterConfig(
|
|
152
|
-
chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap
|
|
153
|
-
)
|
|
139
|
+
FileSplitterConfig(chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap)
|
|
154
140
|
)
|
|
155
141
|
self.llm = create_chat_model(
|
|
156
142
|
{
|
|
@@ -162,28 +148,22 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
162
148
|
self.context_template = config.context_template or self.DEFAULT_CONTEXT_TEMPLATE
|
|
163
149
|
self.summarize = self.config.summarize
|
|
164
150
|
|
|
165
|
-
def _prepare_prompts(
|
|
166
|
-
self, chunk_contents: list[str], full_documents: list[str]
|
|
167
|
-
) -> list[str]:
|
|
151
|
+
def _prepare_prompts(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
|
|
168
152
|
prompts = [
|
|
169
|
-
self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document)
|
|
170
|
-
for full_document in full_documents
|
|
153
|
+
self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document) for full_document in full_documents
|
|
171
154
|
]
|
|
172
155
|
prompts = [
|
|
173
|
-
prompt.replace("{{CHUNK_CONTENT}}", chunk_content)
|
|
174
|
-
for prompt, chunk_content in zip(prompts, chunk_contents)
|
|
156
|
+
prompt.replace("{{CHUNK_CONTENT}}", chunk_content) for prompt, chunk_content in zip(prompts, chunk_contents)
|
|
175
157
|
]
|
|
176
158
|
|
|
177
159
|
return prompts
|
|
178
160
|
|
|
179
|
-
def _generate_context(
|
|
180
|
-
self, chunk_contents: list[str], full_documents: list[str]
|
|
181
|
-
) -> list[str]:
|
|
161
|
+
def _generate_context(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
|
|
182
162
|
"""Generate contextual description for a chunk using LLM"""
|
|
183
163
|
prompts = self._prepare_prompts(chunk_contents, full_documents)
|
|
184
164
|
|
|
185
165
|
# Check if LLM supports async
|
|
186
|
-
if hasattr(self.llm,
|
|
166
|
+
if hasattr(self.llm, "abatch"):
|
|
187
167
|
loop = asyncio.new_event_loop()
|
|
188
168
|
asyncio.set_event_loop(loop)
|
|
189
169
|
try:
|
|
@@ -211,7 +191,6 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
211
191
|
processed_chunks = []
|
|
212
192
|
|
|
213
193
|
for doc_index, doc in enumerate(documents):
|
|
214
|
-
|
|
215
194
|
# Document ID must be provided by this point
|
|
216
195
|
if doc.id is None:
|
|
217
196
|
raise ValueError("Document ID must be provided before preprocessing")
|
|
@@ -247,12 +226,8 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
247
226
|
chunk_contents = [chunk_doc.content for chunk_doc in chunks_list]
|
|
248
227
|
contexts = self._generate_context(chunk_contents, doc_contents)
|
|
249
228
|
|
|
250
|
-
for context, chunk_doc, chunk_index, doc_index in zip(
|
|
251
|
-
|
|
252
|
-
):
|
|
253
|
-
processed_content = (
|
|
254
|
-
context if self.summarize else f"{context}\n\n{chunk_doc.content}"
|
|
255
|
-
)
|
|
229
|
+
for context, chunk_doc, chunk_index, doc_index in zip(contexts, chunks_list, chunk_index_list, doc_index_list):
|
|
230
|
+
processed_content = context if self.summarize else f"{context}\n\n{chunk_doc.content}"
|
|
256
231
|
doc = documents[doc_index]
|
|
257
232
|
|
|
258
233
|
# Initialize metadata
|
|
@@ -261,7 +236,7 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
261
236
|
metadata.update(doc.metadata)
|
|
262
237
|
|
|
263
238
|
# Get content_column from metadata or use default
|
|
264
|
-
content_column = metadata.get(
|
|
239
|
+
content_column = metadata.get("_content_column")
|
|
265
240
|
if content_column is None:
|
|
266
241
|
# If content_column is not in metadata, use the default column name
|
|
267
242
|
content_column = _DEFAULT_CONTENT_COLUMN_NAME
|
|
@@ -305,7 +280,6 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
305
280
|
processed_chunks = []
|
|
306
281
|
|
|
307
282
|
for doc in documents:
|
|
308
|
-
|
|
309
283
|
# Document ID must be provided by this point
|
|
310
284
|
if doc.id is None:
|
|
311
285
|
raise ValueError("Document ID must be provided before preprocessing")
|
|
@@ -334,13 +308,13 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
334
308
|
metadata.update(doc.metadata)
|
|
335
309
|
|
|
336
310
|
# Add position metadata
|
|
337
|
-
metadata["
|
|
338
|
-
metadata["
|
|
311
|
+
metadata["_start_char"] = start_char
|
|
312
|
+
metadata["_end_char"] = end_char
|
|
339
313
|
|
|
340
314
|
# Get content_column from metadata or use default
|
|
341
315
|
content_column = None
|
|
342
316
|
if doc.metadata:
|
|
343
|
-
content_column = doc.metadata.get(
|
|
317
|
+
content_column = doc.metadata.get("_content_column")
|
|
344
318
|
|
|
345
319
|
if content_column is None:
|
|
346
320
|
# If content_column is not in metadata, use the default column name
|
|
@@ -353,7 +327,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
353
327
|
start_char=start_char,
|
|
354
328
|
end_char=end_char,
|
|
355
329
|
provided_id=doc.id,
|
|
356
|
-
content_column=content_column
|
|
330
|
+
content_column=content_column,
|
|
357
331
|
)
|
|
358
332
|
|
|
359
333
|
processed_chunks.append(
|
|
@@ -392,6 +366,7 @@ class PreprocessorFactory:
|
|
|
392
366
|
elif config.type == PreprocessorType.JSON_CHUNKING:
|
|
393
367
|
# Import here to avoid circular imports
|
|
394
368
|
from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingPreprocessor
|
|
369
|
+
|
|
395
370
|
return JSONChunkingPreprocessor(config.json_chunking_config)
|
|
396
371
|
else:
|
|
397
372
|
raise ValueError(f"Unknown preprocessor type: {config.type}")
|