MindsDB 25.4.2.1__py3-none-any.whl → 25.4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +30 -7
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +12 -69
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -1
- mindsdb/integrations/libs/vectordatabase_handler.py +9 -1
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +1 -1
- mindsdb/interfaces/database/projects.py +7 -1
- mindsdb/interfaces/knowledge_base/controller.py +41 -34
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +43 -62
- mindsdb/interfaces/knowledge_base/utils.py +28 -0
- mindsdb/utilities/auth.py +5 -1
- mindsdb/utilities/cache.py +4 -1
- {mindsdb-25.4.2.1.dist-info → mindsdb-25.4.3.0.dist-info}/METADATA +223 -223
- {mindsdb-25.4.2.1.dist-info → mindsdb-25.4.3.0.dist-info}/RECORD +17 -16
- {mindsdb-25.4.2.1.dist-info → mindsdb-25.4.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.2.1.dist-info → mindsdb-25.4.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.2.1.dist-info → mindsdb-25.4.3.0.dist-info}/top_level.txt +0 -0
mindsdb/__about__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
__title__ = 'MindsDB'
|
|
2
2
|
__package_name__ = 'mindsdb'
|
|
3
|
-
__version__ = '25.4.
|
|
3
|
+
__version__ = '25.4.3.0'
|
|
4
4
|
__description__ = "MindsDB's AI SQL Server enables developers to build AI tools that need access to real-time data to perform their tasks"
|
|
5
5
|
__email__ = "jorge@mindsdb.com"
|
|
6
6
|
__author__ = 'MindsDB Inc'
|
mindsdb/__main__.py
CHANGED
|
@@ -299,15 +299,38 @@ if __name__ == '__main__':
|
|
|
299
299
|
logger.debug(f"Checking if default project {config.get('default_project')} exists")
|
|
300
300
|
project_controller = ProjectController()
|
|
301
301
|
|
|
302
|
-
|
|
303
|
-
|
|
302
|
+
try:
|
|
303
|
+
current_default_project = project_controller.get(is_default=True)
|
|
304
|
+
except EntityNotExistsError:
|
|
305
|
+
# In previous versions, the default project could be deleted. This is no longer possible.
|
|
306
|
+
current_default_project = None
|
|
307
|
+
|
|
308
|
+
if current_default_project:
|
|
309
|
+
if current_default_project.record.name != config.get('default_project'):
|
|
310
|
+
try:
|
|
311
|
+
project_controller.get(name=config.get('default_project'))
|
|
312
|
+
log.critical(f"A project with the name '{config.get('default_project')}' already exists")
|
|
313
|
+
sys.exit(1)
|
|
314
|
+
except EntityNotExistsError:
|
|
315
|
+
pass
|
|
316
|
+
project_controller.update(current_default_project.record.id, new_name=config.get('default_project'))
|
|
317
|
+
|
|
318
|
+
# Legacy: If the default project does not exist, mark the new one as default.
|
|
319
|
+
else:
|
|
304
320
|
try:
|
|
305
|
-
|
|
306
|
-
log.critical(f"A project with the name '{config.get('default_project')}' already exists")
|
|
307
|
-
sys.exit(1)
|
|
321
|
+
project_controller.get(name=config.get('default_project'))
|
|
308
322
|
except EntityNotExistsError:
|
|
309
|
-
|
|
310
|
-
|
|
323
|
+
log.critical(
|
|
324
|
+
f"A project with the name '{config.get('default_project')}' does not exist"
|
|
325
|
+
)
|
|
326
|
+
raise
|
|
327
|
+
|
|
328
|
+
project_controller.update(
|
|
329
|
+
name=config.get('default_project'),
|
|
330
|
+
new_metadata={
|
|
331
|
+
"is_default": True
|
|
332
|
+
}
|
|
333
|
+
)
|
|
311
334
|
|
|
312
335
|
apis = os.getenv('MINDSDB_APIS') or config.cmd_args.api
|
|
313
336
|
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import threading
|
|
3
|
-
import queue
|
|
4
2
|
from typing import List
|
|
5
3
|
|
|
6
4
|
from mindsdb_sql_parser import ASTNode
|
|
@@ -11,9 +9,10 @@ from mindsdb.interfaces.query_context.context_controller import RunningQuery
|
|
|
11
9
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
12
10
|
from mindsdb.utilities import log
|
|
13
11
|
from mindsdb.utilities.config import Config
|
|
14
|
-
from mindsdb.utilities.context import Context, context as ctx
|
|
15
12
|
from mindsdb.utilities.partitioning import get_max_thread_count, split_data_frame
|
|
16
13
|
from mindsdb.api.executor.sql_query.steps.fetch_dataframe import get_table_alias, get_fill_param_fnc
|
|
14
|
+
from mindsdb.utilities.context_executor import ContextThreadPoolExecutor
|
|
15
|
+
|
|
17
16
|
|
|
18
17
|
from .base import BaseStepCall
|
|
19
18
|
|
|
@@ -178,9 +177,6 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
178
177
|
"""
|
|
179
178
|
|
|
180
179
|
# create communication queues
|
|
181
|
-
queue_in = queue.Queue()
|
|
182
|
-
queue_out = queue.Queue()
|
|
183
|
-
self.stop_event = threading.Event()
|
|
184
180
|
|
|
185
181
|
if thread_count is None:
|
|
186
182
|
thread_count = get_max_thread_count()
|
|
@@ -191,16 +187,9 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
191
187
|
if partition_size < 10:
|
|
192
188
|
partition_size = 10
|
|
193
189
|
|
|
194
|
-
# create N workers pool
|
|
195
|
-
workers = []
|
|
196
190
|
results = []
|
|
197
191
|
|
|
198
|
-
|
|
199
|
-
for i in range(thread_count):
|
|
200
|
-
worker = threading.Thread(target=self._worker, daemon=True, args=(ctx.dump(), queue_in,
|
|
201
|
-
queue_out, self.stop_event))
|
|
202
|
-
worker.start()
|
|
203
|
-
workers.append(worker)
|
|
192
|
+
with ContextThreadPoolExecutor(max_workers=thread_count) as executor:
|
|
204
193
|
|
|
205
194
|
while True:
|
|
206
195
|
# fetch batch
|
|
@@ -220,69 +209,23 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
220
209
|
max_track_value = run_query.get_max_track_value(df)
|
|
221
210
|
|
|
222
211
|
# split into chunks and send to workers
|
|
223
|
-
|
|
212
|
+
futures = []
|
|
224
213
|
for df2 in split_data_frame(df, partition_size):
|
|
225
|
-
|
|
226
|
-
sent_chunks += 1
|
|
214
|
+
futures.append(executor.submit(self.exec_sub_steps, df2))
|
|
227
215
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
216
|
+
for future in futures:
|
|
217
|
+
try:
|
|
218
|
+
results.append(future.result())
|
|
219
|
+
except Exception as e:
|
|
232
220
|
if on_error == 'skip':
|
|
233
|
-
logger.error(
|
|
221
|
+
logger.error(e)
|
|
234
222
|
else:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
if res['data']:
|
|
238
|
-
batch_results.append(res)
|
|
239
|
-
|
|
240
|
-
# sort results
|
|
241
|
-
batch_results.sort(key=lambda x: x['num'])
|
|
242
|
-
|
|
243
|
-
results.append(self.concat_results(
|
|
244
|
-
[item['data'] for item in batch_results]
|
|
245
|
-
))
|
|
223
|
+
executor.shutdown()
|
|
224
|
+
raise e
|
|
246
225
|
|
|
247
226
|
# TODO
|
|
248
227
|
# 1. get next batch without updating track_value:
|
|
249
228
|
# it allows to keep queue_in filled with data between fetching batches
|
|
250
229
|
run_query.set_progress(df, max_track_value)
|
|
251
|
-
finally:
|
|
252
|
-
self.close_workers(workers)
|
|
253
230
|
|
|
254
231
|
return self.concat_results(results)
|
|
255
|
-
|
|
256
|
-
def close_workers(self, workers: List[threading.Thread]):
|
|
257
|
-
"""
|
|
258
|
-
Sent signal to workers to stop
|
|
259
|
-
"""
|
|
260
|
-
|
|
261
|
-
self.stop_event.set()
|
|
262
|
-
for worker in workers:
|
|
263
|
-
if worker.is_alive():
|
|
264
|
-
worker.join()
|
|
265
|
-
|
|
266
|
-
def _worker(self, context: Context, queue_in: queue.Queue, queue_out: queue.Queue, stop_event: threading.Event):
|
|
267
|
-
"""
|
|
268
|
-
Worker function. Execute incoming tasks unless stop_event is set
|
|
269
|
-
"""
|
|
270
|
-
ctx.load(context)
|
|
271
|
-
while True:
|
|
272
|
-
if stop_event.is_set():
|
|
273
|
-
break
|
|
274
|
-
|
|
275
|
-
try:
|
|
276
|
-
chunk_num, df = queue_in.get(timeout=1)
|
|
277
|
-
if df is None:
|
|
278
|
-
continue
|
|
279
|
-
|
|
280
|
-
sub_data = self.exec_sub_steps(df)
|
|
281
|
-
|
|
282
|
-
queue_out.put({'data': sub_data, 'num': chunk_num})
|
|
283
|
-
except queue.Empty:
|
|
284
|
-
continue
|
|
285
|
-
|
|
286
|
-
except Exception as e:
|
|
287
|
-
queue_out.put({'error': str(e)})
|
|
288
|
-
stop_event.set()
|
|
@@ -244,6 +244,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
244
244
|
offset: int = None,
|
|
245
245
|
limit: int = None,
|
|
246
246
|
) -> pd.DataFrame:
|
|
247
|
+
|
|
247
248
|
collection = self._client.get_collection(table_name)
|
|
248
249
|
filters = self._translate_metadata_condition(conditions)
|
|
249
250
|
|
|
@@ -313,7 +314,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
313
314
|
TableField.ID.value: ids,
|
|
314
315
|
TableField.CONTENT.value: documents,
|
|
315
316
|
TableField.METADATA.value: metadatas,
|
|
316
|
-
TableField.EMBEDDINGS.value: embeddings,
|
|
317
|
+
TableField.EMBEDDINGS.value: list(embeddings),
|
|
317
318
|
}
|
|
318
319
|
|
|
319
320
|
if columns is not None:
|
|
@@ -278,8 +278,16 @@ class VectorStoreHandler(BaseHandler):
|
|
|
278
278
|
return self.do_upsert(table_name, df)
|
|
279
279
|
|
|
280
280
|
def do_upsert(self, table_name, df):
|
|
281
|
-
|
|
281
|
+
"""Upsert data into table, handling document updates and deletions.
|
|
282
282
|
|
|
283
|
+
Args:
|
|
284
|
+
table_name (str): Name of the table
|
|
285
|
+
df (pd.DataFrame): DataFrame containing the data to upsert
|
|
286
|
+
|
|
287
|
+
The function handles three cases:
|
|
288
|
+
1. New documents: Insert them
|
|
289
|
+
2. Updated documents: Delete old chunks and insert new ones
|
|
290
|
+
"""
|
|
283
291
|
id_col = TableField.ID.value
|
|
284
292
|
content_col = TableField.CONTENT.value
|
|
285
293
|
|
|
@@ -18,7 +18,7 @@ log = logging.getLogger(__name__)
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class LLMReranker(BaseDocumentCompressor):
|
|
21
|
-
filtering_threshold: float = 0.
|
|
21
|
+
filtering_threshold: float = 0.0 # Default threshold for filtering
|
|
22
22
|
model: str = DEFAULT_RERANKING_MODEL # Model to use for reranking
|
|
23
23
|
temperature: float = 0.0 # Temperature for the model
|
|
24
24
|
openai_api_key: Optional[str] = None
|
|
@@ -69,6 +69,12 @@ class Project:
|
|
|
69
69
|
self.id = record.id
|
|
70
70
|
|
|
71
71
|
def delete(self):
|
|
72
|
+
if self.record.metadata_ and self.record.metadata_.get('is_default', False):
|
|
73
|
+
raise Exception(
|
|
74
|
+
f"Project '{self.name}' can not be deleted, because it is default project."
|
|
75
|
+
"The default project can be changed in the config file or by setting the environment variable MINDSDB_DEFAULT_PROJECT."
|
|
76
|
+
)
|
|
77
|
+
|
|
72
78
|
tables = self.get_tables()
|
|
73
79
|
tables = [key for key, val in tables.items() if val['type'] != 'table']
|
|
74
80
|
if len(tables) > 0:
|
|
@@ -466,7 +472,7 @@ class ProjectController:
|
|
|
466
472
|
|
|
467
473
|
if new_metadata is not None:
|
|
468
474
|
project.metadata = new_metadata
|
|
469
|
-
project.record.
|
|
475
|
+
project.record.metadata_ = new_metadata
|
|
470
476
|
flag_modified(project.record, 'metadata_')
|
|
471
477
|
|
|
472
478
|
db.session.commit()
|
|
@@ -3,7 +3,6 @@ import copy
|
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import hashlib
|
|
7
6
|
import numpy as np
|
|
8
7
|
|
|
9
8
|
from mindsdb_sql_parser.ast import (
|
|
@@ -155,19 +154,19 @@ class KnowledgeBaseTable:
|
|
|
155
154
|
# extract values from conditions and prepare for vectordb
|
|
156
155
|
conditions = []
|
|
157
156
|
query_text = None
|
|
158
|
-
|
|
157
|
+
relevance_threshold = None
|
|
159
158
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
160
159
|
if query_conditions is not None:
|
|
161
160
|
for item in query_conditions:
|
|
162
|
-
if item.column == "
|
|
161
|
+
if item.column == "relevance_threshold" and item.op.value == "=":
|
|
163
162
|
try:
|
|
164
|
-
|
|
163
|
+
relevance_threshold = float(item.value)
|
|
165
164
|
# Validate range: must be between 0 and 1
|
|
166
|
-
if not (0 <=
|
|
167
|
-
raise ValueError(f"
|
|
168
|
-
logger.debug(f"Found
|
|
165
|
+
if not (0 <= relevance_threshold <= 1):
|
|
166
|
+
raise ValueError(f"relevance_threshold must be between 0 and 1, got: {relevance_threshold}")
|
|
167
|
+
logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
|
|
169
168
|
except (ValueError, TypeError) as e:
|
|
170
|
-
error_msg = f"Invalid
|
|
169
|
+
error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
|
|
171
170
|
logger.error(error_msg)
|
|
172
171
|
raise ValueError(error_msg)
|
|
173
172
|
elif item.column == TableField.CONTENT.value:
|
|
@@ -185,6 +184,16 @@ class KnowledgeBaseTable:
|
|
|
185
184
|
logger.debug(f"Extracted query text: {query_text}")
|
|
186
185
|
|
|
187
186
|
self.addapt_conditions_columns(conditions)
|
|
187
|
+
|
|
188
|
+
# Set default limit if query is present
|
|
189
|
+
if query_text is not None:
|
|
190
|
+
limit = query.limit.value if query.limit is not None else None
|
|
191
|
+
if limit is None:
|
|
192
|
+
limit = 10
|
|
193
|
+
elif limit > 100:
|
|
194
|
+
limit = 100
|
|
195
|
+
query.limit = Constant(limit)
|
|
196
|
+
|
|
188
197
|
df = db_handler.dispatch_select(query, conditions)
|
|
189
198
|
df = self.addapt_result_columns(df)
|
|
190
199
|
|
|
@@ -192,14 +201,14 @@ class KnowledgeBaseTable:
|
|
|
192
201
|
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
193
202
|
# Check if we have a rerank_model configured in KB params
|
|
194
203
|
|
|
195
|
-
df = self.add_relevance(df, query_text,
|
|
204
|
+
df = self.add_relevance(df, query_text, relevance_threshold)
|
|
196
205
|
|
|
197
206
|
# filter by targets
|
|
198
207
|
if requested_kb_columns is not None:
|
|
199
208
|
df = df[requested_kb_columns]
|
|
200
209
|
return df
|
|
201
210
|
|
|
202
|
-
def add_relevance(self, df, query_text,
|
|
211
|
+
def add_relevance(self, df, query_text, relevance_threshold=None):
|
|
203
212
|
relevance_column = TableField.RELEVANCE.value
|
|
204
213
|
|
|
205
214
|
reranking_model_params = self._kb.params.get("reranking_model")
|
|
@@ -208,9 +217,9 @@ class KnowledgeBaseTable:
|
|
|
208
217
|
try:
|
|
209
218
|
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
210
219
|
# Apply custom filtering threshold if provided
|
|
211
|
-
if
|
|
212
|
-
reranking_model_params["filtering_threshold"] =
|
|
213
|
-
logger.info(f"Using custom filtering threshold: {
|
|
220
|
+
if relevance_threshold is not None:
|
|
221
|
+
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
222
|
+
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
214
223
|
|
|
215
224
|
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
216
225
|
# Get documents to rerank
|
|
@@ -236,8 +245,8 @@ class KnowledgeBaseTable:
|
|
|
236
245
|
# Calculate relevance from distance
|
|
237
246
|
logger.info("Calculating relevance from vector distance")
|
|
238
247
|
df[relevance_column] = 1 / (1 + df['distance'])
|
|
239
|
-
if
|
|
240
|
-
df = df[df[relevance_column] >
|
|
248
|
+
if relevance_threshold is not None:
|
|
249
|
+
df = df[df[relevance_column] > relevance_threshold]
|
|
241
250
|
|
|
242
251
|
else:
|
|
243
252
|
df[relevance_column] = None
|
|
@@ -333,12 +342,21 @@ class KnowledgeBaseTable:
|
|
|
333
342
|
|
|
334
343
|
emb_col = TableField.EMBEDDINGS.value
|
|
335
344
|
cont_col = TableField.CONTENT.value
|
|
345
|
+
|
|
346
|
+
db_handler = self.get_vector_db()
|
|
347
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
348
|
+
doc_id = None
|
|
349
|
+
for condition in conditions:
|
|
350
|
+
if condition.column == 'chunk_id' and condition.op == FilterOperator.EQUAL:
|
|
351
|
+
doc_id = condition.value
|
|
352
|
+
|
|
336
353
|
if cont_col in query.update_columns:
|
|
337
354
|
content = query.update_columns[cont_col]
|
|
338
355
|
|
|
339
356
|
# Apply preprocessing to content if configured
|
|
340
357
|
if self.document_preprocessor:
|
|
341
358
|
doc = Document(
|
|
359
|
+
id=doc_id,
|
|
342
360
|
content=content.value,
|
|
343
361
|
metadata={} # Empty metadata for content-only updates
|
|
344
362
|
)
|
|
@@ -354,8 +372,6 @@ class KnowledgeBaseTable:
|
|
|
354
372
|
query.table = Identifier(parts=[self._kb.vector_database_table])
|
|
355
373
|
|
|
356
374
|
# send to vectordb
|
|
357
|
-
db_handler = self.get_vector_db()
|
|
358
|
-
conditions = db_handler.extract_conditions(query.where)
|
|
359
375
|
self.addapt_conditions_columns(conditions)
|
|
360
376
|
db_handler.dispatch_update(query, conditions)
|
|
361
377
|
|
|
@@ -409,7 +425,11 @@ class KnowledgeBaseTable:
|
|
|
409
425
|
db_handler.delete(self._kb.vector_database_table)
|
|
410
426
|
|
|
411
427
|
def insert(self, df: pd.DataFrame):
|
|
412
|
-
"""Insert dataframe to KB table.
|
|
428
|
+
"""Insert dataframe to KB table.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
df: DataFrame to insert
|
|
432
|
+
"""
|
|
413
433
|
if df.empty:
|
|
414
434
|
return
|
|
415
435
|
|
|
@@ -754,22 +774,9 @@ class KnowledgeBaseTable:
|
|
|
754
774
|
return {}
|
|
755
775
|
|
|
756
776
|
def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
|
|
757
|
-
"""
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
Args:
|
|
762
|
-
content: The content string
|
|
763
|
-
content_column: Name of the content column
|
|
764
|
-
provided_id: Optional user-provided ID
|
|
765
|
-
Returns:
|
|
766
|
-
Deterministic document ID
|
|
767
|
-
"""
|
|
768
|
-
if provided_id is not None:
|
|
769
|
-
return f"{provided_id}_{content_column}"
|
|
770
|
-
|
|
771
|
-
id_string = f"content={content}_column={content_column}"
|
|
772
|
-
return hashlib.sha256(id_string.encode()).hexdigest()
|
|
777
|
+
"""Generate a deterministic document ID using the utility function."""
|
|
778
|
+
from mindsdb.interfaces.knowledge_base.utils import generate_document_id
|
|
779
|
+
return generate_document_id(content, content_column, provided_id)
|
|
773
780
|
|
|
774
781
|
def _convert_metadata_value(self, value):
|
|
775
782
|
"""
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Any
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
4
|
-
import hashlib
|
|
5
4
|
import asyncio
|
|
6
5
|
|
|
7
6
|
|
|
@@ -43,7 +42,11 @@ class DocumentPreprocessor:
|
|
|
43
42
|
self.splitter = None # Will be set by child classes
|
|
44
43
|
|
|
45
44
|
def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
|
|
46
|
-
"""Base implementation - should be overridden by child classes
|
|
45
|
+
"""Base implementation - should be overridden by child classes
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
documents: List of documents to process
|
|
49
|
+
"""
|
|
47
50
|
raise NotImplementedError("Subclasses must implement process_documents")
|
|
48
51
|
|
|
49
52
|
def _split_document(self, doc: Document) -> List[Document]:
|
|
@@ -80,27 +83,22 @@ class DocumentPreprocessor:
|
|
|
80
83
|
metadata=data.get("metadata", {}),
|
|
81
84
|
)
|
|
82
85
|
|
|
83
|
-
def _generate_deterministic_id(
|
|
84
|
-
self, content: str, content_column: str = None, provided_id: str = None
|
|
85
|
-
) -> str:
|
|
86
|
-
"""Generate a deterministic ID based on content and column"""
|
|
87
|
-
if provided_id is not None:
|
|
88
|
-
return f"{provided_id}_{content_column}"
|
|
89
|
-
|
|
90
|
-
id_string = f"content={content}_column={content_column}"
|
|
91
|
-
return hashlib.sha256(id_string.encode()).hexdigest()
|
|
92
|
-
|
|
93
86
|
def _generate_chunk_id(
|
|
94
87
|
self,
|
|
95
88
|
chunk_index: Optional[int] = None,
|
|
89
|
+
total_chunks: Optional[int] = None,
|
|
90
|
+
start_char: Optional[int] = None,
|
|
91
|
+
end_char: Optional[int] = None,
|
|
96
92
|
provided_id: str = None,
|
|
97
93
|
) -> str:
|
|
98
|
-
"""Generate deterministic ID for a chunk
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
94
|
+
"""Generate human-readable deterministic ID for a chunk
|
|
95
|
+
Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
96
|
+
"""
|
|
97
|
+
if provided_id is None:
|
|
98
|
+
raise ValueError("Document ID must be provided for chunk ID generation")
|
|
99
|
+
|
|
100
|
+
chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
|
|
101
|
+
logger.debug(f"Generated chunk ID: {chunk_id}")
|
|
104
102
|
return chunk_id
|
|
105
103
|
|
|
106
104
|
def _prepare_chunk_metadata(
|
|
@@ -207,14 +205,10 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
207
205
|
processed_chunks = []
|
|
208
206
|
|
|
209
207
|
for doc_index, doc in enumerate(documents):
|
|
210
|
-
# Get content_column from metadata if available
|
|
211
|
-
content_column = (
|
|
212
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
213
|
-
)
|
|
214
208
|
|
|
215
|
-
#
|
|
209
|
+
# Document ID must be provided by this point
|
|
216
210
|
if doc.id is None:
|
|
217
|
-
|
|
211
|
+
raise ValueError("Document ID must be provided before preprocessing")
|
|
218
212
|
|
|
219
213
|
# Skip empty or whitespace-only content
|
|
220
214
|
if not doc.content or not doc.content.strip():
|
|
@@ -298,68 +292,55 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
298
292
|
processed_chunks = []
|
|
299
293
|
|
|
300
294
|
for doc in documents:
|
|
301
|
-
# Get content_column from metadata if available
|
|
302
|
-
content_column = (
|
|
303
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
304
|
-
)
|
|
305
295
|
|
|
306
|
-
#
|
|
296
|
+
# Document ID must be provided by this point
|
|
307
297
|
if doc.id is None:
|
|
308
|
-
|
|
298
|
+
raise ValueError("Document ID must be provided before preprocessing")
|
|
309
299
|
|
|
310
300
|
# Skip empty or whitespace-only content
|
|
311
301
|
if not doc.content or not doc.content.strip():
|
|
312
302
|
continue
|
|
313
303
|
|
|
314
304
|
chunk_docs = self._split_document(doc)
|
|
305
|
+
total_chunks = len(chunk_docs)
|
|
315
306
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
307
|
+
# Track character positions
|
|
308
|
+
current_pos = 0
|
|
309
|
+
for i, chunk_doc in enumerate(chunk_docs):
|
|
319
310
|
if not chunk_doc.content or not chunk_doc.content.strip():
|
|
320
311
|
continue
|
|
321
312
|
|
|
313
|
+
# Calculate chunk positions
|
|
314
|
+
start_char = current_pos
|
|
315
|
+
end_char = start_char + len(chunk_doc.content)
|
|
316
|
+
current_pos = end_char + 1 # +1 for separator
|
|
317
|
+
|
|
322
318
|
# Initialize metadata
|
|
323
319
|
metadata = {}
|
|
324
320
|
if doc.metadata:
|
|
325
321
|
metadata.update(doc.metadata)
|
|
326
322
|
|
|
327
|
-
#
|
|
328
|
-
|
|
329
|
-
|
|
323
|
+
# Add position metadata
|
|
324
|
+
metadata["start_char"] = start_char
|
|
325
|
+
metadata["end_char"] = end_char
|
|
326
|
+
|
|
327
|
+
# Generate chunk ID with total chunks
|
|
328
|
+
chunk_id = self._generate_chunk_id(
|
|
329
|
+
chunk_index=i,
|
|
330
|
+
total_chunks=total_chunks,
|
|
331
|
+
start_char=start_char,
|
|
332
|
+
end_char=end_char,
|
|
333
|
+
provided_id=doc.id
|
|
330
334
|
)
|
|
335
|
+
|
|
331
336
|
processed_chunks.append(
|
|
332
337
|
ProcessedChunk(
|
|
333
|
-
id=
|
|
338
|
+
id=chunk_id,
|
|
334
339
|
content=chunk_doc.content,
|
|
335
340
|
embeddings=doc.embeddings,
|
|
336
|
-
metadata=self._prepare_chunk_metadata(doc.id,
|
|
341
|
+
metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
|
|
337
342
|
)
|
|
338
343
|
)
|
|
339
|
-
else:
|
|
340
|
-
# Multiple chunks case
|
|
341
|
-
for i, chunk_doc in enumerate(chunk_docs):
|
|
342
|
-
if not chunk_doc.content or not chunk_doc.content.strip():
|
|
343
|
-
continue
|
|
344
|
-
|
|
345
|
-
# Initialize metadata
|
|
346
|
-
metadata = {}
|
|
347
|
-
if doc.metadata:
|
|
348
|
-
metadata.update(doc.metadata)
|
|
349
|
-
|
|
350
|
-
# Pass through doc.id and content_column
|
|
351
|
-
chunk_id = self._generate_chunk_id(
|
|
352
|
-
chunk_index=i,
|
|
353
|
-
provided_id=doc.id,
|
|
354
|
-
)
|
|
355
|
-
processed_chunks.append(
|
|
356
|
-
ProcessedChunk(
|
|
357
|
-
id=chunk_id,
|
|
358
|
-
content=chunk_doc.content,
|
|
359
|
-
embeddings=doc.embeddings,
|
|
360
|
-
metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
|
|
361
|
-
)
|
|
362
|
-
)
|
|
363
344
|
|
|
364
345
|
return processed_chunks
|
|
365
346
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Utilities for knowledge base operations."""
|
|
2
|
+
import hashlib
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Generate a deterministic document ID from content and column name.
|
|
8
|
+
If provided_id exists, combines it with content_column.
|
|
9
|
+
For generated IDs, uses a short hash of just the content to ensure
|
|
10
|
+
same content gets same base ID across different columns.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
content: The content string
|
|
14
|
+
content_column: Name of the content column
|
|
15
|
+
provided_id: Optional user-provided ID
|
|
16
|
+
Returns:
|
|
17
|
+
Deterministic document ID in format: <base_id>_<column>
|
|
18
|
+
where base_id is either the provided_id or a 16-char hash of content
|
|
19
|
+
"""
|
|
20
|
+
if provided_id is not None:
|
|
21
|
+
base_id = provided_id
|
|
22
|
+
else:
|
|
23
|
+
# Generate a shorter 16-character hash based only on content
|
|
24
|
+
hash_obj = hashlib.md5(content.encode())
|
|
25
|
+
base_id = hash_obj.hexdigest()[:16]
|
|
26
|
+
|
|
27
|
+
# Append column name to maintain uniqueness across columns
|
|
28
|
+
return f"{base_id}_{content_column}"
|
mindsdb/utilities/auth.py
CHANGED
|
@@ -15,9 +15,11 @@ def get_aws_meta_data() -> dict:
|
|
|
15
15
|
'ami-id': None,
|
|
16
16
|
'instance-id': None
|
|
17
17
|
}
|
|
18
|
+
aws_token = requests.put("http://169.254.169.254/latest/api/token", headers={'X-aws-ec2-metadata-token-ttl-seconds': '30'}).text
|
|
18
19
|
for key in aws_meta_data.keys():
|
|
19
20
|
resp = requests.get(
|
|
20
21
|
f'http://169.254.169.254/latest/meta-data/{key}',
|
|
22
|
+
headers={'X-aws-ec2-metadata-token': aws_token},
|
|
21
23
|
timeout=1
|
|
22
24
|
)
|
|
23
25
|
if resp.status_code != 200:
|
|
@@ -35,7 +37,9 @@ def register_oauth_client():
|
|
|
35
37
|
aws_meta_data = get_aws_meta_data()
|
|
36
38
|
|
|
37
39
|
current_aws_meta_data = config.get('aws_meta_data', {})
|
|
38
|
-
oauth_meta = config.get('auth', {}).get('oauth'
|
|
40
|
+
oauth_meta = config.get('auth', {}).get('oauth')
|
|
41
|
+
if oauth_meta is None:
|
|
42
|
+
return
|
|
39
43
|
|
|
40
44
|
public_hostname = aws_meta_data['public-hostname']
|
|
41
45
|
if (
|
mindsdb/utilities/cache.py
CHANGED
|
@@ -56,6 +56,7 @@ import os
|
|
|
56
56
|
import time
|
|
57
57
|
from abc import ABC
|
|
58
58
|
from pathlib import Path
|
|
59
|
+
import re
|
|
59
60
|
import hashlib
|
|
60
61
|
import typing as t
|
|
61
62
|
|
|
@@ -154,7 +155,9 @@ class FileCache(BaseCache):
|
|
|
154
155
|
pass
|
|
155
156
|
|
|
156
157
|
def file_path(self, name):
|
|
157
|
-
|
|
158
|
+
# Sanitize the key to avoid table (file) names with backticks and slashes.
|
|
159
|
+
sanitized_name = re.sub(r'[^\w\-.]', '_', name)
|
|
160
|
+
return self.path / sanitized_name
|
|
158
161
|
|
|
159
162
|
def set_df(self, name, df):
|
|
160
163
|
path = self.file_path(name)
|