MindsDB 25.4.2.0__py3-none-any.whl → 25.4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +30 -7
- mindsdb/api/executor/command_executor.py +29 -0
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +3 -2
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_plan.py +1 -0
- mindsdb/api/executor/planner/query_planner.py +86 -14
- mindsdb/api/executor/planner/steps.py +9 -1
- mindsdb/api/executor/sql_query/sql_query.py +37 -6
- mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +231 -0
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -1
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +7 -11
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +28 -4
- mindsdb/integrations/libs/llm/config.py +11 -1
- mindsdb/integrations/libs/llm/utils.py +12 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +9 -1
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +1 -1
- mindsdb/interfaces/agents/constants.py +12 -1
- mindsdb/interfaces/agents/langchain_agent.py +6 -0
- mindsdb/interfaces/database/projects.py +7 -1
- mindsdb/interfaces/knowledge_base/controller.py +166 -74
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +43 -62
- mindsdb/interfaces/knowledge_base/utils.py +28 -0
- mindsdb/interfaces/query_context/context_controller.py +221 -0
- mindsdb/interfaces/storage/db.py +23 -0
- mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
- mindsdb/utilities/auth.py +5 -1
- mindsdb/utilities/cache.py +4 -1
- mindsdb/utilities/context_executor.py +1 -1
- mindsdb/utilities/partitioning.py +35 -20
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/METADATA +221 -219
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/RECORD +39 -36
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,6 @@ import copy
|
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import hashlib
|
|
7
6
|
import numpy as np
|
|
8
7
|
|
|
9
8
|
from mindsdb_sql_parser.ast import (
|
|
@@ -27,6 +26,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
27
26
|
)
|
|
28
27
|
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
29
28
|
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
29
|
+
from mindsdb.integrations.utilities.handler_utils import get_api_key
|
|
30
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args, row_to_document
|
|
30
31
|
|
|
31
32
|
from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
|
|
32
33
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
|
|
@@ -36,6 +37,7 @@ from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor impor
|
|
|
36
37
|
from mindsdb.interfaces.model.functions import PredictorRecordNotFound
|
|
37
38
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
38
39
|
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
40
|
+
from mindsdb.utilities.context import context as ctx
|
|
39
41
|
|
|
40
42
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
41
43
|
from mindsdb.utilities import log
|
|
@@ -50,6 +52,42 @@ KB_TO_VECTORDB_COLUMNS = {
|
|
|
50
52
|
}
|
|
51
53
|
|
|
52
54
|
|
|
55
|
+
def get_embedding_model_from_params(embedding_model_params: dict):
|
|
56
|
+
"""
|
|
57
|
+
Create embedding model from parameters.
|
|
58
|
+
"""
|
|
59
|
+
params_copy = copy.deepcopy(embedding_model_params)
|
|
60
|
+
provider = params_copy.pop('provider', None).lower()
|
|
61
|
+
api_key = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
|
|
62
|
+
# Underscores are replaced because the provider name ultimately gets mapped to a class name.
|
|
63
|
+
# This is mostly to support Azure OpenAI (azure_openai); the mapped class name is 'AzureOpenAIEmbeddings'.
|
|
64
|
+
params_copy['class'] = provider.replace('_', '')
|
|
65
|
+
if provider == 'azure_openai':
|
|
66
|
+
# Azure OpenAI expects the api_key to be passed as 'openai_api_key'.
|
|
67
|
+
params_copy['openai_api_key'] = api_key
|
|
68
|
+
else:
|
|
69
|
+
params_copy[f"{provider}_api_key"] = api_key
|
|
70
|
+
params_copy.pop('api_key', None)
|
|
71
|
+
params_copy['model'] = params_copy.pop('model_name', None)
|
|
72
|
+
|
|
73
|
+
return construct_model_from_args(params_copy)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_reranking_model_from_params(reranking_model_params: dict):
|
|
77
|
+
"""
|
|
78
|
+
Create reranking model from parameters.
|
|
79
|
+
"""
|
|
80
|
+
params_copy = copy.deepcopy(reranking_model_params)
|
|
81
|
+
provider = params_copy.pop('provider', "openai").lower()
|
|
82
|
+
if provider != 'openai':
|
|
83
|
+
raise ValueError("Only OpenAI provider is supported for the reranking model.")
|
|
84
|
+
params_copy[f"{provider}_api_key"] = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
|
|
85
|
+
params_copy.pop('api_key', None)
|
|
86
|
+
params_copy['model'] = params_copy.pop('model_name', None)
|
|
87
|
+
|
|
88
|
+
return LLMReranker(**params_copy)
|
|
89
|
+
|
|
90
|
+
|
|
53
91
|
class KnowledgeBaseTable:
|
|
54
92
|
"""
|
|
55
93
|
Knowledge base table interface
|
|
@@ -116,19 +154,19 @@ class KnowledgeBaseTable:
|
|
|
116
154
|
# extract values from conditions and prepare for vectordb
|
|
117
155
|
conditions = []
|
|
118
156
|
query_text = None
|
|
119
|
-
|
|
157
|
+
relevance_threshold = None
|
|
120
158
|
query_conditions = db_handler.extract_conditions(query.where)
|
|
121
159
|
if query_conditions is not None:
|
|
122
160
|
for item in query_conditions:
|
|
123
|
-
if item.column == "
|
|
161
|
+
if item.column == "relevance_threshold" and item.op.value == "=":
|
|
124
162
|
try:
|
|
125
|
-
|
|
163
|
+
relevance_threshold = float(item.value)
|
|
126
164
|
# Validate range: must be between 0 and 1
|
|
127
|
-
if not (0 <=
|
|
128
|
-
raise ValueError(f"
|
|
129
|
-
logger.debug(f"Found
|
|
165
|
+
if not (0 <= relevance_threshold <= 1):
|
|
166
|
+
raise ValueError(f"relevance_threshold must be between 0 and 1, got: {relevance_threshold}")
|
|
167
|
+
logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
|
|
130
168
|
except (ValueError, TypeError) as e:
|
|
131
|
-
error_msg = f"Invalid
|
|
169
|
+
error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
|
|
132
170
|
logger.error(error_msg)
|
|
133
171
|
raise ValueError(error_msg)
|
|
134
172
|
elif item.column == TableField.CONTENT.value:
|
|
@@ -146,6 +184,16 @@ class KnowledgeBaseTable:
|
|
|
146
184
|
logger.debug(f"Extracted query text: {query_text}")
|
|
147
185
|
|
|
148
186
|
self.addapt_conditions_columns(conditions)
|
|
187
|
+
|
|
188
|
+
# Set default limit if query is present
|
|
189
|
+
if query_text is not None:
|
|
190
|
+
limit = query.limit.value if query.limit is not None else None
|
|
191
|
+
if limit is None:
|
|
192
|
+
limit = 10
|
|
193
|
+
elif limit > 100:
|
|
194
|
+
limit = 100
|
|
195
|
+
query.limit = Constant(limit)
|
|
196
|
+
|
|
149
197
|
df = db_handler.dispatch_select(query, conditions)
|
|
150
198
|
df = self.addapt_result_columns(df)
|
|
151
199
|
|
|
@@ -153,28 +201,27 @@ class KnowledgeBaseTable:
|
|
|
153
201
|
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
154
202
|
# Check if we have a rerank_model configured in KB params
|
|
155
203
|
|
|
156
|
-
df = self.add_relevance(df, query_text,
|
|
204
|
+
df = self.add_relevance(df, query_text, relevance_threshold)
|
|
157
205
|
|
|
158
206
|
# filter by targets
|
|
159
207
|
if requested_kb_columns is not None:
|
|
160
208
|
df = df[requested_kb_columns]
|
|
161
209
|
return df
|
|
162
210
|
|
|
163
|
-
def add_relevance(self, df, query_text,
|
|
211
|
+
def add_relevance(self, df, query_text, relevance_threshold=None):
|
|
164
212
|
relevance_column = TableField.RELEVANCE.value
|
|
165
213
|
|
|
166
|
-
|
|
167
|
-
if
|
|
214
|
+
reranking_model_params = self._kb.params.get("reranking_model")
|
|
215
|
+
if reranking_model_params and query_text and len(df) > 0:
|
|
168
216
|
# Use reranker for relevance score
|
|
169
217
|
try:
|
|
170
|
-
logger.info(f"Using
|
|
171
|
-
reranker_params = {"model": rerank_model}
|
|
218
|
+
logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
|
|
172
219
|
# Apply custom filtering threshold if provided
|
|
173
|
-
if
|
|
174
|
-
|
|
175
|
-
logger.info(f"Using custom filtering threshold: {
|
|
220
|
+
if relevance_threshold is not None:
|
|
221
|
+
reranking_model_params["filtering_threshold"] = relevance_threshold
|
|
222
|
+
logger.info(f"Using custom filtering threshold: {relevance_threshold}")
|
|
176
223
|
|
|
177
|
-
reranker =
|
|
224
|
+
reranker = get_reranking_model_from_params(reranking_model_params)
|
|
178
225
|
# Get documents to rerank
|
|
179
226
|
documents = df['chunk_content'].tolist()
|
|
180
227
|
# Use the get_scores method with disable_events=True
|
|
@@ -185,7 +232,7 @@ class KnowledgeBaseTable:
|
|
|
185
232
|
# Filter by threshold
|
|
186
233
|
scores_array = np.array(scores)
|
|
187
234
|
df = df[scores_array > reranker.filtering_threshold]
|
|
188
|
-
logger.debug(f"Applied reranking with
|
|
235
|
+
logger.debug(f"Applied reranking with params: {reranking_model_params}")
|
|
189
236
|
except Exception as e:
|
|
190
237
|
logger.error(f"Error during reranking: {str(e)}")
|
|
191
238
|
# Fallback to distance-based relevance
|
|
@@ -198,6 +245,8 @@ class KnowledgeBaseTable:
|
|
|
198
245
|
# Calculate relevance from distance
|
|
199
246
|
logger.info("Calculating relevance from vector distance")
|
|
200
247
|
df[relevance_column] = 1 / (1 + df['distance'])
|
|
248
|
+
if relevance_threshold is not None:
|
|
249
|
+
df = df[df[relevance_column] > relevance_threshold]
|
|
201
250
|
|
|
202
251
|
else:
|
|
203
252
|
df[relevance_column] = None
|
|
@@ -293,12 +342,21 @@ class KnowledgeBaseTable:
|
|
|
293
342
|
|
|
294
343
|
emb_col = TableField.EMBEDDINGS.value
|
|
295
344
|
cont_col = TableField.CONTENT.value
|
|
345
|
+
|
|
346
|
+
db_handler = self.get_vector_db()
|
|
347
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
348
|
+
doc_id = None
|
|
349
|
+
for condition in conditions:
|
|
350
|
+
if condition.column == 'chunk_id' and condition.op == FilterOperator.EQUAL:
|
|
351
|
+
doc_id = condition.value
|
|
352
|
+
|
|
296
353
|
if cont_col in query.update_columns:
|
|
297
354
|
content = query.update_columns[cont_col]
|
|
298
355
|
|
|
299
356
|
# Apply preprocessing to content if configured
|
|
300
357
|
if self.document_preprocessor:
|
|
301
358
|
doc = Document(
|
|
359
|
+
id=doc_id,
|
|
302
360
|
content=content.value,
|
|
303
361
|
metadata={} # Empty metadata for content-only updates
|
|
304
362
|
)
|
|
@@ -314,8 +372,6 @@ class KnowledgeBaseTable:
|
|
|
314
372
|
query.table = Identifier(parts=[self._kb.vector_database_table])
|
|
315
373
|
|
|
316
374
|
# send to vectordb
|
|
317
|
-
db_handler = self.get_vector_db()
|
|
318
|
-
conditions = db_handler.extract_conditions(query.where)
|
|
319
375
|
self.addapt_conditions_columns(conditions)
|
|
320
376
|
db_handler.dispatch_update(query, conditions)
|
|
321
377
|
|
|
@@ -369,10 +425,24 @@ class KnowledgeBaseTable:
|
|
|
369
425
|
db_handler.delete(self._kb.vector_database_table)
|
|
370
426
|
|
|
371
427
|
def insert(self, df: pd.DataFrame):
|
|
372
|
-
"""Insert dataframe to KB table.
|
|
428
|
+
"""Insert dataframe to KB table.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
df: DataFrame to insert
|
|
432
|
+
"""
|
|
373
433
|
if df.empty:
|
|
374
434
|
return
|
|
375
435
|
|
|
436
|
+
try:
|
|
437
|
+
run_query_id = ctx.run_query_id
|
|
438
|
+
# Link current KB to running query (where KB is used to insert data)
|
|
439
|
+
if run_query_id is not None:
|
|
440
|
+
self._kb.query_id = run_query_id
|
|
441
|
+
db.session.commit()
|
|
442
|
+
|
|
443
|
+
except AttributeError:
|
|
444
|
+
...
|
|
445
|
+
|
|
376
446
|
# First adapt column names to identify content and metadata columns
|
|
377
447
|
adapted_df = self._adapt_column_names(df)
|
|
378
448
|
content_columns = self._kb.params.get('content_columns', [TableField.CONTENT.value])
|
|
@@ -577,36 +647,48 @@ class KnowledgeBaseTable:
|
|
|
577
647
|
if df.empty:
|
|
578
648
|
return pd.DataFrame([], columns=[TableField.EMBEDDINGS.value])
|
|
579
649
|
|
|
650
|
+
# keep only content
|
|
651
|
+
df = df[[TableField.CONTENT.value]]
|
|
652
|
+
|
|
580
653
|
model_id = self._kb.embedding_model_id
|
|
581
|
-
|
|
582
|
-
|
|
654
|
+
if model_id:
|
|
655
|
+
# get the input columns
|
|
656
|
+
model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
|
|
583
657
|
|
|
584
|
-
|
|
585
|
-
|
|
658
|
+
assert model_rec is not None, f"Model not found: {model_id}"
|
|
659
|
+
model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
|
|
586
660
|
|
|
587
|
-
|
|
661
|
+
project_datanode = self.session.datahub.get(model_project.name)
|
|
588
662
|
|
|
589
|
-
|
|
590
|
-
|
|
663
|
+
model_using = model_rec.learn_args.get('using', {})
|
|
664
|
+
input_col = model_using.get('question_column')
|
|
665
|
+
if input_col is None:
|
|
666
|
+
input_col = model_using.get('input_column')
|
|
667
|
+
|
|
668
|
+
if input_col is not None and input_col != TableField.CONTENT.value:
|
|
669
|
+
df = df.rename(columns={TableField.CONTENT.value: input_col})
|
|
591
670
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
671
|
+
df_out = project_datanode.predict(
|
|
672
|
+
model_name=model_rec.name,
|
|
673
|
+
df=df,
|
|
674
|
+
params=self.model_params
|
|
675
|
+
)
|
|
596
676
|
|
|
597
|
-
|
|
598
|
-
|
|
677
|
+
target = model_rec.to_predict[0]
|
|
678
|
+
if target != TableField.EMBEDDINGS.value:
|
|
679
|
+
# adapt output for vectordb
|
|
680
|
+
df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
|
|
599
681
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
682
|
+
elif self._kb.params.get('embedding_model'):
|
|
683
|
+
embedding_model = get_embedding_model_from_params(self._kb.params.get('embedding_model'))
|
|
684
|
+
|
|
685
|
+
df_texts = df.apply(row_to_document, axis=1)
|
|
686
|
+
embeddings = embedding_model.embed_documents(df_texts.tolist())
|
|
687
|
+
df_out = df.copy().assign(**{TableField.EMBEDDINGS.value: embeddings})
|
|
688
|
+
|
|
689
|
+
else:
|
|
690
|
+
raise ValueError("No embedding model found for the knowledge base.")
|
|
605
691
|
|
|
606
|
-
target = model_rec.to_predict[0]
|
|
607
|
-
if target != TableField.EMBEDDINGS.value:
|
|
608
|
-
# adapt output for vectordb
|
|
609
|
-
df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
|
|
610
692
|
df_out = df_out[[TableField.EMBEDDINGS.value]]
|
|
611
693
|
|
|
612
694
|
return df_out
|
|
@@ -640,9 +722,11 @@ class KnowledgeBaseTable:
|
|
|
640
722
|
# Extract embedding model args from knowledge base table
|
|
641
723
|
embedding_args = self._kb.embedding_model.learn_args.get('using', {})
|
|
642
724
|
# Construct the embedding model directly
|
|
643
|
-
from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
|
|
644
725
|
embeddings_model = construct_model_from_args(embedding_args)
|
|
645
726
|
logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
|
|
727
|
+
elif self._kb.params.get('embedding_model'):
|
|
728
|
+
embeddings_model = get_embedding_model_from_params(self._kb.params['embedding_model'])
|
|
729
|
+
logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
|
|
646
730
|
else:
|
|
647
731
|
embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
|
|
648
732
|
logger.debug("Using default embedding model as knowledge base has no embedding model")
|
|
@@ -690,22 +774,9 @@ class KnowledgeBaseTable:
|
|
|
690
774
|
return {}
|
|
691
775
|
|
|
692
776
|
def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
|
|
693
|
-
"""
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
Args:
|
|
698
|
-
content: The content string
|
|
699
|
-
content_column: Name of the content column
|
|
700
|
-
provided_id: Optional user-provided ID
|
|
701
|
-
Returns:
|
|
702
|
-
Deterministic document ID
|
|
703
|
-
"""
|
|
704
|
-
if provided_id is not None:
|
|
705
|
-
return f"{provided_id}_{content_column}"
|
|
706
|
-
|
|
707
|
-
id_string = f"content={content}_column={content_column}"
|
|
708
|
-
return hashlib.sha256(id_string.encode()).hexdigest()
|
|
777
|
+
"""Generate a deterministic document ID using the utility function."""
|
|
778
|
+
from mindsdb.interfaces.knowledge_base.utils import generate_document_id
|
|
779
|
+
return generate_document_id(content, content_column, provided_id)
|
|
709
780
|
|
|
710
781
|
def _convert_metadata_value(self, value):
|
|
711
782
|
"""
|
|
@@ -788,26 +859,46 @@ class KnowledgeBaseController:
|
|
|
788
859
|
return kb
|
|
789
860
|
raise EntityExistsError("Knowledge base already exists", name)
|
|
790
861
|
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
else:
|
|
796
|
-
# get embedding model from input
|
|
862
|
+
embedding_model_params = params.get('embedding_model', None)
|
|
863
|
+
reranking_model_params = params.get('reranking_model', None)
|
|
864
|
+
|
|
865
|
+
if embedding_model:
|
|
797
866
|
model_name = embedding_model.parts[-1]
|
|
798
867
|
|
|
868
|
+
elif embedding_model_params:
|
|
869
|
+
# Get embedding model from params.
|
|
870
|
+
# This is called here to check validaity of the parameters.
|
|
871
|
+
get_embedding_model_from_params(
|
|
872
|
+
embedding_model_params
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
else:
|
|
876
|
+
model_name = self._get_default_embedding_model(
|
|
877
|
+
project.name,
|
|
878
|
+
params=params
|
|
879
|
+
)
|
|
880
|
+
params['default_embedding_model'] = model_name
|
|
881
|
+
|
|
882
|
+
model_project = None
|
|
799
883
|
if embedding_model is not None and len(embedding_model.parts) > 1:
|
|
800
884
|
# model project is set
|
|
801
885
|
model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
|
|
802
|
-
|
|
886
|
+
elif not embedding_model_params:
|
|
803
887
|
model_project = project
|
|
804
888
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
889
|
+
embedding_model_id = None
|
|
890
|
+
if model_project:
|
|
891
|
+
model = self.session.model_controller.get_model(
|
|
892
|
+
name=model_name,
|
|
893
|
+
project_name=model_project.name
|
|
894
|
+
)
|
|
895
|
+
model_record = db.Predictor.query.get(model['id'])
|
|
896
|
+
embedding_model_id = model_record.id
|
|
897
|
+
|
|
898
|
+
if reranking_model_params:
|
|
899
|
+
# Get reranking model from params.
|
|
900
|
+
# This is called here to check validaity of the parameters.
|
|
901
|
+
get_reranking_model_from_params(reranking_model_params)
|
|
811
902
|
|
|
812
903
|
# search for the vector database table
|
|
813
904
|
if storage is None:
|
|
@@ -1029,6 +1120,7 @@ class KnowledgeBaseController:
|
|
|
1029
1120
|
'embedding_model': embedding_model.name if embedding_model is not None else None,
|
|
1030
1121
|
'vector_database': None if vector_database is None else vector_database.name,
|
|
1031
1122
|
'vector_database_table': record.vector_database_table,
|
|
1123
|
+
'query_id': record.query_id,
|
|
1032
1124
|
'params': record.params
|
|
1033
1125
|
})
|
|
1034
1126
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Any
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
4
|
-
import hashlib
|
|
5
4
|
import asyncio
|
|
6
5
|
|
|
7
6
|
|
|
@@ -43,7 +42,11 @@ class DocumentPreprocessor:
|
|
|
43
42
|
self.splitter = None # Will be set by child classes
|
|
44
43
|
|
|
45
44
|
def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
|
|
46
|
-
"""Base implementation - should be overridden by child classes
|
|
45
|
+
"""Base implementation - should be overridden by child classes
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
documents: List of documents to process
|
|
49
|
+
"""
|
|
47
50
|
raise NotImplementedError("Subclasses must implement process_documents")
|
|
48
51
|
|
|
49
52
|
def _split_document(self, doc: Document) -> List[Document]:
|
|
@@ -80,27 +83,22 @@ class DocumentPreprocessor:
|
|
|
80
83
|
metadata=data.get("metadata", {}),
|
|
81
84
|
)
|
|
82
85
|
|
|
83
|
-
def _generate_deterministic_id(
|
|
84
|
-
self, content: str, content_column: str = None, provided_id: str = None
|
|
85
|
-
) -> str:
|
|
86
|
-
"""Generate a deterministic ID based on content and column"""
|
|
87
|
-
if provided_id is not None:
|
|
88
|
-
return f"{provided_id}_{content_column}"
|
|
89
|
-
|
|
90
|
-
id_string = f"content={content}_column={content_column}"
|
|
91
|
-
return hashlib.sha256(id_string.encode()).hexdigest()
|
|
92
|
-
|
|
93
86
|
def _generate_chunk_id(
|
|
94
87
|
self,
|
|
95
88
|
chunk_index: Optional[int] = None,
|
|
89
|
+
total_chunks: Optional[int] = None,
|
|
90
|
+
start_char: Optional[int] = None,
|
|
91
|
+
end_char: Optional[int] = None,
|
|
96
92
|
provided_id: str = None,
|
|
97
93
|
) -> str:
|
|
98
|
-
"""Generate deterministic ID for a chunk
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
94
|
+
"""Generate human-readable deterministic ID for a chunk
|
|
95
|
+
Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
|
|
96
|
+
"""
|
|
97
|
+
if provided_id is None:
|
|
98
|
+
raise ValueError("Document ID must be provided for chunk ID generation")
|
|
99
|
+
|
|
100
|
+
chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
|
|
101
|
+
logger.debug(f"Generated chunk ID: {chunk_id}")
|
|
104
102
|
return chunk_id
|
|
105
103
|
|
|
106
104
|
def _prepare_chunk_metadata(
|
|
@@ -207,14 +205,10 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
207
205
|
processed_chunks = []
|
|
208
206
|
|
|
209
207
|
for doc_index, doc in enumerate(documents):
|
|
210
|
-
# Get content_column from metadata if available
|
|
211
|
-
content_column = (
|
|
212
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
213
|
-
)
|
|
214
208
|
|
|
215
|
-
#
|
|
209
|
+
# Document ID must be provided by this point
|
|
216
210
|
if doc.id is None:
|
|
217
|
-
|
|
211
|
+
raise ValueError("Document ID must be provided before preprocessing")
|
|
218
212
|
|
|
219
213
|
# Skip empty or whitespace-only content
|
|
220
214
|
if not doc.content or not doc.content.strip():
|
|
@@ -298,68 +292,55 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
|
298
292
|
processed_chunks = []
|
|
299
293
|
|
|
300
294
|
for doc in documents:
|
|
301
|
-
# Get content_column from metadata if available
|
|
302
|
-
content_column = (
|
|
303
|
-
doc.metadata.get("content_column") if doc.metadata else None
|
|
304
|
-
)
|
|
305
295
|
|
|
306
|
-
#
|
|
296
|
+
# Document ID must be provided by this point
|
|
307
297
|
if doc.id is None:
|
|
308
|
-
|
|
298
|
+
raise ValueError("Document ID must be provided before preprocessing")
|
|
309
299
|
|
|
310
300
|
# Skip empty or whitespace-only content
|
|
311
301
|
if not doc.content or not doc.content.strip():
|
|
312
302
|
continue
|
|
313
303
|
|
|
314
304
|
chunk_docs = self._split_document(doc)
|
|
305
|
+
total_chunks = len(chunk_docs)
|
|
315
306
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
307
|
+
# Track character positions
|
|
308
|
+
current_pos = 0
|
|
309
|
+
for i, chunk_doc in enumerate(chunk_docs):
|
|
319
310
|
if not chunk_doc.content or not chunk_doc.content.strip():
|
|
320
311
|
continue
|
|
321
312
|
|
|
313
|
+
# Calculate chunk positions
|
|
314
|
+
start_char = current_pos
|
|
315
|
+
end_char = start_char + len(chunk_doc.content)
|
|
316
|
+
current_pos = end_char + 1 # +1 for separator
|
|
317
|
+
|
|
322
318
|
# Initialize metadata
|
|
323
319
|
metadata = {}
|
|
324
320
|
if doc.metadata:
|
|
325
321
|
metadata.update(doc.metadata)
|
|
326
322
|
|
|
327
|
-
#
|
|
328
|
-
|
|
329
|
-
|
|
323
|
+
# Add position metadata
|
|
324
|
+
metadata["start_char"] = start_char
|
|
325
|
+
metadata["end_char"] = end_char
|
|
326
|
+
|
|
327
|
+
# Generate chunk ID with total chunks
|
|
328
|
+
chunk_id = self._generate_chunk_id(
|
|
329
|
+
chunk_index=i,
|
|
330
|
+
total_chunks=total_chunks,
|
|
331
|
+
start_char=start_char,
|
|
332
|
+
end_char=end_char,
|
|
333
|
+
provided_id=doc.id
|
|
330
334
|
)
|
|
335
|
+
|
|
331
336
|
processed_chunks.append(
|
|
332
337
|
ProcessedChunk(
|
|
333
|
-
id=
|
|
338
|
+
id=chunk_id,
|
|
334
339
|
content=chunk_doc.content,
|
|
335
340
|
embeddings=doc.embeddings,
|
|
336
|
-
metadata=self._prepare_chunk_metadata(doc.id,
|
|
341
|
+
metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
|
|
337
342
|
)
|
|
338
343
|
)
|
|
339
|
-
else:
|
|
340
|
-
# Multiple chunks case
|
|
341
|
-
for i, chunk_doc in enumerate(chunk_docs):
|
|
342
|
-
if not chunk_doc.content or not chunk_doc.content.strip():
|
|
343
|
-
continue
|
|
344
|
-
|
|
345
|
-
# Initialize metadata
|
|
346
|
-
metadata = {}
|
|
347
|
-
if doc.metadata:
|
|
348
|
-
metadata.update(doc.metadata)
|
|
349
|
-
|
|
350
|
-
# Pass through doc.id and content_column
|
|
351
|
-
chunk_id = self._generate_chunk_id(
|
|
352
|
-
chunk_index=i,
|
|
353
|
-
provided_id=doc.id,
|
|
354
|
-
)
|
|
355
|
-
processed_chunks.append(
|
|
356
|
-
ProcessedChunk(
|
|
357
|
-
id=chunk_id,
|
|
358
|
-
content=chunk_doc.content,
|
|
359
|
-
embeddings=doc.embeddings,
|
|
360
|
-
metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
|
|
361
|
-
)
|
|
362
|
-
)
|
|
363
344
|
|
|
364
345
|
return processed_chunks
|
|
365
346
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Utilities for knowledge base operations."""
|
|
2
|
+
import hashlib
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Generate a deterministic document ID from content and column name.
|
|
8
|
+
If provided_id exists, combines it with content_column.
|
|
9
|
+
For generated IDs, uses a short hash of just the content to ensure
|
|
10
|
+
same content gets same base ID across different columns.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
content: The content string
|
|
14
|
+
content_column: Name of the content column
|
|
15
|
+
provided_id: Optional user-provided ID
|
|
16
|
+
Returns:
|
|
17
|
+
Deterministic document ID in format: <base_id>_<column>
|
|
18
|
+
where base_id is either the provided_id or a 16-char hash of content
|
|
19
|
+
"""
|
|
20
|
+
if provided_id is not None:
|
|
21
|
+
base_id = provided_id
|
|
22
|
+
else:
|
|
23
|
+
# Generate a shorter 16-character hash based only on content
|
|
24
|
+
hash_obj = hashlib.md5(content.encode())
|
|
25
|
+
base_id = hash_obj.hexdigest()[:16]
|
|
26
|
+
|
|
27
|
+
# Append column name to maintain uniqueness across columns
|
|
28
|
+
return f"{base_id}_{content_column}"
|