MindsDB 25.4.2.0__py3-none-any.whl → 25.4.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (39) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +30 -7
  3. mindsdb/api/executor/command_executor.py +29 -0
  4. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +3 -2
  5. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
  6. mindsdb/api/executor/planner/plan_join.py +1 -1
  7. mindsdb/api/executor/planner/query_plan.py +1 -0
  8. mindsdb/api/executor/planner/query_planner.py +86 -14
  9. mindsdb/api/executor/planner/steps.py +9 -1
  10. mindsdb/api/executor/sql_query/sql_query.py +37 -6
  11. mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +231 -0
  13. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -1
  14. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
  15. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
  16. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +7 -11
  17. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +28 -4
  18. mindsdb/integrations/libs/llm/config.py +11 -1
  19. mindsdb/integrations/libs/llm/utils.py +12 -0
  20. mindsdb/integrations/libs/vectordatabase_handler.py +9 -1
  21. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +1 -1
  22. mindsdb/interfaces/agents/constants.py +12 -1
  23. mindsdb/interfaces/agents/langchain_agent.py +6 -0
  24. mindsdb/interfaces/database/projects.py +7 -1
  25. mindsdb/interfaces/knowledge_base/controller.py +166 -74
  26. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +43 -62
  27. mindsdb/interfaces/knowledge_base/utils.py +28 -0
  28. mindsdb/interfaces/query_context/context_controller.py +221 -0
  29. mindsdb/interfaces/storage/db.py +23 -0
  30. mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
  31. mindsdb/utilities/auth.py +5 -1
  32. mindsdb/utilities/cache.py +4 -1
  33. mindsdb/utilities/context_executor.py +1 -1
  34. mindsdb/utilities/partitioning.py +35 -20
  35. {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/METADATA +221 -219
  36. {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/RECORD +39 -36
  37. {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/WHEEL +0 -0
  38. {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/licenses/LICENSE +0 -0
  39. {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ import copy
3
3
  from typing import Dict, List, Optional
4
4
 
5
5
  import pandas as pd
6
- import hashlib
7
6
  import numpy as np
8
7
 
9
8
  from mindsdb_sql_parser.ast import (
@@ -27,6 +26,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
27
26
  )
28
27
  from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
29
28
  from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
29
+ from mindsdb.integrations.utilities.handler_utils import get_api_key
30
+ from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args, row_to_document
30
31
 
31
32
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
32
33
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -36,6 +37,7 @@ from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor impor
36
37
  from mindsdb.interfaces.model.functions import PredictorRecordNotFound
37
38
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
38
39
  from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
40
+ from mindsdb.utilities.context import context as ctx
39
41
 
40
42
  from mindsdb.api.executor.command_executor import ExecuteCommands
41
43
  from mindsdb.utilities import log
@@ -50,6 +52,42 @@ KB_TO_VECTORDB_COLUMNS = {
50
52
  }
51
53
 
52
54
 
55
+ def get_embedding_model_from_params(embedding_model_params: dict):
56
+ """
57
+ Create embedding model from parameters.
58
+ """
59
+ params_copy = copy.deepcopy(embedding_model_params)
60
+ provider = params_copy.pop('provider', None).lower()
61
+ api_key = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
62
+ # Underscores are replaced because the provider name ultimately gets mapped to a class name.
63
+ # This is mostly to support Azure OpenAI (azure_openai); the mapped class name is 'AzureOpenAIEmbeddings'.
64
+ params_copy['class'] = provider.replace('_', '')
65
+ if provider == 'azure_openai':
66
+ # Azure OpenAI expects the api_key to be passed as 'openai_api_key'.
67
+ params_copy['openai_api_key'] = api_key
68
+ else:
69
+ params_copy[f"{provider}_api_key"] = api_key
70
+ params_copy.pop('api_key', None)
71
+ params_copy['model'] = params_copy.pop('model_name', None)
72
+
73
+ return construct_model_from_args(params_copy)
74
+
75
+
76
+ def get_reranking_model_from_params(reranking_model_params: dict):
77
+ """
78
+ Create reranking model from parameters.
79
+ """
80
+ params_copy = copy.deepcopy(reranking_model_params)
81
+ provider = params_copy.pop('provider', "openai").lower()
82
+ if provider != 'openai':
83
+ raise ValueError("Only OpenAI provider is supported for the reranking model.")
84
+ params_copy[f"{provider}_api_key"] = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
85
+ params_copy.pop('api_key', None)
86
+ params_copy['model'] = params_copy.pop('model_name', None)
87
+
88
+ return LLMReranker(**params_copy)
89
+
90
+
53
91
  class KnowledgeBaseTable:
54
92
  """
55
93
  Knowledge base table interface
@@ -116,19 +154,19 @@ class KnowledgeBaseTable:
116
154
  # extract values from conditions and prepare for vectordb
117
155
  conditions = []
118
156
  query_text = None
119
- reranking_threshold = None
157
+ relevance_threshold = None
120
158
  query_conditions = db_handler.extract_conditions(query.where)
121
159
  if query_conditions is not None:
122
160
  for item in query_conditions:
123
- if item.column == "reranking_threshold" and item.op.value == "=":
161
+ if item.column == "relevance_threshold" and item.op.value == "=":
124
162
  try:
125
- reranking_threshold = float(item.value)
163
+ relevance_threshold = float(item.value)
126
164
  # Validate range: must be between 0 and 1
127
- if not (0 <= reranking_threshold <= 1):
128
- raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
129
- logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
165
+ if not (0 <= relevance_threshold <= 1):
166
+ raise ValueError(f"relevance_threshold must be between 0 and 1, got: {relevance_threshold}")
167
+ logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
130
168
  except (ValueError, TypeError) as e:
131
- error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
169
+ error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
132
170
  logger.error(error_msg)
133
171
  raise ValueError(error_msg)
134
172
  elif item.column == TableField.CONTENT.value:
@@ -146,6 +184,16 @@ class KnowledgeBaseTable:
146
184
  logger.debug(f"Extracted query text: {query_text}")
147
185
 
148
186
  self.addapt_conditions_columns(conditions)
187
+
188
+ # Set default limit if query is present
189
+ if query_text is not None:
190
+ limit = query.limit.value if query.limit is not None else None
191
+ if limit is None:
192
+ limit = 10
193
+ elif limit > 100:
194
+ limit = 100
195
+ query.limit = Constant(limit)
196
+
149
197
  df = db_handler.dispatch_select(query, conditions)
150
198
  df = self.addapt_result_columns(df)
151
199
 
@@ -153,28 +201,27 @@ class KnowledgeBaseTable:
153
201
  logger.debug(f"Columns in response: {df.columns.tolist()}")
154
202
  # Check if we have a rerank_model configured in KB params
155
203
 
156
- df = self.add_relevance(df, query_text, reranking_threshold)
204
+ df = self.add_relevance(df, query_text, relevance_threshold)
157
205
 
158
206
  # filter by targets
159
207
  if requested_kb_columns is not None:
160
208
  df = df[requested_kb_columns]
161
209
  return df
162
210
 
163
- def add_relevance(self, df, query_text, reranking_threshold=None):
211
+ def add_relevance(self, df, query_text, relevance_threshold=None):
164
212
  relevance_column = TableField.RELEVANCE.value
165
213
 
166
- rerank_model = self._kb.params.get("rerank_model")
167
- if rerank_model and query_text and len(df) > 0:
214
+ reranking_model_params = self._kb.params.get("reranking_model")
215
+ if reranking_model_params and query_text and len(df) > 0:
168
216
  # Use reranker for relevance score
169
217
  try:
170
- logger.info(f"Using reranker model {rerank_model} for relevance calculation")
171
- reranker_params = {"model": rerank_model}
218
+ logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
172
219
  # Apply custom filtering threshold if provided
173
- if reranking_threshold is not None:
174
- reranker_params["filtering_threshold"] = reranking_threshold
175
- logger.info(f"Using custom filtering threshold: {reranking_threshold}")
220
+ if relevance_threshold is not None:
221
+ reranking_model_params["filtering_threshold"] = relevance_threshold
222
+ logger.info(f"Using custom filtering threshold: {relevance_threshold}")
176
223
 
177
- reranker = LLMReranker(**reranker_params)
224
+ reranker = get_reranking_model_from_params(reranking_model_params)
178
225
  # Get documents to rerank
179
226
  documents = df['chunk_content'].tolist()
180
227
  # Use the get_scores method with disable_events=True
@@ -185,7 +232,7 @@ class KnowledgeBaseTable:
185
232
  # Filter by threshold
186
233
  scores_array = np.array(scores)
187
234
  df = df[scores_array > reranker.filtering_threshold]
188
- logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
235
+ logger.debug(f"Applied reranking with params: {reranking_model_params}")
189
236
  except Exception as e:
190
237
  logger.error(f"Error during reranking: {str(e)}")
191
238
  # Fallback to distance-based relevance
@@ -198,6 +245,8 @@ class KnowledgeBaseTable:
198
245
  # Calculate relevance from distance
199
246
  logger.info("Calculating relevance from vector distance")
200
247
  df[relevance_column] = 1 / (1 + df['distance'])
248
+ if relevance_threshold is not None:
249
+ df = df[df[relevance_column] > relevance_threshold]
201
250
 
202
251
  else:
203
252
  df[relevance_column] = None
@@ -293,12 +342,21 @@ class KnowledgeBaseTable:
293
342
 
294
343
  emb_col = TableField.EMBEDDINGS.value
295
344
  cont_col = TableField.CONTENT.value
345
+
346
+ db_handler = self.get_vector_db()
347
+ conditions = db_handler.extract_conditions(query.where)
348
+ doc_id = None
349
+ for condition in conditions:
350
+ if condition.column == 'chunk_id' and condition.op == FilterOperator.EQUAL:
351
+ doc_id = condition.value
352
+
296
353
  if cont_col in query.update_columns:
297
354
  content = query.update_columns[cont_col]
298
355
 
299
356
  # Apply preprocessing to content if configured
300
357
  if self.document_preprocessor:
301
358
  doc = Document(
359
+ id=doc_id,
302
360
  content=content.value,
303
361
  metadata={} # Empty metadata for content-only updates
304
362
  )
@@ -314,8 +372,6 @@ class KnowledgeBaseTable:
314
372
  query.table = Identifier(parts=[self._kb.vector_database_table])
315
373
 
316
374
  # send to vectordb
317
- db_handler = self.get_vector_db()
318
- conditions = db_handler.extract_conditions(query.where)
319
375
  self.addapt_conditions_columns(conditions)
320
376
  db_handler.dispatch_update(query, conditions)
321
377
 
@@ -369,10 +425,24 @@ class KnowledgeBaseTable:
369
425
  db_handler.delete(self._kb.vector_database_table)
370
426
 
371
427
  def insert(self, df: pd.DataFrame):
372
- """Insert dataframe to KB table."""
428
+ """Insert dataframe to KB table.
429
+
430
+ Args:
431
+ df: DataFrame to insert
432
+ """
373
433
  if df.empty:
374
434
  return
375
435
 
436
+ try:
437
+ run_query_id = ctx.run_query_id
438
+ # Link current KB to running query (where KB is used to insert data)
439
+ if run_query_id is not None:
440
+ self._kb.query_id = run_query_id
441
+ db.session.commit()
442
+
443
+ except AttributeError:
444
+ ...
445
+
376
446
  # First adapt column names to identify content and metadata columns
377
447
  adapted_df = self._adapt_column_names(df)
378
448
  content_columns = self._kb.params.get('content_columns', [TableField.CONTENT.value])
@@ -577,36 +647,48 @@ class KnowledgeBaseTable:
577
647
  if df.empty:
578
648
  return pd.DataFrame([], columns=[TableField.EMBEDDINGS.value])
579
649
 
650
+ # keep only content
651
+ df = df[[TableField.CONTENT.value]]
652
+
580
653
  model_id = self._kb.embedding_model_id
581
- # get the input columns
582
- model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
654
+ if model_id:
655
+ # get the input columns
656
+ model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
583
657
 
584
- assert model_rec is not None, f"Model not found: {model_id}"
585
- model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
658
+ assert model_rec is not None, f"Model not found: {model_id}"
659
+ model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
586
660
 
587
- project_datanode = self.session.datahub.get(model_project.name)
661
+ project_datanode = self.session.datahub.get(model_project.name)
588
662
 
589
- # keep only content
590
- df = df[[TableField.CONTENT.value]]
663
+ model_using = model_rec.learn_args.get('using', {})
664
+ input_col = model_using.get('question_column')
665
+ if input_col is None:
666
+ input_col = model_using.get('input_column')
667
+
668
+ if input_col is not None and input_col != TableField.CONTENT.value:
669
+ df = df.rename(columns={TableField.CONTENT.value: input_col})
591
670
 
592
- model_using = model_rec.learn_args.get('using', {})
593
- input_col = model_using.get('question_column')
594
- if input_col is None:
595
- input_col = model_using.get('input_column')
671
+ df_out = project_datanode.predict(
672
+ model_name=model_rec.name,
673
+ df=df,
674
+ params=self.model_params
675
+ )
596
676
 
597
- if input_col is not None and input_col != TableField.CONTENT.value:
598
- df = df.rename(columns={TableField.CONTENT.value: input_col})
677
+ target = model_rec.to_predict[0]
678
+ if target != TableField.EMBEDDINGS.value:
679
+ # adapt output for vectordb
680
+ df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
599
681
 
600
- df_out = project_datanode.predict(
601
- model_name=model_rec.name,
602
- df=df,
603
- params=self.model_params
604
- )
682
+ elif self._kb.params.get('embedding_model'):
683
+ embedding_model = get_embedding_model_from_params(self._kb.params.get('embedding_model'))
684
+
685
+ df_texts = df.apply(row_to_document, axis=1)
686
+ embeddings = embedding_model.embed_documents(df_texts.tolist())
687
+ df_out = df.copy().assign(**{TableField.EMBEDDINGS.value: embeddings})
688
+
689
+ else:
690
+ raise ValueError("No embedding model found for the knowledge base.")
605
691
 
606
- target = model_rec.to_predict[0]
607
- if target != TableField.EMBEDDINGS.value:
608
- # adapt output for vectordb
609
- df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
610
692
  df_out = df_out[[TableField.EMBEDDINGS.value]]
611
693
 
612
694
  return df_out
@@ -640,9 +722,11 @@ class KnowledgeBaseTable:
640
722
  # Extract embedding model args from knowledge base table
641
723
  embedding_args = self._kb.embedding_model.learn_args.get('using', {})
642
724
  # Construct the embedding model directly
643
- from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
644
725
  embeddings_model = construct_model_from_args(embedding_args)
645
726
  logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
727
+ elif self._kb.params.get('embedding_model'):
728
+ embeddings_model = get_embedding_model_from_params(self._kb.params['embedding_model'])
729
+ logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
646
730
  else:
647
731
  embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
648
732
  logger.debug("Using default embedding model as knowledge base has no embedding model")
@@ -690,22 +774,9 @@ class KnowledgeBaseTable:
690
774
  return {}
691
775
 
692
776
  def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
693
- """
694
- Generate a deterministic document ID from content and column name.
695
- If provided_id exists, combines it with content_column.
696
-
697
- Args:
698
- content: The content string
699
- content_column: Name of the content column
700
- provided_id: Optional user-provided ID
701
- Returns:
702
- Deterministic document ID
703
- """
704
- if provided_id is not None:
705
- return f"{provided_id}_{content_column}"
706
-
707
- id_string = f"content={content}_column={content_column}"
708
- return hashlib.sha256(id_string.encode()).hexdigest()
777
+ """Generate a deterministic document ID using the utility function."""
778
+ from mindsdb.interfaces.knowledge_base.utils import generate_document_id
779
+ return generate_document_id(content, content_column, provided_id)
709
780
 
710
781
  def _convert_metadata_value(self, value):
711
782
  """
@@ -788,26 +859,46 @@ class KnowledgeBaseController:
788
859
  return kb
789
860
  raise EntityExistsError("Knowledge base already exists", name)
790
861
 
791
- if embedding_model is None:
792
- # create default embedding model
793
- model_name = self._get_default_embedding_model(project.name, params=params)
794
- params['default_embedding_model'] = model_name
795
- else:
796
- # get embedding model from input
862
+ embedding_model_params = params.get('embedding_model', None)
863
+ reranking_model_params = params.get('reranking_model', None)
864
+
865
+ if embedding_model:
797
866
  model_name = embedding_model.parts[-1]
798
867
 
868
+ elif embedding_model_params:
869
+ # Get embedding model from params.
870
+ # This is called here to check validaity of the parameters.
871
+ get_embedding_model_from_params(
872
+ embedding_model_params
873
+ )
874
+
875
+ else:
876
+ model_name = self._get_default_embedding_model(
877
+ project.name,
878
+ params=params
879
+ )
880
+ params['default_embedding_model'] = model_name
881
+
882
+ model_project = None
799
883
  if embedding_model is not None and len(embedding_model.parts) > 1:
800
884
  # model project is set
801
885
  model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
802
- else:
886
+ elif not embedding_model_params:
803
887
  model_project = project
804
888
 
805
- model = self.session.model_controller.get_model(
806
- name=model_name,
807
- project_name=model_project.name
808
- )
809
- model_record = db.Predictor.query.get(model['id'])
810
- embedding_model_id = model_record.id
889
+ embedding_model_id = None
890
+ if model_project:
891
+ model = self.session.model_controller.get_model(
892
+ name=model_name,
893
+ project_name=model_project.name
894
+ )
895
+ model_record = db.Predictor.query.get(model['id'])
896
+ embedding_model_id = model_record.id
897
+
898
+ if reranking_model_params:
899
+ # Get reranking model from params.
900
+ # This is called here to check validaity of the parameters.
901
+ get_reranking_model_from_params(reranking_model_params)
811
902
 
812
903
  # search for the vector database table
813
904
  if storage is None:
@@ -1029,6 +1120,7 @@ class KnowledgeBaseController:
1029
1120
  'embedding_model': embedding_model.name if embedding_model is not None else None,
1030
1121
  'vector_database': None if vector_database is None else vector_database.name,
1031
1122
  'vector_database_table': record.vector_database_table,
1123
+ 'query_id': record.query_id,
1032
1124
  'params': record.params
1033
1125
  })
1034
1126
 
@@ -1,7 +1,6 @@
1
1
  from typing import List, Dict, Optional, Any
2
2
  import pandas as pd
3
3
  from langchain_text_splitters import RecursiveCharacterTextSplitter
4
- import hashlib
5
4
  import asyncio
6
5
 
7
6
 
@@ -43,7 +42,11 @@ class DocumentPreprocessor:
43
42
  self.splitter = None # Will be set by child classes
44
43
 
45
44
  def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
46
- """Base implementation - should be overridden by child classes"""
45
+ """Base implementation - should be overridden by child classes
46
+
47
+ Args:
48
+ documents: List of documents to process
49
+ """
47
50
  raise NotImplementedError("Subclasses must implement process_documents")
48
51
 
49
52
  def _split_document(self, doc: Document) -> List[Document]:
@@ -80,27 +83,22 @@ class DocumentPreprocessor:
80
83
  metadata=data.get("metadata", {}),
81
84
  )
82
85
 
83
- def _generate_deterministic_id(
84
- self, content: str, content_column: str = None, provided_id: str = None
85
- ) -> str:
86
- """Generate a deterministic ID based on content and column"""
87
- if provided_id is not None:
88
- return f"{provided_id}_{content_column}"
89
-
90
- id_string = f"content={content}_column={content_column}"
91
- return hashlib.sha256(id_string.encode()).hexdigest()
92
-
93
86
  def _generate_chunk_id(
94
87
  self,
95
88
  chunk_index: Optional[int] = None,
89
+ total_chunks: Optional[int] = None,
90
+ start_char: Optional[int] = None,
91
+ end_char: Optional[int] = None,
96
92
  provided_id: str = None,
97
93
  ) -> str:
98
- """Generate deterministic ID for a chunk"""
99
- base_id = provided_id
100
- chunk_id = (
101
- f"{base_id}_chunk_{chunk_index}" if chunk_index is not None else base_id
102
- )
103
- logger.debug(f"Generated chunk ID: {chunk_id} for content hash: {base_id}")
94
+ """Generate human-readable deterministic ID for a chunk
95
+ Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
96
+ """
97
+ if provided_id is None:
98
+ raise ValueError("Document ID must be provided for chunk ID generation")
99
+
100
+ chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
101
+ logger.debug(f"Generated chunk ID: {chunk_id}")
104
102
  return chunk_id
105
103
 
106
104
  def _prepare_chunk_metadata(
@@ -207,14 +205,10 @@ Please give a short succinct context to situate this chunk within the overall do
207
205
  processed_chunks = []
208
206
 
209
207
  for doc_index, doc in enumerate(documents):
210
- # Get content_column from metadata if available
211
- content_column = (
212
- doc.metadata.get("content_column") if doc.metadata else None
213
- )
214
208
 
215
- # Ensure document has an ID
209
+ # Document ID must be provided by this point
216
210
  if doc.id is None:
217
- doc.id = self._generate_deterministic_id(doc.content, content_column)
211
+ raise ValueError("Document ID must be provided before preprocessing")
218
212
 
219
213
  # Skip empty or whitespace-only content
220
214
  if not doc.content or not doc.content.strip():
@@ -298,68 +292,55 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
298
292
  processed_chunks = []
299
293
 
300
294
  for doc in documents:
301
- # Get content_column from metadata if available
302
- content_column = (
303
- doc.metadata.get("content_column") if doc.metadata else None
304
- )
305
295
 
306
- # Ensure document has an ID
296
+ # Document ID must be provided by this point
307
297
  if doc.id is None:
308
- doc.id = self._generate_deterministic_id(doc.content, content_column)
298
+ raise ValueError("Document ID must be provided before preprocessing")
309
299
 
310
300
  # Skip empty or whitespace-only content
311
301
  if not doc.content or not doc.content.strip():
312
302
  continue
313
303
 
314
304
  chunk_docs = self._split_document(doc)
305
+ total_chunks = len(chunk_docs)
315
306
 
316
- # Single chunk case
317
- if len(chunk_docs) == 1:
318
- chunk_doc = chunk_docs[0]
307
+ # Track character positions
308
+ current_pos = 0
309
+ for i, chunk_doc in enumerate(chunk_docs):
319
310
  if not chunk_doc.content or not chunk_doc.content.strip():
320
311
  continue
321
312
 
313
+ # Calculate chunk positions
314
+ start_char = current_pos
315
+ end_char = start_char + len(chunk_doc.content)
316
+ current_pos = end_char + 1 # +1 for separator
317
+
322
318
  # Initialize metadata
323
319
  metadata = {}
324
320
  if doc.metadata:
325
321
  metadata.update(doc.metadata)
326
322
 
327
- # Pass through doc.id and content_column
328
- id = self._generate_chunk_id(
329
- chunk_index=0, provided_id=doc.id
323
+ # Add position metadata
324
+ metadata["start_char"] = start_char
325
+ metadata["end_char"] = end_char
326
+
327
+ # Generate chunk ID with total chunks
328
+ chunk_id = self._generate_chunk_id(
329
+ chunk_index=i,
330
+ total_chunks=total_chunks,
331
+ start_char=start_char,
332
+ end_char=end_char,
333
+ provided_id=doc.id
330
334
  )
335
+
331
336
  processed_chunks.append(
332
337
  ProcessedChunk(
333
- id=id,
338
+ id=chunk_id,
334
339
  content=chunk_doc.content,
335
340
  embeddings=doc.embeddings,
336
- metadata=self._prepare_chunk_metadata(doc.id, None, metadata),
341
+ metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
337
342
  )
338
343
  )
339
- else:
340
- # Multiple chunks case
341
- for i, chunk_doc in enumerate(chunk_docs):
342
- if not chunk_doc.content or not chunk_doc.content.strip():
343
- continue
344
-
345
- # Initialize metadata
346
- metadata = {}
347
- if doc.metadata:
348
- metadata.update(doc.metadata)
349
-
350
- # Pass through doc.id and content_column
351
- chunk_id = self._generate_chunk_id(
352
- chunk_index=i,
353
- provided_id=doc.id,
354
- )
355
- processed_chunks.append(
356
- ProcessedChunk(
357
- id=chunk_id,
358
- content=chunk_doc.content,
359
- embeddings=doc.embeddings,
360
- metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
361
- )
362
- )
363
344
 
364
345
  return processed_chunks
365
346
 
@@ -0,0 +1,28 @@
1
+ """Utilities for knowledge base operations."""
2
+ import hashlib
3
+
4
+
5
+ def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
6
+ """
7
+ Generate a deterministic document ID from content and column name.
8
+ If provided_id exists, combines it with content_column.
9
+ For generated IDs, uses a short hash of just the content to ensure
10
+ same content gets same base ID across different columns.
11
+
12
+ Args:
13
+ content: The content string
14
+ content_column: Name of the content column
15
+ provided_id: Optional user-provided ID
16
+ Returns:
17
+ Deterministic document ID in format: <base_id>_<column>
18
+ where base_id is either the provided_id or a 16-char hash of content
19
+ """
20
+ if provided_id is not None:
21
+ base_id = provided_id
22
+ else:
23
+ # Generate a shorter 16-character hash based only on content
24
+ hash_obj = hashlib.md5(content.encode())
25
+ base_id = hash_obj.hexdigest()[:16]
26
+
27
+ # Append column name to maintain uniqueness across columns
28
+ return f"{base_id}_{content_column}"