natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,19 @@
1
1
  """Implementation of the SearchServiceProtocol using Haystack components."""
2
2
 
3
- import copy
4
3
  import logging
5
4
  import os
5
+ import shutil
6
6
  from pathlib import Path
7
7
  from typing import Any, Dict, Iterable, List, Optional, Union
8
8
 
9
9
  from PIL import Image
10
10
 
11
+ # Import sentence-transformers for dimension calculation
12
+ try:
13
+ from sentence_transformers import SentenceTransformer
14
+ except ImportError:
15
+ SentenceTransformer = None
16
+
11
17
  # --- Haystack Imports ---
12
18
  try:
13
19
  import haystack
@@ -17,15 +23,23 @@ try:
17
23
  SentenceTransformersTextEmbedder,
18
24
  )
19
25
 
20
- # Import necessary retrievers, rankers etc. as needed for search()
21
- from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
26
+ # Import InMemory Store & Retriever unconditionally
27
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
22
28
  from haystack.dataclasses import Document as HaystackDocument
23
29
  from haystack.document_stores.in_memory import InMemoryDocumentStore
24
30
  from haystack.document_stores.types import DocumentStore, DuplicatePolicy
25
- from haystack_integrations.components.retrievers.chroma import ( # Use embedding retriever
26
- ChromaEmbeddingRetriever,
27
- )
28
- from haystack_integrations.document_stores.chroma import ChromaDocumentStore
31
+
32
+ # Conditional LanceDB Imports
33
+ try:
34
+ from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
35
+
36
+ LANCEDB_HAYSTACK_AVAILABLE = True
37
+ except ImportError:
38
+ LanceDBDocumentStore = None
39
+ LanceDBEmbeddingRetriever = None
40
+ LANCEDB_HAYSTACK_AVAILABLE = False
41
+
42
+ # Removed Chroma Imports
29
43
 
30
44
  # Need Ranker if used
31
45
  try:
@@ -33,36 +47,35 @@ try:
33
47
  except ImportError:
34
48
  CohereRanker = None
35
49
 
36
- # Don't define here, it's imported later
37
50
  except ImportError:
38
51
  # Set flags/placeholders if Haystack isn't installed
39
- # Don't define here, it's imported later
40
52
  DocumentStore = object
41
53
  HaystackDocument = Dict
42
- ChromaDocumentStore = None
43
54
  InMemoryDocumentStore = None
55
+ LanceDBDocumentStore = None
44
56
  SentenceTransformersDocumentEmbedder = None
45
57
  SentenceTransformersTextEmbedder = None
46
58
  InMemoryEmbeddingRetriever = None
47
- ChromaEmbeddingRetriever = None # Fallback definition
59
+ LanceDBEmbeddingRetriever = None
48
60
  CohereRanker = None
49
61
  Pipeline = None
50
62
  DuplicatePolicy = None
63
+ LANCEDB_HAYSTACK_AVAILABLE = False
51
64
 
52
- # --- ChromaDB Client Import (for management) ---
65
+ # LanceDB Client Import (for management)
53
66
  try:
54
- import chromadb
67
+ import lancedb
55
68
 
56
- CHROMADB_AVAILABLE = True
69
+ LANCEDB_CLIENT_AVAILABLE = True
57
70
  except ImportError:
58
- chromadb = None
59
- CHROMADB_AVAILABLE = False
71
+ lancedb = None
72
+ LANCEDB_CLIENT_AVAILABLE = False
60
73
 
61
- from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
74
+ # Removed ChromaDB Client Import
75
+
76
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS
62
77
  from .search_options import (
63
78
  BaseSearchOptions,
64
- MultiModalSearchOptions,
65
- SearchOptions,
66
79
  TextSearchOptions,
67
80
  )
68
81
 
@@ -70,11 +83,9 @@ from .search_options import (
70
83
  from .search_service_protocol import (
71
84
  Indexable,
72
85
  IndexConfigurationError,
73
- IndexExistsError,
74
86
  SearchServiceProtocol,
75
87
  )
76
88
 
77
- # --- Logging ---
78
89
  logger = logging.getLogger(__name__)
79
90
 
80
91
  # --- Default Configuration Values ---
@@ -86,74 +97,129 @@ class HaystackSearchService(SearchServiceProtocol):
86
97
  """
87
98
  Haystack-based implementation of the search service protocol.
88
99
 
89
- Manages ChromaDB (persistent) or InMemory (non-persistent) DocumentStores
100
+ Manages LanceDB (persistent) or InMemory (non-persistent) DocumentStores
90
101
  and uses Haystack components for embedding and retrieval.
91
- A single instance of this service is tied to a specific collection name.
102
+ A single instance of this service is tied to a specific table name (LanceDB)
103
+ or implicitly managed (InMemory).
92
104
  """
93
105
 
94
106
  def __init__(
95
107
  self,
96
- collection_name: str,
97
- persist: bool = False, # Store type configuration
98
- default_persist_path: str = DEFAULT_PERSIST_PATH,
99
- embedding_model: str = DEFAULT_EMBEDDING_MODEL, # Renamed for clarity
108
+ table_name: str,
109
+ persist: bool = False,
110
+ uri: str = DEFAULT_PERSIST_PATH,
111
+ embedding_model: str = DEFAULT_EMBEDDING_MODEL,
100
112
  ):
101
113
  """
102
- Initialize the service for a specific collection.
114
+ Initialize the service for a specific LanceDB table or an InMemory store.
103
115
 
104
116
  Args:
105
- collection_name: The name of the index/collection this service instance manages.
106
- persist: If True, this service instance manages persistent ChromaDB stores.
107
- If False, it manages transient InMemory stores.
108
- default_persist_path: Default path for persistent ChromaDB storage.
117
+ table_name: The name of the LanceDB table (if persist=True).
118
+ persist: If True, this service instance manages a persistent LanceDB store.
119
+ If False, it manages a transient InMemory store.
120
+ uri: Path/URI for the LanceDB database directory (if persist=True).
109
121
  embedding_model: The embedding model this service instance will use.
122
+ Required for LanceDB to know embedding dimensions.
110
123
  """
111
124
  if not HAS_HAYSTACK_EXTRAS:
112
125
  raise ImportError(
113
126
  "HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
114
127
  )
115
128
 
116
- self.collection_name = collection_name # Store the collection name
117
- self._persist = persist # Store the persistence type for this instance
118
- self._default_persist_path = default_persist_path
119
- self._embedding_model = embedding_model # Store the configured model
129
+ self.table_name = table_name
130
+ self._persist = persist
131
+ self._uri = uri
132
+ self._embedding_model = embedding_model
133
+ self._embedding_dims: Optional[int] = None
120
134
 
121
- # Dictionary to hold InMemoryDocumentStore instances if not persisting
122
- self._in_memory_store: Optional[InMemoryDocumentStore] = (
123
- None if persist else InMemoryDocumentStore()
124
- )
125
- self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
135
+ # Store instances (lazy loaded)
136
+ self._in_memory_store: Optional[InMemoryDocumentStore] = None
137
+ self._lancedb_store: Optional[LanceDBDocumentStore] = None
126
138
 
127
- logger.info(
128
- f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'"
129
- )
139
+ # Eagerly create InMemoryStore if not persisting
140
+ if not self._persist:
141
+ if not InMemoryDocumentStore:
142
+ raise ImportError(
143
+ "InMemoryDocumentStore not available. Cannot create non-persistent service."
144
+ )
145
+ self._in_memory_store = InMemoryDocumentStore()
146
+ logger.info(
147
+ f"HaystackSearchService initialized for InMemory store (table_name '{self.table_name}' ignored). Model: '{self._embedding_model}'"
148
+ )
149
+ else:
150
+ # Check LanceDB availability if persisting
151
+ if not LANCEDB_HAYSTACK_AVAILABLE:
152
+ raise ImportError(
153
+ "LanceDB persistent store requires lancedb-haystack. Install with: pip install lancedb-haystack"
154
+ )
155
+ if not SentenceTransformer:
156
+ raise ImportError(
157
+ "LanceDB persistent store requires sentence-transformers to determine embedding dimensions. Install with: pip install sentence-transformers"
158
+ )
159
+ # Calculate embedding dimensions needed for LanceDB initialization
160
+ self._calculate_embedding_dims()
161
+ logger.info(
162
+ f"HaystackSearchService initialized for LanceDB table='{self.table_name}' at uri='{self._uri}'. Model: '{self._embedding_model}', Dims: {self._embedding_dims}"
163
+ )
130
164
 
131
- # --- Internal Helper Methods --- #
165
+ # --- Internal Helper Methods ---
132
166
 
133
- def _get_store(
134
- self,
135
- ) -> DocumentStore:
136
- """Gets or creates the appropriate Haystack DocumentStore instance for this service's collection."""
137
- # Use the instance's configured persistence type and collection name
167
+ def _calculate_embedding_dims(self) -> None:
168
+ """Calculates and stores embedding dimensions from the model name."""
169
+ if self._embedding_dims is None:
170
+ if not SentenceTransformer:
171
+ raise ImportError(
172
+ "sentence-transformers library is required to determine embedding dimensions."
173
+ )
174
+ try:
175
+ model = SentenceTransformer(self._embedding_model)
176
+ dims = model.get_sentence_embedding_dimension()
177
+ if not dims:
178
+ raise ValueError(
179
+ f"Could not determine embedding dimension for model: {self._embedding_model}"
180
+ )
181
+ self._embedding_dims = dims
182
+ logger.debug(
183
+ f"Determined embedding dimension: {self._embedding_dims} for model '{self._embedding_model}'"
184
+ )
185
+ except Exception as e:
186
+ logger.error(
187
+ f"Failed to load SentenceTransformer model '{self._embedding_model}' to get dimensions: {e}",
188
+ exc_info=True,
189
+ )
190
+ raise RuntimeError(
191
+ f"Failed to determine embedding dimension for model '{self._embedding_model}'."
192
+ ) from e
193
+
194
+ def _get_store(self) -> DocumentStore:
195
+ """Gets or creates the appropriate Haystack DocumentStore instance."""
138
196
  if self._persist:
139
- if self._chroma_store is None:
140
- # Lazy load Chroma store
197
+ if not LanceDBDocumentStore:
198
+ raise ImportError("LanceDBDocumentStore not available.")
199
+ if self._lancedb_store is None:
141
200
  logger.debug(
142
- f"Initializing ChromaDocumentStore for collection '{self.collection_name}'."
201
+ f"Initializing LanceDBDocumentStore for table '{self.table_name}' at uri '{self._uri}'."
143
202
  )
144
- self._chroma_store = ChromaDocumentStore(
145
- persist_path=self._default_persist_path,
146
- collection_name=self.collection_name, # Use instance name
203
+ if self._embedding_dims is None:
204
+ logger.warning(
205
+ "Embedding dimensions not calculated before getting store. Calculating now."
206
+ )
207
+ self._calculate_embedding_dims()
208
+
209
+ self._lancedb_store = LanceDBDocumentStore(
210
+ database=self._uri,
211
+ table_name=self.table_name,
212
+ embedding_dims=self._embedding_dims,
147
213
  )
148
- return self._chroma_store
149
- else:
150
- # Return the instance's InMemory store
151
- if (
152
- self._in_memory_store is None
153
- ): # Should have been created in __init__ if persist=False
154
- logger.warning(
155
- f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
214
+ logger.info(
215
+ f"Initialized LanceDBDocumentStore for table '{self.table_name}' (Dims: {self._embedding_dims})"
156
216
  )
217
+ return self._lancedb_store
218
+ else:
219
+ if self._in_memory_store is None:
220
+ logger.warning("In-memory store was not initialized. Creating now.")
221
+ if not InMemoryDocumentStore:
222
+ raise ImportError("InMemoryDocumentStore not available.")
157
223
  self._in_memory_store = InMemoryDocumentStore()
158
224
  return self._in_memory_store
159
225
 
@@ -161,7 +227,7 @@ class HaystackSearchService(SearchServiceProtocol):
161
227
  self, device: Optional[str] = None
162
228
  ) -> SentenceTransformersDocumentEmbedder:
163
229
  """Creates the Haystack document embedder component."""
164
- model_name = self._embedding_model # Use instance model
230
+ model_name = self._embedding_model
165
231
  logger.debug(
166
232
  f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
167
233
  )
@@ -187,7 +253,7 @@ class HaystackSearchService(SearchServiceProtocol):
187
253
 
188
254
  def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
189
255
  """Creates the Haystack text embedder component (for queries)."""
190
- model_name = self._embedding_model # Use instance model
256
+ model_name = self._embedding_model
191
257
  logger.debug(
192
258
  f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
193
259
  )
@@ -208,113 +274,97 @@ class HaystackSearchService(SearchServiceProtocol):
208
274
  f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
209
275
  ) from e
210
276
 
211
- def _delete_chroma_collection(self) -> bool:
212
- """Internal helper to delete the ChromaDB collection managed by this service."""
213
- if not CHROMADB_AVAILABLE:
214
- logger.error(
215
- "Cannot delete ChromaDB collection because 'chromadb' library is not installed."
216
- )
217
- raise ImportError("'chromadb' library required for collection deletion.")
277
+ def _delete_lancedb_table(self) -> bool:
278
+ """Internal helper to delete the LanceDB table managed by this service."""
218
279
  if not self._persist:
219
280
  logger.warning(
220
- "Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring."
281
+ "Attempted to delete LanceDB table for a non-persistent service instance. Ignoring."
221
282
  )
222
- return False # Cannot delete if not persistent
283
+ return False
284
+
285
+ if not LANCEDB_CLIENT_AVAILABLE:
286
+ logger.error("Cannot delete LanceDB table because 'lancedb' library is not installed.")
287
+ raise ImportError("'lancedb' library required for table deletion.")
288
+
289
+ table_name_to_delete = self.table_name
290
+ db_uri = self._uri
291
+ logger.warning(
292
+ f"Attempting to delete existing LanceDB table '{table_name_to_delete}' at uri '{db_uri}'."
293
+ )
223
294
  try:
224
- collection_name_to_delete = self.collection_name # Use instance collection name
225
- logger.warning(
226
- f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'."
227
- )
228
- chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
229
- try:
230
- chroma_client.delete_collection(name=collection_name_to_delete)
295
+ db = lancedb.connect(db_uri)
296
+ table_names = db.table_names()
297
+ if table_name_to_delete in table_names:
298
+ db.drop_table(table_name_to_delete)
231
299
  logger.info(
232
- f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'."
300
+ f"Successfully deleted existing LanceDB table '{table_name_to_delete}'."
233
301
  )
234
- self._chroma_store = None # Reset lazy-loaded store
235
- return True
236
- except chromadb.errors.InvalidCollectionException:
302
+ else:
237
303
  logger.info(
238
- f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed."
304
+ f"LanceDB table '{table_name_to_delete}' did not exist. No deletion needed."
239
305
  )
240
- return True # Deletion is effectively successful
241
- finally:
242
- pass # Cleanup if needed
243
- except ImportError as ie:
244
- raise ie
306
+
307
+ self._lancedb_store = None
308
+ return True
245
309
  except Exception as e:
246
310
  logger.error(
247
- f"Error during ChromaDB collection deletion '{self.collection_name}': {e}",
311
+ f"Error during LanceDB table deletion '{table_name_to_delete}' at '{db_uri}': {e}",
248
312
  exc_info=True,
249
313
  )
250
- # Don't raise here, let index() decide based on force_reindex
251
314
  return False
252
315
 
253
- # --- Protocol Methods Implementation --- #
316
+ # --- Protocol Methods Implementation ---
254
317
 
255
318
  def index(
256
319
  self,
257
- documents: Iterable[Indexable], # Accept Indexable objects
320
+ documents: Iterable[Indexable],
258
321
  embedder_device: Optional[str] = None,
259
322
  force_reindex: bool = False,
260
323
  ) -> None:
261
- # Need to consume the iterable to log count, or log differently
262
- # Let's convert to list for now, assuming size isn't prohibitive
263
324
  indexable_list = list(documents)
264
325
  logger.info(
265
- f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
326
+ f"Index request for table='{self.table_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
266
327
  )
267
328
 
268
329
  if not indexable_list:
269
330
  logger.warning("No documents provided for indexing. Skipping.")
270
331
  return
271
332
 
272
- # --- 1. Handle Reindexing (Deletion before store/embedder init) ---
333
+ # Handle Reindexing
273
334
  if force_reindex:
274
- logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
335
+ logger.info(f"Force reindex requested for table '{self.table_name}'.")
275
336
  if self._persist:
276
- # Attempt deletion, raises ImportError if chromadb missing
277
- deleted = self._delete_chroma_collection() # Uses self.collection_name
337
+ deleted = self._delete_lancedb_table()
278
338
  if not deleted:
279
- # If deletion failed for other reasons, log and continue cautiously
280
339
  logger.warning(
281
- "Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
340
+ "LanceDB table deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
282
341
  )
283
342
  else:
284
- # For InMemory, force_reindex means we want a fresh store instance.
285
- # Re-initialize the instance's in-memory store
286
- logger.info(
287
- f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'."
288
- )
289
- self._in_memory_store = InMemoryDocumentStore() # Create a new one
343
+ # For InMemory, re-initialize the instance's store
344
+ logger.info(f"force_reindex=True: Re-initializing InMemory store.")
345
+ if not InMemoryDocumentStore:
346
+ raise ImportError("InMemoryDocumentStore not available.")
347
+ self._in_memory_store = InMemoryDocumentStore()
290
348
 
291
- # REMOVED try...except around store retrieval
292
- # Let store initialization errors propagate directly
293
- store = self._get_store() # No argument needed
349
+ # Get Store
350
+ store = self._get_store()
294
351
 
295
- # --- 3. Create Embedder ---
296
- # Errors during embedder creation will propagate from the helper
352
+ # Create Embedder
297
353
  embedder = self._get_document_embedder(embedder_device)
298
354
 
299
- # --- 4. Convert Indexable to Haystack Docs & Embed ---
355
+ # Convert Indexable to Haystack Docs & Embed
300
356
  haystack_docs_to_embed: List[HaystackDocument] = []
301
357
  logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
302
- # Consume Indexable items using the protocol methods
303
358
  for item in indexable_list:
304
359
  doc_id = item.get_id()
305
360
  metadata = item.get_metadata()
306
- content_obj = item.get_content() # This might be Page, Region, etc.
307
-
308
- # Determine content based on embedder type and content object
309
- # For now, assume text content is needed and try to extract it
361
+ content_obj = item.get_content()
310
362
  content_text = ""
311
363
  if isinstance(content_obj, str):
312
- # If get_content() already returned text
313
364
  content_text = content_obj
314
365
  elif hasattr(content_obj, "extract_text") and callable(
315
366
  getattr(content_obj, "extract_text")
316
367
  ):
317
- # If content object has extract_text (like Page or Region)
318
368
  try:
319
369
  content_text = content_obj.extract_text()
320
370
  if not isinstance(content_text, str):
@@ -329,18 +379,12 @@ class HaystackSearchService(SearchServiceProtocol):
329
379
  )
330
380
  content_text = str(content_obj)
331
381
  else:
332
- # Attempt to convert to string as fallback if no obvious text method
333
382
  logger.warning(
334
383
  f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
335
384
  )
336
385
  content_text = str(content_obj)
337
386
 
338
- # Construct HaystackDocument using data from Indexable protocol methods
339
- haystack_doc = HaystackDocument(
340
- id=doc_id, # Use ID from get_id()
341
- content=content_text,
342
- meta=metadata, # Use metadata from get_metadata()
343
- )
387
+ haystack_doc = HaystackDocument(id=doc_id, content=content_text, meta=metadata)
344
388
  haystack_docs_to_embed.append(haystack_doc)
345
389
 
346
390
  if not haystack_docs_to_embed:
@@ -353,68 +397,57 @@ class HaystackSearchService(SearchServiceProtocol):
353
397
  f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
354
398
  )
355
399
  try:
356
- # Embed the documents
357
400
  embedding_results = embedder.run(documents=haystack_docs_to_embed)
358
401
  embedded_docs = embedding_results["documents"]
359
402
  logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
360
403
 
361
404
  except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
362
- # Keep specific catch for dimension mismatch - provides useful context
363
- error_msg = f"Indexing failed for collection '{self.collection_name}'. Dimension mismatch: {dim_error}. "
364
- error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
405
+ error_msg = (
406
+ f"Indexing failed for table '{self.table_name}'. Dimension mismatch: {dim_error}. "
407
+ )
408
+ error_msg += f"Ensure the embedding model ('{self._embedding_model}', Dim: {self._embedding_dims}) matches the expected dimension of the store. "
365
409
  if self._persist:
366
- error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
367
- error_msg += (
368
- "Try deleting the persistent storage directory or using force_reindex=True."
369
- )
410
+ error_msg += f"If the table already exists at '{self._uri}', it might have been created with a different model/dimension. "
411
+ error_msg += f"Try deleting the LanceDB table directory ('{os.path.join(self._uri, self.table_name + '.lance')}') or using force_reindex=True."
370
412
  else:
371
413
  error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
372
414
  logger.error(error_msg, exc_info=True)
373
415
  raise IndexConfigurationError(error_msg) from dim_error
374
- # REMOVED broad except Exception for embedding errors. Let them propagate.
375
416
 
376
- # --- 5. Write Embedded Documents to Store ---
417
+ # Write Embedded Documents to Store
377
418
  logger.info(
378
- f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'..."
419
+ f"Writing {len(embedded_docs)} embedded documents to store (Table/Type: '{self.table_name if self._persist else 'InMemory'}')..."
379
420
  )
380
- # REMOVED try...except around store writing. Let errors propagate.
381
421
  write_result = store.write_documents(
382
- documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE # Or configure as needed
383
- )
384
- logger.info(
385
- f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
422
+ documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
386
423
  )
387
- # --- Add explicit count check after writing ---
388
- logger.info(
389
- f"Store '{self.collection_name}' document count after write: {store.count_documents()}"
390
- )
391
- # --- End count check ---
424
+ logger.info(f"Successfully wrote {write_result} documents to store.")
425
+ try:
426
+ count = store.count_documents()
427
+ logger.info(f"Store document count after write: {count}")
428
+ except Exception as count_error:
429
+ logger.warning(f"Could not get document count after write: {count_error}")
392
430
 
393
431
  def search(
394
432
  self,
395
- query: Any, # Changed from Union[str, Path, Image.Image] to Any
433
+ query: Any,
396
434
  options: BaseSearchOptions,
397
435
  ) -> List[Dict[str, Any]]:
398
436
  logger.info(
399
- f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}"
437
+ f"Search request for table/store='{self.table_name if self._persist else 'InMemory'}', query_type={type(query).__name__}, options={options}"
400
438
  )
401
439
 
402
- store = self._get_store() # Let errors propagate
440
+ store = self._get_store()
403
441
 
404
- # --- 1. Handle Query Type and Embedding ---
405
- # This implementation currently only supports text query embedding.
406
- # TODO: Refactor or extend for multimodal queries based on service capabilities/options.
442
+ # Handle Query Type and Embedding
407
443
  query_embedding = None
408
444
  query_text = ""
409
445
  if isinstance(query, (str, os.PathLike)):
410
446
  if isinstance(query, os.PathLike):
411
- logger.warning(
412
- "Image path query received, but multimodal search not fully implemented. Treating as text path string."
413
- )
447
+ logger.warning("Image path query received, treating as text path string.")
414
448
  query_text = str(query)
415
449
  else:
416
450
  query_text = query
417
-
418
451
  text_embedder = self._get_text_embedder()
419
452
  embedding_result = text_embedder.run(text=query_text)
420
453
  query_embedding = embedding_result["embedding"]
@@ -423,19 +456,11 @@ class HaystackSearchService(SearchServiceProtocol):
423
456
  logger.debug(
424
457
  f"Successfully generated query text embedding (dim: {len(query_embedding)})."
425
458
  )
426
-
427
459
  elif isinstance(query, Image.Image):
428
- logger.error(
429
- "Multimodal query (PIL Image) is not yet supported by this service implementation."
430
- )
431
- raise NotImplementedError(
432
- "Search with PIL Image queries is not implemented in HaystackSearchService."
433
- )
434
- # Check if query is Indexable and try extracting text?
460
+ logger.error("Multimodal query (PIL Image) is not yet supported.")
461
+ raise NotImplementedError("Search with PIL Image queries is not implemented.")
435
462
  elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
436
- logger.debug(
437
- f"Query type {type(query).__name__} has extract_text. Extracting text for search."
438
- )
463
+ logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text.")
439
464
  try:
440
465
  query_text = query.extract_text()
441
466
  if not query_text or not query_text.strip():
@@ -443,82 +468,121 @@ class HaystackSearchService(SearchServiceProtocol):
443
468
  f"Query object {type(query).__name__} provided empty text. Returning no results."
444
469
  )
445
470
  return []
446
- # Embed the extracted text
447
471
  text_embedder = self._get_text_embedder()
448
472
  embedding_result = text_embedder.run(text=query_text)
449
473
  query_embedding = embedding_result["embedding"]
450
474
  if not query_embedding:
451
475
  raise ValueError(
452
- f"Text embedder did not return an embedding for text extracted from {type(query).__name__}."
476
+ f"Text embedder did not return embedding for text from {type(query).__name__}."
453
477
  )
454
478
  logger.debug(
455
- f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)})."
479
+ f"Generated query embedding from extracted text (dim: {len(query_embedding)})."
456
480
  )
457
481
  except Exception as e:
458
482
  logger.error(
459
- f"Failed to extract or embed text from query object {type(query).__name__}: {e}",
483
+ f"Failed to extract/embed text from query object {type(query).__name__}: {e}",
460
484
  exc_info=True,
461
485
  )
462
486
  raise RuntimeError("Query text extraction or embedding failed.") from e
463
-
464
487
  else:
465
- # Raise specific error for unsupported types by this implementation
466
488
  raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
467
489
 
468
- # --- 2. Select Retriever based on Store Type ---
490
+ # Select Retriever based on Store Type
469
491
  retriever = None
470
- if isinstance(store, ChromaDocumentStore):
471
- if not ChromaEmbeddingRetriever:
472
- raise ImportError("ChromaEmbeddingRetriever is required but not available.")
473
- retriever = ChromaEmbeddingRetriever(document_store=store)
474
- elif isinstance(store, InMemoryDocumentStore):
492
+ # Check if LanceDB is available *before* checking isinstance
493
+ if (
494
+ LANCEDB_HAYSTACK_AVAILABLE
495
+ and LanceDBDocumentStore
496
+ and isinstance(store, LanceDBDocumentStore)
497
+ ):
498
+ if not LanceDBEmbeddingRetriever:
499
+ raise ImportError("LanceDBEmbeddingRetriever is required but not available.")
500
+ retriever = LanceDBEmbeddingRetriever(document_store=store)
501
+ # Check if InMemory is available *before* checking isinstance
502
+ elif (
503
+ InMemoryDocumentStore
504
+ and InMemoryEmbeddingRetriever
505
+ and isinstance(store, InMemoryDocumentStore)
506
+ ):
507
+ # No separate HAS_INMEMORY flag, check if classes are not None
475
508
  retriever = InMemoryEmbeddingRetriever(document_store=store)
476
509
  else:
477
- # Raise specific error for unsupported store
478
- raise TypeError(f"Cannot perform search with store type {type(store)}.")
510
+ # Improved error message if store type is unexpected
511
+ store_type_name = type(store).__name__
512
+ available_integrations = []
513
+ if LANCEDB_HAYSTACK_AVAILABLE and LanceDBDocumentStore:
514
+ available_integrations.append("LanceDB")
515
+ if InMemoryDocumentStore:
516
+ available_integrations.append("InMemory")
517
+
518
+ if not available_integrations:
519
+ raise TypeError(
520
+ f"Cannot perform search: No supported document store integrations (LanceDB, InMemory) seem to be available. "
521
+ f"Check Haystack installation."
522
+ )
523
+ # Check if the store type matches one of the available integrations' expected types
524
+ elif (
525
+ LANCEDB_HAYSTACK_AVAILABLE
526
+ and LanceDBDocumentStore
527
+ and isinstance(store, LanceDBDocumentStore)
528
+ ) or (InMemoryDocumentStore and isinstance(store, InMemoryDocumentStore)):
529
+ # This case implies the retriever class (e.g., LanceDBEmbeddingRetriever) might be missing
530
+ missing_retriever = ""
531
+ if isinstance(store, LanceDBDocumentStore):
532
+ missing_retriever = "LanceDBEmbeddingRetriever"
533
+ if isinstance(store, InMemoryDocumentStore):
534
+ missing_retriever = "InMemoryEmbeddingRetriever"
535
+ raise ImportError(
536
+ f"Store type '{store_type_name}' is supported, but its retriever component '{missing_retriever}' failed to import or is unavailable."
537
+ )
538
+ else: # Store type doesn't match any known/available store type
539
+ raise TypeError(
540
+ f"Cannot perform search with unexpected store type '{store_type_name}'. "
541
+ f"Available integrations: {', '.join(available_integrations)}."
542
+ )
479
543
 
480
- # --- 3. Build Retrieval Pipeline ---
544
+ # This check remains as a final safeguard, though the logic above should catch most issues
545
+ if not retriever:
546
+ raise RuntimeError(
547
+ f"Failed to select a suitable retriever for store type {type(store).__name__}. Please check dependencies and integration availability."
548
+ )
549
+
550
+ logger.debug(f"Selected retriever: {type(retriever).__name__}")
551
+
552
+ # Build Retrieval Pipeline
481
553
  pipeline = Pipeline()
482
554
  pipeline.add_component("retriever", retriever)
483
- # Add Ranker logic (remains the same)
484
- # ... (ranker setup if needed)
485
555
 
486
- # --- 4. Prepare Filters (remains the same) ---
556
+ # Prepare Filters
487
557
  haystack_filters = options.filters
488
558
  if haystack_filters:
489
559
  logger.debug(f"Applying filters: {haystack_filters}")
490
560
 
491
- # --- 5. Prepare Retriever Input Data (Dynamically) ---
561
+ # Prepare Retriever Input Data
492
562
  retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
493
- # Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
494
563
  retriever_input_data["query_embedding"] = query_embedding
495
564
  logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
496
565
 
497
- # --- 6. Run Retrieval ---
566
+ # Run Retrieval
498
567
  try:
499
- logger.info(f"Running retrieval pipeline for collection '{self.collection_name}'...")
500
- result = pipeline.run(
501
- data={"retriever": retriever_input_data}
502
- # ... (ranker data if needed)
568
+ logger.info(
569
+ f"Running retrieval pipeline for table/store '{self.table_name if self._persist else 'InMemory'}'..."
503
570
  )
571
+ result = pipeline.run(data={"retriever": retriever_input_data})
504
572
 
505
- # --- 7. Format Results ---
573
+ # Format Results
506
574
  if "retriever" in result and "documents" in result["retriever"]:
507
575
  retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
508
576
  logger.info(f"Retrieved {len(retrieved_docs)} documents.")
509
- # Format results (remains the same)
510
577
  final_results = []
511
578
  for doc in retrieved_docs:
512
- # Include content_hash in returned metadata if present
513
579
  meta_with_hash = doc.meta
514
- # No need to explicitly add hash here if Haystack store preserves it
515
580
  result_dict = {
516
581
  "content_snippet": doc.content[:200] if doc.content else "",
517
582
  "score": doc.score if doc.score is not None else 0.0,
518
583
  "page_number": meta_with_hash.get("page_number", None),
519
584
  "pdf_path": meta_with_hash.get("pdf_path", None),
520
- "metadata": meta_with_hash, # Pass full metadata
521
- # "_haystack_document": doc # Optionally include full object
585
+ "metadata": meta_with_hash,
522
586
  }
523
587
  final_results.append(result_dict)
524
588
  return final_results
@@ -527,117 +591,97 @@ class HaystackSearchService(SearchServiceProtocol):
527
591
  return []
528
592
 
529
593
  except FileNotFoundError:
530
- # Keep specific catch for collection not found during retrieval
531
594
  logger.error(
532
- f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'."
595
+ f"Search failed: Could not access path for table/store '{self.table_name if self._persist else 'InMemory'}' (URI: '{self._uri if self._persist else 'N/A'}')."
533
596
  )
534
- raise # Re-raise the specific FileNotFoundError
535
- # REMOVED broad except Exception for pipeline execution. Let errors propagate.
597
+ raise
536
598
 
537
- def delete_index(
538
- self,
539
- ) -> bool:
599
+ def delete_index(self) -> bool:
540
600
  """
541
- Deletes the entire index/collection managed by this service instance.
601
+ Deletes the entire LanceDB table or resets the InMemory store.
542
602
 
543
603
  Returns:
544
- True if deletion was successful or collection didn't exist, False otherwise.
604
+ True if deletion was successful or table/store didn't exist, False otherwise.
545
605
  """
546
- logger.warning(f"Request to delete index for collection '{self.collection_name}'.")
547
606
  if self._persist:
548
- # Delegate to internal ChromaDB deletion helper
549
- return self._delete_chroma_collection()
550
- else:
551
- # For InMemory, "deleting" means re-initializing the store
552
- logger.info(
553
- f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
607
+ logger.warning(
608
+ f"Request to delete LanceDB table '{self.table_name}' at uri '{self._uri}'."
554
609
  )
610
+ return self._delete_lancedb_table()
611
+ else:
612
+ logger.info("Request to delete InMemory store (re-initializing).)")
613
+ if not InMemoryDocumentStore:
614
+ raise ImportError("InMemoryDocumentStore not available.")
555
615
  self._in_memory_store = InMemoryDocumentStore()
556
- return True # Considered successful
616
+ return True
557
617
 
558
- def index_exists(
559
- self,
560
- ) -> bool:
618
+ def index_exists(self) -> bool:
561
619
  """
562
- Checks if the index/collection managed by this service instance exists.
563
- NOTE: For ChromaDB, this may involve trying to connect.
564
- For InMemory, it checks if the internal store object exists and has documents.
620
+ Checks if the LanceDB table or InMemory store exists and has documents.
621
+ NOTE: For LanceDB, this tries to count documents, implicitly checking connection/table existence.
622
+ For InMemory, it checks if the internal store object exists and has documents.
565
623
  """
566
- logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
567
- store = self._get_store() # Get the store instance
624
+ store_name = self.table_name if self._persist else "InMemory"
625
+ logger.debug(
626
+ f"Checking existence of index for '{store_name}'. URI: '{self._uri if self._persist else 'N/A'}'"
627
+ )
568
628
  try:
629
+ store = self._get_store()
569
630
  count = store.count_documents()
570
631
  exists = count > 0
571
632
  logger.debug(
572
- f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}"
633
+ f"Store type {type(store).__name__} for '{store_name}' exists and has {count} documents: {exists}"
573
634
  )
574
635
  return exists
636
+ except ImportError as ie:
637
+ logger.error(f"Import error checking index existence for '{store_name}': {ie}")
638
+ return False
575
639
  except Exception as e:
576
- # Catch errors during count_documents (e.g., connection error for persistent stores)
577
640
  logger.warning(
578
- f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}",
641
+ f"Could not confirm existence or count documents in store for '{store_name}': {e}",
579
642
  exc_info=False,
580
643
  )
581
- # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
582
- # Assume not exists if count fails
583
644
  return False
584
645
 
585
646
  # --- Sync Methods Implementation ---
586
647
 
587
648
  def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
588
- """Retrieves documents, required for sync.
589
- NOTE: Haystack's filter_documents is the closest match.
590
- Fetches all docs if filters=None.
591
- """
649
+ """Retrieves documents, required for sync."""
650
+ store_name = self.table_name if self._persist else "InMemory"
592
651
  logger.debug(
593
- f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})..."
652
+ f"Listing documents for '{store_name}' (include_metadata={include_metadata})..."
594
653
  )
595
654
  store = self._get_store()
596
655
  try:
597
- # Use filter_documents with no filters to get all
598
- # This might be inefficient for very large stores.
599
- haystack_docs = store.filter_documents(
600
- filters=kwargs.get("filters")
601
- ) # Pass filters if provided via kwargs
602
- logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
603
- # Convert to simple dicts
656
+ haystack_docs = store.filter_documents(filters=kwargs.get("filters"))
657
+ logger.info(f"Retrieved {len(haystack_docs)} documents from store '{store_name}'.")
604
658
  results = []
605
659
  for doc in haystack_docs:
606
- doc_dict = {"id": doc.id} # ID is essential
660
+ doc_dict = {"id": doc.id}
607
661
  if include_metadata:
608
- # Ensure content_hash is included if it exists in meta
609
662
  doc_dict["meta"] = doc.meta
610
- # Optionally include content? Protocol doesn't require it.
611
- # doc_dict["content"] = doc.content
612
663
  results.append(doc_dict)
613
664
  return results
614
665
  except Exception as e:
615
- logger.error(
616
- f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True
617
- )
618
- raise RuntimeError(
619
- f"Failed to list documents from store '{self.collection_name}'."
620
- ) from e
666
+ logger.error(f"Failed to list documents from store '{store_name}': {e}", exc_info=True)
667
+ raise RuntimeError(f"Failed to list documents from store '{store_name}'.") from e
621
668
 
622
669
  def delete_documents(self, ids: List[str]) -> None:
623
670
  """Deletes documents by ID, required for sync."""
671
+ store_name = self.table_name if self._persist else "InMemory"
624
672
  if not ids:
625
- logger.debug("No document IDs provided for deletion. Skipping.")
673
+ logger.debug(f"No document IDs provided for deletion from '{store_name}'. Skipping.")
626
674
  return
627
- logger.warning(
628
- f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
629
- )
675
+ logger.warning(f"Request to delete {len(ids)} documents from '{store_name}'.")
630
676
  store = self._get_store()
631
677
  try:
632
678
  store.delete_documents(ids=ids)
633
679
  logger.info(
634
- f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}"
680
+ f"Successfully requested deletion of {len(ids)} documents from '{store_name}'. Store count now: {store.count_documents()}"
635
681
  )
636
682
  except Exception as e:
637
683
  logger.error(
638
- f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}",
684
+ f"Failed to delete documents with IDs {ids} from store '{store_name}': {e}",
639
685
  exc_info=True,
640
686
  )
641
- raise RuntimeError(
642
- f"Failed to delete documents from store '{self.collection_name}'."
643
- ) from e
687
+ raise RuntimeError(f"Failed to delete documents from store '{store_name}'.") from e