natural-pdf 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,520 @@
1
+ """Implementation of the SearchServiceProtocol using Haystack components."""
2
+
3
+ import logging
4
+ import os
5
+ from typing import List, Dict, Any, Optional, Union, Iterable
6
+ from pathlib import Path
7
+ from PIL import Image
8
+ import copy
9
+
10
+ # --- Haystack Imports ---
11
+ try:
12
+ import haystack
13
+ from haystack import Pipeline
14
+ from haystack.dataclasses import Document as HaystackDocument
15
+ from haystack.document_stores.types import DocumentStore, DuplicatePolicy
16
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
17
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
18
+ from haystack.components.embedders import (
19
+ SentenceTransformersTextEmbedder,
20
+ SentenceTransformersDocumentEmbedder
21
+ )
22
+ # Import necessary retrievers, rankers etc. as needed for search()
23
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
24
+ from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever # Use embedding retriever
25
+ # Need Ranker if used
26
+ try:
27
+ from haystack.components.rankers import CohereRanker
28
+ except ImportError:
29
+ CohereRanker = None
30
+
31
+ # Don't define here, it's imported later
32
+ except ImportError:
33
+ # Set flags/placeholders if Haystack isn't installed
34
+ # Don't define here, it's imported later
35
+ DocumentStore = object
36
+ HaystackDocument = Dict
37
+ ChromaDocumentStore = None
38
+ InMemoryDocumentStore = None
39
+ SentenceTransformersDocumentEmbedder = None
40
+ SentenceTransformersTextEmbedder = None
41
+ InMemoryEmbeddingRetriever = None
42
+ ChromaEmbeddingRetriever = None # Fallback definition
43
+ CohereRanker = None
44
+ Pipeline = None
45
+ DuplicatePolicy = None
46
+
47
+ # --- ChromaDB Client Import (for management) ---
48
+ try:
49
+ import chromadb
50
+ CHROMADB_AVAILABLE = True
51
+ except ImportError:
52
+ chromadb = None
53
+ CHROMADB_AVAILABLE = False
54
+
55
+ # --- Local Imports ---
56
+ from .search_service_protocol import SearchServiceProtocol, IndexConfigurationError, IndexExistsError
57
+ from .search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
58
+ from .search_service_protocol import Indexable
59
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
60
+
61
+ # --- Logging ---
62
+ logger = logging.getLogger(__name__)
63
+
64
+ # --- Default Configuration Values ---
65
+ DEFAULT_PERSIST_PATH = "./natural_pdf_index"
66
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
67
+
68
+ class HaystackSearchService(SearchServiceProtocol):
69
+ """
70
+ Haystack-based implementation of the search service protocol.
71
+
72
+ Manages ChromaDB (persistent) or InMemory (non-persistent) DocumentStores
73
+ and uses Haystack components for embedding and retrieval.
74
+ A single instance of this service is tied to a specific collection name.
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ collection_name: str,
80
+ persist: bool = False, # Store type configuration
81
+ default_persist_path: str = DEFAULT_PERSIST_PATH,
82
+ embedding_model: str = DEFAULT_EMBEDDING_MODEL # Renamed for clarity
83
+ ):
84
+ """
85
+ Initialize the service for a specific collection.
86
+
87
+ Args:
88
+ collection_name: The name of the index/collection this service instance manages.
89
+ persist: If True, this service instance manages persistent ChromaDB stores.
90
+ If False, it manages transient InMemory stores.
91
+ default_persist_path: Default path for persistent ChromaDB storage.
92
+ embedding_model: The embedding model this service instance will use.
93
+ """
94
+ if not HAS_HAYSTACK_EXTRAS:
95
+ raise ImportError("HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]")
96
+
97
+ self.collection_name = collection_name # Store the collection name
98
+ self._persist = persist # Store the persistence type for this instance
99
+ self._default_persist_path = default_persist_path
100
+ self._embedding_model = embedding_model # Store the configured model
101
+
102
+ # Dictionary to hold InMemoryDocumentStore instances if not persisting
103
+ self._in_memory_store: Optional[InMemoryDocumentStore] = None if persist else InMemoryDocumentStore()
104
+ self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
105
+
106
+ logger.info(f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'")
107
+
108
+ # --- Internal Helper Methods --- #
109
+
110
+ def _get_store(
111
+ self,
112
+ ) -> DocumentStore:
113
+ """Gets or creates the appropriate Haystack DocumentStore instance for this service's collection."""
114
+ # Use the instance's configured persistence type and collection name
115
+ if self._persist:
116
+ if self._chroma_store is None:
117
+ # Lazy load Chroma store
118
+ logger.debug(f"Initializing ChromaDocumentStore for collection '{self.collection_name}'.")
119
+ self._chroma_store = ChromaDocumentStore(
120
+ persist_path=self._default_persist_path,
121
+ collection_name=self.collection_name # Use instance name
122
+ )
123
+ return self._chroma_store
124
+ else:
125
+ # Return the instance's InMemory store
126
+ if self._in_memory_store is None: # Should have been created in __init__ if persist=False
127
+ logger.warning(f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now.")
128
+ self._in_memory_store = InMemoryDocumentStore()
129
+ return self._in_memory_store
130
+
131
+ def _get_document_embedder(
132
+ self,
133
+ device: Optional[str] = None
134
+ ) -> SentenceTransformersDocumentEmbedder:
135
+ """Creates the Haystack document embedder component."""
136
+ model_name = self._embedding_model # Use instance model
137
+ logger.debug(f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}")
138
+ if not SentenceTransformersDocumentEmbedder:
139
+ raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
140
+ try:
141
+ embedder = SentenceTransformersDocumentEmbedder(
142
+ model=model_name,
143
+ device=device,
144
+ )
145
+ embedder.warm_up()
146
+ logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
147
+ return embedder
148
+ except Exception as e:
149
+ logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
150
+ raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
151
+
152
+ def _get_text_embedder(
153
+ self,
154
+ device: Optional[str] = None
155
+ ) -> SentenceTransformersTextEmbedder:
156
+ """Creates the Haystack text embedder component (for queries)."""
157
+ model_name = self._embedding_model # Use instance model
158
+ logger.debug(f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}")
159
+ if not SentenceTransformersTextEmbedder:
160
+ raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
161
+ try:
162
+ embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
163
+ embedder.warm_up()
164
+ logger.info(f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
165
+ return embedder
166
+ except Exception as e:
167
+ logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
168
+ raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
169
+
170
+ def _delete_chroma_collection(self) -> bool:
171
+ """Internal helper to delete the ChromaDB collection managed by this service."""
172
+ if not CHROMADB_AVAILABLE:
173
+ logger.error("Cannot delete ChromaDB collection because 'chromadb' library is not installed.")
174
+ raise ImportError("'chromadb' library required for collection deletion.")
175
+ if not self._persist:
176
+ logger.warning("Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring.")
177
+ return False # Cannot delete if not persistent
178
+ try:
179
+ collection_name_to_delete = self.collection_name # Use instance collection name
180
+ logger.warning(f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'.")
181
+ chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
182
+ try:
183
+ chroma_client.delete_collection(name=collection_name_to_delete)
184
+ logger.info(f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'.")
185
+ self._chroma_store = None # Reset lazy-loaded store
186
+ return True
187
+ except chromadb.errors.InvalidCollectionException:
188
+ logger.info(f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed.")
189
+ return True # Deletion is effectively successful
190
+ finally:
191
+ pass # Cleanup if needed
192
+ except ImportError as ie:
193
+ raise ie
194
+ except Exception as e:
195
+ logger.error(f"Error during ChromaDB collection deletion '{self.collection_name}': {e}", exc_info=True)
196
+ # Don't raise here, let index() decide based on force_reindex
197
+ return False
198
+
199
+ # --- Protocol Methods Implementation --- #
200
+
201
+ def index(
202
+ self,
203
+ documents: Iterable[Indexable], # Accept Indexable objects
204
+ embedder_device: Optional[str] = None,
205
+ force_reindex: bool = False,
206
+ ) -> None:
207
+ # Need to consume the iterable to log count, or log differently
208
+ # Let's convert to list for now, assuming size isn't prohibitive
209
+ indexable_list = list(documents)
210
+ logger.info(f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}")
211
+
212
+ if not indexable_list:
213
+ logger.warning("No documents provided for indexing. Skipping.")
214
+ return
215
+
216
+ # --- 1. Handle Reindexing (Deletion before store/embedder init) ---
217
+ if force_reindex:
218
+ logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
219
+ if self._persist:
220
+ # Attempt deletion, raises ImportError if chromadb missing
221
+ deleted = self._delete_chroma_collection() # Uses self.collection_name
222
+ if not deleted:
223
+ # If deletion failed for other reasons, log and continue cautiously
224
+ logger.warning("Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere.")
225
+ else:
226
+ # For InMemory, force_reindex means we want a fresh store instance.
227
+ # Re-initialize the instance's in-memory store
228
+ logger.info(f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'.")
229
+ self._in_memory_store = InMemoryDocumentStore() # Create a new one
230
+
231
+ # REMOVED try...except around store retrieval
232
+ # Let store initialization errors propagate directly
233
+ store = self._get_store() # No argument needed
234
+
235
+ # --- 3. Create Embedder ---
236
+ # Errors during embedder creation will propagate from the helper
237
+ embedder = self._get_document_embedder(embedder_device)
238
+
239
+ # --- 4. Convert Indexable to Haystack Docs & Embed ---
240
+ haystack_docs_to_embed: List[HaystackDocument] = []
241
+ logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
242
+ # Consume Indexable items using the protocol methods
243
+ for item in indexable_list:
244
+ doc_id = item.get_id()
245
+ metadata = item.get_metadata()
246
+ content_obj = item.get_content() # This might be Page, Region, etc.
247
+
248
+ # Determine content based on embedder type and content object
249
+ # For now, assume text content is needed and try to extract it
250
+ content_text = ""
251
+ if isinstance(content_obj, str):
252
+ # If get_content() already returned text
253
+ content_text = content_obj
254
+ elif hasattr(content_obj, 'extract_text') and callable(getattr(content_obj, 'extract_text')):
255
+ # If content object has extract_text (like Page or Region)
256
+ try:
257
+ content_text = content_obj.extract_text()
258
+ if not isinstance(content_text, str):
259
+ logger.warning(f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str().")
260
+ content_text = str(content_obj)
261
+ except Exception as extraction_error:
262
+ logger.error(f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().", exc_info=False)
263
+ content_text = str(content_obj)
264
+ else:
265
+ # Attempt to convert to string as fallback if no obvious text method
266
+ logger.warning(f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str().")
267
+ content_text = str(content_obj)
268
+
269
+ # Construct HaystackDocument using data from Indexable protocol methods
270
+ haystack_doc = HaystackDocument(
271
+ id=doc_id, # Use ID from get_id()
272
+ content=content_text,
273
+ meta=metadata # Use metadata from get_metadata()
274
+ )
275
+ haystack_docs_to_embed.append(haystack_doc)
276
+
277
+ if not haystack_docs_to_embed:
278
+ logger.warning("No Haystack documents were prepared. Check conversion logic and input data.")
279
+ return
280
+
281
+ logger.info(f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'...")
282
+ try:
283
+ # Embed the documents
284
+ embedding_results = embedder.run(documents=haystack_docs_to_embed)
285
+ embedded_docs = embedding_results["documents"]
286
+ logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
287
+
288
+ except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
289
+ # Keep specific catch for dimension mismatch - provides useful context
290
+ error_msg = f"Indexing failed for collection '{self.collection_name}'. Dimension mismatch: {dim_error}. "
291
+ error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
292
+ if self._persist:
293
+ error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
294
+ error_msg += "Try deleting the persistent storage directory or using force_reindex=True."
295
+ else:
296
+ error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
297
+ logger.error(error_msg, exc_info=True)
298
+ raise IndexConfigurationError(error_msg) from dim_error
299
+ # REMOVED broad except Exception for embedding errors. Let them propagate.
300
+
301
+ # --- 5. Write Embedded Documents to Store ---
302
+ logger.info(f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'...")
303
+ # REMOVED try...except around store writing. Let errors propagate.
304
+ write_result = store.write_documents(
305
+ documents=embedded_docs,
306
+ policy=DuplicatePolicy.OVERWRITE # Or configure as needed
307
+ )
308
+ logger.info(f"Successfully wrote {write_result} documents to store '{self.collection_name}'.")
309
+ # --- Add explicit count check after writing ---
310
+ logger.info(f"Store '{self.collection_name}' document count after write: {store.count_documents()}")
311
+ # --- End count check ---
312
+
313
+ def search(
314
+ self,
315
+ query: Any, # Changed from Union[str, Path, Image.Image] to Any
316
+ options: BaseSearchOptions,
317
+ ) -> List[Dict[str, Any]]:
318
+ logger.info(f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}")
319
+
320
+ store = self._get_store() # Let errors propagate
321
+
322
+ # --- 1. Handle Query Type and Embedding ---
323
+ # This implementation currently only supports text query embedding.
324
+ # TODO: Refactor or extend for multimodal queries based on service capabilities/options.
325
+ query_embedding = None
326
+ query_text = ""
327
+ if isinstance(query, (str, os.PathLike)):
328
+ if isinstance(query, os.PathLike):
329
+ logger.warning("Image path query received, but multimodal search not fully implemented. Treating as text path string.")
330
+ query_text = str(query)
331
+ else:
332
+ query_text = query
333
+
334
+ text_embedder = self._get_text_embedder()
335
+ embedding_result = text_embedder.run(text=query_text)
336
+ query_embedding = embedding_result["embedding"]
337
+ if not query_embedding:
338
+ raise ValueError("Text embedder did not return an embedding for the query.")
339
+ logger.debug(f"Successfully generated query text embedding (dim: {len(query_embedding)}).")
340
+
341
+ elif isinstance(query, Image.Image):
342
+ logger.error("Multimodal query (PIL Image) is not yet supported by this service implementation.")
343
+ raise NotImplementedError("Search with PIL Image queries is not implemented in HaystackSearchService.")
344
+ # Check if query is Indexable and try extracting text?
345
+ elif hasattr(query, 'extract_text') and callable(getattr(query, 'extract_text')):
346
+ logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text for search.")
347
+ try:
348
+ query_text = query.extract_text()
349
+ if not query_text or not query_text.strip():
350
+ logger.warning(f"Query object {type(query).__name__} provided empty text. Returning no results.")
351
+ return []
352
+ # Embed the extracted text
353
+ text_embedder = self._get_text_embedder()
354
+ embedding_result = text_embedder.run(text=query_text)
355
+ query_embedding = embedding_result["embedding"]
356
+ if not query_embedding:
357
+ raise ValueError(f"Text embedder did not return an embedding for text extracted from {type(query).__name__}.")
358
+ logger.debug(f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)}).")
359
+ except Exception as e:
360
+ logger.error(f"Failed to extract or embed text from query object {type(query).__name__}: {e}", exc_info=True)
361
+ raise RuntimeError("Query text extraction or embedding failed.") from e
362
+
363
+ else:
364
+ # Raise specific error for unsupported types by this implementation
365
+ raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
366
+
367
+ # --- 2. Select Retriever based on Store Type ---
368
+ retriever = None
369
+ if isinstance(store, ChromaDocumentStore):
370
+ if not ChromaEmbeddingRetriever:
371
+ raise ImportError("ChromaEmbeddingRetriever is required but not available.")
372
+ retriever = ChromaEmbeddingRetriever(document_store=store)
373
+ elif isinstance(store, InMemoryDocumentStore):
374
+ retriever = InMemoryEmbeddingRetriever(document_store=store)
375
+ else:
376
+ # Raise specific error for unsupported store
377
+ raise TypeError(f"Cannot perform search with store type {type(store)}.")
378
+
379
+ # --- 3. Build Retrieval Pipeline ---
380
+ pipeline = Pipeline()
381
+ pipeline.add_component("retriever", retriever)
382
+ # Add Ranker logic (remains the same)
383
+ # ... (ranker setup if needed)
384
+
385
+ # --- 4. Prepare Filters (remains the same) ---
386
+ haystack_filters = options.filters
387
+ if haystack_filters:
388
+ logger.debug(f"Applying filters: {haystack_filters}")
389
+
390
+ # --- 5. Prepare Retriever Input Data (Dynamically) ---
391
+ retriever_input_data = {
392
+ "filters": haystack_filters,
393
+ "top_k": options.top_k
394
+ }
395
+ # Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
396
+ retriever_input_data["query_embedding"] = query_embedding
397
+ logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
398
+
399
+ # --- 6. Run Retrieval ---
400
+ try:
401
+ logger.info(f"Running retrieval pipeline for collection '{self.collection_name}'...")
402
+ result = pipeline.run(
403
+ data={"retriever": retriever_input_data}
404
+ # ... (ranker data if needed)
405
+ )
406
+
407
+ # --- 7. Format Results ---
408
+ if "retriever" in result and "documents" in result["retriever"]:
409
+ retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
410
+ logger.info(f"Retrieved {len(retrieved_docs)} documents.")
411
+ # Format results (remains the same)
412
+ final_results = []
413
+ for doc in retrieved_docs:
414
+ # Include content_hash in returned metadata if present
415
+ meta_with_hash = doc.meta
416
+ # No need to explicitly add hash here if Haystack store preserves it
417
+ result_dict = {
418
+ "content_snippet": doc.content[:200] if doc.content else "",
419
+ "score": doc.score if doc.score is not None else 0.0,
420
+ "page_number": meta_with_hash.get("page_number", None),
421
+ "pdf_path": meta_with_hash.get("pdf_path", None),
422
+ "metadata": meta_with_hash, # Pass full metadata
423
+ # "_haystack_document": doc # Optionally include full object
424
+ }
425
+ final_results.append(result_dict)
426
+ return final_results
427
+ else:
428
+ logger.warning("Pipeline result did not contain expected retriever output.")
429
+ return []
430
+
431
+ except FileNotFoundError:
432
+ # Keep specific catch for collection not found during retrieval
433
+ logger.error(f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'.")
434
+ raise # Re-raise the specific FileNotFoundError
435
+ # REMOVED broad except Exception for pipeline execution. Let errors propagate.
436
+
437
+ def delete_index(
438
+ self,
439
+ ) -> bool:
440
+ """
441
+ Deletes the entire index/collection managed by this service instance.
442
+
443
+ Returns:
444
+ True if deletion was successful or collection didn't exist, False otherwise.
445
+ """
446
+ logger.warning(f"Request to delete index for collection '{self.collection_name}'.")
447
+ if self._persist:
448
+ # Delegate to internal ChromaDB deletion helper
449
+ return self._delete_chroma_collection()
450
+ else:
451
+ # For InMemory, "deleting" means re-initializing the store
452
+ logger.info(f"Re-initializing InMemory store for '{self.collection_name}' as deletion request.")
453
+ self._in_memory_store = InMemoryDocumentStore()
454
+ return True # Considered successful
455
+
456
+ def index_exists(
457
+ self,
458
+ ) -> bool:
459
+ """
460
+ Checks if the index/collection managed by this service instance exists.
461
+ NOTE: For ChromaDB, this may involve trying to connect.
462
+ For InMemory, it checks if the internal store object exists and has documents.
463
+ """
464
+ logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
465
+ store = self._get_store() # Get the store instance
466
+ try:
467
+ count = store.count_documents()
468
+ exists = count > 0
469
+ logger.debug(f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}")
470
+ return exists
471
+ except Exception as e:
472
+ # Catch errors during count_documents (e.g., connection error for persistent stores)
473
+ logger.warning(f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}", exc_info=False)
474
+ # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
475
+ # Assume not exists if count fails
476
+ return False
477
+
478
+ # --- Sync Methods Implementation ---
479
+
480
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
481
+ """Retrieves documents, required for sync.
482
+ NOTE: Haystack's filter_documents is the closest match.
483
+ Fetches all docs if filters=None.
484
+ """
485
+ logger.debug(f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})...")
486
+ store = self._get_store()
487
+ try:
488
+ # Use filter_documents with no filters to get all
489
+ # This might be inefficient for very large stores.
490
+ haystack_docs = store.filter_documents(filters=kwargs.get('filters')) # Pass filters if provided via kwargs
491
+ logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
492
+ # Convert to simple dicts
493
+ results = []
494
+ for doc in haystack_docs:
495
+ doc_dict = {"id": doc.id} # ID is essential
496
+ if include_metadata:
497
+ # Ensure content_hash is included if it exists in meta
498
+ doc_dict["meta"] = doc.meta
499
+ # Optionally include content? Protocol doesn't require it.
500
+ # doc_dict["content"] = doc.content
501
+ results.append(doc_dict)
502
+ return results
503
+ except Exception as e:
504
+ logger.error(f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True)
505
+ raise RuntimeError(f"Failed to list documents from store '{self.collection_name}'.") from e
506
+
507
+
508
+ def delete_documents(self, ids: List[str]) -> None:
509
+ """Deletes documents by ID, required for sync."""
510
+ if not ids:
511
+ logger.debug("No document IDs provided for deletion. Skipping.")
512
+ return
513
+ logger.warning(f"Request to delete {len(ids)} documents from collection '{self.collection_name}'.")
514
+ store = self._get_store()
515
+ try:
516
+ store.delete_documents(ids=ids)
517
+ logger.info(f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}")
518
+ except Exception as e:
519
+ logger.error(f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}", exc_info=True)
520
+ raise RuntimeError(f"Failed to delete documents from store '{self.collection_name}'.") from e