natural-pdf 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +3 -1
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/METADATA +14 -1
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +18 -9
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,520 @@
|
|
1
|
+
"""Implementation of the SearchServiceProtocol using Haystack components."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
from typing import List, Dict, Any, Optional, Union, Iterable
|
6
|
+
from pathlib import Path
|
7
|
+
from PIL import Image
|
8
|
+
import copy
|
9
|
+
|
10
|
+
# --- Haystack Imports ---
|
11
|
+
try:
|
12
|
+
import haystack
|
13
|
+
from haystack import Pipeline
|
14
|
+
from haystack.dataclasses import Document as HaystackDocument
|
15
|
+
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
16
|
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
17
|
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
18
|
+
from haystack.components.embedders import (
|
19
|
+
SentenceTransformersTextEmbedder,
|
20
|
+
SentenceTransformersDocumentEmbedder
|
21
|
+
)
|
22
|
+
# Import necessary retrievers, rankers etc. as needed for search()
|
23
|
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
|
24
|
+
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever # Use embedding retriever
|
25
|
+
# Need Ranker if used
|
26
|
+
try:
|
27
|
+
from haystack.components.rankers import CohereRanker
|
28
|
+
except ImportError:
|
29
|
+
CohereRanker = None
|
30
|
+
|
31
|
+
# Don't define here, it's imported later
|
32
|
+
except ImportError:
|
33
|
+
# Set flags/placeholders if Haystack isn't installed
|
34
|
+
# Don't define here, it's imported later
|
35
|
+
DocumentStore = object
|
36
|
+
HaystackDocument = Dict
|
37
|
+
ChromaDocumentStore = None
|
38
|
+
InMemoryDocumentStore = None
|
39
|
+
SentenceTransformersDocumentEmbedder = None
|
40
|
+
SentenceTransformersTextEmbedder = None
|
41
|
+
InMemoryEmbeddingRetriever = None
|
42
|
+
ChromaEmbeddingRetriever = None # Fallback definition
|
43
|
+
CohereRanker = None
|
44
|
+
Pipeline = None
|
45
|
+
DuplicatePolicy = None
|
46
|
+
|
47
|
+
# --- ChromaDB Client Import (for management) ---
|
48
|
+
try:
|
49
|
+
import chromadb
|
50
|
+
CHROMADB_AVAILABLE = True
|
51
|
+
except ImportError:
|
52
|
+
chromadb = None
|
53
|
+
CHROMADB_AVAILABLE = False
|
54
|
+
|
55
|
+
# --- Local Imports ---
|
56
|
+
from .search_service_protocol import SearchServiceProtocol, IndexConfigurationError, IndexExistsError
|
57
|
+
from .search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
|
58
|
+
from .search_service_protocol import Indexable
|
59
|
+
from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
|
60
|
+
|
61
|
+
# --- Logging ---
|
62
|
+
logger = logging.getLogger(__name__)
|
63
|
+
|
64
|
+
# --- Default Configuration Values ---
|
65
|
+
DEFAULT_PERSIST_PATH = "./natural_pdf_index"
|
66
|
+
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
67
|
+
|
68
|
+
class HaystackSearchService(SearchServiceProtocol):
|
69
|
+
"""
|
70
|
+
Haystack-based implementation of the search service protocol.
|
71
|
+
|
72
|
+
Manages ChromaDB (persistent) or InMemory (non-persistent) DocumentStores
|
73
|
+
and uses Haystack components for embedding and retrieval.
|
74
|
+
A single instance of this service is tied to a specific collection name.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
collection_name: str,
|
80
|
+
persist: bool = False, # Store type configuration
|
81
|
+
default_persist_path: str = DEFAULT_PERSIST_PATH,
|
82
|
+
embedding_model: str = DEFAULT_EMBEDDING_MODEL # Renamed for clarity
|
83
|
+
):
|
84
|
+
"""
|
85
|
+
Initialize the service for a specific collection.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
collection_name: The name of the index/collection this service instance manages.
|
89
|
+
persist: If True, this service instance manages persistent ChromaDB stores.
|
90
|
+
If False, it manages transient InMemory stores.
|
91
|
+
default_persist_path: Default path for persistent ChromaDB storage.
|
92
|
+
embedding_model: The embedding model this service instance will use.
|
93
|
+
"""
|
94
|
+
if not HAS_HAYSTACK_EXTRAS:
|
95
|
+
raise ImportError("HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]")
|
96
|
+
|
97
|
+
self.collection_name = collection_name # Store the collection name
|
98
|
+
self._persist = persist # Store the persistence type for this instance
|
99
|
+
self._default_persist_path = default_persist_path
|
100
|
+
self._embedding_model = embedding_model # Store the configured model
|
101
|
+
|
102
|
+
# Dictionary to hold InMemoryDocumentStore instances if not persisting
|
103
|
+
self._in_memory_store: Optional[InMemoryDocumentStore] = None if persist else InMemoryDocumentStore()
|
104
|
+
self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
|
105
|
+
|
106
|
+
logger.info(f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'")
|
107
|
+
|
108
|
+
# --- Internal Helper Methods --- #
|
109
|
+
|
110
|
+
def _get_store(
|
111
|
+
self,
|
112
|
+
) -> DocumentStore:
|
113
|
+
"""Gets or creates the appropriate Haystack DocumentStore instance for this service's collection."""
|
114
|
+
# Use the instance's configured persistence type and collection name
|
115
|
+
if self._persist:
|
116
|
+
if self._chroma_store is None:
|
117
|
+
# Lazy load Chroma store
|
118
|
+
logger.debug(f"Initializing ChromaDocumentStore for collection '{self.collection_name}'.")
|
119
|
+
self._chroma_store = ChromaDocumentStore(
|
120
|
+
persist_path=self._default_persist_path,
|
121
|
+
collection_name=self.collection_name # Use instance name
|
122
|
+
)
|
123
|
+
return self._chroma_store
|
124
|
+
else:
|
125
|
+
# Return the instance's InMemory store
|
126
|
+
if self._in_memory_store is None: # Should have been created in __init__ if persist=False
|
127
|
+
logger.warning(f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now.")
|
128
|
+
self._in_memory_store = InMemoryDocumentStore()
|
129
|
+
return self._in_memory_store
|
130
|
+
|
131
|
+
def _get_document_embedder(
|
132
|
+
self,
|
133
|
+
device: Optional[str] = None
|
134
|
+
) -> SentenceTransformersDocumentEmbedder:
|
135
|
+
"""Creates the Haystack document embedder component."""
|
136
|
+
model_name = self._embedding_model # Use instance model
|
137
|
+
logger.debug(f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}")
|
138
|
+
if not SentenceTransformersDocumentEmbedder:
|
139
|
+
raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
|
140
|
+
try:
|
141
|
+
embedder = SentenceTransformersDocumentEmbedder(
|
142
|
+
model=model_name,
|
143
|
+
device=device,
|
144
|
+
)
|
145
|
+
embedder.warm_up()
|
146
|
+
logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
|
147
|
+
return embedder
|
148
|
+
except Exception as e:
|
149
|
+
logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
|
150
|
+
raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
|
151
|
+
|
152
|
+
def _get_text_embedder(
|
153
|
+
self,
|
154
|
+
device: Optional[str] = None
|
155
|
+
) -> SentenceTransformersTextEmbedder:
|
156
|
+
"""Creates the Haystack text embedder component (for queries)."""
|
157
|
+
model_name = self._embedding_model # Use instance model
|
158
|
+
logger.debug(f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}")
|
159
|
+
if not SentenceTransformersTextEmbedder:
|
160
|
+
raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
|
161
|
+
try:
|
162
|
+
embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
|
163
|
+
embedder.warm_up()
|
164
|
+
logger.info(f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
|
165
|
+
return embedder
|
166
|
+
except Exception as e:
|
167
|
+
logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
|
168
|
+
raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
|
169
|
+
|
170
|
+
def _delete_chroma_collection(self) -> bool:
|
171
|
+
"""Internal helper to delete the ChromaDB collection managed by this service."""
|
172
|
+
if not CHROMADB_AVAILABLE:
|
173
|
+
logger.error("Cannot delete ChromaDB collection because 'chromadb' library is not installed.")
|
174
|
+
raise ImportError("'chromadb' library required for collection deletion.")
|
175
|
+
if not self._persist:
|
176
|
+
logger.warning("Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring.")
|
177
|
+
return False # Cannot delete if not persistent
|
178
|
+
try:
|
179
|
+
collection_name_to_delete = self.collection_name # Use instance collection name
|
180
|
+
logger.warning(f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'.")
|
181
|
+
chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
|
182
|
+
try:
|
183
|
+
chroma_client.delete_collection(name=collection_name_to_delete)
|
184
|
+
logger.info(f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'.")
|
185
|
+
self._chroma_store = None # Reset lazy-loaded store
|
186
|
+
return True
|
187
|
+
except chromadb.errors.InvalidCollectionException:
|
188
|
+
logger.info(f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed.")
|
189
|
+
return True # Deletion is effectively successful
|
190
|
+
finally:
|
191
|
+
pass # Cleanup if needed
|
192
|
+
except ImportError as ie:
|
193
|
+
raise ie
|
194
|
+
except Exception as e:
|
195
|
+
logger.error(f"Error during ChromaDB collection deletion '{self.collection_name}': {e}", exc_info=True)
|
196
|
+
# Don't raise here, let index() decide based on force_reindex
|
197
|
+
return False
|
198
|
+
|
199
|
+
# --- Protocol Methods Implementation --- #
|
200
|
+
|
201
|
+
def index(
|
202
|
+
self,
|
203
|
+
documents: Iterable[Indexable], # Accept Indexable objects
|
204
|
+
embedder_device: Optional[str] = None,
|
205
|
+
force_reindex: bool = False,
|
206
|
+
) -> None:
|
207
|
+
# Need to consume the iterable to log count, or log differently
|
208
|
+
# Let's convert to list for now, assuming size isn't prohibitive
|
209
|
+
indexable_list = list(documents)
|
210
|
+
logger.info(f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}")
|
211
|
+
|
212
|
+
if not indexable_list:
|
213
|
+
logger.warning("No documents provided for indexing. Skipping.")
|
214
|
+
return
|
215
|
+
|
216
|
+
# --- 1. Handle Reindexing (Deletion before store/embedder init) ---
|
217
|
+
if force_reindex:
|
218
|
+
logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
|
219
|
+
if self._persist:
|
220
|
+
# Attempt deletion, raises ImportError if chromadb missing
|
221
|
+
deleted = self._delete_chroma_collection() # Uses self.collection_name
|
222
|
+
if not deleted:
|
223
|
+
# If deletion failed for other reasons, log and continue cautiously
|
224
|
+
logger.warning("Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere.")
|
225
|
+
else:
|
226
|
+
# For InMemory, force_reindex means we want a fresh store instance.
|
227
|
+
# Re-initialize the instance's in-memory store
|
228
|
+
logger.info(f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'.")
|
229
|
+
self._in_memory_store = InMemoryDocumentStore() # Create a new one
|
230
|
+
|
231
|
+
# REMOVED try...except around store retrieval
|
232
|
+
# Let store initialization errors propagate directly
|
233
|
+
store = self._get_store() # No argument needed
|
234
|
+
|
235
|
+
# --- 3. Create Embedder ---
|
236
|
+
# Errors during embedder creation will propagate from the helper
|
237
|
+
embedder = self._get_document_embedder(embedder_device)
|
238
|
+
|
239
|
+
# --- 4. Convert Indexable to Haystack Docs & Embed ---
|
240
|
+
haystack_docs_to_embed: List[HaystackDocument] = []
|
241
|
+
logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
|
242
|
+
# Consume Indexable items using the protocol methods
|
243
|
+
for item in indexable_list:
|
244
|
+
doc_id = item.get_id()
|
245
|
+
metadata = item.get_metadata()
|
246
|
+
content_obj = item.get_content() # This might be Page, Region, etc.
|
247
|
+
|
248
|
+
# Determine content based on embedder type and content object
|
249
|
+
# For now, assume text content is needed and try to extract it
|
250
|
+
content_text = ""
|
251
|
+
if isinstance(content_obj, str):
|
252
|
+
# If get_content() already returned text
|
253
|
+
content_text = content_obj
|
254
|
+
elif hasattr(content_obj, 'extract_text') and callable(getattr(content_obj, 'extract_text')):
|
255
|
+
# If content object has extract_text (like Page or Region)
|
256
|
+
try:
|
257
|
+
content_text = content_obj.extract_text()
|
258
|
+
if not isinstance(content_text, str):
|
259
|
+
logger.warning(f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str().")
|
260
|
+
content_text = str(content_obj)
|
261
|
+
except Exception as extraction_error:
|
262
|
+
logger.error(f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().", exc_info=False)
|
263
|
+
content_text = str(content_obj)
|
264
|
+
else:
|
265
|
+
# Attempt to convert to string as fallback if no obvious text method
|
266
|
+
logger.warning(f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str().")
|
267
|
+
content_text = str(content_obj)
|
268
|
+
|
269
|
+
# Construct HaystackDocument using data from Indexable protocol methods
|
270
|
+
haystack_doc = HaystackDocument(
|
271
|
+
id=doc_id, # Use ID from get_id()
|
272
|
+
content=content_text,
|
273
|
+
meta=metadata # Use metadata from get_metadata()
|
274
|
+
)
|
275
|
+
haystack_docs_to_embed.append(haystack_doc)
|
276
|
+
|
277
|
+
if not haystack_docs_to_embed:
|
278
|
+
logger.warning("No Haystack documents were prepared. Check conversion logic and input data.")
|
279
|
+
return
|
280
|
+
|
281
|
+
logger.info(f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'...")
|
282
|
+
try:
|
283
|
+
# Embed the documents
|
284
|
+
embedding_results = embedder.run(documents=haystack_docs_to_embed)
|
285
|
+
embedded_docs = embedding_results["documents"]
|
286
|
+
logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
|
287
|
+
|
288
|
+
except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
|
289
|
+
# Keep specific catch for dimension mismatch - provides useful context
|
290
|
+
error_msg = f"Indexing failed for collection '{self.collection_name}'. Dimension mismatch: {dim_error}. "
|
291
|
+
error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
|
292
|
+
if self._persist:
|
293
|
+
error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
|
294
|
+
error_msg += "Try deleting the persistent storage directory or using force_reindex=True."
|
295
|
+
else:
|
296
|
+
error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
|
297
|
+
logger.error(error_msg, exc_info=True)
|
298
|
+
raise IndexConfigurationError(error_msg) from dim_error
|
299
|
+
# REMOVED broad except Exception for embedding errors. Let them propagate.
|
300
|
+
|
301
|
+
# --- 5. Write Embedded Documents to Store ---
|
302
|
+
logger.info(f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'...")
|
303
|
+
# REMOVED try...except around store writing. Let errors propagate.
|
304
|
+
write_result = store.write_documents(
|
305
|
+
documents=embedded_docs,
|
306
|
+
policy=DuplicatePolicy.OVERWRITE # Or configure as needed
|
307
|
+
)
|
308
|
+
logger.info(f"Successfully wrote {write_result} documents to store '{self.collection_name}'.")
|
309
|
+
# --- Add explicit count check after writing ---
|
310
|
+
logger.info(f"Store '{self.collection_name}' document count after write: {store.count_documents()}")
|
311
|
+
# --- End count check ---
|
312
|
+
|
313
|
+
def search(
|
314
|
+
self,
|
315
|
+
query: Any, # Changed from Union[str, Path, Image.Image] to Any
|
316
|
+
options: BaseSearchOptions,
|
317
|
+
) -> List[Dict[str, Any]]:
|
318
|
+
logger.info(f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}")
|
319
|
+
|
320
|
+
store = self._get_store() # Let errors propagate
|
321
|
+
|
322
|
+
# --- 1. Handle Query Type and Embedding ---
|
323
|
+
# This implementation currently only supports text query embedding.
|
324
|
+
# TODO: Refactor or extend for multimodal queries based on service capabilities/options.
|
325
|
+
query_embedding = None
|
326
|
+
query_text = ""
|
327
|
+
if isinstance(query, (str, os.PathLike)):
|
328
|
+
if isinstance(query, os.PathLike):
|
329
|
+
logger.warning("Image path query received, but multimodal search not fully implemented. Treating as text path string.")
|
330
|
+
query_text = str(query)
|
331
|
+
else:
|
332
|
+
query_text = query
|
333
|
+
|
334
|
+
text_embedder = self._get_text_embedder()
|
335
|
+
embedding_result = text_embedder.run(text=query_text)
|
336
|
+
query_embedding = embedding_result["embedding"]
|
337
|
+
if not query_embedding:
|
338
|
+
raise ValueError("Text embedder did not return an embedding for the query.")
|
339
|
+
logger.debug(f"Successfully generated query text embedding (dim: {len(query_embedding)}).")
|
340
|
+
|
341
|
+
elif isinstance(query, Image.Image):
|
342
|
+
logger.error("Multimodal query (PIL Image) is not yet supported by this service implementation.")
|
343
|
+
raise NotImplementedError("Search with PIL Image queries is not implemented in HaystackSearchService.")
|
344
|
+
# Check if query is Indexable and try extracting text?
|
345
|
+
elif hasattr(query, 'extract_text') and callable(getattr(query, 'extract_text')):
|
346
|
+
logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text for search.")
|
347
|
+
try:
|
348
|
+
query_text = query.extract_text()
|
349
|
+
if not query_text or not query_text.strip():
|
350
|
+
logger.warning(f"Query object {type(query).__name__} provided empty text. Returning no results.")
|
351
|
+
return []
|
352
|
+
# Embed the extracted text
|
353
|
+
text_embedder = self._get_text_embedder()
|
354
|
+
embedding_result = text_embedder.run(text=query_text)
|
355
|
+
query_embedding = embedding_result["embedding"]
|
356
|
+
if not query_embedding:
|
357
|
+
raise ValueError(f"Text embedder did not return an embedding for text extracted from {type(query).__name__}.")
|
358
|
+
logger.debug(f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)}).")
|
359
|
+
except Exception as e:
|
360
|
+
logger.error(f"Failed to extract or embed text from query object {type(query).__name__}: {e}", exc_info=True)
|
361
|
+
raise RuntimeError("Query text extraction or embedding failed.") from e
|
362
|
+
|
363
|
+
else:
|
364
|
+
# Raise specific error for unsupported types by this implementation
|
365
|
+
raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
|
366
|
+
|
367
|
+
# --- 2. Select Retriever based on Store Type ---
|
368
|
+
retriever = None
|
369
|
+
if isinstance(store, ChromaDocumentStore):
|
370
|
+
if not ChromaEmbeddingRetriever:
|
371
|
+
raise ImportError("ChromaEmbeddingRetriever is required but not available.")
|
372
|
+
retriever = ChromaEmbeddingRetriever(document_store=store)
|
373
|
+
elif isinstance(store, InMemoryDocumentStore):
|
374
|
+
retriever = InMemoryEmbeddingRetriever(document_store=store)
|
375
|
+
else:
|
376
|
+
# Raise specific error for unsupported store
|
377
|
+
raise TypeError(f"Cannot perform search with store type {type(store)}.")
|
378
|
+
|
379
|
+
# --- 3. Build Retrieval Pipeline ---
|
380
|
+
pipeline = Pipeline()
|
381
|
+
pipeline.add_component("retriever", retriever)
|
382
|
+
# Add Ranker logic (remains the same)
|
383
|
+
# ... (ranker setup if needed)
|
384
|
+
|
385
|
+
# --- 4. Prepare Filters (remains the same) ---
|
386
|
+
haystack_filters = options.filters
|
387
|
+
if haystack_filters:
|
388
|
+
logger.debug(f"Applying filters: {haystack_filters}")
|
389
|
+
|
390
|
+
# --- 5. Prepare Retriever Input Data (Dynamically) ---
|
391
|
+
retriever_input_data = {
|
392
|
+
"filters": haystack_filters,
|
393
|
+
"top_k": options.top_k
|
394
|
+
}
|
395
|
+
# Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
|
396
|
+
retriever_input_data["query_embedding"] = query_embedding
|
397
|
+
logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
|
398
|
+
|
399
|
+
# --- 6. Run Retrieval ---
|
400
|
+
try:
|
401
|
+
logger.info(f"Running retrieval pipeline for collection '{self.collection_name}'...")
|
402
|
+
result = pipeline.run(
|
403
|
+
data={"retriever": retriever_input_data}
|
404
|
+
# ... (ranker data if needed)
|
405
|
+
)
|
406
|
+
|
407
|
+
# --- 7. Format Results ---
|
408
|
+
if "retriever" in result and "documents" in result["retriever"]:
|
409
|
+
retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
|
410
|
+
logger.info(f"Retrieved {len(retrieved_docs)} documents.")
|
411
|
+
# Format results (remains the same)
|
412
|
+
final_results = []
|
413
|
+
for doc in retrieved_docs:
|
414
|
+
# Include content_hash in returned metadata if present
|
415
|
+
meta_with_hash = doc.meta
|
416
|
+
# No need to explicitly add hash here if Haystack store preserves it
|
417
|
+
result_dict = {
|
418
|
+
"content_snippet": doc.content[:200] if doc.content else "",
|
419
|
+
"score": doc.score if doc.score is not None else 0.0,
|
420
|
+
"page_number": meta_with_hash.get("page_number", None),
|
421
|
+
"pdf_path": meta_with_hash.get("pdf_path", None),
|
422
|
+
"metadata": meta_with_hash, # Pass full metadata
|
423
|
+
# "_haystack_document": doc # Optionally include full object
|
424
|
+
}
|
425
|
+
final_results.append(result_dict)
|
426
|
+
return final_results
|
427
|
+
else:
|
428
|
+
logger.warning("Pipeline result did not contain expected retriever output.")
|
429
|
+
return []
|
430
|
+
|
431
|
+
except FileNotFoundError:
|
432
|
+
# Keep specific catch for collection not found during retrieval
|
433
|
+
logger.error(f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'.")
|
434
|
+
raise # Re-raise the specific FileNotFoundError
|
435
|
+
# REMOVED broad except Exception for pipeline execution. Let errors propagate.
|
436
|
+
|
437
|
+
def delete_index(
|
438
|
+
self,
|
439
|
+
) -> bool:
|
440
|
+
"""
|
441
|
+
Deletes the entire index/collection managed by this service instance.
|
442
|
+
|
443
|
+
Returns:
|
444
|
+
True if deletion was successful or collection didn't exist, False otherwise.
|
445
|
+
"""
|
446
|
+
logger.warning(f"Request to delete index for collection '{self.collection_name}'.")
|
447
|
+
if self._persist:
|
448
|
+
# Delegate to internal ChromaDB deletion helper
|
449
|
+
return self._delete_chroma_collection()
|
450
|
+
else:
|
451
|
+
# For InMemory, "deleting" means re-initializing the store
|
452
|
+
logger.info(f"Re-initializing InMemory store for '{self.collection_name}' as deletion request.")
|
453
|
+
self._in_memory_store = InMemoryDocumentStore()
|
454
|
+
return True # Considered successful
|
455
|
+
|
456
|
+
def index_exists(
|
457
|
+
self,
|
458
|
+
) -> bool:
|
459
|
+
"""
|
460
|
+
Checks if the index/collection managed by this service instance exists.
|
461
|
+
NOTE: For ChromaDB, this may involve trying to connect.
|
462
|
+
For InMemory, it checks if the internal store object exists and has documents.
|
463
|
+
"""
|
464
|
+
logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
|
465
|
+
store = self._get_store() # Get the store instance
|
466
|
+
try:
|
467
|
+
count = store.count_documents()
|
468
|
+
exists = count > 0
|
469
|
+
logger.debug(f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}")
|
470
|
+
return exists
|
471
|
+
except Exception as e:
|
472
|
+
# Catch errors during count_documents (e.g., connection error for persistent stores)
|
473
|
+
logger.warning(f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}", exc_info=False)
|
474
|
+
# Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
|
475
|
+
# Assume not exists if count fails
|
476
|
+
return False
|
477
|
+
|
478
|
+
# --- Sync Methods Implementation ---
|
479
|
+
|
480
|
+
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
481
|
+
"""Retrieves documents, required for sync.
|
482
|
+
NOTE: Haystack's filter_documents is the closest match.
|
483
|
+
Fetches all docs if filters=None.
|
484
|
+
"""
|
485
|
+
logger.debug(f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})...")
|
486
|
+
store = self._get_store()
|
487
|
+
try:
|
488
|
+
# Use filter_documents with no filters to get all
|
489
|
+
# This might be inefficient for very large stores.
|
490
|
+
haystack_docs = store.filter_documents(filters=kwargs.get('filters')) # Pass filters if provided via kwargs
|
491
|
+
logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
|
492
|
+
# Convert to simple dicts
|
493
|
+
results = []
|
494
|
+
for doc in haystack_docs:
|
495
|
+
doc_dict = {"id": doc.id} # ID is essential
|
496
|
+
if include_metadata:
|
497
|
+
# Ensure content_hash is included if it exists in meta
|
498
|
+
doc_dict["meta"] = doc.meta
|
499
|
+
# Optionally include content? Protocol doesn't require it.
|
500
|
+
# doc_dict["content"] = doc.content
|
501
|
+
results.append(doc_dict)
|
502
|
+
return results
|
503
|
+
except Exception as e:
|
504
|
+
logger.error(f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True)
|
505
|
+
raise RuntimeError(f"Failed to list documents from store '{self.collection_name}'.") from e
|
506
|
+
|
507
|
+
|
508
|
+
def delete_documents(self, ids: List[str]) -> None:
|
509
|
+
"""Deletes documents by ID, required for sync."""
|
510
|
+
if not ids:
|
511
|
+
logger.debug("No document IDs provided for deletion. Skipping.")
|
512
|
+
return
|
513
|
+
logger.warning(f"Request to delete {len(ids)} documents from collection '{self.collection_name}'.")
|
514
|
+
store = self._get_store()
|
515
|
+
try:
|
516
|
+
store.delete_documents(ids=ids)
|
517
|
+
logger.info(f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}")
|
518
|
+
except Exception as e:
|
519
|
+
logger.error(f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}", exc_info=True)
|
520
|
+
raise RuntimeError(f"Failed to delete documents from store '{self.collection_name}'.") from e
|