natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,687 +0,0 @@
1
- """Implementation of the SearchServiceProtocol using Haystack components."""
2
-
3
- import logging
4
- import os
5
- import shutil
6
- from pathlib import Path
7
- from typing import Any, Dict, Iterable, List, Optional, Union
8
-
9
- from PIL import Image
10
-
11
- # Import sentence-transformers for dimension calculation
12
- try:
13
- from sentence_transformers import SentenceTransformer
14
- except ImportError:
15
- SentenceTransformer = None
16
-
17
- # --- Haystack Imports ---
18
- try:
19
- import haystack
20
- from haystack import Pipeline
21
- from haystack.components.embedders import (
22
- SentenceTransformersDocumentEmbedder,
23
- SentenceTransformersTextEmbedder,
24
- )
25
-
26
- # Import InMemory Store & Retriever unconditionally
27
- from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
28
- from haystack.dataclasses import Document as HaystackDocument
29
- from haystack.document_stores.in_memory import InMemoryDocumentStore
30
- from haystack.document_stores.types import DocumentStore, DuplicatePolicy
31
-
32
- # Conditional LanceDB Imports
33
- try:
34
- from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
35
-
36
- LANCEDB_HAYSTACK_AVAILABLE = True
37
- except ImportError:
38
- LanceDBDocumentStore = None
39
- LanceDBEmbeddingRetriever = None
40
- LANCEDB_HAYSTACK_AVAILABLE = False
41
-
42
- # Removed Chroma Imports
43
-
44
- # Need Ranker if used
45
- try:
46
- from haystack.components.rankers import CohereRanker
47
- except ImportError:
48
- CohereRanker = None
49
-
50
- except ImportError:
51
- # Set flags/placeholders if Haystack isn't installed
52
- DocumentStore = object
53
- HaystackDocument = Dict
54
- InMemoryDocumentStore = None
55
- LanceDBDocumentStore = None
56
- SentenceTransformersDocumentEmbedder = None
57
- SentenceTransformersTextEmbedder = None
58
- InMemoryEmbeddingRetriever = None
59
- LanceDBEmbeddingRetriever = None
60
- CohereRanker = None
61
- Pipeline = None
62
- DuplicatePolicy = None
63
- LANCEDB_HAYSTACK_AVAILABLE = False
64
-
65
- # LanceDB Client Import (for management)
66
- try:
67
- import lancedb
68
-
69
- LANCEDB_CLIENT_AVAILABLE = True
70
- except ImportError:
71
- lancedb = None
72
- LANCEDB_CLIENT_AVAILABLE = False
73
-
74
- # Removed ChromaDB Client Import
75
-
76
- from .haystack_utils import HAS_HAYSTACK_EXTRAS
77
- from .search_options import (
78
- BaseSearchOptions,
79
- TextSearchOptions,
80
- )
81
-
82
- # --- Local Imports ---
83
- from .search_service_protocol import (
84
- Indexable,
85
- IndexConfigurationError,
86
- SearchServiceProtocol,
87
- )
88
-
89
- logger = logging.getLogger(__name__)
90
-
91
- # --- Default Configuration Values ---
92
- DEFAULT_PERSIST_PATH = "./natural_pdf_index"
93
- DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
94
-
95
-
96
- class HaystackSearchService(SearchServiceProtocol):
97
- """
98
- Haystack-based implementation of the search service protocol.
99
-
100
- Manages LanceDB (persistent) or InMemory (non-persistent) DocumentStores
101
- and uses Haystack components for embedding and retrieval.
102
- A single instance of this service is tied to a specific table name (LanceDB)
103
- or implicitly managed (InMemory).
104
- """
105
-
106
- def __init__(
107
- self,
108
- table_name: str,
109
- persist: bool = False,
110
- uri: str = DEFAULT_PERSIST_PATH,
111
- embedding_model: str = DEFAULT_EMBEDDING_MODEL,
112
- ):
113
- """
114
- Initialize the service for a specific LanceDB table or an InMemory store.
115
-
116
- Args:
117
- table_name: The name of the LanceDB table (if persist=True).
118
- persist: If True, this service instance manages a persistent LanceDB store.
119
- If False, it manages a transient InMemory store.
120
- uri: Path/URI for the LanceDB database directory (if persist=True).
121
- embedding_model: The embedding model this service instance will use.
122
- Required for LanceDB to know embedding dimensions.
123
- """
124
- if not HAS_HAYSTACK_EXTRAS:
125
- raise ImportError(
126
- "HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
127
- )
128
-
129
- self.table_name = table_name
130
- self._persist = persist
131
- self._uri = uri
132
- self._embedding_model = embedding_model
133
- self._embedding_dims: Optional[int] = None
134
-
135
- # Store instances (lazy loaded)
136
- self._in_memory_store: Optional[InMemoryDocumentStore] = None
137
- self._lancedb_store: Optional[LanceDBDocumentStore] = None
138
-
139
- # Eagerly create InMemoryStore if not persisting
140
- if not self._persist:
141
- if not InMemoryDocumentStore:
142
- raise ImportError(
143
- "InMemoryDocumentStore not available. Cannot create non-persistent service."
144
- )
145
- self._in_memory_store = InMemoryDocumentStore()
146
- logger.info(
147
- f"HaystackSearchService initialized for InMemory store (table_name '{self.table_name}' ignored). Model: '{self._embedding_model}'"
148
- )
149
- else:
150
- # Check LanceDB availability if persisting
151
- if not LANCEDB_HAYSTACK_AVAILABLE:
152
- raise ImportError(
153
- "LanceDB persistent store requires lancedb-haystack. Install with: pip install lancedb-haystack"
154
- )
155
- if not SentenceTransformer:
156
- raise ImportError(
157
- "LanceDB persistent store requires sentence-transformers to determine embedding dimensions. Install with: pip install sentence-transformers"
158
- )
159
- # Calculate embedding dimensions needed for LanceDB initialization
160
- self._calculate_embedding_dims()
161
- logger.info(
162
- f"HaystackSearchService initialized for LanceDB table='{self.table_name}' at uri='{self._uri}'. Model: '{self._embedding_model}', Dims: {self._embedding_dims}"
163
- )
164
-
165
- # --- Internal Helper Methods ---
166
-
167
- def _calculate_embedding_dims(self) -> None:
168
- """Calculates and stores embedding dimensions from the model name."""
169
- if self._embedding_dims is None:
170
- if not SentenceTransformer:
171
- raise ImportError(
172
- "sentence-transformers library is required to determine embedding dimensions."
173
- )
174
- try:
175
- model = SentenceTransformer(self._embedding_model)
176
- dims = model.get_sentence_embedding_dimension()
177
- if not dims:
178
- raise ValueError(
179
- f"Could not determine embedding dimension for model: {self._embedding_model}"
180
- )
181
- self._embedding_dims = dims
182
- logger.debug(
183
- f"Determined embedding dimension: {self._embedding_dims} for model '{self._embedding_model}'"
184
- )
185
- except Exception as e:
186
- logger.error(
187
- f"Failed to load SentenceTransformer model '{self._embedding_model}' to get dimensions: {e}",
188
- exc_info=True,
189
- )
190
- raise RuntimeError(
191
- f"Failed to determine embedding dimension for model '{self._embedding_model}'."
192
- ) from e
193
-
194
- def _get_store(self) -> DocumentStore:
195
- """Gets or creates the appropriate Haystack DocumentStore instance."""
196
- if self._persist:
197
- if not LanceDBDocumentStore:
198
- raise ImportError("LanceDBDocumentStore not available.")
199
- if self._lancedb_store is None:
200
- logger.debug(
201
- f"Initializing LanceDBDocumentStore for table '{self.table_name}' at uri '{self._uri}'."
202
- )
203
- if self._embedding_dims is None:
204
- logger.warning(
205
- "Embedding dimensions not calculated before getting store. Calculating now."
206
- )
207
- self._calculate_embedding_dims()
208
-
209
- self._lancedb_store = LanceDBDocumentStore(
210
- database=self._uri,
211
- table_name=self.table_name,
212
- embedding_dims=self._embedding_dims,
213
- )
214
- logger.info(
215
- f"Initialized LanceDBDocumentStore for table '{self.table_name}' (Dims: {self._embedding_dims})"
216
- )
217
- return self._lancedb_store
218
- else:
219
- if self._in_memory_store is None:
220
- logger.warning("In-memory store was not initialized. Creating now.")
221
- if not InMemoryDocumentStore:
222
- raise ImportError("InMemoryDocumentStore not available.")
223
- self._in_memory_store = InMemoryDocumentStore()
224
- return self._in_memory_store
225
-
226
- def _get_document_embedder(
227
- self, device: Optional[str] = None
228
- ) -> SentenceTransformersDocumentEmbedder:
229
- """Creates the Haystack document embedder component."""
230
- model_name = self._embedding_model
231
- logger.debug(
232
- f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
233
- )
234
- if not SentenceTransformersDocumentEmbedder:
235
- raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
236
- try:
237
- embedder = SentenceTransformersDocumentEmbedder(
238
- model=model_name,
239
- device=device,
240
- )
241
- embedder.warm_up()
242
- logger.info(
243
- f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
244
- )
245
- return embedder
246
- except Exception as e:
247
- logger.error(
248
- f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
249
- )
250
- raise RuntimeError(
251
- f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
252
- ) from e
253
-
254
- def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
255
- """Creates the Haystack text embedder component (for queries)."""
256
- model_name = self._embedding_model
257
- logger.debug(
258
- f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
259
- )
260
- if not SentenceTransformersTextEmbedder:
261
- raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
262
- try:
263
- embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
264
- embedder.warm_up()
265
- logger.info(
266
- f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
267
- )
268
- return embedder
269
- except Exception as e:
270
- logger.error(
271
- f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True
272
- )
273
- raise RuntimeError(
274
- f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
275
- ) from e
276
-
277
- def _delete_lancedb_table(self) -> bool:
278
- """Internal helper to delete the LanceDB table managed by this service."""
279
- if not self._persist:
280
- logger.warning(
281
- "Attempted to delete LanceDB table for a non-persistent service instance. Ignoring."
282
- )
283
- return False
284
-
285
- if not LANCEDB_CLIENT_AVAILABLE:
286
- logger.error("Cannot delete LanceDB table because 'lancedb' library is not installed.")
287
- raise ImportError("'lancedb' library required for table deletion.")
288
-
289
- table_name_to_delete = self.table_name
290
- db_uri = self._uri
291
- logger.warning(
292
- f"Attempting to delete existing LanceDB table '{table_name_to_delete}' at uri '{db_uri}'."
293
- )
294
- try:
295
- db = lancedb.connect(db_uri)
296
- table_names = db.table_names()
297
- if table_name_to_delete in table_names:
298
- db.drop_table(table_name_to_delete)
299
- logger.info(
300
- f"Successfully deleted existing LanceDB table '{table_name_to_delete}'."
301
- )
302
- else:
303
- logger.info(
304
- f"LanceDB table '{table_name_to_delete}' did not exist. No deletion needed."
305
- )
306
-
307
- self._lancedb_store = None
308
- return True
309
- except Exception as e:
310
- logger.error(
311
- f"Error during LanceDB table deletion '{table_name_to_delete}' at '{db_uri}': {e}",
312
- exc_info=True,
313
- )
314
- return False
315
-
316
- # --- Protocol Methods Implementation ---
317
-
318
- def index(
319
- self,
320
- documents: Iterable[Indexable],
321
- embedder_device: Optional[str] = None,
322
- force_reindex: bool = False,
323
- ) -> None:
324
- indexable_list = list(documents)
325
- logger.info(
326
- f"Index request for table='{self.table_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
327
- )
328
-
329
- if not indexable_list:
330
- logger.warning("No documents provided for indexing. Skipping.")
331
- return
332
-
333
- # Handle Reindexing
334
- if force_reindex:
335
- logger.info(f"Force reindex requested for table '{self.table_name}'.")
336
- if self._persist:
337
- deleted = self._delete_lancedb_table()
338
- if not deleted:
339
- logger.warning(
340
- "LanceDB table deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
341
- )
342
- else:
343
- # For InMemory, re-initialize the instance's store
344
- logger.info(f"force_reindex=True: Re-initializing InMemory store.")
345
- if not InMemoryDocumentStore:
346
- raise ImportError("InMemoryDocumentStore not available.")
347
- self._in_memory_store = InMemoryDocumentStore()
348
-
349
- # Get Store
350
- store = self._get_store()
351
-
352
- # Create Embedder
353
- embedder = self._get_document_embedder(embedder_device)
354
-
355
- # Convert Indexable to Haystack Docs & Embed
356
- haystack_docs_to_embed: List[HaystackDocument] = []
357
- logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
358
- for item in indexable_list:
359
- doc_id = item.get_id()
360
- metadata = item.get_metadata()
361
- content_obj = item.get_content()
362
- content_text = ""
363
- if isinstance(content_obj, str):
364
- content_text = content_obj
365
- elif hasattr(content_obj, "extract_text") and callable(
366
- getattr(content_obj, "extract_text")
367
- ):
368
- try:
369
- content_text = content_obj.extract_text()
370
- if not isinstance(content_text, str):
371
- logger.warning(
372
- f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str()."
373
- )
374
- content_text = str(content_obj)
375
- except Exception as extraction_error:
376
- logger.error(
377
- f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().",
378
- exc_info=False,
379
- )
380
- content_text = str(content_obj)
381
- else:
382
- logger.warning(
383
- f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
384
- )
385
- content_text = str(content_obj)
386
-
387
- haystack_doc = HaystackDocument(id=doc_id, content=content_text, meta=metadata)
388
- haystack_docs_to_embed.append(haystack_doc)
389
-
390
- if not haystack_docs_to_embed:
391
- logger.warning(
392
- "No Haystack documents were prepared. Check conversion logic and input data."
393
- )
394
- return
395
-
396
- logger.info(
397
- f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
398
- )
399
- try:
400
- embedding_results = embedder.run(documents=haystack_docs_to_embed)
401
- embedded_docs = embedding_results["documents"]
402
- logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
403
-
404
- except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
405
- error_msg = (
406
- f"Indexing failed for table '{self.table_name}'. Dimension mismatch: {dim_error}. "
407
- )
408
- error_msg += f"Ensure the embedding model ('{self._embedding_model}', Dim: {self._embedding_dims}) matches the expected dimension of the store. "
409
- if self._persist:
410
- error_msg += f"If the table already exists at '{self._uri}', it might have been created with a different model/dimension. "
411
- error_msg += f"Try deleting the LanceDB table directory ('{os.path.join(self._uri, self.table_name + '.lance')}') or using force_reindex=True."
412
- else:
413
- error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
414
- logger.error(error_msg, exc_info=True)
415
- raise IndexConfigurationError(error_msg) from dim_error
416
-
417
- # Write Embedded Documents to Store
418
- logger.info(
419
- f"Writing {len(embedded_docs)} embedded documents to store (Table/Type: '{self.table_name if self._persist else 'InMemory'}')..."
420
- )
421
- write_result = store.write_documents(
422
- documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
423
- )
424
- logger.info(f"Successfully wrote {write_result} documents to store.")
425
- try:
426
- count = store.count_documents()
427
- logger.info(f"Store document count after write: {count}")
428
- except Exception as count_error:
429
- logger.warning(f"Could not get document count after write: {count_error}")
430
-
431
- def search(
432
- self,
433
- query: Any,
434
- options: BaseSearchOptions,
435
- ) -> List[Dict[str, Any]]:
436
- logger.info(
437
- f"Search request for table/store='{self.table_name if self._persist else 'InMemory'}', query_type={type(query).__name__}, options={options}"
438
- )
439
-
440
- store = self._get_store()
441
-
442
- # Handle Query Type and Embedding
443
- query_embedding = None
444
- query_text = ""
445
- if isinstance(query, (str, os.PathLike)):
446
- if isinstance(query, os.PathLike):
447
- logger.warning("Image path query received, treating as text path string.")
448
- query_text = str(query)
449
- else:
450
- query_text = query
451
- text_embedder = self._get_text_embedder()
452
- embedding_result = text_embedder.run(text=query_text)
453
- query_embedding = embedding_result["embedding"]
454
- if not query_embedding:
455
- raise ValueError("Text embedder did not return an embedding for the query.")
456
- logger.debug(
457
- f"Successfully generated query text embedding (dim: {len(query_embedding)})."
458
- )
459
- elif isinstance(query, Image.Image):
460
- logger.error("Multimodal query (PIL Image) is not yet supported.")
461
- raise NotImplementedError("Search with PIL Image queries is not implemented.")
462
- elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
463
- logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text.")
464
- try:
465
- query_text = query.extract_text()
466
- if not query_text or not query_text.strip():
467
- logger.warning(
468
- f"Query object {type(query).__name__} provided empty text. Returning no results."
469
- )
470
- return []
471
- text_embedder = self._get_text_embedder()
472
- embedding_result = text_embedder.run(text=query_text)
473
- query_embedding = embedding_result["embedding"]
474
- if not query_embedding:
475
- raise ValueError(
476
- f"Text embedder did not return embedding for text from {type(query).__name__}."
477
- )
478
- logger.debug(
479
- f"Generated query embedding from extracted text (dim: {len(query_embedding)})."
480
- )
481
- except Exception as e:
482
- logger.error(
483
- f"Failed to extract/embed text from query object {type(query).__name__}: {e}",
484
- exc_info=True,
485
- )
486
- raise RuntimeError("Query text extraction or embedding failed.") from e
487
- else:
488
- raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
489
-
490
- # Select Retriever based on Store Type
491
- retriever = None
492
- # Check if LanceDB is available *before* checking isinstance
493
- if (
494
- LANCEDB_HAYSTACK_AVAILABLE
495
- and LanceDBDocumentStore
496
- and isinstance(store, LanceDBDocumentStore)
497
- ):
498
- if not LanceDBEmbeddingRetriever:
499
- raise ImportError("LanceDBEmbeddingRetriever is required but not available.")
500
- retriever = LanceDBEmbeddingRetriever(document_store=store)
501
- # Check if InMemory is available *before* checking isinstance
502
- elif (
503
- InMemoryDocumentStore
504
- and InMemoryEmbeddingRetriever
505
- and isinstance(store, InMemoryDocumentStore)
506
- ):
507
- # No separate HAS_INMEMORY flag, check if classes are not None
508
- retriever = InMemoryEmbeddingRetriever(document_store=store)
509
- else:
510
- # Improved error message if store type is unexpected
511
- store_type_name = type(store).__name__
512
- available_integrations = []
513
- if LANCEDB_HAYSTACK_AVAILABLE and LanceDBDocumentStore:
514
- available_integrations.append("LanceDB")
515
- if InMemoryDocumentStore:
516
- available_integrations.append("InMemory")
517
-
518
- if not available_integrations:
519
- raise TypeError(
520
- f"Cannot perform search: No supported document store integrations (LanceDB, InMemory) seem to be available. "
521
- f"Check Haystack installation."
522
- )
523
- # Check if the store type matches one of the available integrations' expected types
524
- elif (
525
- LANCEDB_HAYSTACK_AVAILABLE
526
- and LanceDBDocumentStore
527
- and isinstance(store, LanceDBDocumentStore)
528
- ) or (InMemoryDocumentStore and isinstance(store, InMemoryDocumentStore)):
529
- # This case implies the retriever class (e.g., LanceDBEmbeddingRetriever) might be missing
530
- missing_retriever = ""
531
- if isinstance(store, LanceDBDocumentStore):
532
- missing_retriever = "LanceDBEmbeddingRetriever"
533
- if isinstance(store, InMemoryDocumentStore):
534
- missing_retriever = "InMemoryEmbeddingRetriever"
535
- raise ImportError(
536
- f"Store type '{store_type_name}' is supported, but its retriever component '{missing_retriever}' failed to import or is unavailable."
537
- )
538
- else: # Store type doesn't match any known/available store type
539
- raise TypeError(
540
- f"Cannot perform search with unexpected store type '{store_type_name}'. "
541
- f"Available integrations: {', '.join(available_integrations)}."
542
- )
543
-
544
- # This check remains as a final safeguard, though the logic above should catch most issues
545
- if not retriever:
546
- raise RuntimeError(
547
- f"Failed to select a suitable retriever for store type {type(store).__name__}. Please check dependencies and integration availability."
548
- )
549
-
550
- logger.debug(f"Selected retriever: {type(retriever).__name__}")
551
-
552
- # Build Retrieval Pipeline
553
- pipeline = Pipeline()
554
- pipeline.add_component("retriever", retriever)
555
-
556
- # Prepare Filters
557
- haystack_filters = options.filters
558
- if haystack_filters:
559
- logger.debug(f"Applying filters: {haystack_filters}")
560
-
561
- # Prepare Retriever Input Data
562
- retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
563
- retriever_input_data["query_embedding"] = query_embedding
564
- logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
565
-
566
- # Run Retrieval
567
- try:
568
- logger.info(
569
- f"Running retrieval pipeline for table/store '{self.table_name if self._persist else 'InMemory'}'..."
570
- )
571
- result = pipeline.run(data={"retriever": retriever_input_data})
572
-
573
- # Format Results
574
- if "retriever" in result and "documents" in result["retriever"]:
575
- retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
576
- logger.info(f"Retrieved {len(retrieved_docs)} documents.")
577
- final_results = []
578
- for doc in retrieved_docs:
579
- meta_with_hash = doc.meta
580
- result_dict = {
581
- "content_snippet": doc.content[:200] if doc.content else "",
582
- "score": doc.score if doc.score is not None else 0.0,
583
- "page_number": meta_with_hash.get("page_number", None),
584
- "pdf_path": meta_with_hash.get("pdf_path", None),
585
- "metadata": meta_with_hash,
586
- }
587
- final_results.append(result_dict)
588
- return final_results
589
- else:
590
- logger.warning("Pipeline result did not contain expected retriever output.")
591
- return []
592
-
593
- except FileNotFoundError:
594
- logger.error(
595
- f"Search failed: Could not access path for table/store '{self.table_name if self._persist else 'InMemory'}' (URI: '{self._uri if self._persist else 'N/A'}')."
596
- )
597
- raise
598
-
599
- def delete_index(self) -> bool:
600
- """
601
- Deletes the entire LanceDB table or resets the InMemory store.
602
-
603
- Returns:
604
- True if deletion was successful or table/store didn't exist, False otherwise.
605
- """
606
- if self._persist:
607
- logger.warning(
608
- f"Request to delete LanceDB table '{self.table_name}' at uri '{self._uri}'."
609
- )
610
- return self._delete_lancedb_table()
611
- else:
612
- logger.info("Request to delete InMemory store (re-initializing).)")
613
- if not InMemoryDocumentStore:
614
- raise ImportError("InMemoryDocumentStore not available.")
615
- self._in_memory_store = InMemoryDocumentStore()
616
- return True
617
-
618
- def index_exists(self) -> bool:
619
- """
620
- Checks if the LanceDB table or InMemory store exists and has documents.
621
- NOTE: For LanceDB, this tries to count documents, implicitly checking connection/table existence.
622
- For InMemory, it checks if the internal store object exists and has documents.
623
- """
624
- store_name = self.table_name if self._persist else "InMemory"
625
- logger.debug(
626
- f"Checking existence of index for '{store_name}'. URI: '{self._uri if self._persist else 'N/A'}'"
627
- )
628
- try:
629
- store = self._get_store()
630
- count = store.count_documents()
631
- exists = count > 0
632
- logger.debug(
633
- f"Store type {type(store).__name__} for '{store_name}' exists and has {count} documents: {exists}"
634
- )
635
- return exists
636
- except ImportError as ie:
637
- logger.error(f"Import error checking index existence for '{store_name}': {ie}")
638
- return False
639
- except Exception as e:
640
- logger.warning(
641
- f"Could not confirm existence or count documents in store for '{store_name}': {e}",
642
- exc_info=False,
643
- )
644
- return False
645
-
646
- # --- Sync Methods Implementation ---
647
-
648
- def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
649
- """Retrieves documents, required for sync."""
650
- store_name = self.table_name if self._persist else "InMemory"
651
- logger.debug(
652
- f"Listing documents for '{store_name}' (include_metadata={include_metadata})..."
653
- )
654
- store = self._get_store()
655
- try:
656
- haystack_docs = store.filter_documents(filters=kwargs.get("filters"))
657
- logger.info(f"Retrieved {len(haystack_docs)} documents from store '{store_name}'.")
658
- results = []
659
- for doc in haystack_docs:
660
- doc_dict = {"id": doc.id}
661
- if include_metadata:
662
- doc_dict["meta"] = doc.meta
663
- results.append(doc_dict)
664
- return results
665
- except Exception as e:
666
- logger.error(f"Failed to list documents from store '{store_name}': {e}", exc_info=True)
667
- raise RuntimeError(f"Failed to list documents from store '{store_name}'.") from e
668
-
669
- def delete_documents(self, ids: List[str]) -> None:
670
- """Deletes documents by ID, required for sync."""
671
- store_name = self.table_name if self._persist else "InMemory"
672
- if not ids:
673
- logger.debug(f"No document IDs provided for deletion from '{store_name}'. Skipping.")
674
- return
675
- logger.warning(f"Request to delete {len(ids)} documents from '{store_name}'.")
676
- store = self._get_store()
677
- try:
678
- store.delete_documents(ids=ids)
679
- logger.info(
680
- f"Successfully requested deletion of {len(ids)} documents from '{store_name}'. Store count now: {store.count_documents()}"
681
- )
682
- except Exception as e:
683
- logger.error(
684
- f"Failed to delete documents with IDs {ids} from store '{store_name}': {e}",
685
- exc_info=True,
686
- )
687
- raise RuntimeError(f"Failed to delete documents from store '{store_name}'.") from e