natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -1,687 +0,0 @@
|
|
1
|
-
"""Implementation of the SearchServiceProtocol using Haystack components."""
|
2
|
-
|
3
|
-
import logging
|
4
|
-
import os
|
5
|
-
import shutil
|
6
|
-
from pathlib import Path
|
7
|
-
from typing import Any, Dict, Iterable, List, Optional, Union
|
8
|
-
|
9
|
-
from PIL import Image
|
10
|
-
|
11
|
-
# Import sentence-transformers for dimension calculation
|
12
|
-
try:
|
13
|
-
from sentence_transformers import SentenceTransformer
|
14
|
-
except ImportError:
|
15
|
-
SentenceTransformer = None
|
16
|
-
|
17
|
-
# --- Haystack Imports ---
|
18
|
-
try:
|
19
|
-
import haystack
|
20
|
-
from haystack import Pipeline
|
21
|
-
from haystack.components.embedders import (
|
22
|
-
SentenceTransformersDocumentEmbedder,
|
23
|
-
SentenceTransformersTextEmbedder,
|
24
|
-
)
|
25
|
-
|
26
|
-
# Import InMemory Store & Retriever unconditionally
|
27
|
-
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
28
|
-
from haystack.dataclasses import Document as HaystackDocument
|
29
|
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
30
|
-
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
31
|
-
|
32
|
-
# Conditional LanceDB Imports
|
33
|
-
try:
|
34
|
-
from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
|
35
|
-
|
36
|
-
LANCEDB_HAYSTACK_AVAILABLE = True
|
37
|
-
except ImportError:
|
38
|
-
LanceDBDocumentStore = None
|
39
|
-
LanceDBEmbeddingRetriever = None
|
40
|
-
LANCEDB_HAYSTACK_AVAILABLE = False
|
41
|
-
|
42
|
-
# Removed Chroma Imports
|
43
|
-
|
44
|
-
# Need Ranker if used
|
45
|
-
try:
|
46
|
-
from haystack.components.rankers import CohereRanker
|
47
|
-
except ImportError:
|
48
|
-
CohereRanker = None
|
49
|
-
|
50
|
-
except ImportError:
|
51
|
-
# Set flags/placeholders if Haystack isn't installed
|
52
|
-
DocumentStore = object
|
53
|
-
HaystackDocument = Dict
|
54
|
-
InMemoryDocumentStore = None
|
55
|
-
LanceDBDocumentStore = None
|
56
|
-
SentenceTransformersDocumentEmbedder = None
|
57
|
-
SentenceTransformersTextEmbedder = None
|
58
|
-
InMemoryEmbeddingRetriever = None
|
59
|
-
LanceDBEmbeddingRetriever = None
|
60
|
-
CohereRanker = None
|
61
|
-
Pipeline = None
|
62
|
-
DuplicatePolicy = None
|
63
|
-
LANCEDB_HAYSTACK_AVAILABLE = False
|
64
|
-
|
65
|
-
# LanceDB Client Import (for management)
|
66
|
-
try:
|
67
|
-
import lancedb
|
68
|
-
|
69
|
-
LANCEDB_CLIENT_AVAILABLE = True
|
70
|
-
except ImportError:
|
71
|
-
lancedb = None
|
72
|
-
LANCEDB_CLIENT_AVAILABLE = False
|
73
|
-
|
74
|
-
# Removed ChromaDB Client Import
|
75
|
-
|
76
|
-
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
77
|
-
from .search_options import (
|
78
|
-
BaseSearchOptions,
|
79
|
-
TextSearchOptions,
|
80
|
-
)
|
81
|
-
|
82
|
-
# --- Local Imports ---
|
83
|
-
from .search_service_protocol import (
|
84
|
-
Indexable,
|
85
|
-
IndexConfigurationError,
|
86
|
-
SearchServiceProtocol,
|
87
|
-
)
|
88
|
-
|
89
|
-
logger = logging.getLogger(__name__)
|
90
|
-
|
91
|
-
# --- Default Configuration Values ---
|
92
|
-
DEFAULT_PERSIST_PATH = "./natural_pdf_index"
|
93
|
-
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
94
|
-
|
95
|
-
|
96
|
-
class HaystackSearchService(SearchServiceProtocol):
|
97
|
-
"""
|
98
|
-
Haystack-based implementation of the search service protocol.
|
99
|
-
|
100
|
-
Manages LanceDB (persistent) or InMemory (non-persistent) DocumentStores
|
101
|
-
and uses Haystack components for embedding and retrieval.
|
102
|
-
A single instance of this service is tied to a specific table name (LanceDB)
|
103
|
-
or implicitly managed (InMemory).
|
104
|
-
"""
|
105
|
-
|
106
|
-
def __init__(
|
107
|
-
self,
|
108
|
-
table_name: str,
|
109
|
-
persist: bool = False,
|
110
|
-
uri: str = DEFAULT_PERSIST_PATH,
|
111
|
-
embedding_model: str = DEFAULT_EMBEDDING_MODEL,
|
112
|
-
):
|
113
|
-
"""
|
114
|
-
Initialize the service for a specific LanceDB table or an InMemory store.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
table_name: The name of the LanceDB table (if persist=True).
|
118
|
-
persist: If True, this service instance manages a persistent LanceDB store.
|
119
|
-
If False, it manages a transient InMemory store.
|
120
|
-
uri: Path/URI for the LanceDB database directory (if persist=True).
|
121
|
-
embedding_model: The embedding model this service instance will use.
|
122
|
-
Required for LanceDB to know embedding dimensions.
|
123
|
-
"""
|
124
|
-
if not HAS_HAYSTACK_EXTRAS:
|
125
|
-
raise ImportError(
|
126
|
-
"HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
|
127
|
-
)
|
128
|
-
|
129
|
-
self.table_name = table_name
|
130
|
-
self._persist = persist
|
131
|
-
self._uri = uri
|
132
|
-
self._embedding_model = embedding_model
|
133
|
-
self._embedding_dims: Optional[int] = None
|
134
|
-
|
135
|
-
# Store instances (lazy loaded)
|
136
|
-
self._in_memory_store: Optional[InMemoryDocumentStore] = None
|
137
|
-
self._lancedb_store: Optional[LanceDBDocumentStore] = None
|
138
|
-
|
139
|
-
# Eagerly create InMemoryStore if not persisting
|
140
|
-
if not self._persist:
|
141
|
-
if not InMemoryDocumentStore:
|
142
|
-
raise ImportError(
|
143
|
-
"InMemoryDocumentStore not available. Cannot create non-persistent service."
|
144
|
-
)
|
145
|
-
self._in_memory_store = InMemoryDocumentStore()
|
146
|
-
logger.info(
|
147
|
-
f"HaystackSearchService initialized for InMemory store (table_name '{self.table_name}' ignored). Model: '{self._embedding_model}'"
|
148
|
-
)
|
149
|
-
else:
|
150
|
-
# Check LanceDB availability if persisting
|
151
|
-
if not LANCEDB_HAYSTACK_AVAILABLE:
|
152
|
-
raise ImportError(
|
153
|
-
"LanceDB persistent store requires lancedb-haystack. Install with: pip install lancedb-haystack"
|
154
|
-
)
|
155
|
-
if not SentenceTransformer:
|
156
|
-
raise ImportError(
|
157
|
-
"LanceDB persistent store requires sentence-transformers to determine embedding dimensions. Install with: pip install sentence-transformers"
|
158
|
-
)
|
159
|
-
# Calculate embedding dimensions needed for LanceDB initialization
|
160
|
-
self._calculate_embedding_dims()
|
161
|
-
logger.info(
|
162
|
-
f"HaystackSearchService initialized for LanceDB table='{self.table_name}' at uri='{self._uri}'. Model: '{self._embedding_model}', Dims: {self._embedding_dims}"
|
163
|
-
)
|
164
|
-
|
165
|
-
# --- Internal Helper Methods ---
|
166
|
-
|
167
|
-
def _calculate_embedding_dims(self) -> None:
|
168
|
-
"""Calculates and stores embedding dimensions from the model name."""
|
169
|
-
if self._embedding_dims is None:
|
170
|
-
if not SentenceTransformer:
|
171
|
-
raise ImportError(
|
172
|
-
"sentence-transformers library is required to determine embedding dimensions."
|
173
|
-
)
|
174
|
-
try:
|
175
|
-
model = SentenceTransformer(self._embedding_model)
|
176
|
-
dims = model.get_sentence_embedding_dimension()
|
177
|
-
if not dims:
|
178
|
-
raise ValueError(
|
179
|
-
f"Could not determine embedding dimension for model: {self._embedding_model}"
|
180
|
-
)
|
181
|
-
self._embedding_dims = dims
|
182
|
-
logger.debug(
|
183
|
-
f"Determined embedding dimension: {self._embedding_dims} for model '{self._embedding_model}'"
|
184
|
-
)
|
185
|
-
except Exception as e:
|
186
|
-
logger.error(
|
187
|
-
f"Failed to load SentenceTransformer model '{self._embedding_model}' to get dimensions: {e}",
|
188
|
-
exc_info=True,
|
189
|
-
)
|
190
|
-
raise RuntimeError(
|
191
|
-
f"Failed to determine embedding dimension for model '{self._embedding_model}'."
|
192
|
-
) from e
|
193
|
-
|
194
|
-
def _get_store(self) -> DocumentStore:
|
195
|
-
"""Gets or creates the appropriate Haystack DocumentStore instance."""
|
196
|
-
if self._persist:
|
197
|
-
if not LanceDBDocumentStore:
|
198
|
-
raise ImportError("LanceDBDocumentStore not available.")
|
199
|
-
if self._lancedb_store is None:
|
200
|
-
logger.debug(
|
201
|
-
f"Initializing LanceDBDocumentStore for table '{self.table_name}' at uri '{self._uri}'."
|
202
|
-
)
|
203
|
-
if self._embedding_dims is None:
|
204
|
-
logger.warning(
|
205
|
-
"Embedding dimensions not calculated before getting store. Calculating now."
|
206
|
-
)
|
207
|
-
self._calculate_embedding_dims()
|
208
|
-
|
209
|
-
self._lancedb_store = LanceDBDocumentStore(
|
210
|
-
database=self._uri,
|
211
|
-
table_name=self.table_name,
|
212
|
-
embedding_dims=self._embedding_dims,
|
213
|
-
)
|
214
|
-
logger.info(
|
215
|
-
f"Initialized LanceDBDocumentStore for table '{self.table_name}' (Dims: {self._embedding_dims})"
|
216
|
-
)
|
217
|
-
return self._lancedb_store
|
218
|
-
else:
|
219
|
-
if self._in_memory_store is None:
|
220
|
-
logger.warning("In-memory store was not initialized. Creating now.")
|
221
|
-
if not InMemoryDocumentStore:
|
222
|
-
raise ImportError("InMemoryDocumentStore not available.")
|
223
|
-
self._in_memory_store = InMemoryDocumentStore()
|
224
|
-
return self._in_memory_store
|
225
|
-
|
226
|
-
def _get_document_embedder(
|
227
|
-
self, device: Optional[str] = None
|
228
|
-
) -> SentenceTransformersDocumentEmbedder:
|
229
|
-
"""Creates the Haystack document embedder component."""
|
230
|
-
model_name = self._embedding_model
|
231
|
-
logger.debug(
|
232
|
-
f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
233
|
-
)
|
234
|
-
if not SentenceTransformersDocumentEmbedder:
|
235
|
-
raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
|
236
|
-
try:
|
237
|
-
embedder = SentenceTransformersDocumentEmbedder(
|
238
|
-
model=model_name,
|
239
|
-
device=device,
|
240
|
-
)
|
241
|
-
embedder.warm_up()
|
242
|
-
logger.info(
|
243
|
-
f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
|
244
|
-
)
|
245
|
-
return embedder
|
246
|
-
except Exception as e:
|
247
|
-
logger.error(
|
248
|
-
f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
|
249
|
-
)
|
250
|
-
raise RuntimeError(
|
251
|
-
f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
|
252
|
-
) from e
|
253
|
-
|
254
|
-
def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
|
255
|
-
"""Creates the Haystack text embedder component (for queries)."""
|
256
|
-
model_name = self._embedding_model
|
257
|
-
logger.debug(
|
258
|
-
f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
259
|
-
)
|
260
|
-
if not SentenceTransformersTextEmbedder:
|
261
|
-
raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
|
262
|
-
try:
|
263
|
-
embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
|
264
|
-
embedder.warm_up()
|
265
|
-
logger.info(
|
266
|
-
f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
|
267
|
-
)
|
268
|
-
return embedder
|
269
|
-
except Exception as e:
|
270
|
-
logger.error(
|
271
|
-
f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True
|
272
|
-
)
|
273
|
-
raise RuntimeError(
|
274
|
-
f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
|
275
|
-
) from e
|
276
|
-
|
277
|
-
def _delete_lancedb_table(self) -> bool:
|
278
|
-
"""Internal helper to delete the LanceDB table managed by this service."""
|
279
|
-
if not self._persist:
|
280
|
-
logger.warning(
|
281
|
-
"Attempted to delete LanceDB table for a non-persistent service instance. Ignoring."
|
282
|
-
)
|
283
|
-
return False
|
284
|
-
|
285
|
-
if not LANCEDB_CLIENT_AVAILABLE:
|
286
|
-
logger.error("Cannot delete LanceDB table because 'lancedb' library is not installed.")
|
287
|
-
raise ImportError("'lancedb' library required for table deletion.")
|
288
|
-
|
289
|
-
table_name_to_delete = self.table_name
|
290
|
-
db_uri = self._uri
|
291
|
-
logger.warning(
|
292
|
-
f"Attempting to delete existing LanceDB table '{table_name_to_delete}' at uri '{db_uri}'."
|
293
|
-
)
|
294
|
-
try:
|
295
|
-
db = lancedb.connect(db_uri)
|
296
|
-
table_names = db.table_names()
|
297
|
-
if table_name_to_delete in table_names:
|
298
|
-
db.drop_table(table_name_to_delete)
|
299
|
-
logger.info(
|
300
|
-
f"Successfully deleted existing LanceDB table '{table_name_to_delete}'."
|
301
|
-
)
|
302
|
-
else:
|
303
|
-
logger.info(
|
304
|
-
f"LanceDB table '{table_name_to_delete}' did not exist. No deletion needed."
|
305
|
-
)
|
306
|
-
|
307
|
-
self._lancedb_store = None
|
308
|
-
return True
|
309
|
-
except Exception as e:
|
310
|
-
logger.error(
|
311
|
-
f"Error during LanceDB table deletion '{table_name_to_delete}' at '{db_uri}': {e}",
|
312
|
-
exc_info=True,
|
313
|
-
)
|
314
|
-
return False
|
315
|
-
|
316
|
-
# --- Protocol Methods Implementation ---
|
317
|
-
|
318
|
-
def index(
|
319
|
-
self,
|
320
|
-
documents: Iterable[Indexable],
|
321
|
-
embedder_device: Optional[str] = None,
|
322
|
-
force_reindex: bool = False,
|
323
|
-
) -> None:
|
324
|
-
indexable_list = list(documents)
|
325
|
-
logger.info(
|
326
|
-
f"Index request for table='{self.table_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
|
327
|
-
)
|
328
|
-
|
329
|
-
if not indexable_list:
|
330
|
-
logger.warning("No documents provided for indexing. Skipping.")
|
331
|
-
return
|
332
|
-
|
333
|
-
# Handle Reindexing
|
334
|
-
if force_reindex:
|
335
|
-
logger.info(f"Force reindex requested for table '{self.table_name}'.")
|
336
|
-
if self._persist:
|
337
|
-
deleted = self._delete_lancedb_table()
|
338
|
-
if not deleted:
|
339
|
-
logger.warning(
|
340
|
-
"LanceDB table deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
|
341
|
-
)
|
342
|
-
else:
|
343
|
-
# For InMemory, re-initialize the instance's store
|
344
|
-
logger.info(f"force_reindex=True: Re-initializing InMemory store.")
|
345
|
-
if not InMemoryDocumentStore:
|
346
|
-
raise ImportError("InMemoryDocumentStore not available.")
|
347
|
-
self._in_memory_store = InMemoryDocumentStore()
|
348
|
-
|
349
|
-
# Get Store
|
350
|
-
store = self._get_store()
|
351
|
-
|
352
|
-
# Create Embedder
|
353
|
-
embedder = self._get_document_embedder(embedder_device)
|
354
|
-
|
355
|
-
# Convert Indexable to Haystack Docs & Embed
|
356
|
-
haystack_docs_to_embed: List[HaystackDocument] = []
|
357
|
-
logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
|
358
|
-
for item in indexable_list:
|
359
|
-
doc_id = item.get_id()
|
360
|
-
metadata = item.get_metadata()
|
361
|
-
content_obj = item.get_content()
|
362
|
-
content_text = ""
|
363
|
-
if isinstance(content_obj, str):
|
364
|
-
content_text = content_obj
|
365
|
-
elif hasattr(content_obj, "extract_text") and callable(
|
366
|
-
getattr(content_obj, "extract_text")
|
367
|
-
):
|
368
|
-
try:
|
369
|
-
content_text = content_obj.extract_text()
|
370
|
-
if not isinstance(content_text, str):
|
371
|
-
logger.warning(
|
372
|
-
f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str()."
|
373
|
-
)
|
374
|
-
content_text = str(content_obj)
|
375
|
-
except Exception as extraction_error:
|
376
|
-
logger.error(
|
377
|
-
f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().",
|
378
|
-
exc_info=False,
|
379
|
-
)
|
380
|
-
content_text = str(content_obj)
|
381
|
-
else:
|
382
|
-
logger.warning(
|
383
|
-
f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
|
384
|
-
)
|
385
|
-
content_text = str(content_obj)
|
386
|
-
|
387
|
-
haystack_doc = HaystackDocument(id=doc_id, content=content_text, meta=metadata)
|
388
|
-
haystack_docs_to_embed.append(haystack_doc)
|
389
|
-
|
390
|
-
if not haystack_docs_to_embed:
|
391
|
-
logger.warning(
|
392
|
-
"No Haystack documents were prepared. Check conversion logic and input data."
|
393
|
-
)
|
394
|
-
return
|
395
|
-
|
396
|
-
logger.info(
|
397
|
-
f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
|
398
|
-
)
|
399
|
-
try:
|
400
|
-
embedding_results = embedder.run(documents=haystack_docs_to_embed)
|
401
|
-
embedded_docs = embedding_results["documents"]
|
402
|
-
logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
|
403
|
-
|
404
|
-
except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
|
405
|
-
error_msg = (
|
406
|
-
f"Indexing failed for table '{self.table_name}'. Dimension mismatch: {dim_error}. "
|
407
|
-
)
|
408
|
-
error_msg += f"Ensure the embedding model ('{self._embedding_model}', Dim: {self._embedding_dims}) matches the expected dimension of the store. "
|
409
|
-
if self._persist:
|
410
|
-
error_msg += f"If the table already exists at '{self._uri}', it might have been created with a different model/dimension. "
|
411
|
-
error_msg += f"Try deleting the LanceDB table directory ('{os.path.join(self._uri, self.table_name + '.lance')}') or using force_reindex=True."
|
412
|
-
else:
|
413
|
-
error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
|
414
|
-
logger.error(error_msg, exc_info=True)
|
415
|
-
raise IndexConfigurationError(error_msg) from dim_error
|
416
|
-
|
417
|
-
# Write Embedded Documents to Store
|
418
|
-
logger.info(
|
419
|
-
f"Writing {len(embedded_docs)} embedded documents to store (Table/Type: '{self.table_name if self._persist else 'InMemory'}')..."
|
420
|
-
)
|
421
|
-
write_result = store.write_documents(
|
422
|
-
documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
|
423
|
-
)
|
424
|
-
logger.info(f"Successfully wrote {write_result} documents to store.")
|
425
|
-
try:
|
426
|
-
count = store.count_documents()
|
427
|
-
logger.info(f"Store document count after write: {count}")
|
428
|
-
except Exception as count_error:
|
429
|
-
logger.warning(f"Could not get document count after write: {count_error}")
|
430
|
-
|
431
|
-
def search(
|
432
|
-
self,
|
433
|
-
query: Any,
|
434
|
-
options: BaseSearchOptions,
|
435
|
-
) -> List[Dict[str, Any]]:
|
436
|
-
logger.info(
|
437
|
-
f"Search request for table/store='{self.table_name if self._persist else 'InMemory'}', query_type={type(query).__name__}, options={options}"
|
438
|
-
)
|
439
|
-
|
440
|
-
store = self._get_store()
|
441
|
-
|
442
|
-
# Handle Query Type and Embedding
|
443
|
-
query_embedding = None
|
444
|
-
query_text = ""
|
445
|
-
if isinstance(query, (str, os.PathLike)):
|
446
|
-
if isinstance(query, os.PathLike):
|
447
|
-
logger.warning("Image path query received, treating as text path string.")
|
448
|
-
query_text = str(query)
|
449
|
-
else:
|
450
|
-
query_text = query
|
451
|
-
text_embedder = self._get_text_embedder()
|
452
|
-
embedding_result = text_embedder.run(text=query_text)
|
453
|
-
query_embedding = embedding_result["embedding"]
|
454
|
-
if not query_embedding:
|
455
|
-
raise ValueError("Text embedder did not return an embedding for the query.")
|
456
|
-
logger.debug(
|
457
|
-
f"Successfully generated query text embedding (dim: {len(query_embedding)})."
|
458
|
-
)
|
459
|
-
elif isinstance(query, Image.Image):
|
460
|
-
logger.error("Multimodal query (PIL Image) is not yet supported.")
|
461
|
-
raise NotImplementedError("Search with PIL Image queries is not implemented.")
|
462
|
-
elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
|
463
|
-
logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text.")
|
464
|
-
try:
|
465
|
-
query_text = query.extract_text()
|
466
|
-
if not query_text or not query_text.strip():
|
467
|
-
logger.warning(
|
468
|
-
f"Query object {type(query).__name__} provided empty text. Returning no results."
|
469
|
-
)
|
470
|
-
return []
|
471
|
-
text_embedder = self._get_text_embedder()
|
472
|
-
embedding_result = text_embedder.run(text=query_text)
|
473
|
-
query_embedding = embedding_result["embedding"]
|
474
|
-
if not query_embedding:
|
475
|
-
raise ValueError(
|
476
|
-
f"Text embedder did not return embedding for text from {type(query).__name__}."
|
477
|
-
)
|
478
|
-
logger.debug(
|
479
|
-
f"Generated query embedding from extracted text (dim: {len(query_embedding)})."
|
480
|
-
)
|
481
|
-
except Exception as e:
|
482
|
-
logger.error(
|
483
|
-
f"Failed to extract/embed text from query object {type(query).__name__}: {e}",
|
484
|
-
exc_info=True,
|
485
|
-
)
|
486
|
-
raise RuntimeError("Query text extraction or embedding failed.") from e
|
487
|
-
else:
|
488
|
-
raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
|
489
|
-
|
490
|
-
# Select Retriever based on Store Type
|
491
|
-
retriever = None
|
492
|
-
# Check if LanceDB is available *before* checking isinstance
|
493
|
-
if (
|
494
|
-
LANCEDB_HAYSTACK_AVAILABLE
|
495
|
-
and LanceDBDocumentStore
|
496
|
-
and isinstance(store, LanceDBDocumentStore)
|
497
|
-
):
|
498
|
-
if not LanceDBEmbeddingRetriever:
|
499
|
-
raise ImportError("LanceDBEmbeddingRetriever is required but not available.")
|
500
|
-
retriever = LanceDBEmbeddingRetriever(document_store=store)
|
501
|
-
# Check if InMemory is available *before* checking isinstance
|
502
|
-
elif (
|
503
|
-
InMemoryDocumentStore
|
504
|
-
and InMemoryEmbeddingRetriever
|
505
|
-
and isinstance(store, InMemoryDocumentStore)
|
506
|
-
):
|
507
|
-
# No separate HAS_INMEMORY flag, check if classes are not None
|
508
|
-
retriever = InMemoryEmbeddingRetriever(document_store=store)
|
509
|
-
else:
|
510
|
-
# Improved error message if store type is unexpected
|
511
|
-
store_type_name = type(store).__name__
|
512
|
-
available_integrations = []
|
513
|
-
if LANCEDB_HAYSTACK_AVAILABLE and LanceDBDocumentStore:
|
514
|
-
available_integrations.append("LanceDB")
|
515
|
-
if InMemoryDocumentStore:
|
516
|
-
available_integrations.append("InMemory")
|
517
|
-
|
518
|
-
if not available_integrations:
|
519
|
-
raise TypeError(
|
520
|
-
f"Cannot perform search: No supported document store integrations (LanceDB, InMemory) seem to be available. "
|
521
|
-
f"Check Haystack installation."
|
522
|
-
)
|
523
|
-
# Check if the store type matches one of the available integrations' expected types
|
524
|
-
elif (
|
525
|
-
LANCEDB_HAYSTACK_AVAILABLE
|
526
|
-
and LanceDBDocumentStore
|
527
|
-
and isinstance(store, LanceDBDocumentStore)
|
528
|
-
) or (InMemoryDocumentStore and isinstance(store, InMemoryDocumentStore)):
|
529
|
-
# This case implies the retriever class (e.g., LanceDBEmbeddingRetriever) might be missing
|
530
|
-
missing_retriever = ""
|
531
|
-
if isinstance(store, LanceDBDocumentStore):
|
532
|
-
missing_retriever = "LanceDBEmbeddingRetriever"
|
533
|
-
if isinstance(store, InMemoryDocumentStore):
|
534
|
-
missing_retriever = "InMemoryEmbeddingRetriever"
|
535
|
-
raise ImportError(
|
536
|
-
f"Store type '{store_type_name}' is supported, but its retriever component '{missing_retriever}' failed to import or is unavailable."
|
537
|
-
)
|
538
|
-
else: # Store type doesn't match any known/available store type
|
539
|
-
raise TypeError(
|
540
|
-
f"Cannot perform search with unexpected store type '{store_type_name}'. "
|
541
|
-
f"Available integrations: {', '.join(available_integrations)}."
|
542
|
-
)
|
543
|
-
|
544
|
-
# This check remains as a final safeguard, though the logic above should catch most issues
|
545
|
-
if not retriever:
|
546
|
-
raise RuntimeError(
|
547
|
-
f"Failed to select a suitable retriever for store type {type(store).__name__}. Please check dependencies and integration availability."
|
548
|
-
)
|
549
|
-
|
550
|
-
logger.debug(f"Selected retriever: {type(retriever).__name__}")
|
551
|
-
|
552
|
-
# Build Retrieval Pipeline
|
553
|
-
pipeline = Pipeline()
|
554
|
-
pipeline.add_component("retriever", retriever)
|
555
|
-
|
556
|
-
# Prepare Filters
|
557
|
-
haystack_filters = options.filters
|
558
|
-
if haystack_filters:
|
559
|
-
logger.debug(f"Applying filters: {haystack_filters}")
|
560
|
-
|
561
|
-
# Prepare Retriever Input Data
|
562
|
-
retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
|
563
|
-
retriever_input_data["query_embedding"] = query_embedding
|
564
|
-
logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
|
565
|
-
|
566
|
-
# Run Retrieval
|
567
|
-
try:
|
568
|
-
logger.info(
|
569
|
-
f"Running retrieval pipeline for table/store '{self.table_name if self._persist else 'InMemory'}'..."
|
570
|
-
)
|
571
|
-
result = pipeline.run(data={"retriever": retriever_input_data})
|
572
|
-
|
573
|
-
# Format Results
|
574
|
-
if "retriever" in result and "documents" in result["retriever"]:
|
575
|
-
retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
|
576
|
-
logger.info(f"Retrieved {len(retrieved_docs)} documents.")
|
577
|
-
final_results = []
|
578
|
-
for doc in retrieved_docs:
|
579
|
-
meta_with_hash = doc.meta
|
580
|
-
result_dict = {
|
581
|
-
"content_snippet": doc.content[:200] if doc.content else "",
|
582
|
-
"score": doc.score if doc.score is not None else 0.0,
|
583
|
-
"page_number": meta_with_hash.get("page_number", None),
|
584
|
-
"pdf_path": meta_with_hash.get("pdf_path", None),
|
585
|
-
"metadata": meta_with_hash,
|
586
|
-
}
|
587
|
-
final_results.append(result_dict)
|
588
|
-
return final_results
|
589
|
-
else:
|
590
|
-
logger.warning("Pipeline result did not contain expected retriever output.")
|
591
|
-
return []
|
592
|
-
|
593
|
-
except FileNotFoundError:
|
594
|
-
logger.error(
|
595
|
-
f"Search failed: Could not access path for table/store '{self.table_name if self._persist else 'InMemory'}' (URI: '{self._uri if self._persist else 'N/A'}')."
|
596
|
-
)
|
597
|
-
raise
|
598
|
-
|
599
|
-
def delete_index(self) -> bool:
|
600
|
-
"""
|
601
|
-
Deletes the entire LanceDB table or resets the InMemory store.
|
602
|
-
|
603
|
-
Returns:
|
604
|
-
True if deletion was successful or table/store didn't exist, False otherwise.
|
605
|
-
"""
|
606
|
-
if self._persist:
|
607
|
-
logger.warning(
|
608
|
-
f"Request to delete LanceDB table '{self.table_name}' at uri '{self._uri}'."
|
609
|
-
)
|
610
|
-
return self._delete_lancedb_table()
|
611
|
-
else:
|
612
|
-
logger.info("Request to delete InMemory store (re-initializing).)")
|
613
|
-
if not InMemoryDocumentStore:
|
614
|
-
raise ImportError("InMemoryDocumentStore not available.")
|
615
|
-
self._in_memory_store = InMemoryDocumentStore()
|
616
|
-
return True
|
617
|
-
|
618
|
-
def index_exists(self) -> bool:
|
619
|
-
"""
|
620
|
-
Checks if the LanceDB table or InMemory store exists and has documents.
|
621
|
-
NOTE: For LanceDB, this tries to count documents, implicitly checking connection/table existence.
|
622
|
-
For InMemory, it checks if the internal store object exists and has documents.
|
623
|
-
"""
|
624
|
-
store_name = self.table_name if self._persist else "InMemory"
|
625
|
-
logger.debug(
|
626
|
-
f"Checking existence of index for '{store_name}'. URI: '{self._uri if self._persist else 'N/A'}'"
|
627
|
-
)
|
628
|
-
try:
|
629
|
-
store = self._get_store()
|
630
|
-
count = store.count_documents()
|
631
|
-
exists = count > 0
|
632
|
-
logger.debug(
|
633
|
-
f"Store type {type(store).__name__} for '{store_name}' exists and has {count} documents: {exists}"
|
634
|
-
)
|
635
|
-
return exists
|
636
|
-
except ImportError as ie:
|
637
|
-
logger.error(f"Import error checking index existence for '{store_name}': {ie}")
|
638
|
-
return False
|
639
|
-
except Exception as e:
|
640
|
-
logger.warning(
|
641
|
-
f"Could not confirm existence or count documents in store for '{store_name}': {e}",
|
642
|
-
exc_info=False,
|
643
|
-
)
|
644
|
-
return False
|
645
|
-
|
646
|
-
# --- Sync Methods Implementation ---
|
647
|
-
|
648
|
-
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
649
|
-
"""Retrieves documents, required for sync."""
|
650
|
-
store_name = self.table_name if self._persist else "InMemory"
|
651
|
-
logger.debug(
|
652
|
-
f"Listing documents for '{store_name}' (include_metadata={include_metadata})..."
|
653
|
-
)
|
654
|
-
store = self._get_store()
|
655
|
-
try:
|
656
|
-
haystack_docs = store.filter_documents(filters=kwargs.get("filters"))
|
657
|
-
logger.info(f"Retrieved {len(haystack_docs)} documents from store '{store_name}'.")
|
658
|
-
results = []
|
659
|
-
for doc in haystack_docs:
|
660
|
-
doc_dict = {"id": doc.id}
|
661
|
-
if include_metadata:
|
662
|
-
doc_dict["meta"] = doc.meta
|
663
|
-
results.append(doc_dict)
|
664
|
-
return results
|
665
|
-
except Exception as e:
|
666
|
-
logger.error(f"Failed to list documents from store '{store_name}': {e}", exc_info=True)
|
667
|
-
raise RuntimeError(f"Failed to list documents from store '{store_name}'.") from e
|
668
|
-
|
669
|
-
def delete_documents(self, ids: List[str]) -> None:
|
670
|
-
"""Deletes documents by ID, required for sync."""
|
671
|
-
store_name = self.table_name if self._persist else "InMemory"
|
672
|
-
if not ids:
|
673
|
-
logger.debug(f"No document IDs provided for deletion from '{store_name}'. Skipping.")
|
674
|
-
return
|
675
|
-
logger.warning(f"Request to delete {len(ids)} documents from '{store_name}'.")
|
676
|
-
store = self._get_store()
|
677
|
-
try:
|
678
|
-
store.delete_documents(ids=ids)
|
679
|
-
logger.info(
|
680
|
-
f"Successfully requested deletion of {len(ids)} documents from '{store_name}'. Store count now: {store.count_documents()}"
|
681
|
-
)
|
682
|
-
except Exception as e:
|
683
|
-
logger.error(
|
684
|
-
f"Failed to delete documents with IDs {ids} from store '{store_name}': {e}",
|
685
|
-
exc_info=True,
|
686
|
-
)
|
687
|
-
raise RuntimeError(f"Failed to delete documents from store '{store_name}'.") from e
|