natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,32 @@
|
|
1
1
|
"""Implementation of the SearchServiceProtocol using Haystack components."""
|
2
2
|
|
3
|
+
import copy
|
3
4
|
import logging
|
4
5
|
import os
|
5
|
-
from typing import List, Dict, Any, Optional, Union, Iterable
|
6
6
|
from pathlib import Path
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
8
|
+
|
7
9
|
from PIL import Image
|
8
|
-
import copy
|
9
10
|
|
10
11
|
# --- Haystack Imports ---
|
11
12
|
try:
|
12
13
|
import haystack
|
13
14
|
from haystack import Pipeline
|
14
|
-
from haystack.dataclasses import Document as HaystackDocument
|
15
|
-
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
16
|
-
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
17
|
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
18
15
|
from haystack.components.embedders import (
|
16
|
+
SentenceTransformersDocumentEmbedder,
|
19
17
|
SentenceTransformersTextEmbedder,
|
20
|
-
SentenceTransformersDocumentEmbedder
|
21
18
|
)
|
19
|
+
|
22
20
|
# Import necessary retrievers, rankers etc. as needed for search()
|
23
|
-
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
24
|
-
from
|
21
|
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
|
22
|
+
from haystack.dataclasses import Document as HaystackDocument
|
23
|
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
24
|
+
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
25
|
+
from haystack_integrations.components.retrievers.chroma import ( # Use embedding retriever
|
26
|
+
ChromaEmbeddingRetriever,
|
27
|
+
)
|
28
|
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
29
|
+
|
25
30
|
# Need Ranker if used
|
26
31
|
try:
|
27
32
|
from haystack.components.rankers import CohereRanker
|
@@ -39,7 +44,7 @@ except ImportError:
|
|
39
44
|
SentenceTransformersDocumentEmbedder = None
|
40
45
|
SentenceTransformersTextEmbedder = None
|
41
46
|
InMemoryEmbeddingRetriever = None
|
42
|
-
ChromaEmbeddingRetriever = None
|
47
|
+
ChromaEmbeddingRetriever = None # Fallback definition
|
43
48
|
CohereRanker = None
|
44
49
|
Pipeline = None
|
45
50
|
DuplicatePolicy = None
|
@@ -47,16 +52,27 @@ except ImportError:
|
|
47
52
|
# --- ChromaDB Client Import (for management) ---
|
48
53
|
try:
|
49
54
|
import chromadb
|
55
|
+
|
50
56
|
CHROMADB_AVAILABLE = True
|
51
57
|
except ImportError:
|
52
58
|
chromadb = None
|
53
59
|
CHROMADB_AVAILABLE = False
|
54
60
|
|
61
|
+
from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
|
62
|
+
from .search_options import (
|
63
|
+
BaseSearchOptions,
|
64
|
+
MultiModalSearchOptions,
|
65
|
+
SearchOptions,
|
66
|
+
TextSearchOptions,
|
67
|
+
)
|
68
|
+
|
55
69
|
# --- Local Imports ---
|
56
|
-
from .search_service_protocol import
|
57
|
-
|
58
|
-
|
59
|
-
|
70
|
+
from .search_service_protocol import (
|
71
|
+
Indexable,
|
72
|
+
IndexConfigurationError,
|
73
|
+
IndexExistsError,
|
74
|
+
SearchServiceProtocol,
|
75
|
+
)
|
60
76
|
|
61
77
|
# --- Logging ---
|
62
78
|
logger = logging.getLogger(__name__)
|
@@ -65,6 +81,7 @@ logger = logging.getLogger(__name__)
|
|
65
81
|
DEFAULT_PERSIST_PATH = "./natural_pdf_index"
|
66
82
|
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
67
83
|
|
84
|
+
|
68
85
|
class HaystackSearchService(SearchServiceProtocol):
|
69
86
|
"""
|
70
87
|
Haystack-based implementation of the search service protocol.
|
@@ -77,9 +94,9 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
77
94
|
def __init__(
|
78
95
|
self,
|
79
96
|
collection_name: str,
|
80
|
-
persist: bool = False,
|
97
|
+
persist: bool = False, # Store type configuration
|
81
98
|
default_persist_path: str = DEFAULT_PERSIST_PATH,
|
82
|
-
embedding_model: str = DEFAULT_EMBEDDING_MODEL
|
99
|
+
embedding_model: str = DEFAULT_EMBEDDING_MODEL, # Renamed for clarity
|
83
100
|
):
|
84
101
|
"""
|
85
102
|
Initialize the service for a specific collection.
|
@@ -92,18 +109,24 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
92
109
|
embedding_model: The embedding model this service instance will use.
|
93
110
|
"""
|
94
111
|
if not HAS_HAYSTACK_EXTRAS:
|
95
|
-
raise ImportError(
|
112
|
+
raise ImportError(
|
113
|
+
"HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
|
114
|
+
)
|
96
115
|
|
97
|
-
self.collection_name = collection_name
|
98
|
-
self._persist = persist
|
116
|
+
self.collection_name = collection_name # Store the collection name
|
117
|
+
self._persist = persist # Store the persistence type for this instance
|
99
118
|
self._default_persist_path = default_persist_path
|
100
|
-
self._embedding_model = embedding_model
|
101
|
-
|
119
|
+
self._embedding_model = embedding_model # Store the configured model
|
120
|
+
|
102
121
|
# Dictionary to hold InMemoryDocumentStore instances if not persisting
|
103
|
-
self._in_memory_store: Optional[InMemoryDocumentStore] =
|
104
|
-
|
122
|
+
self._in_memory_store: Optional[InMemoryDocumentStore] = (
|
123
|
+
None if persist else InMemoryDocumentStore()
|
124
|
+
)
|
125
|
+
self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
|
105
126
|
|
106
|
-
logger.info(
|
127
|
+
logger.info(
|
128
|
+
f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'"
|
129
|
+
)
|
107
130
|
|
108
131
|
# --- Internal Helper Methods --- #
|
109
132
|
|
@@ -114,27 +137,34 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
114
137
|
# Use the instance's configured persistence type and collection name
|
115
138
|
if self._persist:
|
116
139
|
if self._chroma_store is None:
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
140
|
+
# Lazy load Chroma store
|
141
|
+
logger.debug(
|
142
|
+
f"Initializing ChromaDocumentStore for collection '{self.collection_name}'."
|
143
|
+
)
|
144
|
+
self._chroma_store = ChromaDocumentStore(
|
145
|
+
persist_path=self._default_persist_path,
|
146
|
+
collection_name=self.collection_name, # Use instance name
|
147
|
+
)
|
123
148
|
return self._chroma_store
|
124
149
|
else:
|
125
150
|
# Return the instance's InMemory store
|
126
|
-
if
|
127
|
-
|
128
|
-
|
151
|
+
if (
|
152
|
+
self._in_memory_store is None
|
153
|
+
): # Should have been created in __init__ if persist=False
|
154
|
+
logger.warning(
|
155
|
+
f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
|
156
|
+
)
|
157
|
+
self._in_memory_store = InMemoryDocumentStore()
|
129
158
|
return self._in_memory_store
|
130
159
|
|
131
160
|
def _get_document_embedder(
|
132
|
-
self,
|
133
|
-
device: Optional[str] = None
|
161
|
+
self, device: Optional[str] = None
|
134
162
|
) -> SentenceTransformersDocumentEmbedder:
|
135
163
|
"""Creates the Haystack document embedder component."""
|
136
|
-
model_name = self._embedding_model
|
137
|
-
logger.debug(
|
164
|
+
model_name = self._embedding_model # Use instance model
|
165
|
+
logger.debug(
|
166
|
+
f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
167
|
+
)
|
138
168
|
if not SentenceTransformersDocumentEmbedder:
|
139
169
|
raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
|
140
170
|
try:
|
@@ -143,56 +173,80 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
143
173
|
device=device,
|
144
174
|
)
|
145
175
|
embedder.warm_up()
|
146
|
-
logger.info(
|
176
|
+
logger.info(
|
177
|
+
f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
|
178
|
+
)
|
147
179
|
return embedder
|
148
180
|
except Exception as e:
|
149
|
-
logger.error(
|
150
|
-
|
181
|
+
logger.error(
|
182
|
+
f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
|
183
|
+
)
|
184
|
+
raise RuntimeError(
|
185
|
+
f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
|
186
|
+
) from e
|
151
187
|
|
152
|
-
def _get_text_embedder(
|
153
|
-
self,
|
154
|
-
device: Optional[str] = None
|
155
|
-
) -> SentenceTransformersTextEmbedder:
|
188
|
+
def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
|
156
189
|
"""Creates the Haystack text embedder component (for queries)."""
|
157
|
-
model_name = self._embedding_model
|
158
|
-
logger.debug(
|
190
|
+
model_name = self._embedding_model # Use instance model
|
191
|
+
logger.debug(
|
192
|
+
f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
193
|
+
)
|
159
194
|
if not SentenceTransformersTextEmbedder:
|
160
195
|
raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
|
161
196
|
try:
|
162
197
|
embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
|
163
|
-
embedder.warm_up()
|
164
|
-
logger.info(
|
198
|
+
embedder.warm_up()
|
199
|
+
logger.info(
|
200
|
+
f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
|
201
|
+
)
|
165
202
|
return embedder
|
166
203
|
except Exception as e:
|
167
|
-
logger.error(
|
168
|
-
|
204
|
+
logger.error(
|
205
|
+
f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True
|
206
|
+
)
|
207
|
+
raise RuntimeError(
|
208
|
+
f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
|
209
|
+
) from e
|
169
210
|
|
170
211
|
def _delete_chroma_collection(self) -> bool:
|
171
212
|
"""Internal helper to delete the ChromaDB collection managed by this service."""
|
172
213
|
if not CHROMADB_AVAILABLE:
|
173
|
-
logger.error(
|
214
|
+
logger.error(
|
215
|
+
"Cannot delete ChromaDB collection because 'chromadb' library is not installed."
|
216
|
+
)
|
174
217
|
raise ImportError("'chromadb' library required for collection deletion.")
|
175
218
|
if not self._persist:
|
176
|
-
|
177
|
-
|
219
|
+
logger.warning(
|
220
|
+
"Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring."
|
221
|
+
)
|
222
|
+
return False # Cannot delete if not persistent
|
178
223
|
try:
|
179
|
-
collection_name_to_delete = self.collection_name
|
180
|
-
logger.warning(
|
224
|
+
collection_name_to_delete = self.collection_name # Use instance collection name
|
225
|
+
logger.warning(
|
226
|
+
f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'."
|
227
|
+
)
|
181
228
|
chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
|
182
229
|
try:
|
183
230
|
chroma_client.delete_collection(name=collection_name_to_delete)
|
184
|
-
logger.info(
|
185
|
-
|
231
|
+
logger.info(
|
232
|
+
f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'."
|
233
|
+
)
|
234
|
+
self._chroma_store = None # Reset lazy-loaded store
|
186
235
|
return True
|
187
236
|
except chromadb.errors.InvalidCollectionException:
|
188
|
-
logger.info(
|
189
|
-
|
237
|
+
logger.info(
|
238
|
+
f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed."
|
239
|
+
)
|
240
|
+
return True # Deletion is effectively successful
|
190
241
|
finally:
|
191
|
-
pass
|
242
|
+
pass # Cleanup if needed
|
192
243
|
except ImportError as ie:
|
193
244
|
raise ie
|
194
245
|
except Exception as e:
|
195
|
-
logger.error(
|
246
|
+
logger.error(
|
247
|
+
f"Error during ChromaDB collection deletion '{self.collection_name}': {e}",
|
248
|
+
exc_info=True,
|
249
|
+
)
|
196
250
|
# Don't raise here, let index() decide based on force_reindex
|
197
251
|
return False
|
198
252
|
|
@@ -200,37 +254,43 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
200
254
|
|
201
255
|
def index(
|
202
256
|
self,
|
203
|
-
documents: Iterable[Indexable],
|
257
|
+
documents: Iterable[Indexable], # Accept Indexable objects
|
204
258
|
embedder_device: Optional[str] = None,
|
205
259
|
force_reindex: bool = False,
|
206
260
|
) -> None:
|
207
261
|
# Need to consume the iterable to log count, or log differently
|
208
262
|
# Let's convert to list for now, assuming size isn't prohibitive
|
209
263
|
indexable_list = list(documents)
|
210
|
-
logger.info(
|
211
|
-
|
264
|
+
logger.info(
|
265
|
+
f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
|
266
|
+
)
|
267
|
+
|
212
268
|
if not indexable_list:
|
213
|
-
|
214
|
-
|
269
|
+
logger.warning("No documents provided for indexing. Skipping.")
|
270
|
+
return
|
215
271
|
|
216
272
|
# --- 1. Handle Reindexing (Deletion before store/embedder init) ---
|
217
273
|
if force_reindex:
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
274
|
+
logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
|
275
|
+
if self._persist:
|
276
|
+
# Attempt deletion, raises ImportError if chromadb missing
|
277
|
+
deleted = self._delete_chroma_collection() # Uses self.collection_name
|
278
|
+
if not deleted:
|
279
|
+
# If deletion failed for other reasons, log and continue cautiously
|
280
|
+
logger.warning(
|
281
|
+
"Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
# For InMemory, force_reindex means we want a fresh store instance.
|
285
|
+
# Re-initialize the instance's in-memory store
|
286
|
+
logger.info(
|
287
|
+
f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'."
|
288
|
+
)
|
289
|
+
self._in_memory_store = InMemoryDocumentStore() # Create a new one
|
230
290
|
|
231
291
|
# REMOVED try...except around store retrieval
|
232
292
|
# Let store initialization errors propagate directly
|
233
|
-
store = self._get_store()
|
293
|
+
store = self._get_store() # No argument needed
|
234
294
|
|
235
295
|
# --- 3. Create Embedder ---
|
236
296
|
# Errors during embedder creation will propagate from the helper
|
@@ -243,42 +303,55 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
243
303
|
for item in indexable_list:
|
244
304
|
doc_id = item.get_id()
|
245
305
|
metadata = item.get_metadata()
|
246
|
-
content_obj = item.get_content()
|
306
|
+
content_obj = item.get_content() # This might be Page, Region, etc.
|
247
307
|
|
248
308
|
# Determine content based on embedder type and content object
|
249
309
|
# For now, assume text content is needed and try to extract it
|
250
310
|
content_text = ""
|
251
311
|
if isinstance(content_obj, str):
|
252
|
-
|
253
|
-
|
254
|
-
elif hasattr(content_obj,
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
312
|
+
# If get_content() already returned text
|
313
|
+
content_text = content_obj
|
314
|
+
elif hasattr(content_obj, "extract_text") and callable(
|
315
|
+
getattr(content_obj, "extract_text")
|
316
|
+
):
|
317
|
+
# If content object has extract_text (like Page or Region)
|
318
|
+
try:
|
319
|
+
content_text = content_obj.extract_text()
|
320
|
+
if not isinstance(content_text, str):
|
321
|
+
logger.warning(
|
322
|
+
f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str()."
|
323
|
+
)
|
324
|
+
content_text = str(content_obj)
|
325
|
+
except Exception as extraction_error:
|
326
|
+
logger.error(
|
327
|
+
f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().",
|
328
|
+
exc_info=False,
|
329
|
+
)
|
330
|
+
content_text = str(content_obj)
|
264
331
|
else:
|
265
|
-
|
266
|
-
|
267
|
-
|
332
|
+
# Attempt to convert to string as fallback if no obvious text method
|
333
|
+
logger.warning(
|
334
|
+
f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
|
335
|
+
)
|
336
|
+
content_text = str(content_obj)
|
268
337
|
|
269
338
|
# Construct HaystackDocument using data from Indexable protocol methods
|
270
339
|
haystack_doc = HaystackDocument(
|
271
|
-
id=doc_id,
|
340
|
+
id=doc_id, # Use ID from get_id()
|
272
341
|
content=content_text,
|
273
|
-
meta=metadata
|
342
|
+
meta=metadata, # Use metadata from get_metadata()
|
274
343
|
)
|
275
344
|
haystack_docs_to_embed.append(haystack_doc)
|
276
345
|
|
277
346
|
if not haystack_docs_to_embed:
|
278
|
-
|
279
|
-
|
347
|
+
logger.warning(
|
348
|
+
"No Haystack documents were prepared. Check conversion logic and input data."
|
349
|
+
)
|
350
|
+
return
|
280
351
|
|
281
|
-
logger.info(
|
352
|
+
logger.info(
|
353
|
+
f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
|
354
|
+
)
|
282
355
|
try:
|
283
356
|
# Embed the documents
|
284
357
|
embedding_results = embedder.run(documents=haystack_docs_to_embed)
|
@@ -291,33 +364,42 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
291
364
|
error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
|
292
365
|
if self._persist:
|
293
366
|
error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
|
294
|
-
error_msg +=
|
367
|
+
error_msg += (
|
368
|
+
"Try deleting the persistent storage directory or using force_reindex=True."
|
369
|
+
)
|
295
370
|
else:
|
296
|
-
|
371
|
+
error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
|
297
372
|
logger.error(error_msg, exc_info=True)
|
298
373
|
raise IndexConfigurationError(error_msg) from dim_error
|
299
374
|
# REMOVED broad except Exception for embedding errors. Let them propagate.
|
300
375
|
|
301
376
|
# --- 5. Write Embedded Documents to Store ---
|
302
|
-
logger.info(
|
377
|
+
logger.info(
|
378
|
+
f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'..."
|
379
|
+
)
|
303
380
|
# REMOVED try...except around store writing. Let errors propagate.
|
304
381
|
write_result = store.write_documents(
|
305
|
-
|
306
|
-
|
382
|
+
documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE # Or configure as needed
|
383
|
+
)
|
384
|
+
logger.info(
|
385
|
+
f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
|
307
386
|
)
|
308
|
-
logger.info(f"Successfully wrote {write_result} documents to store '{self.collection_name}'.")
|
309
387
|
# --- Add explicit count check after writing ---
|
310
|
-
logger.info(
|
388
|
+
logger.info(
|
389
|
+
f"Store '{self.collection_name}' document count after write: {store.count_documents()}"
|
390
|
+
)
|
311
391
|
# --- End count check ---
|
312
392
|
|
313
393
|
def search(
|
314
394
|
self,
|
315
|
-
query: Any,
|
395
|
+
query: Any, # Changed from Union[str, Path, Image.Image] to Any
|
316
396
|
options: BaseSearchOptions,
|
317
397
|
) -> List[Dict[str, Any]]:
|
318
|
-
logger.info(
|
398
|
+
logger.info(
|
399
|
+
f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}"
|
400
|
+
)
|
319
401
|
|
320
|
-
store = self._get_store()
|
402
|
+
store = self._get_store() # Let errors propagate
|
321
403
|
|
322
404
|
# --- 1. Handle Query Type and Embedding ---
|
323
405
|
# This implementation currently only supports text query embedding.
|
@@ -325,44 +407,63 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
325
407
|
query_embedding = None
|
326
408
|
query_text = ""
|
327
409
|
if isinstance(query, (str, os.PathLike)):
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
410
|
+
if isinstance(query, os.PathLike):
|
411
|
+
logger.warning(
|
412
|
+
"Image path query received, but multimodal search not fully implemented. Treating as text path string."
|
413
|
+
)
|
414
|
+
query_text = str(query)
|
415
|
+
else:
|
416
|
+
query_text = query
|
417
|
+
|
418
|
+
text_embedder = self._get_text_embedder()
|
419
|
+
embedding_result = text_embedder.run(text=query_text)
|
420
|
+
query_embedding = embedding_result["embedding"]
|
421
|
+
if not query_embedding:
|
422
|
+
raise ValueError("Text embedder did not return an embedding for the query.")
|
423
|
+
logger.debug(
|
424
|
+
f"Successfully generated query text embedding (dim: {len(query_embedding)})."
|
425
|
+
)
|
340
426
|
|
341
427
|
elif isinstance(query, Image.Image):
|
342
|
-
|
343
|
-
|
428
|
+
logger.error(
|
429
|
+
"Multimodal query (PIL Image) is not yet supported by this service implementation."
|
430
|
+
)
|
431
|
+
raise NotImplementedError(
|
432
|
+
"Search with PIL Image queries is not implemented in HaystackSearchService."
|
433
|
+
)
|
344
434
|
# Check if query is Indexable and try extracting text?
|
345
|
-
elif hasattr(query,
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
435
|
+
elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
|
436
|
+
logger.debug(
|
437
|
+
f"Query type {type(query).__name__} has extract_text. Extracting text for search."
|
438
|
+
)
|
439
|
+
try:
|
440
|
+
query_text = query.extract_text()
|
441
|
+
if not query_text or not query_text.strip():
|
442
|
+
logger.warning(
|
443
|
+
f"Query object {type(query).__name__} provided empty text. Returning no results."
|
444
|
+
)
|
445
|
+
return []
|
446
|
+
# Embed the extracted text
|
447
|
+
text_embedder = self._get_text_embedder()
|
448
|
+
embedding_result = text_embedder.run(text=query_text)
|
449
|
+
query_embedding = embedding_result["embedding"]
|
450
|
+
if not query_embedding:
|
451
|
+
raise ValueError(
|
452
|
+
f"Text embedder did not return an embedding for text extracted from {type(query).__name__}."
|
453
|
+
)
|
454
|
+
logger.debug(
|
455
|
+
f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)})."
|
456
|
+
)
|
457
|
+
except Exception as e:
|
458
|
+
logger.error(
|
459
|
+
f"Failed to extract or embed text from query object {type(query).__name__}: {e}",
|
460
|
+
exc_info=True,
|
461
|
+
)
|
462
|
+
raise RuntimeError("Query text extraction or embedding failed.") from e
|
362
463
|
|
363
464
|
else:
|
364
|
-
|
365
|
-
|
465
|
+
# Raise specific error for unsupported types by this implementation
|
466
|
+
raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
|
366
467
|
|
367
468
|
# --- 2. Select Retriever based on Store Type ---
|
368
469
|
retriever = None
|
@@ -371,10 +472,10 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
371
472
|
raise ImportError("ChromaEmbeddingRetriever is required but not available.")
|
372
473
|
retriever = ChromaEmbeddingRetriever(document_store=store)
|
373
474
|
elif isinstance(store, InMemoryDocumentStore):
|
374
|
-
|
475
|
+
retriever = InMemoryEmbeddingRetriever(document_store=store)
|
375
476
|
else:
|
376
|
-
|
377
|
-
|
477
|
+
# Raise specific error for unsupported store
|
478
|
+
raise TypeError(f"Cannot perform search with store type {type(store)}.")
|
378
479
|
|
379
480
|
# --- 3. Build Retrieval Pipeline ---
|
380
481
|
pipeline = Pipeline()
|
@@ -385,13 +486,10 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
385
486
|
# --- 4. Prepare Filters (remains the same) ---
|
386
487
|
haystack_filters = options.filters
|
387
488
|
if haystack_filters:
|
388
|
-
|
489
|
+
logger.debug(f"Applying filters: {haystack_filters}")
|
389
490
|
|
390
491
|
# --- 5. Prepare Retriever Input Data (Dynamically) ---
|
391
|
-
retriever_input_data = {
|
392
|
-
"filters": haystack_filters,
|
393
|
-
"top_k": options.top_k
|
394
|
-
}
|
492
|
+
retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
|
395
493
|
# Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
|
396
494
|
retriever_input_data["query_embedding"] = query_embedding
|
397
495
|
logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
|
@@ -415,23 +513,25 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
415
513
|
meta_with_hash = doc.meta
|
416
514
|
# No need to explicitly add hash here if Haystack store preserves it
|
417
515
|
result_dict = {
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
516
|
+
"content_snippet": doc.content[:200] if doc.content else "",
|
517
|
+
"score": doc.score if doc.score is not None else 0.0,
|
518
|
+
"page_number": meta_with_hash.get("page_number", None),
|
519
|
+
"pdf_path": meta_with_hash.get("pdf_path", None),
|
520
|
+
"metadata": meta_with_hash, # Pass full metadata
|
521
|
+
# "_haystack_document": doc # Optionally include full object
|
424
522
|
}
|
425
523
|
final_results.append(result_dict)
|
426
524
|
return final_results
|
427
525
|
else:
|
428
|
-
|
429
|
-
|
526
|
+
logger.warning("Pipeline result did not contain expected retriever output.")
|
527
|
+
return []
|
430
528
|
|
431
529
|
except FileNotFoundError:
|
432
|
-
|
433
|
-
|
434
|
-
|
530
|
+
# Keep specific catch for collection not found during retrieval
|
531
|
+
logger.error(
|
532
|
+
f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'."
|
533
|
+
)
|
534
|
+
raise # Re-raise the specific FileNotFoundError
|
435
535
|
# REMOVED broad except Exception for pipeline execution. Let errors propagate.
|
436
536
|
|
437
537
|
def delete_index(
|
@@ -449,9 +549,11 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
449
549
|
return self._delete_chroma_collection()
|
450
550
|
else:
|
451
551
|
# For InMemory, "deleting" means re-initializing the store
|
452
|
-
logger.info(
|
552
|
+
logger.info(
|
553
|
+
f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
|
554
|
+
)
|
453
555
|
self._in_memory_store = InMemoryDocumentStore()
|
454
|
-
return True
|
556
|
+
return True # Considered successful
|
455
557
|
|
456
558
|
def index_exists(
|
457
559
|
self,
|
@@ -462,59 +564,80 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
462
564
|
For InMemory, it checks if the internal store object exists and has documents.
|
463
565
|
"""
|
464
566
|
logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
|
465
|
-
store = self._get_store()
|
567
|
+
store = self._get_store() # Get the store instance
|
466
568
|
try:
|
467
569
|
count = store.count_documents()
|
468
570
|
exists = count > 0
|
469
|
-
logger.debug(
|
571
|
+
logger.debug(
|
572
|
+
f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}"
|
573
|
+
)
|
470
574
|
return exists
|
471
575
|
except Exception as e:
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
576
|
+
# Catch errors during count_documents (e.g., connection error for persistent stores)
|
577
|
+
logger.warning(
|
578
|
+
f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}",
|
579
|
+
exc_info=False,
|
580
|
+
)
|
581
|
+
# Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
|
582
|
+
# Assume not exists if count fails
|
583
|
+
return False
|
477
584
|
|
478
585
|
# --- Sync Methods Implementation ---
|
479
586
|
|
480
587
|
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
481
588
|
"""Retrieves documents, required for sync.
|
482
|
-
|
483
|
-
|
589
|
+
NOTE: Haystack's filter_documents is the closest match.
|
590
|
+
Fetches all docs if filters=None.
|
484
591
|
"""
|
485
|
-
logger.debug(
|
592
|
+
logger.debug(
|
593
|
+
f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})..."
|
594
|
+
)
|
486
595
|
store = self._get_store()
|
487
596
|
try:
|
488
597
|
# Use filter_documents with no filters to get all
|
489
598
|
# This might be inefficient for very large stores.
|
490
|
-
haystack_docs = store.filter_documents(
|
599
|
+
haystack_docs = store.filter_documents(
|
600
|
+
filters=kwargs.get("filters")
|
601
|
+
) # Pass filters if provided via kwargs
|
491
602
|
logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
|
492
603
|
# Convert to simple dicts
|
493
604
|
results = []
|
494
605
|
for doc in haystack_docs:
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
606
|
+
doc_dict = {"id": doc.id} # ID is essential
|
607
|
+
if include_metadata:
|
608
|
+
# Ensure content_hash is included if it exists in meta
|
609
|
+
doc_dict["meta"] = doc.meta
|
610
|
+
# Optionally include content? Protocol doesn't require it.
|
611
|
+
# doc_dict["content"] = doc.content
|
612
|
+
results.append(doc_dict)
|
502
613
|
return results
|
503
614
|
except Exception as e:
|
504
|
-
logger.error(
|
505
|
-
|
506
|
-
|
615
|
+
logger.error(
|
616
|
+
f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True
|
617
|
+
)
|
618
|
+
raise RuntimeError(
|
619
|
+
f"Failed to list documents from store '{self.collection_name}'."
|
620
|
+
) from e
|
507
621
|
|
508
622
|
def delete_documents(self, ids: List[str]) -> None:
|
509
623
|
"""Deletes documents by ID, required for sync."""
|
510
624
|
if not ids:
|
511
625
|
logger.debug("No document IDs provided for deletion. Skipping.")
|
512
626
|
return
|
513
|
-
logger.warning(
|
627
|
+
logger.warning(
|
628
|
+
f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
|
629
|
+
)
|
514
630
|
store = self._get_store()
|
515
631
|
try:
|
516
632
|
store.delete_documents(ids=ids)
|
517
|
-
logger.info(
|
633
|
+
logger.info(
|
634
|
+
f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}"
|
635
|
+
)
|
518
636
|
except Exception as e:
|
519
|
-
logger.error(
|
520
|
-
|
637
|
+
logger.error(
|
638
|
+
f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}",
|
639
|
+
exc_info=True,
|
640
|
+
)
|
641
|
+
raise RuntimeError(
|
642
|
+
f"Failed to delete documents from store '{self.collection_name}'."
|
643
|
+
) from e
|