natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,19 @@
|
|
1
1
|
"""Implementation of the SearchServiceProtocol using Haystack components."""
|
2
2
|
|
3
|
-
import copy
|
4
3
|
import logging
|
5
4
|
import os
|
5
|
+
import shutil
|
6
6
|
from pathlib import Path
|
7
7
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
8
8
|
|
9
9
|
from PIL import Image
|
10
10
|
|
11
|
+
# Import sentence-transformers for dimension calculation
|
12
|
+
try:
|
13
|
+
from sentence_transformers import SentenceTransformer
|
14
|
+
except ImportError:
|
15
|
+
SentenceTransformer = None
|
16
|
+
|
11
17
|
# --- Haystack Imports ---
|
12
18
|
try:
|
13
19
|
import haystack
|
@@ -17,15 +23,23 @@ try:
|
|
17
23
|
SentenceTransformersTextEmbedder,
|
18
24
|
)
|
19
25
|
|
20
|
-
# Import
|
21
|
-
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
26
|
+
# Import InMemory Store & Retriever unconditionally
|
27
|
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
22
28
|
from haystack.dataclasses import Document as HaystackDocument
|
23
29
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
24
30
|
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
31
|
+
|
32
|
+
# Conditional LanceDB Imports
|
33
|
+
try:
|
34
|
+
from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
|
35
|
+
|
36
|
+
LANCEDB_HAYSTACK_AVAILABLE = True
|
37
|
+
except ImportError:
|
38
|
+
LanceDBDocumentStore = None
|
39
|
+
LanceDBEmbeddingRetriever = None
|
40
|
+
LANCEDB_HAYSTACK_AVAILABLE = False
|
41
|
+
|
42
|
+
# Removed Chroma Imports
|
29
43
|
|
30
44
|
# Need Ranker if used
|
31
45
|
try:
|
@@ -33,36 +47,35 @@ try:
|
|
33
47
|
except ImportError:
|
34
48
|
CohereRanker = None
|
35
49
|
|
36
|
-
# Don't define here, it's imported later
|
37
50
|
except ImportError:
|
38
51
|
# Set flags/placeholders if Haystack isn't installed
|
39
|
-
# Don't define here, it's imported later
|
40
52
|
DocumentStore = object
|
41
53
|
HaystackDocument = Dict
|
42
|
-
ChromaDocumentStore = None
|
43
54
|
InMemoryDocumentStore = None
|
55
|
+
LanceDBDocumentStore = None
|
44
56
|
SentenceTransformersDocumentEmbedder = None
|
45
57
|
SentenceTransformersTextEmbedder = None
|
46
58
|
InMemoryEmbeddingRetriever = None
|
47
|
-
|
59
|
+
LanceDBEmbeddingRetriever = None
|
48
60
|
CohereRanker = None
|
49
61
|
Pipeline = None
|
50
62
|
DuplicatePolicy = None
|
63
|
+
LANCEDB_HAYSTACK_AVAILABLE = False
|
51
64
|
|
52
|
-
#
|
65
|
+
# LanceDB Client Import (for management)
|
53
66
|
try:
|
54
|
-
import
|
67
|
+
import lancedb
|
55
68
|
|
56
|
-
|
69
|
+
LANCEDB_CLIENT_AVAILABLE = True
|
57
70
|
except ImportError:
|
58
|
-
|
59
|
-
|
71
|
+
lancedb = None
|
72
|
+
LANCEDB_CLIENT_AVAILABLE = False
|
60
73
|
|
61
|
-
|
74
|
+
# Removed ChromaDB Client Import
|
75
|
+
|
76
|
+
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
62
77
|
from .search_options import (
|
63
78
|
BaseSearchOptions,
|
64
|
-
MultiModalSearchOptions,
|
65
|
-
SearchOptions,
|
66
79
|
TextSearchOptions,
|
67
80
|
)
|
68
81
|
|
@@ -70,11 +83,9 @@ from .search_options import (
|
|
70
83
|
from .search_service_protocol import (
|
71
84
|
Indexable,
|
72
85
|
IndexConfigurationError,
|
73
|
-
IndexExistsError,
|
74
86
|
SearchServiceProtocol,
|
75
87
|
)
|
76
88
|
|
77
|
-
# --- Logging ---
|
78
89
|
logger = logging.getLogger(__name__)
|
79
90
|
|
80
91
|
# --- Default Configuration Values ---
|
@@ -86,74 +97,129 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
86
97
|
"""
|
87
98
|
Haystack-based implementation of the search service protocol.
|
88
99
|
|
89
|
-
Manages
|
100
|
+
Manages LanceDB (persistent) or InMemory (non-persistent) DocumentStores
|
90
101
|
and uses Haystack components for embedding and retrieval.
|
91
|
-
A single instance of this service is tied to a specific
|
102
|
+
A single instance of this service is tied to a specific table name (LanceDB)
|
103
|
+
or implicitly managed (InMemory).
|
92
104
|
"""
|
93
105
|
|
94
106
|
def __init__(
|
95
107
|
self,
|
96
|
-
|
97
|
-
persist: bool = False,
|
98
|
-
|
99
|
-
embedding_model: str = DEFAULT_EMBEDDING_MODEL,
|
108
|
+
table_name: str,
|
109
|
+
persist: bool = False,
|
110
|
+
uri: str = DEFAULT_PERSIST_PATH,
|
111
|
+
embedding_model: str = DEFAULT_EMBEDDING_MODEL,
|
100
112
|
):
|
101
113
|
"""
|
102
|
-
Initialize the service for a specific
|
114
|
+
Initialize the service for a specific LanceDB table or an InMemory store.
|
103
115
|
|
104
116
|
Args:
|
105
|
-
|
106
|
-
persist: If True, this service instance manages persistent
|
107
|
-
If False, it manages transient InMemory
|
108
|
-
|
117
|
+
table_name: The name of the LanceDB table (if persist=True).
|
118
|
+
persist: If True, this service instance manages a persistent LanceDB store.
|
119
|
+
If False, it manages a transient InMemory store.
|
120
|
+
uri: Path/URI for the LanceDB database directory (if persist=True).
|
109
121
|
embedding_model: The embedding model this service instance will use.
|
122
|
+
Required for LanceDB to know embedding dimensions.
|
110
123
|
"""
|
111
124
|
if not HAS_HAYSTACK_EXTRAS:
|
112
125
|
raise ImportError(
|
113
126
|
"HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
|
114
127
|
)
|
115
128
|
|
116
|
-
self.
|
117
|
-
self._persist = persist
|
118
|
-
self.
|
119
|
-
self._embedding_model = embedding_model
|
129
|
+
self.table_name = table_name
|
130
|
+
self._persist = persist
|
131
|
+
self._uri = uri
|
132
|
+
self._embedding_model = embedding_model
|
133
|
+
self._embedding_dims: Optional[int] = None
|
120
134
|
|
121
|
-
#
|
122
|
-
self._in_memory_store: Optional[InMemoryDocumentStore] =
|
123
|
-
|
124
|
-
)
|
125
|
-
self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
|
135
|
+
# Store instances (lazy loaded)
|
136
|
+
self._in_memory_store: Optional[InMemoryDocumentStore] = None
|
137
|
+
self._lancedb_store: Optional[LanceDBDocumentStore] = None
|
126
138
|
|
127
|
-
|
128
|
-
|
129
|
-
|
139
|
+
# Eagerly create InMemoryStore if not persisting
|
140
|
+
if not self._persist:
|
141
|
+
if not InMemoryDocumentStore:
|
142
|
+
raise ImportError(
|
143
|
+
"InMemoryDocumentStore not available. Cannot create non-persistent service."
|
144
|
+
)
|
145
|
+
self._in_memory_store = InMemoryDocumentStore()
|
146
|
+
logger.info(
|
147
|
+
f"HaystackSearchService initialized for InMemory store (table_name '{self.table_name}' ignored). Model: '{self._embedding_model}'"
|
148
|
+
)
|
149
|
+
else:
|
150
|
+
# Check LanceDB availability if persisting
|
151
|
+
if not LANCEDB_HAYSTACK_AVAILABLE:
|
152
|
+
raise ImportError(
|
153
|
+
"LanceDB persistent store requires lancedb-haystack. Install with: pip install lancedb-haystack"
|
154
|
+
)
|
155
|
+
if not SentenceTransformer:
|
156
|
+
raise ImportError(
|
157
|
+
"LanceDB persistent store requires sentence-transformers to determine embedding dimensions. Install with: pip install sentence-transformers"
|
158
|
+
)
|
159
|
+
# Calculate embedding dimensions needed for LanceDB initialization
|
160
|
+
self._calculate_embedding_dims()
|
161
|
+
logger.info(
|
162
|
+
f"HaystackSearchService initialized for LanceDB table='{self.table_name}' at uri='{self._uri}'. Model: '{self._embedding_model}', Dims: {self._embedding_dims}"
|
163
|
+
)
|
130
164
|
|
131
|
-
# --- Internal Helper Methods ---
|
165
|
+
# --- Internal Helper Methods ---
|
132
166
|
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
167
|
+
def _calculate_embedding_dims(self) -> None:
|
168
|
+
"""Calculates and stores embedding dimensions from the model name."""
|
169
|
+
if self._embedding_dims is None:
|
170
|
+
if not SentenceTransformer:
|
171
|
+
raise ImportError(
|
172
|
+
"sentence-transformers library is required to determine embedding dimensions."
|
173
|
+
)
|
174
|
+
try:
|
175
|
+
model = SentenceTransformer(self._embedding_model)
|
176
|
+
dims = model.get_sentence_embedding_dimension()
|
177
|
+
if not dims:
|
178
|
+
raise ValueError(
|
179
|
+
f"Could not determine embedding dimension for model: {self._embedding_model}"
|
180
|
+
)
|
181
|
+
self._embedding_dims = dims
|
182
|
+
logger.debug(
|
183
|
+
f"Determined embedding dimension: {self._embedding_dims} for model '{self._embedding_model}'"
|
184
|
+
)
|
185
|
+
except Exception as e:
|
186
|
+
logger.error(
|
187
|
+
f"Failed to load SentenceTransformer model '{self._embedding_model}' to get dimensions: {e}",
|
188
|
+
exc_info=True,
|
189
|
+
)
|
190
|
+
raise RuntimeError(
|
191
|
+
f"Failed to determine embedding dimension for model '{self._embedding_model}'."
|
192
|
+
) from e
|
193
|
+
|
194
|
+
def _get_store(self) -> DocumentStore:
|
195
|
+
"""Gets or creates the appropriate Haystack DocumentStore instance."""
|
138
196
|
if self._persist:
|
139
|
-
if
|
140
|
-
|
197
|
+
if not LanceDBDocumentStore:
|
198
|
+
raise ImportError("LanceDBDocumentStore not available.")
|
199
|
+
if self._lancedb_store is None:
|
141
200
|
logger.debug(
|
142
|
-
f"Initializing
|
201
|
+
f"Initializing LanceDBDocumentStore for table '{self.table_name}' at uri '{self._uri}'."
|
143
202
|
)
|
144
|
-
self.
|
145
|
-
|
146
|
-
|
203
|
+
if self._embedding_dims is None:
|
204
|
+
logger.warning(
|
205
|
+
"Embedding dimensions not calculated before getting store. Calculating now."
|
206
|
+
)
|
207
|
+
self._calculate_embedding_dims()
|
208
|
+
|
209
|
+
self._lancedb_store = LanceDBDocumentStore(
|
210
|
+
database=self._uri,
|
211
|
+
table_name=self.table_name,
|
212
|
+
embedding_dims=self._embedding_dims,
|
147
213
|
)
|
148
|
-
|
149
|
-
|
150
|
-
# Return the instance's InMemory store
|
151
|
-
if (
|
152
|
-
self._in_memory_store is None
|
153
|
-
): # Should have been created in __init__ if persist=False
|
154
|
-
logger.warning(
|
155
|
-
f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
|
214
|
+
logger.info(
|
215
|
+
f"Initialized LanceDBDocumentStore for table '{self.table_name}' (Dims: {self._embedding_dims})"
|
156
216
|
)
|
217
|
+
return self._lancedb_store
|
218
|
+
else:
|
219
|
+
if self._in_memory_store is None:
|
220
|
+
logger.warning("In-memory store was not initialized. Creating now.")
|
221
|
+
if not InMemoryDocumentStore:
|
222
|
+
raise ImportError("InMemoryDocumentStore not available.")
|
157
223
|
self._in_memory_store = InMemoryDocumentStore()
|
158
224
|
return self._in_memory_store
|
159
225
|
|
@@ -161,7 +227,7 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
161
227
|
self, device: Optional[str] = None
|
162
228
|
) -> SentenceTransformersDocumentEmbedder:
|
163
229
|
"""Creates the Haystack document embedder component."""
|
164
|
-
model_name = self._embedding_model
|
230
|
+
model_name = self._embedding_model
|
165
231
|
logger.debug(
|
166
232
|
f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
167
233
|
)
|
@@ -187,7 +253,7 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
187
253
|
|
188
254
|
def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
|
189
255
|
"""Creates the Haystack text embedder component (for queries)."""
|
190
|
-
model_name = self._embedding_model
|
256
|
+
model_name = self._embedding_model
|
191
257
|
logger.debug(
|
192
258
|
f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
|
193
259
|
)
|
@@ -208,113 +274,97 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
208
274
|
f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
|
209
275
|
) from e
|
210
276
|
|
211
|
-
def
|
212
|
-
"""Internal helper to delete the
|
213
|
-
if not CHROMADB_AVAILABLE:
|
214
|
-
logger.error(
|
215
|
-
"Cannot delete ChromaDB collection because 'chromadb' library is not installed."
|
216
|
-
)
|
217
|
-
raise ImportError("'chromadb' library required for collection deletion.")
|
277
|
+
def _delete_lancedb_table(self) -> bool:
|
278
|
+
"""Internal helper to delete the LanceDB table managed by this service."""
|
218
279
|
if not self._persist:
|
219
280
|
logger.warning(
|
220
|
-
"Attempted to delete
|
281
|
+
"Attempted to delete LanceDB table for a non-persistent service instance. Ignoring."
|
221
282
|
)
|
222
|
-
return False
|
283
|
+
return False
|
284
|
+
|
285
|
+
if not LANCEDB_CLIENT_AVAILABLE:
|
286
|
+
logger.error("Cannot delete LanceDB table because 'lancedb' library is not installed.")
|
287
|
+
raise ImportError("'lancedb' library required for table deletion.")
|
288
|
+
|
289
|
+
table_name_to_delete = self.table_name
|
290
|
+
db_uri = self._uri
|
291
|
+
logger.warning(
|
292
|
+
f"Attempting to delete existing LanceDB table '{table_name_to_delete}' at uri '{db_uri}'."
|
293
|
+
)
|
223
294
|
try:
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
|
229
|
-
try:
|
230
|
-
chroma_client.delete_collection(name=collection_name_to_delete)
|
295
|
+
db = lancedb.connect(db_uri)
|
296
|
+
table_names = db.table_names()
|
297
|
+
if table_name_to_delete in table_names:
|
298
|
+
db.drop_table(table_name_to_delete)
|
231
299
|
logger.info(
|
232
|
-
f"Successfully deleted existing
|
300
|
+
f"Successfully deleted existing LanceDB table '{table_name_to_delete}'."
|
233
301
|
)
|
234
|
-
|
235
|
-
return True
|
236
|
-
except chromadb.errors.InvalidCollectionException:
|
302
|
+
else:
|
237
303
|
logger.info(
|
238
|
-
f"
|
304
|
+
f"LanceDB table '{table_name_to_delete}' did not exist. No deletion needed."
|
239
305
|
)
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
except ImportError as ie:
|
244
|
-
raise ie
|
306
|
+
|
307
|
+
self._lancedb_store = None
|
308
|
+
return True
|
245
309
|
except Exception as e:
|
246
310
|
logger.error(
|
247
|
-
f"Error during
|
311
|
+
f"Error during LanceDB table deletion '{table_name_to_delete}' at '{db_uri}': {e}",
|
248
312
|
exc_info=True,
|
249
313
|
)
|
250
|
-
# Don't raise here, let index() decide based on force_reindex
|
251
314
|
return False
|
252
315
|
|
253
|
-
# --- Protocol Methods Implementation ---
|
316
|
+
# --- Protocol Methods Implementation ---
|
254
317
|
|
255
318
|
def index(
|
256
319
|
self,
|
257
|
-
documents: Iterable[Indexable],
|
320
|
+
documents: Iterable[Indexable],
|
258
321
|
embedder_device: Optional[str] = None,
|
259
322
|
force_reindex: bool = False,
|
260
323
|
) -> None:
|
261
|
-
# Need to consume the iterable to log count, or log differently
|
262
|
-
# Let's convert to list for now, assuming size isn't prohibitive
|
263
324
|
indexable_list = list(documents)
|
264
325
|
logger.info(
|
265
|
-
f"Index request for
|
326
|
+
f"Index request for table='{self.table_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
|
266
327
|
)
|
267
328
|
|
268
329
|
if not indexable_list:
|
269
330
|
logger.warning("No documents provided for indexing. Skipping.")
|
270
331
|
return
|
271
332
|
|
272
|
-
#
|
333
|
+
# Handle Reindexing
|
273
334
|
if force_reindex:
|
274
|
-
logger.info(f"Force reindex requested for
|
335
|
+
logger.info(f"Force reindex requested for table '{self.table_name}'.")
|
275
336
|
if self._persist:
|
276
|
-
|
277
|
-
deleted = self._delete_chroma_collection() # Uses self.collection_name
|
337
|
+
deleted = self._delete_lancedb_table()
|
278
338
|
if not deleted:
|
279
|
-
# If deletion failed for other reasons, log and continue cautiously
|
280
339
|
logger.warning(
|
281
|
-
"
|
340
|
+
"LanceDB table deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
|
282
341
|
)
|
283
342
|
else:
|
284
|
-
# For InMemory,
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
)
|
289
|
-
self._in_memory_store = InMemoryDocumentStore() # Create a new one
|
343
|
+
# For InMemory, re-initialize the instance's store
|
344
|
+
logger.info(f"force_reindex=True: Re-initializing InMemory store.")
|
345
|
+
if not InMemoryDocumentStore:
|
346
|
+
raise ImportError("InMemoryDocumentStore not available.")
|
347
|
+
self._in_memory_store = InMemoryDocumentStore()
|
290
348
|
|
291
|
-
#
|
292
|
-
|
293
|
-
store = self._get_store() # No argument needed
|
349
|
+
# Get Store
|
350
|
+
store = self._get_store()
|
294
351
|
|
295
|
-
#
|
296
|
-
# Errors during embedder creation will propagate from the helper
|
352
|
+
# Create Embedder
|
297
353
|
embedder = self._get_document_embedder(embedder_device)
|
298
354
|
|
299
|
-
#
|
355
|
+
# Convert Indexable to Haystack Docs & Embed
|
300
356
|
haystack_docs_to_embed: List[HaystackDocument] = []
|
301
357
|
logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
|
302
|
-
# Consume Indexable items using the protocol methods
|
303
358
|
for item in indexable_list:
|
304
359
|
doc_id = item.get_id()
|
305
360
|
metadata = item.get_metadata()
|
306
|
-
content_obj = item.get_content()
|
307
|
-
|
308
|
-
# Determine content based on embedder type and content object
|
309
|
-
# For now, assume text content is needed and try to extract it
|
361
|
+
content_obj = item.get_content()
|
310
362
|
content_text = ""
|
311
363
|
if isinstance(content_obj, str):
|
312
|
-
# If get_content() already returned text
|
313
364
|
content_text = content_obj
|
314
365
|
elif hasattr(content_obj, "extract_text") and callable(
|
315
366
|
getattr(content_obj, "extract_text")
|
316
367
|
):
|
317
|
-
# If content object has extract_text (like Page or Region)
|
318
368
|
try:
|
319
369
|
content_text = content_obj.extract_text()
|
320
370
|
if not isinstance(content_text, str):
|
@@ -329,18 +379,12 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
329
379
|
)
|
330
380
|
content_text = str(content_obj)
|
331
381
|
else:
|
332
|
-
# Attempt to convert to string as fallback if no obvious text method
|
333
382
|
logger.warning(
|
334
383
|
f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
|
335
384
|
)
|
336
385
|
content_text = str(content_obj)
|
337
386
|
|
338
|
-
|
339
|
-
haystack_doc = HaystackDocument(
|
340
|
-
id=doc_id, # Use ID from get_id()
|
341
|
-
content=content_text,
|
342
|
-
meta=metadata, # Use metadata from get_metadata()
|
343
|
-
)
|
387
|
+
haystack_doc = HaystackDocument(id=doc_id, content=content_text, meta=metadata)
|
344
388
|
haystack_docs_to_embed.append(haystack_doc)
|
345
389
|
|
346
390
|
if not haystack_docs_to_embed:
|
@@ -353,68 +397,57 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
353
397
|
f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
|
354
398
|
)
|
355
399
|
try:
|
356
|
-
# Embed the documents
|
357
400
|
embedding_results = embedder.run(documents=haystack_docs_to_embed)
|
358
401
|
embedded_docs = embedding_results["documents"]
|
359
402
|
logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
|
360
403
|
|
361
404
|
except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
|
362
|
-
|
363
|
-
|
364
|
-
|
405
|
+
error_msg = (
|
406
|
+
f"Indexing failed for table '{self.table_name}'. Dimension mismatch: {dim_error}. "
|
407
|
+
)
|
408
|
+
error_msg += f"Ensure the embedding model ('{self._embedding_model}', Dim: {self._embedding_dims}) matches the expected dimension of the store. "
|
365
409
|
if self._persist:
|
366
|
-
error_msg += f"If the
|
367
|
-
error_msg += (
|
368
|
-
"Try deleting the persistent storage directory or using force_reindex=True."
|
369
|
-
)
|
410
|
+
error_msg += f"If the table already exists at '{self._uri}', it might have been created with a different model/dimension. "
|
411
|
+
error_msg += f"Try deleting the LanceDB table directory ('{os.path.join(self._uri, self.table_name + '.lance')}') or using force_reindex=True."
|
370
412
|
else:
|
371
413
|
error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
|
372
414
|
logger.error(error_msg, exc_info=True)
|
373
415
|
raise IndexConfigurationError(error_msg) from dim_error
|
374
|
-
# REMOVED broad except Exception for embedding errors. Let them propagate.
|
375
416
|
|
376
|
-
#
|
417
|
+
# Write Embedded Documents to Store
|
377
418
|
logger.info(
|
378
|
-
f"Writing {len(embedded_docs)} embedded documents to store '{self.
|
419
|
+
f"Writing {len(embedded_docs)} embedded documents to store (Table/Type: '{self.table_name if self._persist else 'InMemory'}')..."
|
379
420
|
)
|
380
|
-
# REMOVED try...except around store writing. Let errors propagate.
|
381
421
|
write_result = store.write_documents(
|
382
|
-
documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
|
383
|
-
)
|
384
|
-
logger.info(
|
385
|
-
f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
|
422
|
+
documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
|
386
423
|
)
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
424
|
+
logger.info(f"Successfully wrote {write_result} documents to store.")
|
425
|
+
try:
|
426
|
+
count = store.count_documents()
|
427
|
+
logger.info(f"Store document count after write: {count}")
|
428
|
+
except Exception as count_error:
|
429
|
+
logger.warning(f"Could not get document count after write: {count_error}")
|
392
430
|
|
393
431
|
def search(
|
394
432
|
self,
|
395
|
-
query: Any,
|
433
|
+
query: Any,
|
396
434
|
options: BaseSearchOptions,
|
397
435
|
) -> List[Dict[str, Any]]:
|
398
436
|
logger.info(
|
399
|
-
f"Search request for
|
437
|
+
f"Search request for table/store='{self.table_name if self._persist else 'InMemory'}', query_type={type(query).__name__}, options={options}"
|
400
438
|
)
|
401
439
|
|
402
|
-
store = self._get_store()
|
440
|
+
store = self._get_store()
|
403
441
|
|
404
|
-
#
|
405
|
-
# This implementation currently only supports text query embedding.
|
406
|
-
# TODO: Refactor or extend for multimodal queries based on service capabilities/options.
|
442
|
+
# Handle Query Type and Embedding
|
407
443
|
query_embedding = None
|
408
444
|
query_text = ""
|
409
445
|
if isinstance(query, (str, os.PathLike)):
|
410
446
|
if isinstance(query, os.PathLike):
|
411
|
-
logger.warning(
|
412
|
-
"Image path query received, but multimodal search not fully implemented. Treating as text path string."
|
413
|
-
)
|
447
|
+
logger.warning("Image path query received, treating as text path string.")
|
414
448
|
query_text = str(query)
|
415
449
|
else:
|
416
450
|
query_text = query
|
417
|
-
|
418
451
|
text_embedder = self._get_text_embedder()
|
419
452
|
embedding_result = text_embedder.run(text=query_text)
|
420
453
|
query_embedding = embedding_result["embedding"]
|
@@ -423,19 +456,11 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
423
456
|
logger.debug(
|
424
457
|
f"Successfully generated query text embedding (dim: {len(query_embedding)})."
|
425
458
|
)
|
426
|
-
|
427
459
|
elif isinstance(query, Image.Image):
|
428
|
-
logger.error(
|
429
|
-
|
430
|
-
)
|
431
|
-
raise NotImplementedError(
|
432
|
-
"Search with PIL Image queries is not implemented in HaystackSearchService."
|
433
|
-
)
|
434
|
-
# Check if query is Indexable and try extracting text?
|
460
|
+
logger.error("Multimodal query (PIL Image) is not yet supported.")
|
461
|
+
raise NotImplementedError("Search with PIL Image queries is not implemented.")
|
435
462
|
elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
|
436
|
-
logger.debug(
|
437
|
-
f"Query type {type(query).__name__} has extract_text. Extracting text for search."
|
438
|
-
)
|
463
|
+
logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text.")
|
439
464
|
try:
|
440
465
|
query_text = query.extract_text()
|
441
466
|
if not query_text or not query_text.strip():
|
@@ -443,82 +468,121 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
443
468
|
f"Query object {type(query).__name__} provided empty text. Returning no results."
|
444
469
|
)
|
445
470
|
return []
|
446
|
-
# Embed the extracted text
|
447
471
|
text_embedder = self._get_text_embedder()
|
448
472
|
embedding_result = text_embedder.run(text=query_text)
|
449
473
|
query_embedding = embedding_result["embedding"]
|
450
474
|
if not query_embedding:
|
451
475
|
raise ValueError(
|
452
|
-
f"Text embedder did not return
|
476
|
+
f"Text embedder did not return embedding for text from {type(query).__name__}."
|
453
477
|
)
|
454
478
|
logger.debug(
|
455
|
-
f"
|
479
|
+
f"Generated query embedding from extracted text (dim: {len(query_embedding)})."
|
456
480
|
)
|
457
481
|
except Exception as e:
|
458
482
|
logger.error(
|
459
|
-
f"Failed to extract
|
483
|
+
f"Failed to extract/embed text from query object {type(query).__name__}: {e}",
|
460
484
|
exc_info=True,
|
461
485
|
)
|
462
486
|
raise RuntimeError("Query text extraction or embedding failed.") from e
|
463
|
-
|
464
487
|
else:
|
465
|
-
# Raise specific error for unsupported types by this implementation
|
466
488
|
raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
|
467
489
|
|
468
|
-
#
|
490
|
+
# Select Retriever based on Store Type
|
469
491
|
retriever = None
|
470
|
-
if isinstance
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
492
|
+
# Check if LanceDB is available *before* checking isinstance
|
493
|
+
if (
|
494
|
+
LANCEDB_HAYSTACK_AVAILABLE
|
495
|
+
and LanceDBDocumentStore
|
496
|
+
and isinstance(store, LanceDBDocumentStore)
|
497
|
+
):
|
498
|
+
if not LanceDBEmbeddingRetriever:
|
499
|
+
raise ImportError("LanceDBEmbeddingRetriever is required but not available.")
|
500
|
+
retriever = LanceDBEmbeddingRetriever(document_store=store)
|
501
|
+
# Check if InMemory is available *before* checking isinstance
|
502
|
+
elif (
|
503
|
+
InMemoryDocumentStore
|
504
|
+
and InMemoryEmbeddingRetriever
|
505
|
+
and isinstance(store, InMemoryDocumentStore)
|
506
|
+
):
|
507
|
+
# No separate HAS_INMEMORY flag, check if classes are not None
|
475
508
|
retriever = InMemoryEmbeddingRetriever(document_store=store)
|
476
509
|
else:
|
477
|
-
#
|
478
|
-
|
510
|
+
# Improved error message if store type is unexpected
|
511
|
+
store_type_name = type(store).__name__
|
512
|
+
available_integrations = []
|
513
|
+
if LANCEDB_HAYSTACK_AVAILABLE and LanceDBDocumentStore:
|
514
|
+
available_integrations.append("LanceDB")
|
515
|
+
if InMemoryDocumentStore:
|
516
|
+
available_integrations.append("InMemory")
|
517
|
+
|
518
|
+
if not available_integrations:
|
519
|
+
raise TypeError(
|
520
|
+
f"Cannot perform search: No supported document store integrations (LanceDB, InMemory) seem to be available. "
|
521
|
+
f"Check Haystack installation."
|
522
|
+
)
|
523
|
+
# Check if the store type matches one of the available integrations' expected types
|
524
|
+
elif (
|
525
|
+
LANCEDB_HAYSTACK_AVAILABLE
|
526
|
+
and LanceDBDocumentStore
|
527
|
+
and isinstance(store, LanceDBDocumentStore)
|
528
|
+
) or (InMemoryDocumentStore and isinstance(store, InMemoryDocumentStore)):
|
529
|
+
# This case implies the retriever class (e.g., LanceDBEmbeddingRetriever) might be missing
|
530
|
+
missing_retriever = ""
|
531
|
+
if isinstance(store, LanceDBDocumentStore):
|
532
|
+
missing_retriever = "LanceDBEmbeddingRetriever"
|
533
|
+
if isinstance(store, InMemoryDocumentStore):
|
534
|
+
missing_retriever = "InMemoryEmbeddingRetriever"
|
535
|
+
raise ImportError(
|
536
|
+
f"Store type '{store_type_name}' is supported, but its retriever component '{missing_retriever}' failed to import or is unavailable."
|
537
|
+
)
|
538
|
+
else: # Store type doesn't match any known/available store type
|
539
|
+
raise TypeError(
|
540
|
+
f"Cannot perform search with unexpected store type '{store_type_name}'. "
|
541
|
+
f"Available integrations: {', '.join(available_integrations)}."
|
542
|
+
)
|
479
543
|
|
480
|
-
#
|
544
|
+
# This check remains as a final safeguard, though the logic above should catch most issues
|
545
|
+
if not retriever:
|
546
|
+
raise RuntimeError(
|
547
|
+
f"Failed to select a suitable retriever for store type {type(store).__name__}. Please check dependencies and integration availability."
|
548
|
+
)
|
549
|
+
|
550
|
+
logger.debug(f"Selected retriever: {type(retriever).__name__}")
|
551
|
+
|
552
|
+
# Build Retrieval Pipeline
|
481
553
|
pipeline = Pipeline()
|
482
554
|
pipeline.add_component("retriever", retriever)
|
483
|
-
# Add Ranker logic (remains the same)
|
484
|
-
# ... (ranker setup if needed)
|
485
555
|
|
486
|
-
#
|
556
|
+
# Prepare Filters
|
487
557
|
haystack_filters = options.filters
|
488
558
|
if haystack_filters:
|
489
559
|
logger.debug(f"Applying filters: {haystack_filters}")
|
490
560
|
|
491
|
-
#
|
561
|
+
# Prepare Retriever Input Data
|
492
562
|
retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
|
493
|
-
# Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
|
494
563
|
retriever_input_data["query_embedding"] = query_embedding
|
495
564
|
logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
|
496
565
|
|
497
|
-
#
|
566
|
+
# Run Retrieval
|
498
567
|
try:
|
499
|
-
logger.info(
|
500
|
-
|
501
|
-
data={"retriever": retriever_input_data}
|
502
|
-
# ... (ranker data if needed)
|
568
|
+
logger.info(
|
569
|
+
f"Running retrieval pipeline for table/store '{self.table_name if self._persist else 'InMemory'}'..."
|
503
570
|
)
|
571
|
+
result = pipeline.run(data={"retriever": retriever_input_data})
|
504
572
|
|
505
|
-
#
|
573
|
+
# Format Results
|
506
574
|
if "retriever" in result and "documents" in result["retriever"]:
|
507
575
|
retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
|
508
576
|
logger.info(f"Retrieved {len(retrieved_docs)} documents.")
|
509
|
-
# Format results (remains the same)
|
510
577
|
final_results = []
|
511
578
|
for doc in retrieved_docs:
|
512
|
-
# Include content_hash in returned metadata if present
|
513
579
|
meta_with_hash = doc.meta
|
514
|
-
# No need to explicitly add hash here if Haystack store preserves it
|
515
580
|
result_dict = {
|
516
581
|
"content_snippet": doc.content[:200] if doc.content else "",
|
517
582
|
"score": doc.score if doc.score is not None else 0.0,
|
518
583
|
"page_number": meta_with_hash.get("page_number", None),
|
519
584
|
"pdf_path": meta_with_hash.get("pdf_path", None),
|
520
|
-
"metadata": meta_with_hash,
|
521
|
-
# "_haystack_document": doc # Optionally include full object
|
585
|
+
"metadata": meta_with_hash,
|
522
586
|
}
|
523
587
|
final_results.append(result_dict)
|
524
588
|
return final_results
|
@@ -527,117 +591,97 @@ class HaystackSearchService(SearchServiceProtocol):
|
|
527
591
|
return []
|
528
592
|
|
529
593
|
except FileNotFoundError:
|
530
|
-
# Keep specific catch for collection not found during retrieval
|
531
594
|
logger.error(
|
532
|
-
f"Search failed:
|
595
|
+
f"Search failed: Could not access path for table/store '{self.table_name if self._persist else 'InMemory'}' (URI: '{self._uri if self._persist else 'N/A'}')."
|
533
596
|
)
|
534
|
-
raise
|
535
|
-
# REMOVED broad except Exception for pipeline execution. Let errors propagate.
|
597
|
+
raise
|
536
598
|
|
537
|
-
def delete_index(
|
538
|
-
self,
|
539
|
-
) -> bool:
|
599
|
+
def delete_index(self) -> bool:
|
540
600
|
"""
|
541
|
-
Deletes the entire
|
601
|
+
Deletes the entire LanceDB table or resets the InMemory store.
|
542
602
|
|
543
603
|
Returns:
|
544
|
-
True if deletion was successful or
|
604
|
+
True if deletion was successful or table/store didn't exist, False otherwise.
|
545
605
|
"""
|
546
|
-
logger.warning(f"Request to delete index for collection '{self.collection_name}'.")
|
547
606
|
if self._persist:
|
548
|
-
|
549
|
-
|
550
|
-
else:
|
551
|
-
# For InMemory, "deleting" means re-initializing the store
|
552
|
-
logger.info(
|
553
|
-
f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
|
607
|
+
logger.warning(
|
608
|
+
f"Request to delete LanceDB table '{self.table_name}' at uri '{self._uri}'."
|
554
609
|
)
|
610
|
+
return self._delete_lancedb_table()
|
611
|
+
else:
|
612
|
+
logger.info("Request to delete InMemory store (re-initializing).)")
|
613
|
+
if not InMemoryDocumentStore:
|
614
|
+
raise ImportError("InMemoryDocumentStore not available.")
|
555
615
|
self._in_memory_store = InMemoryDocumentStore()
|
556
|
-
return True
|
616
|
+
return True
|
557
617
|
|
558
|
-
def index_exists(
|
559
|
-
self,
|
560
|
-
) -> bool:
|
618
|
+
def index_exists(self) -> bool:
|
561
619
|
"""
|
562
|
-
Checks if the
|
563
|
-
NOTE: For
|
564
|
-
|
620
|
+
Checks if the LanceDB table or InMemory store exists and has documents.
|
621
|
+
NOTE: For LanceDB, this tries to count documents, implicitly checking connection/table existence.
|
622
|
+
For InMemory, it checks if the internal store object exists and has documents.
|
565
623
|
"""
|
566
|
-
|
567
|
-
|
624
|
+
store_name = self.table_name if self._persist else "InMemory"
|
625
|
+
logger.debug(
|
626
|
+
f"Checking existence of index for '{store_name}'. URI: '{self._uri if self._persist else 'N/A'}'"
|
627
|
+
)
|
568
628
|
try:
|
629
|
+
store = self._get_store()
|
569
630
|
count = store.count_documents()
|
570
631
|
exists = count > 0
|
571
632
|
logger.debug(
|
572
|
-
f"Store type {type(store).__name__} for '{
|
633
|
+
f"Store type {type(store).__name__} for '{store_name}' exists and has {count} documents: {exists}"
|
573
634
|
)
|
574
635
|
return exists
|
636
|
+
except ImportError as ie:
|
637
|
+
logger.error(f"Import error checking index existence for '{store_name}': {ie}")
|
638
|
+
return False
|
575
639
|
except Exception as e:
|
576
|
-
# Catch errors during count_documents (e.g., connection error for persistent stores)
|
577
640
|
logger.warning(
|
578
|
-
f"Could not count documents in store for
|
641
|
+
f"Could not confirm existence or count documents in store for '{store_name}': {e}",
|
579
642
|
exc_info=False,
|
580
643
|
)
|
581
|
-
# Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
|
582
|
-
# Assume not exists if count fails
|
583
644
|
return False
|
584
645
|
|
585
646
|
# --- Sync Methods Implementation ---
|
586
647
|
|
587
648
|
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
588
|
-
"""Retrieves documents, required for sync.
|
589
|
-
|
590
|
-
Fetches all docs if filters=None.
|
591
|
-
"""
|
649
|
+
"""Retrieves documents, required for sync."""
|
650
|
+
store_name = self.table_name if self._persist else "InMemory"
|
592
651
|
logger.debug(
|
593
|
-
f"Listing documents for
|
652
|
+
f"Listing documents for '{store_name}' (include_metadata={include_metadata})..."
|
594
653
|
)
|
595
654
|
store = self._get_store()
|
596
655
|
try:
|
597
|
-
|
598
|
-
|
599
|
-
haystack_docs = store.filter_documents(
|
600
|
-
filters=kwargs.get("filters")
|
601
|
-
) # Pass filters if provided via kwargs
|
602
|
-
logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
|
603
|
-
# Convert to simple dicts
|
656
|
+
haystack_docs = store.filter_documents(filters=kwargs.get("filters"))
|
657
|
+
logger.info(f"Retrieved {len(haystack_docs)} documents from store '{store_name}'.")
|
604
658
|
results = []
|
605
659
|
for doc in haystack_docs:
|
606
|
-
doc_dict = {"id": doc.id}
|
660
|
+
doc_dict = {"id": doc.id}
|
607
661
|
if include_metadata:
|
608
|
-
# Ensure content_hash is included if it exists in meta
|
609
662
|
doc_dict["meta"] = doc.meta
|
610
|
-
# Optionally include content? Protocol doesn't require it.
|
611
|
-
# doc_dict["content"] = doc.content
|
612
663
|
results.append(doc_dict)
|
613
664
|
return results
|
614
665
|
except Exception as e:
|
615
|
-
logger.error(
|
616
|
-
|
617
|
-
)
|
618
|
-
raise RuntimeError(
|
619
|
-
f"Failed to list documents from store '{self.collection_name}'."
|
620
|
-
) from e
|
666
|
+
logger.error(f"Failed to list documents from store '{store_name}': {e}", exc_info=True)
|
667
|
+
raise RuntimeError(f"Failed to list documents from store '{store_name}'.") from e
|
621
668
|
|
622
669
|
def delete_documents(self, ids: List[str]) -> None:
|
623
670
|
"""Deletes documents by ID, required for sync."""
|
671
|
+
store_name = self.table_name if self._persist else "InMemory"
|
624
672
|
if not ids:
|
625
|
-
logger.debug("No document IDs provided for deletion. Skipping.")
|
673
|
+
logger.debug(f"No document IDs provided for deletion from '{store_name}'. Skipping.")
|
626
674
|
return
|
627
|
-
logger.warning(
|
628
|
-
f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
|
629
|
-
)
|
675
|
+
logger.warning(f"Request to delete {len(ids)} documents from '{store_name}'.")
|
630
676
|
store = self._get_store()
|
631
677
|
try:
|
632
678
|
store.delete_documents(ids=ids)
|
633
679
|
logger.info(
|
634
|
-
f"Successfully
|
680
|
+
f"Successfully requested deletion of {len(ids)} documents from '{store_name}'. Store count now: {store.count_documents()}"
|
635
681
|
)
|
636
682
|
except Exception as e:
|
637
683
|
logger.error(
|
638
|
-
f"Failed to delete documents with IDs {ids} from store '{
|
684
|
+
f"Failed to delete documents with IDs {ids} from store '{store_name}': {e}",
|
639
685
|
exc_info=True,
|
640
686
|
)
|
641
|
-
raise RuntimeError(
|
642
|
-
f"Failed to delete documents from store '{self.collection_name}'."
|
643
|
-
) from e
|
687
|
+
raise RuntimeError(f"Failed to delete documents from store '{store_name}'.") from e
|