natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,34 +1,48 @@
|
|
1
|
+
import hashlib # For hashing content
|
1
2
|
import logging
|
2
|
-
from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
import
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
|
5
5
|
|
6
6
|
# Now import the flag from the canonical source - this import should always work
|
7
7
|
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
8
|
+
|
8
9
|
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
9
10
|
|
10
11
|
# Avoid runtime import errors if extras not installed
|
11
12
|
try:
|
12
13
|
# Import protocols and options first
|
14
|
+
from . import get_search_service
|
15
|
+
from .search_options import SearchOptions, TextSearchOptions
|
13
16
|
from .search_service_protocol import (
|
14
|
-
|
15
|
-
IndexConfigurationError,
|
17
|
+
Indexable,
|
18
|
+
IndexConfigurationError,
|
19
|
+
IndexExistsError,
|
20
|
+
SearchServiceProtocol,
|
16
21
|
)
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
if TYPE_CHECKING: # Keep type hints working
|
21
|
-
from natural_pdf.elements.region import Region # Example indexable type
|
22
|
+
|
23
|
+
if TYPE_CHECKING: # Keep type hints working
|
24
|
+
from natural_pdf.elements.region import Region # Example indexable type
|
22
25
|
except ImportError:
|
23
26
|
# Define dummies if extras missing
|
24
|
-
SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError =
|
27
|
+
SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = (
|
28
|
+
object,
|
29
|
+
object,
|
30
|
+
RuntimeError,
|
31
|
+
RuntimeError,
|
32
|
+
)
|
25
33
|
SearchOptions, TextSearchOptions = object, object
|
26
34
|
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
27
|
-
|
28
|
-
|
35
|
+
|
36
|
+
def get_search_service(**kwargs):
|
37
|
+
raise ImportError("Search dependencies missing.")
|
38
|
+
|
39
|
+
class Region:
|
40
|
+
pass # Dummy for type hint
|
41
|
+
|
29
42
|
|
30
43
|
logger = logging.getLogger(__name__)
|
31
44
|
|
45
|
+
|
32
46
|
class SearchableMixin(ABC):
|
33
47
|
"""
|
34
48
|
Mixin class providing search functionality (initialization, indexing, searching, syncing).
|
@@ -36,6 +50,7 @@ class SearchableMixin(ABC):
|
|
36
50
|
Requires the inheriting class to implement `get_indexable_items`.
|
37
51
|
Assumes the inheriting class has a `_search_service` attribute initialized to None.
|
38
52
|
"""
|
53
|
+
|
39
54
|
# Ensure inheriting class initializes this
|
40
55
|
_search_service: Optional[SearchServiceProtocol] = None
|
41
56
|
|
@@ -53,12 +68,12 @@ class SearchableMixin(ABC):
|
|
53
68
|
*,
|
54
69
|
persist: Optional[bool] = None,
|
55
70
|
collection_name: Optional[str] = None,
|
56
|
-
embedding_model: Optional[str] = None,
|
57
|
-
index: bool = False,
|
71
|
+
embedding_model: Optional[str] = None, # Allow overriding embedding model
|
72
|
+
index: bool = False, # Changed from index_now
|
58
73
|
force_reindex: bool = False,
|
59
74
|
embedder_device: Optional[str] = None,
|
60
|
-
**kwargs
|
61
|
-
) ->
|
75
|
+
**kwargs, # Pass other args to get_search_service
|
76
|
+
) -> "SearchableMixin": # Return self for chaining
|
62
77
|
"""
|
63
78
|
Initializes and configures the search service for this instance.
|
64
79
|
|
@@ -90,7 +105,9 @@ class SearchableMixin(ABC):
|
|
90
105
|
"""
|
91
106
|
if service:
|
92
107
|
# Attach provided service
|
93
|
-
logger.info(
|
108
|
+
logger.info(
|
109
|
+
f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
|
110
|
+
)
|
94
111
|
# TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
|
95
112
|
self._search_service = service
|
96
113
|
else:
|
@@ -101,24 +118,45 @@ class SearchableMixin(ABC):
|
|
101
118
|
raise ValueError("A collection_name must be provided when persist=True.")
|
102
119
|
elif not effective_persist and not effective_collection_name:
|
103
120
|
effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
|
104
|
-
logger.info(
|
121
|
+
logger.info(
|
122
|
+
f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service."
|
123
|
+
)
|
105
124
|
|
106
|
-
logger.info(
|
125
|
+
logger.info(
|
126
|
+
f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
|
127
|
+
)
|
107
128
|
try:
|
108
|
-
service_args = {
|
109
|
-
|
129
|
+
service_args = {
|
130
|
+
"collection_name": effective_collection_name,
|
131
|
+
"persist": effective_persist,
|
132
|
+
**kwargs,
|
133
|
+
}
|
134
|
+
if embedding_model:
|
135
|
+
service_args["embedding_model"] = embedding_model
|
110
136
|
self._search_service = get_search_service(**service_args)
|
137
|
+
except ImportError as ie: # Catch the specific ImportError first
|
138
|
+
logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
|
139
|
+
raise ie # Re-raise the original ImportError
|
111
140
|
except Exception as e:
|
112
|
-
logger.error(
|
113
|
-
|
114
|
-
|
115
|
-
|
141
|
+
logger.error(
|
142
|
+
f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
|
143
|
+
)
|
144
|
+
# Keep the RuntimeError for other unexpected creation errors
|
145
|
+
raise RuntimeError(
|
146
|
+
"Could not create SearchService instance due to an unexpected error."
|
147
|
+
) from e
|
148
|
+
|
149
|
+
# --- Optional Immediate Indexing (with safety check for persistent) ---
|
116
150
|
if index:
|
117
|
-
if not self._search_service:
|
118
|
-
raise RuntimeError(
|
151
|
+
if not self._search_service: # Should not happen if logic above is correct
|
152
|
+
raise RuntimeError(
|
153
|
+
"Cannot index: Search service not available after initialization attempt."
|
154
|
+
)
|
119
155
|
|
120
|
-
is_persistent = getattr(
|
121
|
-
|
156
|
+
is_persistent = getattr(
|
157
|
+
self._search_service, "_persist", False
|
158
|
+
) # Check if service is persistent
|
159
|
+
collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
|
122
160
|
|
123
161
|
if is_persistent and not force_reindex:
|
124
162
|
# Check existence only if persistent and not forcing reindex
|
@@ -131,14 +169,20 @@ class SearchableMixin(ABC):
|
|
131
169
|
)
|
132
170
|
else:
|
133
171
|
# Index doesn't exist, safe to proceed
|
134
|
-
logger.info(
|
172
|
+
logger.info(
|
173
|
+
f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing."
|
174
|
+
)
|
135
175
|
elif is_persistent and force_reindex:
|
136
|
-
logger.warning(
|
176
|
+
logger.warning(
|
177
|
+
f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
|
178
|
+
)
|
137
179
|
# else: # Not persistent, safe to proceed without existence check
|
138
180
|
# logger.debug("Proceeding with index=True for non-persistent index.")
|
139
181
|
|
140
182
|
# Proceed with indexing if checks passed or not applicable
|
141
|
-
logger.info(
|
183
|
+
logger.info(
|
184
|
+
f"index=True: Proceeding to index collection immediately after search initialization."
|
185
|
+
)
|
142
186
|
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
143
187
|
|
144
188
|
return self
|
@@ -148,42 +192,53 @@ class SearchableMixin(ABC):
|
|
148
192
|
if not self._search_service:
|
149
193
|
raise RuntimeError("Search service not initialized. Call init_search first.")
|
150
194
|
|
151
|
-
collection_name = getattr(self._search_service,
|
152
|
-
logger.info(
|
195
|
+
collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
|
196
|
+
logger.info(
|
197
|
+
f"Starting internal indexing process into SearchService collection '{collection_name}'..."
|
198
|
+
)
|
153
199
|
|
154
200
|
# Use the abstract method to get items
|
155
201
|
try:
|
156
|
-
indexable_items = list(self.get_indexable_items())
|
202
|
+
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
157
203
|
except Exception as e:
|
158
|
-
|
159
|
-
|
204
|
+
logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
|
205
|
+
raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
|
160
206
|
|
161
207
|
if not indexable_items:
|
162
|
-
logger.warning(
|
208
|
+
logger.warning(
|
209
|
+
"No indexable items provided by get_indexable_items(). Skipping index call."
|
210
|
+
)
|
163
211
|
return
|
164
212
|
|
165
213
|
logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
|
166
214
|
try:
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
215
|
+
logger.debug(
|
216
|
+
f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
|
217
|
+
)
|
218
|
+
self._search_service.index(
|
219
|
+
documents=indexable_items,
|
220
|
+
embedder_device=embedder_device,
|
221
|
+
force_reindex=force_reindex,
|
222
|
+
)
|
223
|
+
logger.info(
|
224
|
+
f"Successfully completed indexing into SearchService collection '{collection_name}'."
|
225
|
+
)
|
174
226
|
except IndexConfigurationError as ice:
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
227
|
+
logger.error(
|
228
|
+
f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
|
229
|
+
exc_info=True,
|
230
|
+
)
|
231
|
+
raise # Re-raise specific error
|
232
|
+
except Exception as e: # Catch other indexing errors from the service
|
233
|
+
logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
|
234
|
+
raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
|
181
235
|
|
182
236
|
def index_for_search(
|
183
|
-
self,
|
237
|
+
self,
|
238
|
+
*, # Make args keyword-only
|
184
239
|
embedder_device: Optional[str] = None,
|
185
240
|
force_reindex: bool = False,
|
186
|
-
) ->
|
241
|
+
) -> "SearchableMixin":
|
187
242
|
"""
|
188
243
|
Ensures the search service is initialized (using default if needed)
|
189
244
|
and indexes the items provided by `get_indexable_items`.
|
@@ -201,8 +256,10 @@ class SearchableMixin(ABC):
|
|
201
256
|
"""
|
202
257
|
# --- Ensure Service is Initialized (Use Default if Needed) ---
|
203
258
|
if not self._search_service:
|
204
|
-
logger.info(
|
205
|
-
|
259
|
+
logger.info(
|
260
|
+
"Search service not initialized prior to index_for_search. Initializing default in-memory service."
|
261
|
+
)
|
262
|
+
self.init_search() # Call init with defaults
|
206
263
|
|
207
264
|
# --- Perform Indexing ---
|
208
265
|
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
@@ -210,10 +267,10 @@ class SearchableMixin(ABC):
|
|
210
267
|
|
211
268
|
def find_relevant(
|
212
269
|
self,
|
213
|
-
query: Any,
|
214
|
-
*,
|
270
|
+
query: Any, # Query type depends on service capabilities
|
271
|
+
*, # Make options/service keyword-only
|
215
272
|
options: Optional[SearchOptions] = None,
|
216
|
-
search_service: Optional[SearchServiceProtocol] = None
|
273
|
+
search_service: Optional[SearchServiceProtocol] = None, # Allow override
|
217
274
|
) -> List[Dict[str, Any]]:
|
218
275
|
"""
|
219
276
|
Finds relevant items using the configured or provided search service.
|
@@ -240,8 +297,10 @@ class SearchableMixin(ABC):
|
|
240
297
|
"or provide an explicit 'search_service' instance to find_relevant()."
|
241
298
|
)
|
242
299
|
|
243
|
-
collection_name = getattr(effective_service,
|
244
|
-
logger.info(
|
300
|
+
collection_name = getattr(effective_service, "collection_name", "<Unknown>")
|
301
|
+
logger.info(
|
302
|
+
f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
|
303
|
+
)
|
245
304
|
|
246
305
|
# --- Prepare Query and Options ---
|
247
306
|
query_input = query
|
@@ -263,11 +322,15 @@ class SearchableMixin(ABC):
|
|
263
322
|
query=query_input,
|
264
323
|
options=effective_options,
|
265
324
|
)
|
266
|
-
logger.info(
|
325
|
+
logger.info(
|
326
|
+
f"SearchService returned {len(results)} results from collection '{collection_name}'."
|
327
|
+
)
|
267
328
|
return results
|
268
329
|
except FileNotFoundError as fnf:
|
269
|
-
logger.error(
|
270
|
-
|
330
|
+
logger.error(
|
331
|
+
f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
|
332
|
+
)
|
333
|
+
raise # Re-raise specific error
|
271
334
|
except Exception as e:
|
272
335
|
logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
|
273
336
|
# Consider wrapping in a SearchError?
|
@@ -276,11 +339,11 @@ class SearchableMixin(ABC):
|
|
276
339
|
# --- NEW Sync Method ---
|
277
340
|
def sync_index(
|
278
341
|
self,
|
279
|
-
strategy: str =
|
342
|
+
strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
|
280
343
|
dry_run: bool = False,
|
281
|
-
batch_size: int = 100,
|
282
|
-
embedder_device: Optional[str] = None,
|
283
|
-
**kwargs: Any
|
344
|
+
batch_size: int = 100, # For batching deletes/updates if needed
|
345
|
+
embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
|
346
|
+
**kwargs: Any, # Allow passing extra args to get_search_service
|
284
347
|
) -> Dict[str, int]:
|
285
348
|
"""
|
286
349
|
Synchronizes the search index with the current state of indexable items.
|
@@ -307,91 +370,107 @@ class SearchableMixin(ABC):
|
|
307
370
|
RuntimeError: For backend errors during synchronization.
|
308
371
|
"""
|
309
372
|
if not self._search_service:
|
310
|
-
|
373
|
+
raise RuntimeError("Search service not configured. Call init_search first.")
|
311
374
|
|
312
|
-
collection_name = getattr(self._search_service,
|
313
|
-
logger.info(
|
314
|
-
|
375
|
+
collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
|
376
|
+
logger.info(
|
377
|
+
f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})..."
|
378
|
+
)
|
379
|
+
summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
|
315
380
|
|
316
381
|
# --- Check Service Capabilities for 'full' sync ---
|
317
|
-
if strategy ==
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
382
|
+
if strategy == "full":
|
383
|
+
required_methods = ["list_documents", "delete_documents"]
|
384
|
+
missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
|
385
|
+
if missing_methods:
|
386
|
+
raise NotImplementedError(
|
387
|
+
f"The configured search service ({type(self._search_service).__name__}) "
|
388
|
+
f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
|
389
|
+
)
|
325
390
|
|
326
391
|
# --- 1. Get Desired State (from current collection) ---
|
327
|
-
desired_state: Dict[str, Indexable] = {}
|
328
|
-
desired_hashes: Dict[str, Optional[str]] = {}
|
392
|
+
desired_state: Dict[str, Indexable] = {} # {id: item}
|
393
|
+
desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
|
329
394
|
try:
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
395
|
+
for item in self.get_indexable_items():
|
396
|
+
item_id = item.get_id()
|
397
|
+
if not item_id:
|
398
|
+
logger.warning(f"Skipping item with no ID: {item}")
|
399
|
+
summary["skipped"] += 1
|
400
|
+
continue
|
401
|
+
if item_id in desired_state:
|
402
|
+
logger.warning(
|
403
|
+
f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item."
|
404
|
+
)
|
405
|
+
summary["skipped"] += 1
|
406
|
+
continue
|
407
|
+
desired_state[item_id] = item
|
408
|
+
# Try to get hash, store None if unavailable or fails
|
409
|
+
try:
|
410
|
+
desired_hashes[item_id] = item.get_content_hash()
|
411
|
+
except (AttributeError, NotImplementedError):
|
412
|
+
logger.debug(
|
413
|
+
f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based."
|
414
|
+
)
|
415
|
+
desired_hashes[item_id] = None
|
416
|
+
except Exception as e:
|
417
|
+
logger.warning(
|
418
|
+
f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.",
|
419
|
+
exc_info=False,
|
420
|
+
)
|
421
|
+
desired_hashes[item_id] = None
|
350
422
|
|
351
423
|
except Exception as e:
|
352
|
-
|
353
|
-
|
424
|
+
logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
|
425
|
+
raise RuntimeError("Failed to get current indexable items.") from e
|
354
426
|
|
355
427
|
logger.info(f"Desired state contains {len(desired_state)} indexable items.")
|
356
428
|
|
357
429
|
# --- 2. Handle Different Strategies ---
|
358
|
-
if strategy ==
|
430
|
+
if strategy == "upsert_only":
|
359
431
|
# Simple case: just index everything, let the service handle upserts
|
360
432
|
items_to_index = list(desired_state.values())
|
361
|
-
summary[
|
362
|
-
logger.info(
|
433
|
+
summary["added"] = len(items_to_index) # Approximate count
|
434
|
+
logger.info(
|
435
|
+
f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting."
|
436
|
+
)
|
363
437
|
if not dry_run and items_to_index:
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
embedder_device=embedder_device
|
370
|
-
)
|
438
|
+
logger.debug("Calling service.index for upsert...")
|
439
|
+
# Call index directly, force_reindex=False implies upsert
|
440
|
+
self._search_service.index(
|
441
|
+
documents=items_to_index, force_reindex=False, embedder_device=embedder_device
|
442
|
+
)
|
371
443
|
elif dry_run:
|
372
444
|
logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
|
373
445
|
|
374
|
-
elif strategy ==
|
446
|
+
elif strategy == "full":
|
375
447
|
# Complex case: Add/Update/Delete
|
376
448
|
# 2a. Get Current Index State
|
377
449
|
try:
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
450
|
+
logger.debug("Listing documents currently in the index...")
|
451
|
+
# Assumes list_documents takes filters and include_metadata
|
452
|
+
# Fetch all documents with metadata
|
453
|
+
current_docs = self._search_service.list_documents(include_metadata=True)
|
454
|
+
current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
|
455
|
+
duplicates = 0
|
456
|
+
for doc in current_docs:
|
457
|
+
doc_id = doc.get("id")
|
458
|
+
if not doc_id:
|
459
|
+
continue # Skip docs without ID from service
|
460
|
+
if doc_id in current_state:
|
461
|
+
duplicates += 1
|
462
|
+
current_state[doc_id] = doc
|
463
|
+
logger.info(
|
464
|
+
f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs)."
|
465
|
+
)
|
466
|
+
if duplicates > 0:
|
467
|
+
logger.warning(
|
468
|
+
f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison."
|
469
|
+
)
|
391
470
|
|
392
471
|
except Exception as e:
|
393
|
-
|
394
|
-
|
472
|
+
logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
|
473
|
+
raise RuntimeError("Could not retrieve current index state for sync.") from e
|
395
474
|
|
396
475
|
# 2b. Compare States and Plan Actions
|
397
476
|
ids_in_desired = set(desired_state.keys())
|
@@ -403,62 +482,68 @@ class SearchableMixin(ABC):
|
|
403
482
|
|
404
483
|
items_to_update = []
|
405
484
|
for item_id in ids_to_check_update:
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
485
|
+
desired_hash = desired_hashes.get(item_id)
|
486
|
+
current_meta = current_state[item_id].get("meta", {})
|
487
|
+
current_hash = current_meta.get("content_hash") # Assuming hash stored in meta
|
488
|
+
|
489
|
+
# Check if hash exists and differs, or if hash is missing (force update)
|
490
|
+
if desired_hash is None or current_hash is None or desired_hash != current_hash:
|
491
|
+
if desired_hash != current_hash:
|
492
|
+
logger.debug(
|
493
|
+
f"Content hash changed for ID {item_id}. Scheduling for update."
|
494
|
+
)
|
495
|
+
else:
|
496
|
+
logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
|
497
|
+
items_to_update.append(desired_state[item_id])
|
498
|
+
# Else: hashes match, no update needed
|
418
499
|
|
419
500
|
items_to_add = [desired_state[id_] for id_ in ids_to_add]
|
420
|
-
items_to_index =
|
501
|
+
items_to_index = (
|
502
|
+
items_to_add + items_to_update
|
503
|
+
) # Combine adds and updates for single index call
|
421
504
|
|
422
|
-
summary[
|
423
|
-
summary[
|
424
|
-
summary[
|
505
|
+
summary["added"] = len(items_to_add)
|
506
|
+
summary["updated"] = len(items_to_update)
|
507
|
+
summary["deleted"] = len(ids_to_delete)
|
425
508
|
|
426
|
-
logger.info(
|
509
|
+
logger.info(
|
510
|
+
f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}"
|
511
|
+
)
|
427
512
|
|
428
513
|
# 2c. Execute Actions (if not dry_run)
|
429
514
|
if not dry_run:
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
515
|
+
# Execute Deletes
|
516
|
+
if ids_to_delete:
|
517
|
+
logger.info(f"Deleting {len(ids_to_delete)} items from index...")
|
518
|
+
try:
|
519
|
+
# Assuming delete_documents takes list of IDs
|
520
|
+
# Implement batching if needed
|
521
|
+
self._search_service.delete_documents(ids=list(ids_to_delete))
|
522
|
+
logger.info("Deletion successful.")
|
523
|
+
except Exception as e:
|
524
|
+
logger.error(f"Failed to delete documents: {e}", exc_info=True)
|
525
|
+
# Decide whether to continue or raise
|
526
|
+
raise RuntimeError("Failed during deletion phase of sync.") from e
|
527
|
+
|
528
|
+
# Execute Adds/Updates
|
529
|
+
if items_to_index:
|
530
|
+
logger.info(f"Indexing/Updating {len(items_to_index)} items...")
|
531
|
+
try:
|
532
|
+
# Upsert logic handled by service's index method with force_reindex=False
|
533
|
+
self._search_service.index(
|
534
|
+
documents=items_to_index,
|
535
|
+
force_reindex=False,
|
536
|
+
embedder_device=embedder_device,
|
537
|
+
)
|
538
|
+
logger.info("Add/Update successful.")
|
539
|
+
except Exception as e:
|
540
|
+
logger.error(f"Failed to index/update documents: {e}", exc_info=True)
|
541
|
+
raise RuntimeError("Failed during add/update phase of sync.") from e
|
542
|
+
logger.info("Sync actions completed.")
|
458
543
|
else:
|
459
|
-
|
544
|
+
logger.info("[Dry Run] No changes applied to the index.")
|
460
545
|
|
461
546
|
else:
|
462
547
|
raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
|
463
548
|
|
464
|
-
return summary
|
549
|
+
return summary
|