natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,34 +1,48 @@
1
+ import hashlib # For hashing content
1
2
  import logging
2
- from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
3
3
  from abc import ABC, abstractmethod
4
- import hashlib # For hashing content
4
+ from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
5
5
 
6
6
  # Now import the flag from the canonical source - this import should always work
7
7
  from .haystack_utils import HAS_HAYSTACK_EXTRAS
8
+
8
9
  DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
9
10
 
10
11
  # Avoid runtime import errors if extras not installed
11
12
  try:
12
13
  # Import protocols and options first
14
+ from . import get_search_service
15
+ from .search_options import SearchOptions, TextSearchOptions
13
16
  from .search_service_protocol import (
14
- SearchServiceProtocol, Indexable,
15
- IndexConfigurationError, IndexExistsError
17
+ Indexable,
18
+ IndexConfigurationError,
19
+ IndexExistsError,
20
+ SearchServiceProtocol,
16
21
  )
17
- from .search_options import SearchOptions, TextSearchOptions
18
- from . import get_search_service
19
-
20
- if TYPE_CHECKING: # Keep type hints working
21
- from natural_pdf.elements.region import Region # Example indexable type
22
+
23
+ if TYPE_CHECKING: # Keep type hints working
24
+ from natural_pdf.elements.region import Region # Example indexable type
22
25
  except ImportError:
23
26
  # Define dummies if extras missing
24
- SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = object, object, RuntimeError, RuntimeError
27
+ SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = (
28
+ object,
29
+ object,
30
+ RuntimeError,
31
+ RuntimeError,
32
+ )
25
33
  SearchOptions, TextSearchOptions = object, object
26
34
  DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
27
- def get_search_service(**kwargs): raise ImportError("Search dependencies missing.")
28
- class Region: pass # Dummy for type hint
35
+
36
+ def get_search_service(**kwargs):
37
+ raise ImportError("Search dependencies missing.")
38
+
39
+ class Region:
40
+ pass # Dummy for type hint
41
+
29
42
 
30
43
  logger = logging.getLogger(__name__)
31
44
 
45
+
32
46
  class SearchableMixin(ABC):
33
47
  """
34
48
  Mixin class providing search functionality (initialization, indexing, searching, syncing).
@@ -36,6 +50,7 @@ class SearchableMixin(ABC):
36
50
  Requires the inheriting class to implement `get_indexable_items`.
37
51
  Assumes the inheriting class has a `_search_service` attribute initialized to None.
38
52
  """
53
+
39
54
  # Ensure inheriting class initializes this
40
55
  _search_service: Optional[SearchServiceProtocol] = None
41
56
 
@@ -53,12 +68,12 @@ class SearchableMixin(ABC):
53
68
  *,
54
69
  persist: Optional[bool] = None,
55
70
  collection_name: Optional[str] = None,
56
- embedding_model: Optional[str] = None, # Allow overriding embedding model
57
- index: bool = False, # Changed from index_now
71
+ embedding_model: Optional[str] = None, # Allow overriding embedding model
72
+ index: bool = False, # Changed from index_now
58
73
  force_reindex: bool = False,
59
74
  embedder_device: Optional[str] = None,
60
- **kwargs # Pass other args to get_search_service
61
- ) -> 'SearchableMixin': # Return self for chaining
75
+ **kwargs, # Pass other args to get_search_service
76
+ ) -> "SearchableMixin": # Return self for chaining
62
77
  """
63
78
  Initializes and configures the search service for this instance.
64
79
 
@@ -90,7 +105,9 @@ class SearchableMixin(ABC):
90
105
  """
91
106
  if service:
92
107
  # Attach provided service
93
- logger.info(f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}').")
108
+ logger.info(
109
+ f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
110
+ )
94
111
  # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
95
112
  self._search_service = service
96
113
  else:
@@ -101,24 +118,45 @@ class SearchableMixin(ABC):
101
118
  raise ValueError("A collection_name must be provided when persist=True.")
102
119
  elif not effective_persist and not effective_collection_name:
103
120
  effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
104
- logger.info(f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service.")
121
+ logger.info(
122
+ f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service."
123
+ )
105
124
 
106
- logger.info(f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}")
125
+ logger.info(
126
+ f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
127
+ )
107
128
  try:
108
- service_args = {"collection_name": effective_collection_name, "persist": effective_persist, **kwargs}
109
- if embedding_model: service_args['embedding_model'] = embedding_model
129
+ service_args = {
130
+ "collection_name": effective_collection_name,
131
+ "persist": effective_persist,
132
+ **kwargs,
133
+ }
134
+ if embedding_model:
135
+ service_args["embedding_model"] = embedding_model
110
136
  self._search_service = get_search_service(**service_args)
137
+ except ImportError as ie: # Catch the specific ImportError first
138
+ logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
139
+ raise ie # Re-raise the original ImportError
111
140
  except Exception as e:
112
- logger.error(f"Failed to create SearchService: {e}", exc_info=True)
113
- raise RuntimeError("Could not create SearchService instance.") from e
114
-
115
- # --- Optional Immediate Indexing (with safety check for persistent) ---
141
+ logger.error(
142
+ f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
143
+ )
144
+ # Keep the RuntimeError for other unexpected creation errors
145
+ raise RuntimeError(
146
+ "Could not create SearchService instance due to an unexpected error."
147
+ ) from e
148
+
149
+ # --- Optional Immediate Indexing (with safety check for persistent) ---
116
150
  if index:
117
- if not self._search_service: # Should not happen if logic above is correct
118
- raise RuntimeError("Cannot index: Search service not available after initialization attempt.")
151
+ if not self._search_service: # Should not happen if logic above is correct
152
+ raise RuntimeError(
153
+ "Cannot index: Search service not available after initialization attempt."
154
+ )
119
155
 
120
- is_persistent = getattr(self._search_service, '_persist', False) # Check if service is persistent
121
- collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
156
+ is_persistent = getattr(
157
+ self._search_service, "_persist", False
158
+ ) # Check if service is persistent
159
+ collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
122
160
 
123
161
  if is_persistent and not force_reindex:
124
162
  # Check existence only if persistent and not forcing reindex
@@ -131,14 +169,20 @@ class SearchableMixin(ABC):
131
169
  )
132
170
  else:
133
171
  # Index doesn't exist, safe to proceed
134
- logger.info(f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing.")
172
+ logger.info(
173
+ f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing."
174
+ )
135
175
  elif is_persistent and force_reindex:
136
- logger.warning(f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted.")
176
+ logger.warning(
177
+ f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
178
+ )
137
179
  # else: # Not persistent, safe to proceed without existence check
138
180
  # logger.debug("Proceeding with index=True for non-persistent index.")
139
181
 
140
182
  # Proceed with indexing if checks passed or not applicable
141
- logger.info(f"index=True: Proceeding to index collection immediately after search initialization.")
183
+ logger.info(
184
+ f"index=True: Proceeding to index collection immediately after search initialization."
185
+ )
142
186
  self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
143
187
 
144
188
  return self
@@ -148,42 +192,53 @@ class SearchableMixin(ABC):
148
192
  if not self._search_service:
149
193
  raise RuntimeError("Search service not initialized. Call init_search first.")
150
194
 
151
- collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
152
- logger.info(f"Starting internal indexing process into SearchService collection '{collection_name}'...")
195
+ collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
196
+ logger.info(
197
+ f"Starting internal indexing process into SearchService collection '{collection_name}'..."
198
+ )
153
199
 
154
200
  # Use the abstract method to get items
155
201
  try:
156
- indexable_items = list(self.get_indexable_items()) # Consume iterator
202
+ indexable_items = list(self.get_indexable_items()) # Consume iterator
157
203
  except Exception as e:
158
- logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
159
- raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
204
+ logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
205
+ raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
160
206
 
161
207
  if not indexable_items:
162
- logger.warning("No indexable items provided by get_indexable_items(). Skipping index call.")
208
+ logger.warning(
209
+ "No indexable items provided by get_indexable_items(). Skipping index call."
210
+ )
163
211
  return
164
212
 
165
213
  logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
166
214
  try:
167
- logger.debug(f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex}).")
168
- self._search_service.index(
169
- documents=indexable_items,
170
- embedder_device=embedder_device,
171
- force_reindex=force_reindex,
172
- )
173
- logger.info(f"Successfully completed indexing into SearchService collection '{collection_name}'.")
215
+ logger.debug(
216
+ f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
217
+ )
218
+ self._search_service.index(
219
+ documents=indexable_items,
220
+ embedder_device=embedder_device,
221
+ force_reindex=force_reindex,
222
+ )
223
+ logger.info(
224
+ f"Successfully completed indexing into SearchService collection '{collection_name}'."
225
+ )
174
226
  except IndexConfigurationError as ice:
175
- logger.error(f"Indexing failed due to configuration error in collection '{collection_name}': {ice}", exc_info=True)
176
- raise # Re-raise specific error
177
- except Exception as e: # Catch other indexing errors from the service
178
- logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
179
- raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
180
-
227
+ logger.error(
228
+ f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
229
+ exc_info=True,
230
+ )
231
+ raise # Re-raise specific error
232
+ except Exception as e: # Catch other indexing errors from the service
233
+ logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
234
+ raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
181
235
 
182
236
  def index_for_search(
183
- self, *, # Make args keyword-only
237
+ self,
238
+ *, # Make args keyword-only
184
239
  embedder_device: Optional[str] = None,
185
240
  force_reindex: bool = False,
186
- ) -> 'SearchableMixin':
241
+ ) -> "SearchableMixin":
187
242
  """
188
243
  Ensures the search service is initialized (using default if needed)
189
244
  and indexes the items provided by `get_indexable_items`.
@@ -201,8 +256,10 @@ class SearchableMixin(ABC):
201
256
  """
202
257
  # --- Ensure Service is Initialized (Use Default if Needed) ---
203
258
  if not self._search_service:
204
- logger.info("Search service not initialized prior to index_for_search. Initializing default in-memory service.")
205
- self.init_search() # Call init with defaults
259
+ logger.info(
260
+ "Search service not initialized prior to index_for_search. Initializing default in-memory service."
261
+ )
262
+ self.init_search() # Call init with defaults
206
263
 
207
264
  # --- Perform Indexing ---
208
265
  self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
@@ -210,10 +267,10 @@ class SearchableMixin(ABC):
210
267
 
211
268
  def find_relevant(
212
269
  self,
213
- query: Any, # Query type depends on service capabilities
214
- *, # Make options/service keyword-only
270
+ query: Any, # Query type depends on service capabilities
271
+ *, # Make options/service keyword-only
215
272
  options: Optional[SearchOptions] = None,
216
- search_service: Optional[SearchServiceProtocol] = None # Allow override
273
+ search_service: Optional[SearchServiceProtocol] = None, # Allow override
217
274
  ) -> List[Dict[str, Any]]:
218
275
  """
219
276
  Finds relevant items using the configured or provided search service.
@@ -240,8 +297,10 @@ class SearchableMixin(ABC):
240
297
  "or provide an explicit 'search_service' instance to find_relevant()."
241
298
  )
242
299
 
243
- collection_name = getattr(effective_service, 'collection_name', '<Unknown>')
244
- logger.info(f"Searching collection '{collection_name}' via {type(effective_service).__name__}...")
300
+ collection_name = getattr(effective_service, "collection_name", "<Unknown>")
301
+ logger.info(
302
+ f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
303
+ )
245
304
 
246
305
  # --- Prepare Query and Options ---
247
306
  query_input = query
@@ -263,11 +322,15 @@ class SearchableMixin(ABC):
263
322
  query=query_input,
264
323
  options=effective_options,
265
324
  )
266
- logger.info(f"SearchService returned {len(results)} results from collection '{collection_name}'.")
325
+ logger.info(
326
+ f"SearchService returned {len(results)} results from collection '{collection_name}'."
327
+ )
267
328
  return results
268
329
  except FileNotFoundError as fnf:
269
- logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
270
- raise # Re-raise specific error
330
+ logger.error(
331
+ f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
332
+ )
333
+ raise # Re-raise specific error
271
334
  except Exception as e:
272
335
  logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
273
336
  # Consider wrapping in a SearchError?
@@ -276,11 +339,11 @@ class SearchableMixin(ABC):
276
339
  # --- NEW Sync Method ---
277
340
  def sync_index(
278
341
  self,
279
- strategy: str = 'full', # 'full' (add/update/delete) or 'upsert_only'
342
+ strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
280
343
  dry_run: bool = False,
281
- batch_size: int = 100, # For batching deletes/updates if needed
282
- embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
283
- **kwargs: Any # Allow passing extra args to get_search_service
344
+ batch_size: int = 100, # For batching deletes/updates if needed
345
+ embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
346
+ **kwargs: Any, # Allow passing extra args to get_search_service
284
347
  ) -> Dict[str, int]:
285
348
  """
286
349
  Synchronizes the search index with the current state of indexable items.
@@ -307,91 +370,107 @@ class SearchableMixin(ABC):
307
370
  RuntimeError: For backend errors during synchronization.
308
371
  """
309
372
  if not self._search_service:
310
- raise RuntimeError("Search service not configured. Call init_search first.")
373
+ raise RuntimeError("Search service not configured. Call init_search first.")
311
374
 
312
- collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
313
- logger.info(f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})...")
314
- summary = {'added': 0, 'updated': 0, 'deleted': 0, 'skipped': 0}
375
+ collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
376
+ logger.info(
377
+ f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})..."
378
+ )
379
+ summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
315
380
 
316
381
  # --- Check Service Capabilities for 'full' sync ---
317
- if strategy == 'full':
318
- required_methods = ['list_documents', 'delete_documents']
319
- missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
320
- if missing_methods:
321
- raise NotImplementedError(
322
- f"The configured search service ({type(self._search_service).__name__}) "
323
- f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
324
- )
382
+ if strategy == "full":
383
+ required_methods = ["list_documents", "delete_documents"]
384
+ missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
385
+ if missing_methods:
386
+ raise NotImplementedError(
387
+ f"The configured search service ({type(self._search_service).__name__}) "
388
+ f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
389
+ )
325
390
 
326
391
  # --- 1. Get Desired State (from current collection) ---
327
- desired_state: Dict[str, Indexable] = {} # {id: item}
328
- desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
392
+ desired_state: Dict[str, Indexable] = {} # {id: item}
393
+ desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
329
394
  try:
330
- for item in self.get_indexable_items():
331
- item_id = item.get_id()
332
- if not item_id:
333
- logger.warning(f"Skipping item with no ID: {item}")
334
- summary['skipped'] += 1
335
- continue
336
- if item_id in desired_state:
337
- logger.warning(f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item.")
338
- summary['skipped'] += 1
339
- continue
340
- desired_state[item_id] = item
341
- # Try to get hash, store None if unavailable or fails
342
- try:
343
- desired_hashes[item_id] = item.get_content_hash()
344
- except (AttributeError, NotImplementedError):
345
- logger.debug(f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based.")
346
- desired_hashes[item_id] = None
347
- except Exception as e:
348
- logger.warning(f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.", exc_info=False)
349
- desired_hashes[item_id] = None
395
+ for item in self.get_indexable_items():
396
+ item_id = item.get_id()
397
+ if not item_id:
398
+ logger.warning(f"Skipping item with no ID: {item}")
399
+ summary["skipped"] += 1
400
+ continue
401
+ if item_id in desired_state:
402
+ logger.warning(
403
+ f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item."
404
+ )
405
+ summary["skipped"] += 1
406
+ continue
407
+ desired_state[item_id] = item
408
+ # Try to get hash, store None if unavailable or fails
409
+ try:
410
+ desired_hashes[item_id] = item.get_content_hash()
411
+ except (AttributeError, NotImplementedError):
412
+ logger.debug(
413
+ f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based."
414
+ )
415
+ desired_hashes[item_id] = None
416
+ except Exception as e:
417
+ logger.warning(
418
+ f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.",
419
+ exc_info=False,
420
+ )
421
+ desired_hashes[item_id] = None
350
422
 
351
423
  except Exception as e:
352
- logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
353
- raise RuntimeError("Failed to get current indexable items.") from e
424
+ logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
425
+ raise RuntimeError("Failed to get current indexable items.") from e
354
426
 
355
427
  logger.info(f"Desired state contains {len(desired_state)} indexable items.")
356
428
 
357
429
  # --- 2. Handle Different Strategies ---
358
- if strategy == 'upsert_only':
430
+ if strategy == "upsert_only":
359
431
  # Simple case: just index everything, let the service handle upserts
360
432
  items_to_index = list(desired_state.values())
361
- summary['added'] = len(items_to_index) # Approximate count
362
- logger.info(f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting.")
433
+ summary["added"] = len(items_to_index) # Approximate count
434
+ logger.info(
435
+ f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting."
436
+ )
363
437
  if not dry_run and items_to_index:
364
- logger.debug("Calling service.index for upsert...")
365
- # Call index directly, force_reindex=False implies upsert
366
- self._search_service.index(
367
- documents=items_to_index,
368
- force_reindex=False,
369
- embedder_device=embedder_device
370
- )
438
+ logger.debug("Calling service.index for upsert...")
439
+ # Call index directly, force_reindex=False implies upsert
440
+ self._search_service.index(
441
+ documents=items_to_index, force_reindex=False, embedder_device=embedder_device
442
+ )
371
443
  elif dry_run:
372
444
  logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
373
445
 
374
- elif strategy == 'full':
446
+ elif strategy == "full":
375
447
  # Complex case: Add/Update/Delete
376
448
  # 2a. Get Current Index State
377
449
  try:
378
- logger.debug("Listing documents currently in the index...")
379
- # Assumes list_documents takes filters and include_metadata
380
- # Fetch all documents with metadata
381
- current_docs = self._search_service.list_documents(include_metadata=True)
382
- current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
383
- duplicates = 0
384
- for doc in current_docs:
385
- doc_id = doc.get('id')
386
- if not doc_id: continue # Skip docs without ID from service
387
- if doc_id in current_state: duplicates +=1
388
- current_state[doc_id] = doc
389
- logger.info(f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs).")
390
- if duplicates > 0: logger.warning(f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison.")
450
+ logger.debug("Listing documents currently in the index...")
451
+ # Assumes list_documents takes filters and include_metadata
452
+ # Fetch all documents with metadata
453
+ current_docs = self._search_service.list_documents(include_metadata=True)
454
+ current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
455
+ duplicates = 0
456
+ for doc in current_docs:
457
+ doc_id = doc.get("id")
458
+ if not doc_id:
459
+ continue # Skip docs without ID from service
460
+ if doc_id in current_state:
461
+ duplicates += 1
462
+ current_state[doc_id] = doc
463
+ logger.info(
464
+ f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs)."
465
+ )
466
+ if duplicates > 0:
467
+ logger.warning(
468
+ f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison."
469
+ )
391
470
 
392
471
  except Exception as e:
393
- logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
394
- raise RuntimeError("Could not retrieve current index state for sync.") from e
472
+ logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
473
+ raise RuntimeError("Could not retrieve current index state for sync.") from e
395
474
 
396
475
  # 2b. Compare States and Plan Actions
397
476
  ids_in_desired = set(desired_state.keys())
@@ -403,62 +482,68 @@ class SearchableMixin(ABC):
403
482
 
404
483
  items_to_update = []
405
484
  for item_id in ids_to_check_update:
406
- desired_hash = desired_hashes.get(item_id)
407
- current_meta = current_state[item_id].get('meta', {})
408
- current_hash = current_meta.get('content_hash') # Assuming hash stored in meta
409
-
410
- # Check if hash exists and differs, or if hash is missing (force update)
411
- if desired_hash is None or current_hash is None or desired_hash != current_hash:
412
- if desired_hash != current_hash:
413
- logger.debug(f"Content hash changed for ID {item_id}. Scheduling for update.")
414
- else:
415
- logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
416
- items_to_update.append(desired_state[item_id])
417
- # Else: hashes match, no update needed
485
+ desired_hash = desired_hashes.get(item_id)
486
+ current_meta = current_state[item_id].get("meta", {})
487
+ current_hash = current_meta.get("content_hash") # Assuming hash stored in meta
488
+
489
+ # Check if hash exists and differs, or if hash is missing (force update)
490
+ if desired_hash is None or current_hash is None or desired_hash != current_hash:
491
+ if desired_hash != current_hash:
492
+ logger.debug(
493
+ f"Content hash changed for ID {item_id}. Scheduling for update."
494
+ )
495
+ else:
496
+ logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
497
+ items_to_update.append(desired_state[item_id])
498
+ # Else: hashes match, no update needed
418
499
 
419
500
  items_to_add = [desired_state[id_] for id_ in ids_to_add]
420
- items_to_index = items_to_add + items_to_update # Combine adds and updates for single index call
501
+ items_to_index = (
502
+ items_to_add + items_to_update
503
+ ) # Combine adds and updates for single index call
421
504
 
422
- summary['added'] = len(items_to_add)
423
- summary['updated'] = len(items_to_update)
424
- summary['deleted'] = len(ids_to_delete)
505
+ summary["added"] = len(items_to_add)
506
+ summary["updated"] = len(items_to_update)
507
+ summary["deleted"] = len(ids_to_delete)
425
508
 
426
- logger.info(f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}")
509
+ logger.info(
510
+ f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}"
511
+ )
427
512
 
428
513
  # 2c. Execute Actions (if not dry_run)
429
514
  if not dry_run:
430
- # Execute Deletes
431
- if ids_to_delete:
432
- logger.info(f"Deleting {len(ids_to_delete)} items from index...")
433
- try:
434
- # Assuming delete_documents takes list of IDs
435
- # Implement batching if needed
436
- self._search_service.delete_documents(ids=list(ids_to_delete))
437
- logger.info("Deletion successful.")
438
- except Exception as e:
439
- logger.error(f"Failed to delete documents: {e}", exc_info=True)
440
- # Decide whether to continue or raise
441
- raise RuntimeError("Failed during deletion phase of sync.") from e
442
-
443
- # Execute Adds/Updates
444
- if items_to_index:
445
- logger.info(f"Indexing/Updating {len(items_to_index)} items...")
446
- try:
447
- # Upsert logic handled by service's index method with force_reindex=False
448
- self._search_service.index(
449
- documents=items_to_index,
450
- force_reindex=False,
451
- embedder_device=embedder_device
452
- )
453
- logger.info("Add/Update successful.")
454
- except Exception as e:
455
- logger.error(f"Failed to index/update documents: {e}", exc_info=True)
456
- raise RuntimeError("Failed during add/update phase of sync.") from e
457
- logger.info("Sync actions completed.")
515
+ # Execute Deletes
516
+ if ids_to_delete:
517
+ logger.info(f"Deleting {len(ids_to_delete)} items from index...")
518
+ try:
519
+ # Assuming delete_documents takes list of IDs
520
+ # Implement batching if needed
521
+ self._search_service.delete_documents(ids=list(ids_to_delete))
522
+ logger.info("Deletion successful.")
523
+ except Exception as e:
524
+ logger.error(f"Failed to delete documents: {e}", exc_info=True)
525
+ # Decide whether to continue or raise
526
+ raise RuntimeError("Failed during deletion phase of sync.") from e
527
+
528
+ # Execute Adds/Updates
529
+ if items_to_index:
530
+ logger.info(f"Indexing/Updating {len(items_to_index)} items...")
531
+ try:
532
+ # Upsert logic handled by service's index method with force_reindex=False
533
+ self._search_service.index(
534
+ documents=items_to_index,
535
+ force_reindex=False,
536
+ embedder_device=embedder_device,
537
+ )
538
+ logger.info("Add/Update successful.")
539
+ except Exception as e:
540
+ logger.error(f"Failed to index/update documents: {e}", exc_info=True)
541
+ raise RuntimeError("Failed during add/update phase of sync.") from e
542
+ logger.info("Sync actions completed.")
458
543
  else:
459
- logger.info("[Dry Run] No changes applied to the index.")
544
+ logger.info("[Dry Run] No changes applied to the index.")
460
545
 
461
546
  else:
462
547
  raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
463
548
 
464
- return summary
549
+ return summary
@@ -1,4 +1,5 @@
1
1
  """
2
2
  Selector module for natural-pdf.
3
3
  """
4
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
4
+
5
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func