natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,464 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
import hashlib # For hashing content
|
5
|
+
|
6
|
+
# Now import the flag from the canonical source - this import should always work
|
7
|
+
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
8
|
+
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
9
|
+
|
10
|
+
# Avoid runtime import errors if extras not installed
|
11
|
+
try:
|
12
|
+
# Import protocols and options first
|
13
|
+
from .search_service_protocol import (
|
14
|
+
SearchServiceProtocol, Indexable,
|
15
|
+
IndexConfigurationError, IndexExistsError
|
16
|
+
)
|
17
|
+
from .search_options import SearchOptions, TextSearchOptions
|
18
|
+
from . import get_search_service
|
19
|
+
|
20
|
+
if TYPE_CHECKING: # Keep type hints working
|
21
|
+
from natural_pdf.elements.region import Region # Example indexable type
|
22
|
+
except ImportError:
|
23
|
+
# Define dummies if extras missing
|
24
|
+
SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = object, object, RuntimeError, RuntimeError
|
25
|
+
SearchOptions, TextSearchOptions = object, object
|
26
|
+
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
27
|
+
def get_search_service(**kwargs): raise ImportError("Search dependencies missing.")
|
28
|
+
class Region: pass # Dummy for type hint
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
class SearchableMixin(ABC):
|
33
|
+
"""
|
34
|
+
Mixin class providing search functionality (initialization, indexing, searching, syncing).
|
35
|
+
|
36
|
+
Requires the inheriting class to implement `get_indexable_items`.
|
37
|
+
Assumes the inheriting class has a `_search_service` attribute initialized to None.
|
38
|
+
"""
|
39
|
+
# Ensure inheriting class initializes this
|
40
|
+
_search_service: Optional[SearchServiceProtocol] = None
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def get_indexable_items(self) -> Iterable[Indexable]:
|
44
|
+
"""
|
45
|
+
Abstract method that must be implemented by the inheriting class.
|
46
|
+
Should yield or return an iterable of objects conforming to the Indexable protocol.
|
47
|
+
"""
|
48
|
+
pass
|
49
|
+
|
50
|
+
def init_search(
|
51
|
+
self,
|
52
|
+
service: Optional[SearchServiceProtocol] = None,
|
53
|
+
*,
|
54
|
+
persist: Optional[bool] = None,
|
55
|
+
collection_name: Optional[str] = None,
|
56
|
+
embedding_model: Optional[str] = None, # Allow overriding embedding model
|
57
|
+
index: bool = False, # Changed from index_now
|
58
|
+
force_reindex: bool = False,
|
59
|
+
embedder_device: Optional[str] = None,
|
60
|
+
**kwargs # Pass other args to get_search_service
|
61
|
+
) -> 'SearchableMixin': # Return self for chaining
|
62
|
+
"""
|
63
|
+
Initializes and configures the search service for this instance.
|
64
|
+
|
65
|
+
Call this explicitly before `index_for_search`, `sync_index`, or `find_relevant`
|
66
|
+
if using non-default settings (e.g., persistence) or attaching an
|
67
|
+
existing service instance.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
service: An optional pre-configured SearchServiceProtocol instance.
|
71
|
+
If provided, attaches this service directly, ignoring other
|
72
|
+
configuration arguments (persist, collection_name, etc.).
|
73
|
+
persist: If creating a new service (service=None), determines if it should
|
74
|
+
use persistent storage (True) or be in-memory (False/None).
|
75
|
+
Defaults to False.
|
76
|
+
collection_name: If creating a new service, the name for the index/collection.
|
77
|
+
Required if persist=True. Defaults to 'default_collection'
|
78
|
+
if persist=False.
|
79
|
+
embedding_model: If creating a new service, override the default embedding model.
|
80
|
+
index: If True, immediately indexes the collection's documents using the
|
81
|
+
configured service after setup. Calls `_perform_indexing`. Defaults to False.
|
82
|
+
force_reindex: If index=True, instructs the service to delete any existing
|
83
|
+
index before indexing. Defaults to False.
|
84
|
+
embedder_device: If index=True, optional device override for the embedder.
|
85
|
+
**kwargs: Additional keyword arguments passed to get_search_service when creating
|
86
|
+
a new service instance.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
Self for method chaining.
|
90
|
+
"""
|
91
|
+
if service:
|
92
|
+
# Attach provided service
|
93
|
+
logger.info(f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}').")
|
94
|
+
# TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
|
95
|
+
self._search_service = service
|
96
|
+
else:
|
97
|
+
# Create new service
|
98
|
+
effective_persist = persist if persist is not None else False
|
99
|
+
effective_collection_name = collection_name
|
100
|
+
if effective_persist and not effective_collection_name:
|
101
|
+
raise ValueError("A collection_name must be provided when persist=True.")
|
102
|
+
elif not effective_persist and not effective_collection_name:
|
103
|
+
effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
|
104
|
+
logger.info(f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service.")
|
105
|
+
|
106
|
+
logger.info(f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}")
|
107
|
+
try:
|
108
|
+
service_args = {"collection_name": effective_collection_name, "persist": effective_persist, **kwargs}
|
109
|
+
if embedding_model: service_args['embedding_model'] = embedding_model
|
110
|
+
self._search_service = get_search_service(**service_args)
|
111
|
+
except Exception as e:
|
112
|
+
logger.error(f"Failed to create SearchService: {e}", exc_info=True)
|
113
|
+
raise RuntimeError("Could not create SearchService instance.") from e
|
114
|
+
|
115
|
+
# --- Optional Immediate Indexing (with safety check for persistent) ---
|
116
|
+
if index:
|
117
|
+
if not self._search_service: # Should not happen if logic above is correct
|
118
|
+
raise RuntimeError("Cannot index: Search service not available after initialization attempt.")
|
119
|
+
|
120
|
+
is_persistent = getattr(self._search_service, '_persist', False) # Check if service is persistent
|
121
|
+
collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
|
122
|
+
|
123
|
+
if is_persistent and not force_reindex:
|
124
|
+
# Check existence only if persistent and not forcing reindex
|
125
|
+
if self._search_service.index_exists():
|
126
|
+
# Raise safety error if index exists and force_reindex is not True
|
127
|
+
raise IndexExistsError(
|
128
|
+
f"Persistent index '{collection_name}' already exists. "
|
129
|
+
f"To overwrite/re-index via init_search(index=True), explicitly set force_reindex=True. "
|
130
|
+
f"Alternatively, use index_for_search() or sync_index() for more granular control."
|
131
|
+
)
|
132
|
+
else:
|
133
|
+
# Index doesn't exist, safe to proceed
|
134
|
+
logger.info(f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing.")
|
135
|
+
elif is_persistent and force_reindex:
|
136
|
+
logger.warning(f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted.")
|
137
|
+
# else: # Not persistent, safe to proceed without existence check
|
138
|
+
# logger.debug("Proceeding with index=True for non-persistent index.")
|
139
|
+
|
140
|
+
# Proceed with indexing if checks passed or not applicable
|
141
|
+
logger.info(f"index=True: Proceeding to index collection immediately after search initialization.")
|
142
|
+
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
143
|
+
|
144
|
+
return self
|
145
|
+
|
146
|
+
def _perform_indexing(self, force_reindex: bool, embedder_device: Optional[str]):
|
147
|
+
"""Internal helper containing the core indexing logic."""
|
148
|
+
if not self._search_service:
|
149
|
+
raise RuntimeError("Search service not initialized. Call init_search first.")
|
150
|
+
|
151
|
+
collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
|
152
|
+
logger.info(f"Starting internal indexing process into SearchService collection '{collection_name}'...")
|
153
|
+
|
154
|
+
# Use the abstract method to get items
|
155
|
+
try:
|
156
|
+
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
157
|
+
except Exception as e:
|
158
|
+
logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
|
159
|
+
raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
|
160
|
+
|
161
|
+
if not indexable_items:
|
162
|
+
logger.warning("No indexable items provided by get_indexable_items(). Skipping index call.")
|
163
|
+
return
|
164
|
+
|
165
|
+
logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
|
166
|
+
try:
|
167
|
+
logger.debug(f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex}).")
|
168
|
+
self._search_service.index(
|
169
|
+
documents=indexable_items,
|
170
|
+
embedder_device=embedder_device,
|
171
|
+
force_reindex=force_reindex,
|
172
|
+
)
|
173
|
+
logger.info(f"Successfully completed indexing into SearchService collection '{collection_name}'.")
|
174
|
+
except IndexConfigurationError as ice:
|
175
|
+
logger.error(f"Indexing failed due to configuration error in collection '{collection_name}': {ice}", exc_info=True)
|
176
|
+
raise # Re-raise specific error
|
177
|
+
except Exception as e: # Catch other indexing errors from the service
|
178
|
+
logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
|
179
|
+
raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
|
180
|
+
|
181
|
+
|
182
|
+
def index_for_search(
|
183
|
+
self, *, # Make args keyword-only
|
184
|
+
embedder_device: Optional[str] = None,
|
185
|
+
force_reindex: bool = False,
|
186
|
+
) -> 'SearchableMixin':
|
187
|
+
"""
|
188
|
+
Ensures the search service is initialized (using default if needed)
|
189
|
+
and indexes the items provided by `get_indexable_items`.
|
190
|
+
|
191
|
+
If the search service hasn't been configured via `init_search`, this
|
192
|
+
method will initialize the default in-memory service.
|
193
|
+
|
194
|
+
Args:
|
195
|
+
embedder_device: Optional device override for the embedder.
|
196
|
+
force_reindex: If True, instructs the service to delete any existing
|
197
|
+
index before indexing.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
Self for method chaining.
|
201
|
+
"""
|
202
|
+
# --- Ensure Service is Initialized (Use Default if Needed) ---
|
203
|
+
if not self._search_service:
|
204
|
+
logger.info("Search service not initialized prior to index_for_search. Initializing default in-memory service.")
|
205
|
+
self.init_search() # Call init with defaults
|
206
|
+
|
207
|
+
# --- Perform Indexing ---
|
208
|
+
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
209
|
+
return self
|
210
|
+
|
211
|
+
def find_relevant(
|
212
|
+
self,
|
213
|
+
query: Any, # Query type depends on service capabilities
|
214
|
+
*, # Make options/service keyword-only
|
215
|
+
options: Optional[SearchOptions] = None,
|
216
|
+
search_service: Optional[SearchServiceProtocol] = None # Allow override
|
217
|
+
) -> List[Dict[str, Any]]:
|
218
|
+
"""
|
219
|
+
Finds relevant items using the configured or provided search service.
|
220
|
+
|
221
|
+
Args:
|
222
|
+
query: The search query (text, image path, PIL Image, Region, etc.).
|
223
|
+
The SearchService implementation handles the specific query type.
|
224
|
+
options: Optional SearchOptions to configure the query (top_k, filters, etc.).
|
225
|
+
search_service: Optional specific SearchService instance to use for this query,
|
226
|
+
overriding the collection's configured service.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
A list of result dictionaries, sorted by relevance.
|
230
|
+
|
231
|
+
Raises:
|
232
|
+
RuntimeError: If no search service is configured or provided, or if search fails.
|
233
|
+
FileNotFoundError: If the collection managed by the service does not exist.
|
234
|
+
"""
|
235
|
+
# --- Determine which Search Service to use ---
|
236
|
+
effective_service = search_service or self._search_service
|
237
|
+
if not effective_service:
|
238
|
+
raise RuntimeError(
|
239
|
+
"Search service not configured. Call init_search(...) or index_for_search() first, "
|
240
|
+
"or provide an explicit 'search_service' instance to find_relevant()."
|
241
|
+
)
|
242
|
+
|
243
|
+
collection_name = getattr(effective_service, 'collection_name', '<Unknown>')
|
244
|
+
logger.info(f"Searching collection '{collection_name}' via {type(effective_service).__name__}...")
|
245
|
+
|
246
|
+
# --- Prepare Query and Options ---
|
247
|
+
query_input = query
|
248
|
+
# Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
|
249
|
+
# If we keep it here, it makes the mixin less generic.
|
250
|
+
# Let's assume the SearchService handles the query type appropriately for now.
|
251
|
+
# if isinstance(query, Region):
|
252
|
+
# logger.debug("Query is a Region object. Extracting text.")
|
253
|
+
# query_input = query.extract_text()
|
254
|
+
# if not query_input or query_input.isspace():
|
255
|
+
# logger.warning("Region provided for query has no extractable text.")
|
256
|
+
# return []
|
257
|
+
|
258
|
+
effective_options = options if options is not None else TextSearchOptions()
|
259
|
+
|
260
|
+
# --- Call SearchService Search Method ---
|
261
|
+
try:
|
262
|
+
results = effective_service.search(
|
263
|
+
query=query_input,
|
264
|
+
options=effective_options,
|
265
|
+
)
|
266
|
+
logger.info(f"SearchService returned {len(results)} results from collection '{collection_name}'.")
|
267
|
+
return results
|
268
|
+
except FileNotFoundError as fnf:
|
269
|
+
logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
|
270
|
+
raise # Re-raise specific error
|
271
|
+
except Exception as e:
|
272
|
+
logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
|
273
|
+
# Consider wrapping in a SearchError?
|
274
|
+
raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
|
275
|
+
|
276
|
+
# --- NEW Sync Method ---
|
277
|
+
def sync_index(
|
278
|
+
self,
|
279
|
+
strategy: str = 'full', # 'full' (add/update/delete) or 'upsert_only'
|
280
|
+
dry_run: bool = False,
|
281
|
+
batch_size: int = 100, # For batching deletes/updates if needed
|
282
|
+
embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
|
283
|
+
**kwargs: Any # Allow passing extra args to get_search_service
|
284
|
+
) -> Dict[str, int]:
|
285
|
+
"""
|
286
|
+
Synchronizes the search index with the current state of indexable items.
|
287
|
+
Requires the configured search service to implement `list_documents`
|
288
|
+
and `delete_documents` for the 'full' strategy.
|
289
|
+
Requires `Indexable` items to implement `get_content_hash` for 'full' strategy
|
290
|
+
change detection (falls back to ID-based update if hash is missing).
|
291
|
+
|
292
|
+
Args:
|
293
|
+
strategy: 'full' (Default): Adds new, updates changed (based on hash),
|
294
|
+
and deletes items no longer present.
|
295
|
+
'upsert_only': Adds new items and updates existing ones (based on ID),
|
296
|
+
but does not delete missing items. (Effectively like force_reindex=False index)
|
297
|
+
dry_run: If True, calculates changes but does not modify the index.
|
298
|
+
batch_size: Hint for batching delete/update operations (service implementation specific).
|
299
|
+
embedder_device: Optional device for embedding during updates if needed by service.
|
300
|
+
**kwargs: Additional keyword arguments passed to get_search_service when creating
|
301
|
+
a new service instance.
|
302
|
+
|
303
|
+
Returns:
|
304
|
+
A dictionary summarizing the changes (e.g., {'added': N, 'updated': M, 'deleted': K, 'skipped': S}).
|
305
|
+
|
306
|
+
Raises:
|
307
|
+
RuntimeError: For backend errors during synchronization.
|
308
|
+
"""
|
309
|
+
if not self._search_service:
|
310
|
+
raise RuntimeError("Search service not configured. Call init_search first.")
|
311
|
+
|
312
|
+
collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
|
313
|
+
logger.info(f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})...")
|
314
|
+
summary = {'added': 0, 'updated': 0, 'deleted': 0, 'skipped': 0}
|
315
|
+
|
316
|
+
# --- Check Service Capabilities for 'full' sync ---
|
317
|
+
if strategy == 'full':
|
318
|
+
required_methods = ['list_documents', 'delete_documents']
|
319
|
+
missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
|
320
|
+
if missing_methods:
|
321
|
+
raise NotImplementedError(
|
322
|
+
f"The configured search service ({type(self._search_service).__name__}) "
|
323
|
+
f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
|
324
|
+
)
|
325
|
+
|
326
|
+
# --- 1. Get Desired State (from current collection) ---
|
327
|
+
desired_state: Dict[str, Indexable] = {} # {id: item}
|
328
|
+
desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
|
329
|
+
try:
|
330
|
+
for item in self.get_indexable_items():
|
331
|
+
item_id = item.get_id()
|
332
|
+
if not item_id:
|
333
|
+
logger.warning(f"Skipping item with no ID: {item}")
|
334
|
+
summary['skipped'] += 1
|
335
|
+
continue
|
336
|
+
if item_id in desired_state:
|
337
|
+
logger.warning(f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item.")
|
338
|
+
summary['skipped'] += 1
|
339
|
+
continue
|
340
|
+
desired_state[item_id] = item
|
341
|
+
# Try to get hash, store None if unavailable or fails
|
342
|
+
try:
|
343
|
+
desired_hashes[item_id] = item.get_content_hash()
|
344
|
+
except (AttributeError, NotImplementedError):
|
345
|
+
logger.debug(f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based.")
|
346
|
+
desired_hashes[item_id] = None
|
347
|
+
except Exception as e:
|
348
|
+
logger.warning(f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.", exc_info=False)
|
349
|
+
desired_hashes[item_id] = None
|
350
|
+
|
351
|
+
except Exception as e:
|
352
|
+
logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
|
353
|
+
raise RuntimeError("Failed to get current indexable items.") from e
|
354
|
+
|
355
|
+
logger.info(f"Desired state contains {len(desired_state)} indexable items.")
|
356
|
+
|
357
|
+
# --- 2. Handle Different Strategies ---
|
358
|
+
if strategy == 'upsert_only':
|
359
|
+
# Simple case: just index everything, let the service handle upserts
|
360
|
+
items_to_index = list(desired_state.values())
|
361
|
+
summary['added'] = len(items_to_index) # Approximate count
|
362
|
+
logger.info(f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting.")
|
363
|
+
if not dry_run and items_to_index:
|
364
|
+
logger.debug("Calling service.index for upsert...")
|
365
|
+
# Call index directly, force_reindex=False implies upsert
|
366
|
+
self._search_service.index(
|
367
|
+
documents=items_to_index,
|
368
|
+
force_reindex=False,
|
369
|
+
embedder_device=embedder_device
|
370
|
+
)
|
371
|
+
elif dry_run:
|
372
|
+
logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
|
373
|
+
|
374
|
+
elif strategy == 'full':
|
375
|
+
# Complex case: Add/Update/Delete
|
376
|
+
# 2a. Get Current Index State
|
377
|
+
try:
|
378
|
+
logger.debug("Listing documents currently in the index...")
|
379
|
+
# Assumes list_documents takes filters and include_metadata
|
380
|
+
# Fetch all documents with metadata
|
381
|
+
current_docs = self._search_service.list_documents(include_metadata=True)
|
382
|
+
current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
|
383
|
+
duplicates = 0
|
384
|
+
for doc in current_docs:
|
385
|
+
doc_id = doc.get('id')
|
386
|
+
if not doc_id: continue # Skip docs without ID from service
|
387
|
+
if doc_id in current_state: duplicates +=1
|
388
|
+
current_state[doc_id] = doc
|
389
|
+
logger.info(f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs).")
|
390
|
+
if duplicates > 0: logger.warning(f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison.")
|
391
|
+
|
392
|
+
except Exception as e:
|
393
|
+
logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
|
394
|
+
raise RuntimeError("Could not retrieve current index state for sync.") from e
|
395
|
+
|
396
|
+
# 2b. Compare States and Plan Actions
|
397
|
+
ids_in_desired = set(desired_state.keys())
|
398
|
+
ids_in_current = set(current_state.keys())
|
399
|
+
|
400
|
+
ids_to_add = ids_in_desired - ids_in_current
|
401
|
+
ids_to_delete = ids_in_current - ids_in_desired
|
402
|
+
ids_to_check_update = ids_in_desired.intersection(ids_in_current)
|
403
|
+
|
404
|
+
items_to_update = []
|
405
|
+
for item_id in ids_to_check_update:
|
406
|
+
desired_hash = desired_hashes.get(item_id)
|
407
|
+
current_meta = current_state[item_id].get('meta', {})
|
408
|
+
current_hash = current_meta.get('content_hash') # Assuming hash stored in meta
|
409
|
+
|
410
|
+
# Check if hash exists and differs, or if hash is missing (force update)
|
411
|
+
if desired_hash is None or current_hash is None or desired_hash != current_hash:
|
412
|
+
if desired_hash != current_hash:
|
413
|
+
logger.debug(f"Content hash changed for ID {item_id}. Scheduling for update.")
|
414
|
+
else:
|
415
|
+
logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
|
416
|
+
items_to_update.append(desired_state[item_id])
|
417
|
+
# Else: hashes match, no update needed
|
418
|
+
|
419
|
+
items_to_add = [desired_state[id_] for id_ in ids_to_add]
|
420
|
+
items_to_index = items_to_add + items_to_update # Combine adds and updates for single index call
|
421
|
+
|
422
|
+
summary['added'] = len(items_to_add)
|
423
|
+
summary['updated'] = len(items_to_update)
|
424
|
+
summary['deleted'] = len(ids_to_delete)
|
425
|
+
|
426
|
+
logger.info(f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}")
|
427
|
+
|
428
|
+
# 2c. Execute Actions (if not dry_run)
|
429
|
+
if not dry_run:
|
430
|
+
# Execute Deletes
|
431
|
+
if ids_to_delete:
|
432
|
+
logger.info(f"Deleting {len(ids_to_delete)} items from index...")
|
433
|
+
try:
|
434
|
+
# Assuming delete_documents takes list of IDs
|
435
|
+
# Implement batching if needed
|
436
|
+
self._search_service.delete_documents(ids=list(ids_to_delete))
|
437
|
+
logger.info("Deletion successful.")
|
438
|
+
except Exception as e:
|
439
|
+
logger.error(f"Failed to delete documents: {e}", exc_info=True)
|
440
|
+
# Decide whether to continue or raise
|
441
|
+
raise RuntimeError("Failed during deletion phase of sync.") from e
|
442
|
+
|
443
|
+
# Execute Adds/Updates
|
444
|
+
if items_to_index:
|
445
|
+
logger.info(f"Indexing/Updating {len(items_to_index)} items...")
|
446
|
+
try:
|
447
|
+
# Upsert logic handled by service's index method with force_reindex=False
|
448
|
+
self._search_service.index(
|
449
|
+
documents=items_to_index,
|
450
|
+
force_reindex=False,
|
451
|
+
embedder_device=embedder_device
|
452
|
+
)
|
453
|
+
logger.info("Add/Update successful.")
|
454
|
+
except Exception as e:
|
455
|
+
logger.error(f"Failed to index/update documents: {e}", exc_info=True)
|
456
|
+
raise RuntimeError("Failed during add/update phase of sync.") from e
|
457
|
+
logger.info("Sync actions completed.")
|
458
|
+
else:
|
459
|
+
logger.info("[Dry Run] No changes applied to the index.")
|
460
|
+
|
461
|
+
else:
|
462
|
+
raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
|
463
|
+
|
464
|
+
return summary
|
@@ -0,0 +1,137 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: natural-pdf
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: A more intuitive interface for working with PDFs
|
5
|
+
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/jsoma/natural-pdf
|
8
|
+
Project-URL: Repository, https://github.com/jsoma/natural-pdf
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.7
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: pdfplumber>=0.7.0
|
15
|
+
Requires-Dist: Pillow>=8.0.0
|
16
|
+
Requires-Dist: colour>=0.1.5
|
17
|
+
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
19
|
+
Requires-Dist: torch>=2.0.0
|
20
|
+
Requires-Dist: torchvision>=0.15.0
|
21
|
+
Requires-Dist: transformers>=4.30.0
|
22
|
+
Requires-Dist: huggingface_hub>=0.19.0
|
23
|
+
Requires-Dist: ocrmypdf>=16.0.0
|
24
|
+
Requires-Dist: pikepdf>=10.0.0
|
25
|
+
Provides-Extra: interactive
|
26
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
27
|
+
Provides-Extra: haystack
|
28
|
+
Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
|
29
|
+
Requires-Dist: chroma-haystack; extra == "haystack"
|
30
|
+
Requires-Dist: sentence-transformers; extra == "haystack"
|
31
|
+
Provides-Extra: easyocr
|
32
|
+
Requires-Dist: easyocr; extra == "easyocr"
|
33
|
+
Provides-Extra: paddle
|
34
|
+
Requires-Dist: paddlepaddle; extra == "paddle"
|
35
|
+
Requires-Dist: paddleocr; extra == "paddle"
|
36
|
+
Provides-Extra: layout-yolo
|
37
|
+
Requires-Dist: doclayout_yolo; extra == "layout-yolo"
|
38
|
+
Provides-Extra: surya
|
39
|
+
Requires-Dist: surya-ocr; extra == "surya"
|
40
|
+
Provides-Extra: qa
|
41
|
+
Provides-Extra: all
|
42
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
|
43
|
+
Requires-Dist: easyocr; extra == "all"
|
44
|
+
Requires-Dist: paddlepaddle; extra == "all"
|
45
|
+
Requires-Dist: paddleocr; extra == "all"
|
46
|
+
Requires-Dist: doclayout_yolo; extra == "all"
|
47
|
+
Requires-Dist: surya-ocr; extra == "all"
|
48
|
+
Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
|
49
|
+
Requires-Dist: chroma-haystack; extra == "all"
|
50
|
+
Requires-Dist: sentence-transformers; extra == "all"
|
51
|
+
Dynamic: license-file
|
52
|
+
|
53
|
+
# Natural PDF
|
54
|
+
|
55
|
+
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
56
|
+
|
57
|
+
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
58
|
+
|
59
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
60
|
+
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
61
|
+
|
62
|
+
<div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
|
66
|
+
```bash
|
67
|
+
pip install natural-pdf
|
68
|
+
```
|
69
|
+
|
70
|
+
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
|
71
|
+
|
72
|
+
```bash
|
73
|
+
# Example: Install with EasyOCR support
|
74
|
+
pip install natural-pdf[easyocr]
|
75
|
+
pip install natural-pdf[surya]
|
76
|
+
pip install natural-pdf[paddle]
|
77
|
+
|
78
|
+
# Example: Install with interactive viewer support
|
79
|
+
pip install natural-pdf[interactive]
|
80
|
+
|
81
|
+
# Example: Install with semantic search support (Haystack)
|
82
|
+
pip install natural-pdf[haystack]
|
83
|
+
|
84
|
+
# Install everything
|
85
|
+
pip install natural-pdf[all]
|
86
|
+
```
|
87
|
+
|
88
|
+
See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
|
89
|
+
|
90
|
+
## Quick Start
|
91
|
+
|
92
|
+
```python
|
93
|
+
from natural_pdf import PDF
|
94
|
+
|
95
|
+
# Open a PDF
|
96
|
+
pdf = PDF('document.pdf')
|
97
|
+
page = pdf.pages[0]
|
98
|
+
|
99
|
+
# Find elements using CSS-like selectors
|
100
|
+
heading = page.find('text:contains("Summary"):bold')
|
101
|
+
|
102
|
+
# Extract content below the heading
|
103
|
+
content = heading.below().extract_text()
|
104
|
+
print("Content below Summary:", content[:100] + "...")
|
105
|
+
|
106
|
+
# Exclude headers/footers automatically (example)
|
107
|
+
# You might define these based on common text or position
|
108
|
+
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
109
|
+
page.add_exclusion(page.find_all('line')[-1].below())
|
110
|
+
|
111
|
+
# Extract clean text from the page
|
112
|
+
clean_text = page.extract_text()
|
113
|
+
print("\nClean page text:", clean_text[:200] + "...")
|
114
|
+
|
115
|
+
# Highlight the heading and view the page
|
116
|
+
heading.highlight(color='red')
|
117
|
+
page.to_image()
|
118
|
+
```
|
119
|
+
|
120
|
+
And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
|
121
|
+
|
122
|
+
## Key Features
|
123
|
+
|
124
|
+
Natural PDF offers a range of features for working with PDFs:
|
125
|
+
|
126
|
+
* **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
|
127
|
+
* **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
|
128
|
+
* **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
|
129
|
+
* **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
|
130
|
+
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
|
131
|
+
* **Document QA:** Ask natural language questions about your document's content.
|
132
|
+
* **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
|
133
|
+
* **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
|
134
|
+
|
135
|
+
## Learn More
|
136
|
+
|
137
|
+
Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
|