natural-pdf 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import logging
2
+ from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
3
+ from abc import ABC, abstractmethod
4
+ import hashlib # For hashing content
5
+
6
+ # Now import the flag from the canonical source - this import should always work
7
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS
8
+ DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
9
+
10
+ # Avoid runtime import errors if extras not installed
11
+ try:
12
+ # Import protocols and options first
13
+ from .search_service_protocol import (
14
+ SearchServiceProtocol, Indexable,
15
+ IndexConfigurationError, IndexExistsError
16
+ )
17
+ from .search_options import SearchOptions, TextSearchOptions
18
+ from . import get_search_service
19
+
20
+ if TYPE_CHECKING: # Keep type hints working
21
+ from natural_pdf.elements.region import Region # Example indexable type
22
+ except ImportError:
23
+ # Define dummies if extras missing
24
+ SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = object, object, RuntimeError, RuntimeError
25
+ SearchOptions, TextSearchOptions = object, object
26
+ DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
27
+ def get_search_service(**kwargs): raise ImportError("Search dependencies missing.")
28
+ class Region: pass # Dummy for type hint
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class SearchableMixin(ABC):
33
+ """
34
+ Mixin class providing search functionality (initialization, indexing, searching, syncing).
35
+
36
+ Requires the inheriting class to implement `get_indexable_items`.
37
+ Assumes the inheriting class has a `_search_service` attribute initialized to None.
38
+ """
39
+ # Ensure inheriting class initializes this
40
+ _search_service: Optional[SearchServiceProtocol] = None
41
+
42
+ @abstractmethod
43
+ def get_indexable_items(self) -> Iterable[Indexable]:
44
+ """
45
+ Abstract method that must be implemented by the inheriting class.
46
+ Should yield or return an iterable of objects conforming to the Indexable protocol.
47
+ """
48
+ pass
49
+
50
+ def init_search(
51
+ self,
52
+ service: Optional[SearchServiceProtocol] = None,
53
+ *,
54
+ persist: Optional[bool] = None,
55
+ collection_name: Optional[str] = None,
56
+ embedding_model: Optional[str] = None, # Allow overriding embedding model
57
+ index: bool = False, # Changed from index_now
58
+ force_reindex: bool = False,
59
+ embedder_device: Optional[str] = None,
60
+ **kwargs # Pass other args to get_search_service
61
+ ) -> 'SearchableMixin': # Return self for chaining
62
+ """
63
+ Initializes and configures the search service for this instance.
64
+
65
+ Call this explicitly before `index_for_search`, `sync_index`, or `find_relevant`
66
+ if using non-default settings (e.g., persistence) or attaching an
67
+ existing service instance.
68
+
69
+ Args:
70
+ service: An optional pre-configured SearchServiceProtocol instance.
71
+ If provided, attaches this service directly, ignoring other
72
+ configuration arguments (persist, collection_name, etc.).
73
+ persist: If creating a new service (service=None), determines if it should
74
+ use persistent storage (True) or be in-memory (False/None).
75
+ Defaults to False.
76
+ collection_name: If creating a new service, the name for the index/collection.
77
+ Required if persist=True. Defaults to 'default_collection'
78
+ if persist=False.
79
+ embedding_model: If creating a new service, override the default embedding model.
80
+ index: If True, immediately indexes the collection's documents using the
81
+ configured service after setup. Calls `_perform_indexing`. Defaults to False.
82
+ force_reindex: If index=True, instructs the service to delete any existing
83
+ index before indexing. Defaults to False.
84
+ embedder_device: If index=True, optional device override for the embedder.
85
+ **kwargs: Additional keyword arguments passed to get_search_service when creating
86
+ a new service instance.
87
+
88
+ Returns:
89
+ Self for method chaining.
90
+ """
91
+ if service:
92
+ # Attach provided service
93
+ logger.info(f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}').")
94
+ # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
95
+ self._search_service = service
96
+ else:
97
+ # Create new service
98
+ effective_persist = persist if persist is not None else False
99
+ effective_collection_name = collection_name
100
+ if effective_persist and not effective_collection_name:
101
+ raise ValueError("A collection_name must be provided when persist=True.")
102
+ elif not effective_persist and not effective_collection_name:
103
+ effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
104
+ logger.info(f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service.")
105
+
106
+ logger.info(f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}")
107
+ try:
108
+ service_args = {"collection_name": effective_collection_name, "persist": effective_persist, **kwargs}
109
+ if embedding_model: service_args['embedding_model'] = embedding_model
110
+ self._search_service = get_search_service(**service_args)
111
+ except Exception as e:
112
+ logger.error(f"Failed to create SearchService: {e}", exc_info=True)
113
+ raise RuntimeError("Could not create SearchService instance.") from e
114
+
115
+ # --- Optional Immediate Indexing (with safety check for persistent) ---
116
+ if index:
117
+ if not self._search_service: # Should not happen if logic above is correct
118
+ raise RuntimeError("Cannot index: Search service not available after initialization attempt.")
119
+
120
+ is_persistent = getattr(self._search_service, '_persist', False) # Check if service is persistent
121
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
122
+
123
+ if is_persistent and not force_reindex:
124
+ # Check existence only if persistent and not forcing reindex
125
+ if self._search_service.index_exists():
126
+ # Raise safety error if index exists and force_reindex is not True
127
+ raise IndexExistsError(
128
+ f"Persistent index '{collection_name}' already exists. "
129
+ f"To overwrite/re-index via init_search(index=True), explicitly set force_reindex=True. "
130
+ f"Alternatively, use index_for_search() or sync_index() for more granular control."
131
+ )
132
+ else:
133
+ # Index doesn't exist, safe to proceed
134
+ logger.info(f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing.")
135
+ elif is_persistent and force_reindex:
136
+ logger.warning(f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted.")
137
+ # else: # Not persistent, safe to proceed without existence check
138
+ # logger.debug("Proceeding with index=True for non-persistent index.")
139
+
140
+ # Proceed with indexing if checks passed or not applicable
141
+ logger.info(f"index=True: Proceeding to index collection immediately after search initialization.")
142
+ self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
143
+
144
+ return self
145
+
146
+ def _perform_indexing(self, force_reindex: bool, embedder_device: Optional[str]):
147
+ """Internal helper containing the core indexing logic."""
148
+ if not self._search_service:
149
+ raise RuntimeError("Search service not initialized. Call init_search first.")
150
+
151
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
152
+ logger.info(f"Starting internal indexing process into SearchService collection '{collection_name}'...")
153
+
154
+ # Use the abstract method to get items
155
+ try:
156
+ indexable_items = list(self.get_indexable_items()) # Consume iterator
157
+ except Exception as e:
158
+ logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
159
+ raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
160
+
161
+ if not indexable_items:
162
+ logger.warning("No indexable items provided by get_indexable_items(). Skipping index call.")
163
+ return
164
+
165
+ logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
166
+ try:
167
+ logger.debug(f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex}).")
168
+ self._search_service.index(
169
+ documents=indexable_items,
170
+ embedder_device=embedder_device,
171
+ force_reindex=force_reindex,
172
+ )
173
+ logger.info(f"Successfully completed indexing into SearchService collection '{collection_name}'.")
174
+ except IndexConfigurationError as ice:
175
+ logger.error(f"Indexing failed due to configuration error in collection '{collection_name}': {ice}", exc_info=True)
176
+ raise # Re-raise specific error
177
+ except Exception as e: # Catch other indexing errors from the service
178
+ logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
179
+ raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
180
+
181
+
182
+ def index_for_search(
183
+ self, *, # Make args keyword-only
184
+ embedder_device: Optional[str] = None,
185
+ force_reindex: bool = False,
186
+ ) -> 'SearchableMixin':
187
+ """
188
+ Ensures the search service is initialized (using default if needed)
189
+ and indexes the items provided by `get_indexable_items`.
190
+
191
+ If the search service hasn't been configured via `init_search`, this
192
+ method will initialize the default in-memory service.
193
+
194
+ Args:
195
+ embedder_device: Optional device override for the embedder.
196
+ force_reindex: If True, instructs the service to delete any existing
197
+ index before indexing.
198
+
199
+ Returns:
200
+ Self for method chaining.
201
+ """
202
+ # --- Ensure Service is Initialized (Use Default if Needed) ---
203
+ if not self._search_service:
204
+ logger.info("Search service not initialized prior to index_for_search. Initializing default in-memory service.")
205
+ self.init_search() # Call init with defaults
206
+
207
+ # --- Perform Indexing ---
208
+ self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
209
+ return self
210
+
211
+ def find_relevant(
212
+ self,
213
+ query: Any, # Query type depends on service capabilities
214
+ *, # Make options/service keyword-only
215
+ options: Optional[SearchOptions] = None,
216
+ search_service: Optional[SearchServiceProtocol] = None # Allow override
217
+ ) -> List[Dict[str, Any]]:
218
+ """
219
+ Finds relevant items using the configured or provided search service.
220
+
221
+ Args:
222
+ query: The search query (text, image path, PIL Image, Region, etc.).
223
+ The SearchService implementation handles the specific query type.
224
+ options: Optional SearchOptions to configure the query (top_k, filters, etc.).
225
+ search_service: Optional specific SearchService instance to use for this query,
226
+ overriding the collection's configured service.
227
+
228
+ Returns:
229
+ A list of result dictionaries, sorted by relevance.
230
+
231
+ Raises:
232
+ RuntimeError: If no search service is configured or provided, or if search fails.
233
+ FileNotFoundError: If the collection managed by the service does not exist.
234
+ """
235
+ # --- Determine which Search Service to use ---
236
+ effective_service = search_service or self._search_service
237
+ if not effective_service:
238
+ raise RuntimeError(
239
+ "Search service not configured. Call init_search(...) or index_for_search() first, "
240
+ "or provide an explicit 'search_service' instance to find_relevant()."
241
+ )
242
+
243
+ collection_name = getattr(effective_service, 'collection_name', '<Unknown>')
244
+ logger.info(f"Searching collection '{collection_name}' via {type(effective_service).__name__}...")
245
+
246
+ # --- Prepare Query and Options ---
247
+ query_input = query
248
+ # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
249
+ # If we keep it here, it makes the mixin less generic.
250
+ # Let's assume the SearchService handles the query type appropriately for now.
251
+ # if isinstance(query, Region):
252
+ # logger.debug("Query is a Region object. Extracting text.")
253
+ # query_input = query.extract_text()
254
+ # if not query_input or query_input.isspace():
255
+ # logger.warning("Region provided for query has no extractable text.")
256
+ # return []
257
+
258
+ effective_options = options if options is not None else TextSearchOptions()
259
+
260
+ # --- Call SearchService Search Method ---
261
+ try:
262
+ results = effective_service.search(
263
+ query=query_input,
264
+ options=effective_options,
265
+ )
266
+ logger.info(f"SearchService returned {len(results)} results from collection '{collection_name}'.")
267
+ return results
268
+ except FileNotFoundError as fnf:
269
+ logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
270
+ raise # Re-raise specific error
271
+ except Exception as e:
272
+ logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
273
+ # Consider wrapping in a SearchError?
274
+ raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
275
+
276
+ # --- NEW Sync Method ---
277
+ def sync_index(
278
+ self,
279
+ strategy: str = 'full', # 'full' (add/update/delete) or 'upsert_only'
280
+ dry_run: bool = False,
281
+ batch_size: int = 100, # For batching deletes/updates if needed
282
+ embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
283
+ **kwargs: Any # Allow passing extra args to get_search_service
284
+ ) -> Dict[str, int]:
285
+ """
286
+ Synchronizes the search index with the current state of indexable items.
287
+ Requires the configured search service to implement `list_documents`
288
+ and `delete_documents` for the 'full' strategy.
289
+ Requires `Indexable` items to implement `get_content_hash` for 'full' strategy
290
+ change detection (falls back to ID-based update if hash is missing).
291
+
292
+ Args:
293
+ strategy: 'full' (Default): Adds new, updates changed (based on hash),
294
+ and deletes items no longer present.
295
+ 'upsert_only': Adds new items and updates existing ones (based on ID),
296
+ but does not delete missing items. (Effectively like force_reindex=False index)
297
+ dry_run: If True, calculates changes but does not modify the index.
298
+ batch_size: Hint for batching delete/update operations (service implementation specific).
299
+ embedder_device: Optional device for embedding during updates if needed by service.
300
+ **kwargs: Additional keyword arguments passed to get_search_service when creating
301
+ a new service instance.
302
+
303
+ Returns:
304
+ A dictionary summarizing the changes (e.g., {'added': N, 'updated': M, 'deleted': K, 'skipped': S}).
305
+
306
+ Raises:
307
+ RuntimeError: For backend errors during synchronization.
308
+ """
309
+ if not self._search_service:
310
+ raise RuntimeError("Search service not configured. Call init_search first.")
311
+
312
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
313
+ logger.info(f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})...")
314
+ summary = {'added': 0, 'updated': 0, 'deleted': 0, 'skipped': 0}
315
+
316
+ # --- Check Service Capabilities for 'full' sync ---
317
+ if strategy == 'full':
318
+ required_methods = ['list_documents', 'delete_documents']
319
+ missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
320
+ if missing_methods:
321
+ raise NotImplementedError(
322
+ f"The configured search service ({type(self._search_service).__name__}) "
323
+ f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
324
+ )
325
+
326
+ # --- 1. Get Desired State (from current collection) ---
327
+ desired_state: Dict[str, Indexable] = {} # {id: item}
328
+ desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
329
+ try:
330
+ for item in self.get_indexable_items():
331
+ item_id = item.get_id()
332
+ if not item_id:
333
+ logger.warning(f"Skipping item with no ID: {item}")
334
+ summary['skipped'] += 1
335
+ continue
336
+ if item_id in desired_state:
337
+ logger.warning(f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item.")
338
+ summary['skipped'] += 1
339
+ continue
340
+ desired_state[item_id] = item
341
+ # Try to get hash, store None if unavailable or fails
342
+ try:
343
+ desired_hashes[item_id] = item.get_content_hash()
344
+ except (AttributeError, NotImplementedError):
345
+ logger.debug(f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based.")
346
+ desired_hashes[item_id] = None
347
+ except Exception as e:
348
+ logger.warning(f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.", exc_info=False)
349
+ desired_hashes[item_id] = None
350
+
351
+ except Exception as e:
352
+ logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
353
+ raise RuntimeError("Failed to get current indexable items.") from e
354
+
355
+ logger.info(f"Desired state contains {len(desired_state)} indexable items.")
356
+
357
+ # --- 2. Handle Different Strategies ---
358
+ if strategy == 'upsert_only':
359
+ # Simple case: just index everything, let the service handle upserts
360
+ items_to_index = list(desired_state.values())
361
+ summary['added'] = len(items_to_index) # Approximate count
362
+ logger.info(f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting.")
363
+ if not dry_run and items_to_index:
364
+ logger.debug("Calling service.index for upsert...")
365
+ # Call index directly, force_reindex=False implies upsert
366
+ self._search_service.index(
367
+ documents=items_to_index,
368
+ force_reindex=False,
369
+ embedder_device=embedder_device
370
+ )
371
+ elif dry_run:
372
+ logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
373
+
374
+ elif strategy == 'full':
375
+ # Complex case: Add/Update/Delete
376
+ # 2a. Get Current Index State
377
+ try:
378
+ logger.debug("Listing documents currently in the index...")
379
+ # Assumes list_documents takes filters and include_metadata
380
+ # Fetch all documents with metadata
381
+ current_docs = self._search_service.list_documents(include_metadata=True)
382
+ current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
383
+ duplicates = 0
384
+ for doc in current_docs:
385
+ doc_id = doc.get('id')
386
+ if not doc_id: continue # Skip docs without ID from service
387
+ if doc_id in current_state: duplicates +=1
388
+ current_state[doc_id] = doc
389
+ logger.info(f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs).")
390
+ if duplicates > 0: logger.warning(f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison.")
391
+
392
+ except Exception as e:
393
+ logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
394
+ raise RuntimeError("Could not retrieve current index state for sync.") from e
395
+
396
+ # 2b. Compare States and Plan Actions
397
+ ids_in_desired = set(desired_state.keys())
398
+ ids_in_current = set(current_state.keys())
399
+
400
+ ids_to_add = ids_in_desired - ids_in_current
401
+ ids_to_delete = ids_in_current - ids_in_desired
402
+ ids_to_check_update = ids_in_desired.intersection(ids_in_current)
403
+
404
+ items_to_update = []
405
+ for item_id in ids_to_check_update:
406
+ desired_hash = desired_hashes.get(item_id)
407
+ current_meta = current_state[item_id].get('meta', {})
408
+ current_hash = current_meta.get('content_hash') # Assuming hash stored in meta
409
+
410
+ # Check if hash exists and differs, or if hash is missing (force update)
411
+ if desired_hash is None or current_hash is None or desired_hash != current_hash:
412
+ if desired_hash != current_hash:
413
+ logger.debug(f"Content hash changed for ID {item_id}. Scheduling for update.")
414
+ else:
415
+ logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
416
+ items_to_update.append(desired_state[item_id])
417
+ # Else: hashes match, no update needed
418
+
419
+ items_to_add = [desired_state[id_] for id_ in ids_to_add]
420
+ items_to_index = items_to_add + items_to_update # Combine adds and updates for single index call
421
+
422
+ summary['added'] = len(items_to_add)
423
+ summary['updated'] = len(items_to_update)
424
+ summary['deleted'] = len(ids_to_delete)
425
+
426
+ logger.info(f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}")
427
+
428
+ # 2c. Execute Actions (if not dry_run)
429
+ if not dry_run:
430
+ # Execute Deletes
431
+ if ids_to_delete:
432
+ logger.info(f"Deleting {len(ids_to_delete)} items from index...")
433
+ try:
434
+ # Assuming delete_documents takes list of IDs
435
+ # Implement batching if needed
436
+ self._search_service.delete_documents(ids=list(ids_to_delete))
437
+ logger.info("Deletion successful.")
438
+ except Exception as e:
439
+ logger.error(f"Failed to delete documents: {e}", exc_info=True)
440
+ # Decide whether to continue or raise
441
+ raise RuntimeError("Failed during deletion phase of sync.") from e
442
+
443
+ # Execute Adds/Updates
444
+ if items_to_index:
445
+ logger.info(f"Indexing/Updating {len(items_to_index)} items...")
446
+ try:
447
+ # Upsert logic handled by service's index method with force_reindex=False
448
+ self._search_service.index(
449
+ documents=items_to_index,
450
+ force_reindex=False,
451
+ embedder_device=embedder_device
452
+ )
453
+ logger.info("Add/Update successful.")
454
+ except Exception as e:
455
+ logger.error(f"Failed to index/update documents: {e}", exc_info=True)
456
+ raise RuntimeError("Failed during add/update phase of sync.") from e
457
+ logger.info("Sync actions completed.")
458
+ else:
459
+ logger.info("[Dry Run] No changes applied to the index.")
460
+
461
+ else:
462
+ raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
463
+
464
+ return summary
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -20,8 +20,14 @@ Requires-Dist: torch>=2.0.0
20
20
  Requires-Dist: torchvision>=0.15.0
21
21
  Requires-Dist: transformers>=4.30.0
22
22
  Requires-Dist: huggingface_hub>=0.19.0
23
+ Requires-Dist: ocrmypdf>=16.0.0
24
+ Requires-Dist: pikepdf>=10.0.0
23
25
  Provides-Extra: interactive
24
26
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
27
+ Provides-Extra: haystack
28
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
29
+ Requires-Dist: chroma-haystack; extra == "haystack"
30
+ Requires-Dist: sentence-transformers; extra == "haystack"
25
31
  Provides-Extra: easyocr
26
32
  Requires-Dist: easyocr; extra == "easyocr"
27
33
  Provides-Extra: paddle
@@ -39,6 +45,9 @@ Requires-Dist: paddlepaddle; extra == "all"
39
45
  Requires-Dist: paddleocr; extra == "all"
40
46
  Requires-Dist: doclayout_yolo; extra == "all"
41
47
  Requires-Dist: surya-ocr; extra == "all"
48
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
49
+ Requires-Dist: chroma-haystack; extra == "all"
50
+ Requires-Dist: sentence-transformers; extra == "all"
42
51
  Dynamic: license-file
43
52
 
44
53
  # Natural PDF
@@ -69,6 +78,9 @@ pip install natural-pdf[paddle]
69
78
  # Example: Install with interactive viewer support
70
79
  pip install natural-pdf[interactive]
71
80
 
81
+ # Example: Install with semantic search support (Haystack)
82
+ pip install natural-pdf[haystack]
83
+
72
84
  # Install everything
73
85
  pip install natural-pdf[all]
74
86
  ```
@@ -117,6 +129,7 @@ Natural PDF offers a range of features for working with PDFs:
117
129
  * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
118
130
  * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
119
131
  * **Document QA:** Ask natural language questions about your document's content.
132
+ * **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
120
133
  * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
121
134
 
122
135
  ## Learn More
@@ -1,4 +1,4 @@
1
- natural_pdf/__init__.py,sha256=hsSosbPnvDRCfyYAL9bf1haVS6oBxLAl7cbKTWRTHkU,1784
1
+ natural_pdf/__init__.py,sha256=hdqbTG3SHtu8jPIL7su6TpEhEbNsL89pgktCXPMKWCI,2825
2
2
  natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
3
3
  natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
4
4
  natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
@@ -13,18 +13,21 @@ natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5
13
13
  natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
14
14
  natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
15
15
  natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
+ natural_pdf/collections/pdf_collection.py,sha256=Da8saWBTguxk16pNzMxCrFwatrWk_qrcG0RVPQybro8,12159
16
17
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
17
18
  natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
18
19
  natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
19
- natural_pdf/core/page.py,sha256=tnxG-5OhFVuFHt0p-a9YSLU-nXjA8fftg5ViQdH5sOU,68512
20
- natural_pdf/core/pdf.py,sha256=UzxVfVeCnhSN7rxdJresUj_UNFkcFkeaEjLvwZMJS-c,28532
20
+ natural_pdf/core/page.py,sha256=qhumZqmwHoBlGodiCvYE0z34Iu1WSs32V4_Iz_Sfaow,69350
21
+ natural_pdf/core/pdf.py,sha256=MLN-asJ_d5spmCjLz7SDp74t__vioszfKEFooBul7nU,41167
21
22
  natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
22
23
  natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
23
24
  natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
24
25
  natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
25
26
  natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
26
- natural_pdf/elements/region.py,sha256=sfYWLn1nii7o7lqY_fTyJN2fd__Cg_9euGsZDQUQffA,74242
27
+ natural_pdf/elements/region.py,sha256=5dXHYbbdO1QNgkD6b6I34ezHt-SHKx_aH1ubzbfMHQs,74370
27
28
  natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
29
+ natural_pdf/exporters/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
30
+ natural_pdf/exporters/searchable_pdf.py,sha256=PPkF64hFNNhPlZPuyJRvC_scAg3WCOiIvwgIP8nlZ9E,10225
28
31
  natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
29
32
  natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
30
33
  natural_pdf/ocr/engine_easyocr.py,sha256=6srZhXqlH3UpNWw5iFq7u4TS5HQsMSTWYuuWo3oYZp8,8273
@@ -34,6 +37,12 @@ natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW
34
37
  natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
35
38
  natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
36
39
  natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
40
+ natural_pdf/search/__init__.py,sha256=sYv7-XrSohUgE2UH8sFpGfl66SG092jZoNokZaDdxsY,4125
41
+ natural_pdf/search/haystack_search_service.py,sha256=qhvqVJMxz4-KTnQF0MPO7YLQxTlYe27PCgKJgYeAels,27580
42
+ natural_pdf/search/haystack_utils.py,sha256=BXU5yIEcFIWliSX44slMYLlUMfwCXEfve-ZYmVcEt3k,18773
43
+ natural_pdf/search/search_options.py,sha256=PrIGkvM9A9wpqaz6tDB-9hWiSp9fqhi8mf7FQl1qoGI,3510
44
+ natural_pdf/search/search_service_protocol.py,sha256=5EYzHFUoFvaYw3khnQNz1dsOHqTvBChekvk_qf2mu5w,6811
45
+ natural_pdf/search/searchable_mixin.py,sha256=QPtPSJHCP5n0Twp4uHKSns8J6HuvGjyipTNbB66JFLg,24896
37
46
  natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
38
47
  natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
39
48
  natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
@@ -45,8 +54,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
45
54
  natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
46
55
  natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
47
56
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
48
- natural_pdf-0.1.2.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
- natural_pdf-0.1.2.dist-info/METADATA,sha256=NQQGLJQVgbbxkyj4UZW-wkmdQLfDGzu7U-UswwiojGU,4453
50
- natural_pdf-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
- natural_pdf-0.1.2.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
- natural_pdf-0.1.2.dist-info/RECORD,,
57
+ natural_pdf-0.1.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
58
+ natural_pdf-0.1.3.dist-info/METADATA,sha256=kBSb1SueOGQFw97pvHBxlJYcuNwxAB-lInLKows0BEs,5069
59
+ natural_pdf-0.1.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
60
+ natural_pdf-0.1.3.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
61
+ natural_pdf-0.1.3.dist-info/RECORD,,