natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import logging
2
+ from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
3
+ from abc import ABC, abstractmethod
4
+ import hashlib # For hashing content
5
+
6
+ # Now import the flag from the canonical source - this import should always work
7
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS
8
+ DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
9
+
10
+ # Avoid runtime import errors if extras not installed
11
+ try:
12
+ # Import protocols and options first
13
+ from .search_service_protocol import (
14
+ SearchServiceProtocol, Indexable,
15
+ IndexConfigurationError, IndexExistsError
16
+ )
17
+ from .search_options import SearchOptions, TextSearchOptions
18
+ from . import get_search_service
19
+
20
+ if TYPE_CHECKING: # Keep type hints working
21
+ from natural_pdf.elements.region import Region # Example indexable type
22
+ except ImportError:
23
+ # Define dummies if extras missing
24
+ SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = object, object, RuntimeError, RuntimeError
25
+ SearchOptions, TextSearchOptions = object, object
26
+ DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
27
+ def get_search_service(**kwargs): raise ImportError("Search dependencies missing.")
28
+ class Region: pass # Dummy for type hint
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class SearchableMixin(ABC):
33
+ """
34
+ Mixin class providing search functionality (initialization, indexing, searching, syncing).
35
+
36
+ Requires the inheriting class to implement `get_indexable_items`.
37
+ Assumes the inheriting class has a `_search_service` attribute initialized to None.
38
+ """
39
+ # Ensure inheriting class initializes this
40
+ _search_service: Optional[SearchServiceProtocol] = None
41
+
42
+ @abstractmethod
43
+ def get_indexable_items(self) -> Iterable[Indexable]:
44
+ """
45
+ Abstract method that must be implemented by the inheriting class.
46
+ Should yield or return an iterable of objects conforming to the Indexable protocol.
47
+ """
48
+ pass
49
+
50
+ def init_search(
51
+ self,
52
+ service: Optional[SearchServiceProtocol] = None,
53
+ *,
54
+ persist: Optional[bool] = None,
55
+ collection_name: Optional[str] = None,
56
+ embedding_model: Optional[str] = None, # Allow overriding embedding model
57
+ index: bool = False, # Changed from index_now
58
+ force_reindex: bool = False,
59
+ embedder_device: Optional[str] = None,
60
+ **kwargs # Pass other args to get_search_service
61
+ ) -> 'SearchableMixin': # Return self for chaining
62
+ """
63
+ Initializes and configures the search service for this instance.
64
+
65
+ Call this explicitly before `index_for_search`, `sync_index`, or `find_relevant`
66
+ if using non-default settings (e.g., persistence) or attaching an
67
+ existing service instance.
68
+
69
+ Args:
70
+ service: An optional pre-configured SearchServiceProtocol instance.
71
+ If provided, attaches this service directly, ignoring other
72
+ configuration arguments (persist, collection_name, etc.).
73
+ persist: If creating a new service (service=None), determines if it should
74
+ use persistent storage (True) or be in-memory (False/None).
75
+ Defaults to False.
76
+ collection_name: If creating a new service, the name for the index/collection.
77
+ Required if persist=True. Defaults to 'default_collection'
78
+ if persist=False.
79
+ embedding_model: If creating a new service, override the default embedding model.
80
+ index: If True, immediately indexes the collection's documents using the
81
+ configured service after setup. Calls `_perform_indexing`. Defaults to False.
82
+ force_reindex: If index=True, instructs the service to delete any existing
83
+ index before indexing. Defaults to False.
84
+ embedder_device: If index=True, optional device override for the embedder.
85
+ **kwargs: Additional keyword arguments passed to get_search_service when creating
86
+ a new service instance.
87
+
88
+ Returns:
89
+ Self for method chaining.
90
+ """
91
+ if service:
92
+ # Attach provided service
93
+ logger.info(f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}').")
94
+ # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
95
+ self._search_service = service
96
+ else:
97
+ # Create new service
98
+ effective_persist = persist if persist is not None else False
99
+ effective_collection_name = collection_name
100
+ if effective_persist and not effective_collection_name:
101
+ raise ValueError("A collection_name must be provided when persist=True.")
102
+ elif not effective_persist and not effective_collection_name:
103
+ effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
104
+ logger.info(f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service.")
105
+
106
+ logger.info(f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}")
107
+ try:
108
+ service_args = {"collection_name": effective_collection_name, "persist": effective_persist, **kwargs}
109
+ if embedding_model: service_args['embedding_model'] = embedding_model
110
+ self._search_service = get_search_service(**service_args)
111
+ except Exception as e:
112
+ logger.error(f"Failed to create SearchService: {e}", exc_info=True)
113
+ raise RuntimeError("Could not create SearchService instance.") from e
114
+
115
+ # --- Optional Immediate Indexing (with safety check for persistent) ---
116
+ if index:
117
+ if not self._search_service: # Should not happen if logic above is correct
118
+ raise RuntimeError("Cannot index: Search service not available after initialization attempt.")
119
+
120
+ is_persistent = getattr(self._search_service, '_persist', False) # Check if service is persistent
121
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
122
+
123
+ if is_persistent and not force_reindex:
124
+ # Check existence only if persistent and not forcing reindex
125
+ if self._search_service.index_exists():
126
+ # Raise safety error if index exists and force_reindex is not True
127
+ raise IndexExistsError(
128
+ f"Persistent index '{collection_name}' already exists. "
129
+ f"To overwrite/re-index via init_search(index=True), explicitly set force_reindex=True. "
130
+ f"Alternatively, use index_for_search() or sync_index() for more granular control."
131
+ )
132
+ else:
133
+ # Index doesn't exist, safe to proceed
134
+ logger.info(f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing.")
135
+ elif is_persistent and force_reindex:
136
+ logger.warning(f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted.")
137
+ # else: # Not persistent, safe to proceed without existence check
138
+ # logger.debug("Proceeding with index=True for non-persistent index.")
139
+
140
+ # Proceed with indexing if checks passed or not applicable
141
+ logger.info(f"index=True: Proceeding to index collection immediately after search initialization.")
142
+ self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
143
+
144
+ return self
145
+
146
+ def _perform_indexing(self, force_reindex: bool, embedder_device: Optional[str]):
147
+ """Internal helper containing the core indexing logic."""
148
+ if not self._search_service:
149
+ raise RuntimeError("Search service not initialized. Call init_search first.")
150
+
151
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
152
+ logger.info(f"Starting internal indexing process into SearchService collection '{collection_name}'...")
153
+
154
+ # Use the abstract method to get items
155
+ try:
156
+ indexable_items = list(self.get_indexable_items()) # Consume iterator
157
+ except Exception as e:
158
+ logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
159
+ raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
160
+
161
+ if not indexable_items:
162
+ logger.warning("No indexable items provided by get_indexable_items(). Skipping index call.")
163
+ return
164
+
165
+ logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
166
+ try:
167
+ logger.debug(f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex}).")
168
+ self._search_service.index(
169
+ documents=indexable_items,
170
+ embedder_device=embedder_device,
171
+ force_reindex=force_reindex,
172
+ )
173
+ logger.info(f"Successfully completed indexing into SearchService collection '{collection_name}'.")
174
+ except IndexConfigurationError as ice:
175
+ logger.error(f"Indexing failed due to configuration error in collection '{collection_name}': {ice}", exc_info=True)
176
+ raise # Re-raise specific error
177
+ except Exception as e: # Catch other indexing errors from the service
178
+ logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
179
+ raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
180
+
181
+
182
+ def index_for_search(
183
+ self, *, # Make args keyword-only
184
+ embedder_device: Optional[str] = None,
185
+ force_reindex: bool = False,
186
+ ) -> 'SearchableMixin':
187
+ """
188
+ Ensures the search service is initialized (using default if needed)
189
+ and indexes the items provided by `get_indexable_items`.
190
+
191
+ If the search service hasn't been configured via `init_search`, this
192
+ method will initialize the default in-memory service.
193
+
194
+ Args:
195
+ embedder_device: Optional device override for the embedder.
196
+ force_reindex: If True, instructs the service to delete any existing
197
+ index before indexing.
198
+
199
+ Returns:
200
+ Self for method chaining.
201
+ """
202
+ # --- Ensure Service is Initialized (Use Default if Needed) ---
203
+ if not self._search_service:
204
+ logger.info("Search service not initialized prior to index_for_search. Initializing default in-memory service.")
205
+ self.init_search() # Call init with defaults
206
+
207
+ # --- Perform Indexing ---
208
+ self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
209
+ return self
210
+
211
+ def find_relevant(
212
+ self,
213
+ query: Any, # Query type depends on service capabilities
214
+ *, # Make options/service keyword-only
215
+ options: Optional[SearchOptions] = None,
216
+ search_service: Optional[SearchServiceProtocol] = None # Allow override
217
+ ) -> List[Dict[str, Any]]:
218
+ """
219
+ Finds relevant items using the configured or provided search service.
220
+
221
+ Args:
222
+ query: The search query (text, image path, PIL Image, Region, etc.).
223
+ The SearchService implementation handles the specific query type.
224
+ options: Optional SearchOptions to configure the query (top_k, filters, etc.).
225
+ search_service: Optional specific SearchService instance to use for this query,
226
+ overriding the collection's configured service.
227
+
228
+ Returns:
229
+ A list of result dictionaries, sorted by relevance.
230
+
231
+ Raises:
232
+ RuntimeError: If no search service is configured or provided, or if search fails.
233
+ FileNotFoundError: If the collection managed by the service does not exist.
234
+ """
235
+ # --- Determine which Search Service to use ---
236
+ effective_service = search_service or self._search_service
237
+ if not effective_service:
238
+ raise RuntimeError(
239
+ "Search service not configured. Call init_search(...) or index_for_search() first, "
240
+ "or provide an explicit 'search_service' instance to find_relevant()."
241
+ )
242
+
243
+ collection_name = getattr(effective_service, 'collection_name', '<Unknown>')
244
+ logger.info(f"Searching collection '{collection_name}' via {type(effective_service).__name__}...")
245
+
246
+ # --- Prepare Query and Options ---
247
+ query_input = query
248
+ # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
249
+ # If we keep it here, it makes the mixin less generic.
250
+ # Let's assume the SearchService handles the query type appropriately for now.
251
+ # if isinstance(query, Region):
252
+ # logger.debug("Query is a Region object. Extracting text.")
253
+ # query_input = query.extract_text()
254
+ # if not query_input or query_input.isspace():
255
+ # logger.warning("Region provided for query has no extractable text.")
256
+ # return []
257
+
258
+ effective_options = options if options is not None else TextSearchOptions()
259
+
260
+ # --- Call SearchService Search Method ---
261
+ try:
262
+ results = effective_service.search(
263
+ query=query_input,
264
+ options=effective_options,
265
+ )
266
+ logger.info(f"SearchService returned {len(results)} results from collection '{collection_name}'.")
267
+ return results
268
+ except FileNotFoundError as fnf:
269
+ logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
270
+ raise # Re-raise specific error
271
+ except Exception as e:
272
+ logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
273
+ # Consider wrapping in a SearchError?
274
+ raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
275
+
276
+ # --- NEW Sync Method ---
277
+ def sync_index(
278
+ self,
279
+ strategy: str = 'full', # 'full' (add/update/delete) or 'upsert_only'
280
+ dry_run: bool = False,
281
+ batch_size: int = 100, # For batching deletes/updates if needed
282
+ embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
283
+ **kwargs: Any # Allow passing extra args to get_search_service
284
+ ) -> Dict[str, int]:
285
+ """
286
+ Synchronizes the search index with the current state of indexable items.
287
+ Requires the configured search service to implement `list_documents`
288
+ and `delete_documents` for the 'full' strategy.
289
+ Requires `Indexable` items to implement `get_content_hash` for 'full' strategy
290
+ change detection (falls back to ID-based update if hash is missing).
291
+
292
+ Args:
293
+ strategy: 'full' (Default): Adds new, updates changed (based on hash),
294
+ and deletes items no longer present.
295
+ 'upsert_only': Adds new items and updates existing ones (based on ID),
296
+ but does not delete missing items. (Effectively like force_reindex=False index)
297
+ dry_run: If True, calculates changes but does not modify the index.
298
+ batch_size: Hint for batching delete/update operations (service implementation specific).
299
+ embedder_device: Optional device for embedding during updates if needed by service.
300
+ **kwargs: Additional keyword arguments passed to get_search_service when creating
301
+ a new service instance.
302
+
303
+ Returns:
304
+ A dictionary summarizing the changes (e.g., {'added': N, 'updated': M, 'deleted': K, 'skipped': S}).
305
+
306
+ Raises:
307
+ RuntimeError: For backend errors during synchronization.
308
+ """
309
+ if not self._search_service:
310
+ raise RuntimeError("Search service not configured. Call init_search first.")
311
+
312
+ collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
313
+ logger.info(f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})...")
314
+ summary = {'added': 0, 'updated': 0, 'deleted': 0, 'skipped': 0}
315
+
316
+ # --- Check Service Capabilities for 'full' sync ---
317
+ if strategy == 'full':
318
+ required_methods = ['list_documents', 'delete_documents']
319
+ missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
320
+ if missing_methods:
321
+ raise NotImplementedError(
322
+ f"The configured search service ({type(self._search_service).__name__}) "
323
+ f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
324
+ )
325
+
326
+ # --- 1. Get Desired State (from current collection) ---
327
+ desired_state: Dict[str, Indexable] = {} # {id: item}
328
+ desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
329
+ try:
330
+ for item in self.get_indexable_items():
331
+ item_id = item.get_id()
332
+ if not item_id:
333
+ logger.warning(f"Skipping item with no ID: {item}")
334
+ summary['skipped'] += 1
335
+ continue
336
+ if item_id in desired_state:
337
+ logger.warning(f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item.")
338
+ summary['skipped'] += 1
339
+ continue
340
+ desired_state[item_id] = item
341
+ # Try to get hash, store None if unavailable or fails
342
+ try:
343
+ desired_hashes[item_id] = item.get_content_hash()
344
+ except (AttributeError, NotImplementedError):
345
+ logger.debug(f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based.")
346
+ desired_hashes[item_id] = None
347
+ except Exception as e:
348
+ logger.warning(f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.", exc_info=False)
349
+ desired_hashes[item_id] = None
350
+
351
+ except Exception as e:
352
+ logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
353
+ raise RuntimeError("Failed to get current indexable items.") from e
354
+
355
+ logger.info(f"Desired state contains {len(desired_state)} indexable items.")
356
+
357
+ # --- 2. Handle Different Strategies ---
358
+ if strategy == 'upsert_only':
359
+ # Simple case: just index everything, let the service handle upserts
360
+ items_to_index = list(desired_state.values())
361
+ summary['added'] = len(items_to_index) # Approximate count
362
+ logger.info(f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting.")
363
+ if not dry_run and items_to_index:
364
+ logger.debug("Calling service.index for upsert...")
365
+ # Call index directly, force_reindex=False implies upsert
366
+ self._search_service.index(
367
+ documents=items_to_index,
368
+ force_reindex=False,
369
+ embedder_device=embedder_device
370
+ )
371
+ elif dry_run:
372
+ logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
373
+
374
+ elif strategy == 'full':
375
+ # Complex case: Add/Update/Delete
376
+ # 2a. Get Current Index State
377
+ try:
378
+ logger.debug("Listing documents currently in the index...")
379
+ # Assumes list_documents takes filters and include_metadata
380
+ # Fetch all documents with metadata
381
+ current_docs = self._search_service.list_documents(include_metadata=True)
382
+ current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
383
+ duplicates = 0
384
+ for doc in current_docs:
385
+ doc_id = doc.get('id')
386
+ if not doc_id: continue # Skip docs without ID from service
387
+ if doc_id in current_state: duplicates +=1
388
+ current_state[doc_id] = doc
389
+ logger.info(f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs).")
390
+ if duplicates > 0: logger.warning(f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison.")
391
+
392
+ except Exception as e:
393
+ logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
394
+ raise RuntimeError("Could not retrieve current index state for sync.") from e
395
+
396
+ # 2b. Compare States and Plan Actions
397
+ ids_in_desired = set(desired_state.keys())
398
+ ids_in_current = set(current_state.keys())
399
+
400
+ ids_to_add = ids_in_desired - ids_in_current
401
+ ids_to_delete = ids_in_current - ids_in_desired
402
+ ids_to_check_update = ids_in_desired.intersection(ids_in_current)
403
+
404
+ items_to_update = []
405
+ for item_id in ids_to_check_update:
406
+ desired_hash = desired_hashes.get(item_id)
407
+ current_meta = current_state[item_id].get('meta', {})
408
+ current_hash = current_meta.get('content_hash') # Assuming hash stored in meta
409
+
410
+ # Check if hash exists and differs, or if hash is missing (force update)
411
+ if desired_hash is None or current_hash is None or desired_hash != current_hash:
412
+ if desired_hash != current_hash:
413
+ logger.debug(f"Content hash changed for ID {item_id}. Scheduling for update.")
414
+ else:
415
+ logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
416
+ items_to_update.append(desired_state[item_id])
417
+ # Else: hashes match, no update needed
418
+
419
+ items_to_add = [desired_state[id_] for id_ in ids_to_add]
420
+ items_to_index = items_to_add + items_to_update # Combine adds and updates for single index call
421
+
422
+ summary['added'] = len(items_to_add)
423
+ summary['updated'] = len(items_to_update)
424
+ summary['deleted'] = len(ids_to_delete)
425
+
426
+ logger.info(f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}")
427
+
428
+ # 2c. Execute Actions (if not dry_run)
429
+ if not dry_run:
430
+ # Execute Deletes
431
+ if ids_to_delete:
432
+ logger.info(f"Deleting {len(ids_to_delete)} items from index...")
433
+ try:
434
+ # Assuming delete_documents takes list of IDs
435
+ # Implement batching if needed
436
+ self._search_service.delete_documents(ids=list(ids_to_delete))
437
+ logger.info("Deletion successful.")
438
+ except Exception as e:
439
+ logger.error(f"Failed to delete documents: {e}", exc_info=True)
440
+ # Decide whether to continue or raise
441
+ raise RuntimeError("Failed during deletion phase of sync.") from e
442
+
443
+ # Execute Adds/Updates
444
+ if items_to_index:
445
+ logger.info(f"Indexing/Updating {len(items_to_index)} items...")
446
+ try:
447
+ # Upsert logic handled by service's index method with force_reindex=False
448
+ self._search_service.index(
449
+ documents=items_to_index,
450
+ force_reindex=False,
451
+ embedder_device=embedder_device
452
+ )
453
+ logger.info("Add/Update successful.")
454
+ except Exception as e:
455
+ logger.error(f"Failed to index/update documents: {e}", exc_info=True)
456
+ raise RuntimeError("Failed during add/update phase of sync.") from e
457
+ logger.info("Sync actions completed.")
458
+ else:
459
+ logger.info("[Dry Run] No changes applied to the index.")
460
+
461
+ else:
462
+ raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
463
+
464
+ return summary
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: natural-pdf
3
+ Version: 0.1.3
4
+ Summary: A more intuitive interface for working with PDFs
5
+ Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
+ Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfplumber>=0.7.0
15
+ Requires-Dist: Pillow>=8.0.0
16
+ Requires-Dist: colour>=0.1.5
17
+ Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
19
+ Requires-Dist: torch>=2.0.0
20
+ Requires-Dist: torchvision>=0.15.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: huggingface_hub>=0.19.0
23
+ Requires-Dist: ocrmypdf>=16.0.0
24
+ Requires-Dist: pikepdf>=10.0.0
25
+ Provides-Extra: interactive
26
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
27
+ Provides-Extra: haystack
28
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
29
+ Requires-Dist: chroma-haystack; extra == "haystack"
30
+ Requires-Dist: sentence-transformers; extra == "haystack"
31
+ Provides-Extra: easyocr
32
+ Requires-Dist: easyocr; extra == "easyocr"
33
+ Provides-Extra: paddle
34
+ Requires-Dist: paddlepaddle; extra == "paddle"
35
+ Requires-Dist: paddleocr; extra == "paddle"
36
+ Provides-Extra: layout-yolo
37
+ Requires-Dist: doclayout_yolo; extra == "layout-yolo"
38
+ Provides-Extra: surya
39
+ Requires-Dist: surya-ocr; extra == "surya"
40
+ Provides-Extra: qa
41
+ Provides-Extra: all
42
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
43
+ Requires-Dist: easyocr; extra == "all"
44
+ Requires-Dist: paddlepaddle; extra == "all"
45
+ Requires-Dist: paddleocr; extra == "all"
46
+ Requires-Dist: doclayout_yolo; extra == "all"
47
+ Requires-Dist: surya-ocr; extra == "all"
48
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
49
+ Requires-Dist: chroma-haystack; extra == "all"
50
+ Requires-Dist: sentence-transformers; extra == "all"
51
+ Dynamic: license-file
52
+
53
+ # Natural PDF
54
+
55
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
56
+
57
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
58
+
59
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
60
+ - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
61
+
62
+ <div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
63
+
64
+ ## Installation
65
+
66
+ ```bash
67
+ pip install natural-pdf
68
+ ```
69
+
70
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
71
+
72
+ ```bash
73
+ # Example: Install with EasyOCR support
74
+ pip install natural-pdf[easyocr]
75
+ pip install natural-pdf[surya]
76
+ pip install natural-pdf[paddle]
77
+
78
+ # Example: Install with interactive viewer support
79
+ pip install natural-pdf[interactive]
80
+
81
+ # Example: Install with semantic search support (Haystack)
82
+ pip install natural-pdf[haystack]
83
+
84
+ # Install everything
85
+ pip install natural-pdf[all]
86
+ ```
87
+
88
+ See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
89
+
90
+ ## Quick Start
91
+
92
+ ```python
93
+ from natural_pdf import PDF
94
+
95
+ # Open a PDF
96
+ pdf = PDF('document.pdf')
97
+ page = pdf.pages[0]
98
+
99
+ # Find elements using CSS-like selectors
100
+ heading = page.find('text:contains("Summary"):bold')
101
+
102
+ # Extract content below the heading
103
+ content = heading.below().extract_text()
104
+ print("Content below Summary:", content[:100] + "...")
105
+
106
+ # Exclude headers/footers automatically (example)
107
+ # You might define these based on common text or position
108
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
109
+ page.add_exclusion(page.find_all('line')[-1].below())
110
+
111
+ # Extract clean text from the page
112
+ clean_text = page.extract_text()
113
+ print("\nClean page text:", clean_text[:200] + "...")
114
+
115
+ # Highlight the heading and view the page
116
+ heading.highlight(color='red')
117
+ page.to_image()
118
+ ```
119
+
120
+ And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
121
+
122
+ ## Key Features
123
+
124
+ Natural PDF offers a range of features for working with PDFs:
125
+
126
+ * **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
127
+ * **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
128
+ * **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
129
+ * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
130
+ * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
131
+ * **Document QA:** Ask natural language questions about your document's content.
132
+ * **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
133
+ * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
134
+
135
+ ## Learn More
136
+
137
+ Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).