natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +113 -22
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -7,6 +7,7 @@ import tempfile
|
|
7
7
|
import threading
|
8
8
|
import time
|
9
9
|
import urllib.request
|
10
|
+
import weakref
|
10
11
|
from pathlib import Path
|
11
12
|
from typing import (
|
12
13
|
TYPE_CHECKING,
|
@@ -23,9 +24,7 @@ from typing import (
|
|
23
24
|
)
|
24
25
|
|
25
26
|
import pdfplumber
|
26
|
-
from PIL import Image
|
27
27
|
from tqdm.auto import tqdm
|
28
|
-
import weakref
|
29
28
|
|
30
29
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
31
30
|
from natural_pdf.classification.manager import ClassificationError
|
@@ -73,11 +72,14 @@ except ImportError:
|
|
73
72
|
|
74
73
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
75
74
|
|
75
|
+
|
76
76
|
def _get_classification_manager_class():
|
77
77
|
"""Lazy import for ClassificationManager."""
|
78
78
|
from natural_pdf.classification.manager import ClassificationManager
|
79
|
+
|
79
80
|
return ClassificationManager
|
80
81
|
|
82
|
+
|
81
83
|
DEFAULT_MANAGERS = {
|
82
84
|
"classification": _get_classification_manager_class,
|
83
85
|
"structured_data": StructuredDataManager,
|
@@ -100,15 +102,45 @@ except ImportError:
|
|
100
102
|
# --- Lazy Page List Helper --- #
|
101
103
|
from collections.abc import Sequence
|
102
104
|
|
105
|
+
|
103
106
|
class _LazyPageList(Sequence):
|
104
107
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
105
108
|
|
109
|
+
This class implements the Sequence protocol to provide list-like access to PDF pages
|
110
|
+
while minimizing memory usage. Pages are only created when accessed, and once created,
|
111
|
+
they are cached for subsequent access. This design allows efficient handling of large
|
112
|
+
PDF documents without loading all pages into memory immediately.
|
113
|
+
|
106
114
|
The sequence holds `None` placeholders until an index is accessed, at which point
|
107
|
-
a real `Page` object is created, cached, and returned.
|
108
|
-
also supported and will
|
115
|
+
a real `Page` object is created, cached, and returned. Slices and iteration are
|
116
|
+
also supported and will materialize pages on demand.
|
117
|
+
|
118
|
+
Attributes:
|
119
|
+
_parent_pdf: Reference to the parent PDF object.
|
120
|
+
_plumber_pdf: Underlying pdfplumber PDF object.
|
121
|
+
_font_attrs: Font attributes to use when creating pages.
|
122
|
+
_cache: List of cached Page objects (None until accessed).
|
123
|
+
_load_text: Whether to load text layer when creating pages.
|
124
|
+
|
125
|
+
Example:
|
126
|
+
```python
|
127
|
+
# Access is transparent - pages created on demand
|
128
|
+
pdf = npdf.PDF("document.pdf")
|
129
|
+
first_page = pdf.pages[0] # Creates Page object here
|
130
|
+
last_page = pdf.pages[-1] # Creates another Page object
|
131
|
+
|
132
|
+
# Slicing works too
|
133
|
+
first_three = pdf.pages[0:3] # Creates 3 Page objects
|
134
|
+
|
135
|
+
# Iteration creates all pages
|
136
|
+
for page in pdf.pages: # Each page created as needed
|
137
|
+
print(f"Page {page.index}")
|
138
|
+
```
|
109
139
|
"""
|
110
140
|
|
111
|
-
def __init__(
|
141
|
+
def __init__(
|
142
|
+
self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
|
143
|
+
):
|
112
144
|
self._parent_pdf = parent_pdf
|
113
145
|
self._plumber_pdf = plumber_pdf
|
114
146
|
self._font_attrs = font_attrs
|
@@ -124,7 +156,13 @@ class _LazyPageList(Sequence):
|
|
124
156
|
from natural_pdf.core.page import Page
|
125
157
|
|
126
158
|
plumber_page = self._plumber_pdf.pages[index]
|
127
|
-
cached = Page(
|
159
|
+
cached = Page(
|
160
|
+
plumber_page,
|
161
|
+
parent=self._parent_pdf,
|
162
|
+
index=index,
|
163
|
+
font_attrs=self._font_attrs,
|
164
|
+
load_text=self._load_text,
|
165
|
+
)
|
128
166
|
self._cache[index] = cached
|
129
167
|
return cached
|
130
168
|
|
@@ -153,14 +191,44 @@ class _LazyPageList(Sequence):
|
|
153
191
|
def __repr__(self) -> str: # pragma: no cover
|
154
192
|
return f"<_LazyPageList(len={len(self)})>"
|
155
193
|
|
194
|
+
|
156
195
|
# --- End Lazy Page List Helper --- #
|
157
196
|
|
197
|
+
|
158
198
|
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
159
|
-
"""
|
160
|
-
Enhanced PDF wrapper built on top of pdfplumber.
|
199
|
+
"""Enhanced PDF wrapper built on top of pdfplumber.
|
161
200
|
|
162
201
|
This class provides a fluent interface for working with PDF documents,
|
163
|
-
with improved selection, navigation, and extraction capabilities.
|
202
|
+
with improved selection, navigation, and extraction capabilities. It integrates
|
203
|
+
OCR, layout analysis, and AI-powered data extraction features while maintaining
|
204
|
+
compatibility with the underlying pdfplumber API.
|
205
|
+
|
206
|
+
The PDF class supports loading from files, URLs, or streams, and provides
|
207
|
+
spatial navigation, element selection with CSS-like selectors, and advanced
|
208
|
+
document processing workflows including multi-page content flows.
|
209
|
+
|
210
|
+
Attributes:
|
211
|
+
pages: Lazy-loaded list of Page objects for document pages.
|
212
|
+
path: Resolved path to the PDF file or source identifier.
|
213
|
+
source_path: Original path, URL, or stream identifier provided during initialization.
|
214
|
+
highlighter: Service for rendering highlighted visualizations of document content.
|
215
|
+
|
216
|
+
Example:
|
217
|
+
Basic usage:
|
218
|
+
```python
|
219
|
+
import natural_pdf as npdf
|
220
|
+
|
221
|
+
pdf = npdf.PDF("document.pdf")
|
222
|
+
page = pdf.pages[0]
|
223
|
+
text_elements = page.find_all('text:contains("Summary")')
|
224
|
+
```
|
225
|
+
|
226
|
+
Advanced usage with OCR:
|
227
|
+
```python
|
228
|
+
pdf = npdf.PDF("scanned_document.pdf")
|
229
|
+
pdf.apply_ocr(engine="easyocr", resolution=144)
|
230
|
+
tables = pdf.pages[0].find_all('table')
|
231
|
+
```
|
164
232
|
"""
|
165
233
|
|
166
234
|
def __init__(
|
@@ -173,18 +241,48 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
173
241
|
auto_text_tolerance: bool = True,
|
174
242
|
text_layer: bool = True,
|
175
243
|
):
|
176
|
-
"""
|
177
|
-
Initialize the enhanced PDF object.
|
244
|
+
"""Initialize the enhanced PDF object.
|
178
245
|
|
179
246
|
Args:
|
180
|
-
path_or_url_or_stream: Path to the PDF file, a URL
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
247
|
+
path_or_url_or_stream: Path to the PDF file (str/Path), a URL (str),
|
248
|
+
or a file-like object (stream). URLs must start with 'http://' or 'https://'.
|
249
|
+
reading_order: If True, use natural reading order for text extraction.
|
250
|
+
Defaults to True.
|
251
|
+
font_attrs: List of font attributes for grouping characters into words.
|
252
|
+
Common attributes include ['fontname', 'size']. Defaults to None.
|
253
|
+
keep_spaces: If True, include spaces in word elements during text extraction.
|
254
|
+
Defaults to True.
|
255
|
+
text_tolerance: PDFplumber-style tolerance settings for text grouping.
|
256
|
+
Dictionary with keys like 'x_tolerance', 'y_tolerance'. Defaults to None.
|
257
|
+
auto_text_tolerance: If True, automatically scale text tolerance based on
|
258
|
+
font size and document characteristics. Defaults to True.
|
259
|
+
text_layer: If True, preserve existing text layer from the PDF. If False,
|
260
|
+
removes all existing text elements during initialization, useful for
|
261
|
+
OCR-only workflows. Defaults to True.
|
262
|
+
|
263
|
+
Raises:
|
264
|
+
TypeError: If path_or_url_or_stream is not a valid type.
|
265
|
+
IOError: If the PDF file cannot be opened or read.
|
266
|
+
ValueError: If URL download fails.
|
267
|
+
|
268
|
+
Example:
|
269
|
+
```python
|
270
|
+
# From file path
|
271
|
+
pdf = npdf.PDF("document.pdf")
|
272
|
+
|
273
|
+
# From URL
|
274
|
+
pdf = npdf.PDF("https://example.com/document.pdf")
|
275
|
+
|
276
|
+
# From stream
|
277
|
+
with open("document.pdf", "rb") as f:
|
278
|
+
pdf = npdf.PDF(f)
|
279
|
+
|
280
|
+
# With custom settings
|
281
|
+
pdf = npdf.PDF("document.pdf",
|
282
|
+
reading_order=False,
|
283
|
+
text_layer=False, # For OCR-only processing
|
284
|
+
font_attrs=['fontname', 'size', 'flags'])
|
285
|
+
```
|
188
286
|
"""
|
189
287
|
self._original_path_or_stream = path_or_url_or_stream
|
190
288
|
self._temp_file = None
|
@@ -262,7 +360,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
262
360
|
self._manager_registry = {}
|
263
361
|
|
264
362
|
# Lazily instantiate pages only when accessed
|
265
|
-
self._pages = _LazyPageList(
|
363
|
+
self._pages = _LazyPageList(
|
364
|
+
self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer
|
365
|
+
)
|
266
366
|
|
267
367
|
self._element_cache = {}
|
268
368
|
self._exclusions = []
|
@@ -272,13 +372,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
272
372
|
|
273
373
|
self._initialize_managers()
|
274
374
|
self._initialize_highlighter()
|
275
|
-
|
375
|
+
|
276
376
|
# Remove text layer if requested
|
277
377
|
if not self._text_layer:
|
278
378
|
logger.info("Removing text layer as requested (text_layer=False)")
|
279
379
|
# Text layer is not loaded when text_layer=False, so no need to remove
|
280
380
|
pass
|
281
|
-
|
381
|
+
|
282
382
|
# Analysis results accessed via self.analyses property (see below)
|
283
383
|
|
284
384
|
# --- Automatic cleanup when object is garbage-collected ---
|
@@ -315,7 +415,30 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
315
415
|
self._managers = {} # Will hold instantiated managers
|
316
416
|
|
317
417
|
def get_manager(self, key: str) -> Any:
|
318
|
-
"""Retrieve a manager instance by its key, instantiating it lazily if needed.
|
418
|
+
"""Retrieve a manager instance by its key, instantiating it lazily if needed.
|
419
|
+
|
420
|
+
Managers are specialized components that handle specific functionality like
|
421
|
+
classification, structured data extraction, or OCR processing. They are
|
422
|
+
instantiated on-demand to minimize memory usage and startup time.
|
423
|
+
|
424
|
+
Args:
|
425
|
+
key: The manager key to retrieve. Common keys include 'classification'
|
426
|
+
and 'structured_data'.
|
427
|
+
|
428
|
+
Returns:
|
429
|
+
The manager instance for the specified key.
|
430
|
+
|
431
|
+
Raises:
|
432
|
+
KeyError: If no manager is registered for the given key.
|
433
|
+
RuntimeError: If the manager failed to initialize.
|
434
|
+
|
435
|
+
Example:
|
436
|
+
```python
|
437
|
+
pdf = npdf.PDF("document.pdf")
|
438
|
+
classification_mgr = pdf.get_manager('classification')
|
439
|
+
structured_data_mgr = pdf.get_manager('structured_data')
|
440
|
+
```
|
441
|
+
"""
|
319
442
|
# Check if already instantiated
|
320
443
|
if key in self._managers:
|
321
444
|
manager_instance = self._managers[key]
|
@@ -351,12 +474,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
351
474
|
|
352
475
|
@property
|
353
476
|
def metadata(self) -> Dict[str, Any]:
|
354
|
-
"""Access metadata as a dictionary.
|
477
|
+
"""Access PDF metadata as a dictionary.
|
478
|
+
|
479
|
+
Returns document metadata such as title, author, creation date, and other
|
480
|
+
properties embedded in the PDF file. The exact keys available depend on
|
481
|
+
what metadata was included when the PDF was created.
|
482
|
+
|
483
|
+
Returns:
|
484
|
+
Dictionary containing PDF metadata. Common keys include 'Title',
|
485
|
+
'Author', 'Subject', 'Creator', 'Producer', 'CreationDate', and
|
486
|
+
'ModDate'. May be empty if no metadata is available.
|
487
|
+
|
488
|
+
Example:
|
489
|
+
```python
|
490
|
+
pdf = npdf.PDF("document.pdf")
|
491
|
+
print(pdf.metadata.get('Title', 'No title'))
|
492
|
+
print(f"Created: {pdf.metadata.get('CreationDate')}")
|
493
|
+
```
|
494
|
+
"""
|
355
495
|
return self._pdf.metadata
|
356
496
|
|
357
497
|
@property
|
358
498
|
def pages(self) -> "PageCollection":
|
359
|
-
"""Access pages as a PageCollection object.
|
499
|
+
"""Access pages as a PageCollection object.
|
500
|
+
|
501
|
+
Provides access to individual pages of the PDF document through a
|
502
|
+
collection interface that supports indexing, slicing, and iteration.
|
503
|
+
Pages are lazy-loaded to minimize memory usage.
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
PageCollection object that provides list-like access to PDF pages.
|
507
|
+
|
508
|
+
Raises:
|
509
|
+
AttributeError: If PDF pages are not yet initialized.
|
510
|
+
|
511
|
+
Example:
|
512
|
+
```python
|
513
|
+
pdf = npdf.PDF("document.pdf")
|
514
|
+
|
515
|
+
# Access individual pages
|
516
|
+
first_page = pdf.pages[0]
|
517
|
+
last_page = pdf.pages[-1]
|
518
|
+
|
519
|
+
# Slice pages
|
520
|
+
first_three = pdf.pages[0:3]
|
521
|
+
|
522
|
+
# Iterate over pages
|
523
|
+
for page in pdf.pages:
|
524
|
+
print(f"Page {page.index} has {len(page.chars)} characters")
|
525
|
+
```
|
526
|
+
"""
|
360
527
|
from natural_pdf.elements.collections import PageCollection
|
361
528
|
|
362
529
|
if not hasattr(self, "_pages"):
|
@@ -364,11 +531,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
364
531
|
return PageCollection(self._pages)
|
365
532
|
|
366
533
|
def clear_exclusions(self) -> "PDF":
|
367
|
-
"""
|
368
|
-
|
534
|
+
"""Clear all exclusion functions from the PDF.
|
535
|
+
|
536
|
+
Removes all previously added exclusion functions that were used to filter
|
537
|
+
out unwanted content (like headers, footers, or administrative text) from
|
538
|
+
text extraction and analysis operations.
|
369
539
|
|
370
540
|
Returns:
|
371
|
-
Self for method chaining
|
541
|
+
Self for method chaining.
|
542
|
+
|
543
|
+
Raises:
|
544
|
+
AttributeError: If PDF pages are not yet initialized.
|
545
|
+
|
546
|
+
Example:
|
547
|
+
```python
|
548
|
+
pdf = npdf.PDF("document.pdf")
|
549
|
+
pdf.add_exclusion(lambda page: page.find('text:contains("CONFIDENTIAL")').above())
|
550
|
+
|
551
|
+
# Later, remove all exclusions
|
552
|
+
pdf.clear_exclusions()
|
553
|
+
```
|
372
554
|
"""
|
373
555
|
if not hasattr(self, "_pages"):
|
374
556
|
raise AttributeError("PDF pages not yet initialized.")
|
@@ -381,16 +563,46 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
381
563
|
def add_exclusion(
|
382
564
|
self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
|
383
565
|
) -> "PDF":
|
384
|
-
"""
|
385
|
-
|
566
|
+
"""Add an exclusion function to the PDF.
|
567
|
+
|
568
|
+
Exclusion functions define regions of each page that should be ignored during
|
569
|
+
text extraction and analysis operations. This is useful for filtering out headers,
|
570
|
+
footers, watermarks, or other administrative content that shouldn't be included
|
571
|
+
in the main document processing.
|
386
572
|
|
387
573
|
Args:
|
388
|
-
exclusion_func: A function that takes a Page and returns a Region
|
389
|
-
|
390
|
-
|
574
|
+
exclusion_func: A function that takes a Page object and returns a Region
|
575
|
+
to exclude from processing, or None if no exclusion should be applied
|
576
|
+
to that page. The function is called once per page.
|
577
|
+
label: Optional descriptive label for this exclusion rule, useful for
|
578
|
+
debugging and identification.
|
391
579
|
|
392
580
|
Returns:
|
393
|
-
Self for method chaining
|
581
|
+
Self for method chaining.
|
582
|
+
|
583
|
+
Raises:
|
584
|
+
AttributeError: If PDF pages are not yet initialized.
|
585
|
+
|
586
|
+
Example:
|
587
|
+
```python
|
588
|
+
pdf = npdf.PDF("document.pdf")
|
589
|
+
|
590
|
+
# Exclude headers (top 50 points of each page)
|
591
|
+
pdf.add_exclusion(
|
592
|
+
lambda page: page.region(0, 0, page.width, 50),
|
593
|
+
label="header_exclusion"
|
594
|
+
)
|
595
|
+
|
596
|
+
# Exclude any text containing "CONFIDENTIAL"
|
597
|
+
pdf.add_exclusion(
|
598
|
+
lambda page: page.find('text:contains("CONFIDENTIAL")').above(include_source=True)
|
599
|
+
if page.find('text:contains("CONFIDENTIAL")') else None,
|
600
|
+
label="confidential_exclusion"
|
601
|
+
)
|
602
|
+
|
603
|
+
# Chain multiple exclusions
|
604
|
+
pdf.add_exclusion(header_func).add_exclusion(footer_func)
|
605
|
+
```
|
394
606
|
"""
|
395
607
|
if not hasattr(self, "_pages"):
|
396
608
|
raise AttributeError("PDF pages not yet initialized.")
|
@@ -416,23 +628,74 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
416
628
|
options: Optional[Any] = None,
|
417
629
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
418
630
|
) -> "PDF":
|
419
|
-
"""
|
420
|
-
|
631
|
+
"""Apply OCR to specified pages of the PDF using batch processing.
|
632
|
+
|
633
|
+
Performs optical character recognition on the specified pages, converting
|
634
|
+
image-based text into searchable and extractable text elements. This method
|
635
|
+
supports multiple OCR engines and provides batch processing for efficiency.
|
421
636
|
|
422
637
|
Args:
|
423
|
-
engine: Name of the OCR engine
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
638
|
+
engine: Name of the OCR engine to use. Supported engines include
|
639
|
+
'easyocr' (default), 'surya', 'paddle', and 'doctr'. If None,
|
640
|
+
uses the global default from natural_pdf.options.ocr.engine.
|
641
|
+
languages: List of language codes for OCR recognition (e.g., ['en', 'es']).
|
642
|
+
If None, uses the global default from natural_pdf.options.ocr.languages.
|
643
|
+
min_confidence: Minimum confidence threshold (0.0-1.0) for accepting
|
644
|
+
OCR results. Text with lower confidence will be filtered out.
|
645
|
+
If None, uses the global default.
|
646
|
+
device: Device to run OCR on ('cpu', 'cuda', 'mps'). Engine-specific
|
647
|
+
availability varies. If None, uses engine defaults.
|
648
|
+
resolution: DPI resolution for rendering pages to images before OCR.
|
649
|
+
Higher values improve accuracy but increase processing time and memory.
|
650
|
+
Typical values: 150 (fast), 300 (balanced), 600 (high quality).
|
651
|
+
apply_exclusions: If True, mask excluded regions before OCR to prevent
|
652
|
+
processing of headers, footers, or other unwanted content.
|
653
|
+
detect_only: If True, only detect text bounding boxes without performing
|
654
|
+
character recognition. Useful for layout analysis workflows.
|
655
|
+
replace: If True, replace any existing OCR elements on the pages.
|
656
|
+
If False, append new OCR results to existing elements.
|
657
|
+
options: Engine-specific options object (e.g., EasyOCROptions, SuryaOptions).
|
658
|
+
Allows fine-tuning of engine behavior beyond common parameters.
|
659
|
+
pages: Page indices to process. Can be:
|
660
|
+
- None: Process all pages
|
661
|
+
- slice: Process a range of pages (e.g., slice(0, 10))
|
662
|
+
- Iterable[int]: Process specific page indices (e.g., [0, 2, 5])
|
433
663
|
|
434
664
|
Returns:
|
435
|
-
Self for method chaining
|
665
|
+
Self for method chaining.
|
666
|
+
|
667
|
+
Raises:
|
668
|
+
ValueError: If invalid page index is provided.
|
669
|
+
TypeError: If pages parameter has invalid type.
|
670
|
+
RuntimeError: If OCR engine is not available or fails.
|
671
|
+
|
672
|
+
Example:
|
673
|
+
```python
|
674
|
+
pdf = npdf.PDF("scanned_document.pdf")
|
675
|
+
|
676
|
+
# Basic OCR on all pages
|
677
|
+
pdf.apply_ocr()
|
678
|
+
|
679
|
+
# High-quality OCR with specific settings
|
680
|
+
pdf.apply_ocr(
|
681
|
+
engine='easyocr',
|
682
|
+
languages=['en', 'es'],
|
683
|
+
resolution=300,
|
684
|
+
min_confidence=0.8
|
685
|
+
)
|
686
|
+
|
687
|
+
# OCR specific pages only
|
688
|
+
pdf.apply_ocr(pages=[0, 1, 2]) # First 3 pages
|
689
|
+
pdf.apply_ocr(pages=slice(5, 10)) # Pages 5-9
|
690
|
+
|
691
|
+
# Detection-only workflow for layout analysis
|
692
|
+
pdf.apply_ocr(detect_only=True, resolution=150)
|
693
|
+
```
|
694
|
+
|
695
|
+
Note:
|
696
|
+
OCR processing can be time and memory intensive, especially at high
|
697
|
+
resolutions. Consider using exclusions to mask unwanted regions and
|
698
|
+
processing pages in batches for large documents.
|
436
699
|
"""
|
437
700
|
if not self._ocr_manager:
|
438
701
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
@@ -1025,10 +1288,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1025
1288
|
**kwargs,
|
1026
1289
|
) -> Dict[str, Any]:
|
1027
1290
|
"""
|
1028
|
-
Ask a question about the document content.
|
1291
|
+
Ask a single question about the document content.
|
1292
|
+
|
1293
|
+
Args:
|
1294
|
+
question: Question string to ask about the document
|
1295
|
+
mode: "extractive" to extract answer from document, "generative" to generate
|
1296
|
+
pages: Specific pages to query (default: all pages)
|
1297
|
+
min_confidence: Minimum confidence threshold for answers
|
1298
|
+
model: Optional model name for question answering
|
1299
|
+
**kwargs: Additional parameters passed to the QA engine
|
1300
|
+
|
1301
|
+
Returns:
|
1302
|
+
Dict containing: answer, confidence, found, page_num, source_elements, etc.
|
1303
|
+
"""
|
1304
|
+
# Delegate to ask_batch and return the first result
|
1305
|
+
results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
|
1306
|
+
return results[0] if results else {
|
1307
|
+
"answer": None,
|
1308
|
+
"confidence": 0.0,
|
1309
|
+
"found": False,
|
1310
|
+
"page_num": None,
|
1311
|
+
"source_elements": [],
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
def ask_batch(
|
1315
|
+
self,
|
1316
|
+
questions: List[str],
|
1317
|
+
mode: str = "extractive",
|
1318
|
+
pages: Union[int, List[int], range] = None,
|
1319
|
+
min_confidence: float = 0.1,
|
1320
|
+
model: str = None,
|
1321
|
+
**kwargs,
|
1322
|
+
) -> List[Dict[str, Any]]:
|
1323
|
+
"""
|
1324
|
+
Ask multiple questions about the document content using batch processing.
|
1325
|
+
|
1326
|
+
This method processes multiple questions efficiently in a single batch,
|
1327
|
+
avoiding the multiprocessing resource accumulation that can occur with
|
1328
|
+
sequential individual question calls.
|
1029
1329
|
|
1030
1330
|
Args:
|
1031
|
-
|
1331
|
+
questions: List of question strings to ask about the document
|
1032
1332
|
mode: "extractive" to extract answer from document, "generative" to generate
|
1033
1333
|
pages: Specific pages to query (default: all pages)
|
1034
1334
|
min_confidence: Minimum confidence threshold for answers
|
@@ -1036,45 +1336,147 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1036
1336
|
**kwargs: Additional parameters passed to the QA engine
|
1037
1337
|
|
1038
1338
|
Returns:
|
1039
|
-
|
1040
|
-
A dictionary containing the answer, confidence, and other metadata
|
1339
|
+
List of Dicts, each containing: answer, confidence, found, page_num, source_elements, etc.
|
1041
1340
|
"""
|
1042
1341
|
from natural_pdf.qa import get_qa_engine
|
1043
1342
|
|
1343
|
+
if not questions:
|
1344
|
+
return []
|
1345
|
+
|
1346
|
+
if not isinstance(questions, list) or not all(isinstance(q, str) for q in questions):
|
1347
|
+
raise TypeError("'questions' must be a list of strings")
|
1348
|
+
|
1044
1349
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
1045
1350
|
|
1351
|
+
# Resolve target pages
|
1046
1352
|
if pages is None:
|
1047
|
-
target_pages =
|
1353
|
+
target_pages = self.pages
|
1048
1354
|
elif isinstance(pages, int):
|
1049
|
-
|
1355
|
+
if 0 <= pages < len(self.pages):
|
1356
|
+
target_pages = [self.pages[pages]]
|
1357
|
+
else:
|
1358
|
+
raise IndexError(f"Page index {pages} out of range (0-{len(self.pages)-1})")
|
1050
1359
|
elif isinstance(pages, (list, range)):
|
1051
|
-
target_pages =
|
1360
|
+
target_pages = []
|
1361
|
+
for page_idx in pages:
|
1362
|
+
if 0 <= page_idx < len(self.pages):
|
1363
|
+
target_pages.append(self.pages[page_idx])
|
1364
|
+
else:
|
1365
|
+
logger.warning(f"Page index {page_idx} out of range, skipping")
|
1052
1366
|
else:
|
1053
1367
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
1054
1368
|
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1369
|
+
if not target_pages:
|
1370
|
+
logger.warning("No valid pages found for QA processing.")
|
1371
|
+
return [
|
1372
|
+
{
|
1373
|
+
"answer": None,
|
1374
|
+
"confidence": 0.0,
|
1375
|
+
"found": False,
|
1376
|
+
"page_num": None,
|
1377
|
+
"source_elements": [],
|
1378
|
+
}
|
1379
|
+
for _ in questions
|
1380
|
+
]
|
1062
1381
|
|
1063
|
-
|
1064
|
-
results.append(page_result)
|
1382
|
+
logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
|
1065
1383
|
|
1066
|
-
|
1384
|
+
# Collect all page images and metadata for batch processing
|
1385
|
+
page_images = []
|
1386
|
+
page_word_boxes = []
|
1387
|
+
page_metadata = []
|
1067
1388
|
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1389
|
+
for page in target_pages:
|
1390
|
+
# Get page image
|
1391
|
+
try:
|
1392
|
+
page_image = page.to_image(resolution=150, include_highlights=False)
|
1393
|
+
if page_image is None:
|
1394
|
+
logger.warning(f"Failed to render image for page {page.number}, skipping")
|
1395
|
+
continue
|
1396
|
+
|
1397
|
+
# Get text elements for word boxes
|
1398
|
+
elements = page.find_all("text")
|
1399
|
+
if not elements:
|
1400
|
+
logger.warning(f"No text elements found on page {page.number}")
|
1401
|
+
word_boxes = []
|
1402
|
+
else:
|
1403
|
+
word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
|
1404
|
+
|
1405
|
+
page_images.append(page_image)
|
1406
|
+
page_word_boxes.append(word_boxes)
|
1407
|
+
page_metadata.append({
|
1408
|
+
"page_number": page.number,
|
1409
|
+
"page_object": page
|
1410
|
+
})
|
1411
|
+
|
1412
|
+
except Exception as e:
|
1413
|
+
logger.warning(f"Error processing page {page.number}: {e}")
|
1414
|
+
continue
|
1415
|
+
|
1416
|
+
if not page_images:
|
1417
|
+
logger.warning("No page images could be processed for QA.")
|
1418
|
+
return [
|
1419
|
+
{
|
1420
|
+
"answer": None,
|
1421
|
+
"confidence": 0.0,
|
1422
|
+
"found": False,
|
1423
|
+
"page_num": None,
|
1424
|
+
"source_elements": [],
|
1425
|
+
}
|
1426
|
+
for _ in questions
|
1427
|
+
]
|
1428
|
+
|
1429
|
+
# Process all questions against all pages in batch
|
1430
|
+
all_results = []
|
1431
|
+
|
1432
|
+
for question_text in questions:
|
1433
|
+
question_results = []
|
1434
|
+
|
1435
|
+
# Ask this question against each page (but in batch per page)
|
1436
|
+
for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
|
1437
|
+
try:
|
1438
|
+
# Use the DocumentQA batch interface
|
1439
|
+
page_result = qa_engine.ask(
|
1440
|
+
image=page_image,
|
1441
|
+
question=question_text,
|
1442
|
+
word_boxes=word_boxes,
|
1443
|
+
min_confidence=min_confidence,
|
1444
|
+
**kwargs
|
1445
|
+
)
|
1446
|
+
|
1447
|
+
if page_result and page_result.found:
|
1448
|
+
# Add page metadata to result
|
1449
|
+
page_result_dict = {
|
1450
|
+
"answer": page_result.answer,
|
1451
|
+
"confidence": page_result.confidence,
|
1452
|
+
"found": page_result.found,
|
1453
|
+
"page_num": page_meta["page_number"],
|
1454
|
+
"source_elements": getattr(page_result, 'source_elements', []),
|
1455
|
+
"start": getattr(page_result, 'start', -1),
|
1456
|
+
"end": getattr(page_result, 'end', -1),
|
1457
|
+
}
|
1458
|
+
question_results.append(page_result_dict)
|
1459
|
+
|
1460
|
+
except Exception as e:
|
1461
|
+
logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
|
1462
|
+
continue
|
1463
|
+
|
1464
|
+
# Sort results by confidence and take the best one for this question
|
1465
|
+
question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
1466
|
+
|
1467
|
+
if question_results:
|
1468
|
+
all_results.append(question_results[0])
|
1469
|
+
else:
|
1470
|
+
# No results found for this question
|
1471
|
+
all_results.append({
|
1472
|
+
"answer": None,
|
1473
|
+
"confidence": 0.0,
|
1474
|
+
"found": False,
|
1475
|
+
"page_num": None,
|
1476
|
+
"source_elements": [],
|
1477
|
+
})
|
1478
|
+
|
1479
|
+
return all_results
|
1078
1480
|
|
1079
1481
|
def search_within_index(
|
1080
1482
|
self,
|
@@ -1519,7 +1921,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1519
1921
|
|
1520
1922
|
if not manager or not manager.is_available():
|
1521
1923
|
from natural_pdf.classification.manager import is_classification_available
|
1522
|
-
|
1924
|
+
|
1523
1925
|
if not is_classification_available():
|
1524
1926
|
raise ImportError(
|
1525
1927
|
"Classification dependencies missing. "
|