natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import io
|
2
3
|
import logging
|
3
4
|
import os
|
4
5
|
import re
|
5
6
|
import tempfile
|
6
|
-
import urllib.request
|
7
|
-
import time
|
8
7
|
import threading
|
8
|
+
import time
|
9
|
+
import urllib.request
|
9
10
|
from pathlib import Path
|
10
11
|
from typing import (
|
11
12
|
TYPE_CHECKING,
|
@@ -18,38 +19,35 @@ from typing import (
|
|
18
19
|
Tuple,
|
19
20
|
Type,
|
20
21
|
Union,
|
22
|
+
overload,
|
21
23
|
)
|
22
|
-
from natural_pdf.utils.tqdm_utils import get_tqdm
|
23
24
|
|
24
25
|
import pdfplumber
|
25
26
|
from PIL import Image
|
26
27
|
|
27
28
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
|
+
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
30
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
31
|
+
from natural_pdf.classification.results import ClassificationResult
|
28
32
|
from natural_pdf.core.highlighting_service import HighlightingService
|
29
|
-
from natural_pdf.
|
30
|
-
from natural_pdf.elements.collections import ElementCollection
|
33
|
+
from natural_pdf.elements.base import Element
|
31
34
|
from natural_pdf.elements.region import Region
|
35
|
+
from natural_pdf.export.mixin import ExportMixin
|
36
|
+
from natural_pdf.extraction.manager import StructuredDataManager
|
37
|
+
from natural_pdf.extraction.mixin import ExtractionMixin
|
32
38
|
from natural_pdf.ocr import OCRManager, OCROptions
|
33
39
|
from natural_pdf.selectors.parser import parse_selector
|
34
|
-
|
35
|
-
from natural_pdf.classification.manager import ClassificationManager
|
36
|
-
from natural_pdf.classification.manager import ClassificationError
|
37
|
-
from natural_pdf.classification.results import ClassificationResult
|
38
|
-
from natural_pdf.extraction.manager import StructuredDataManager
|
39
|
-
|
40
40
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from natural_pdf.
|
42
|
-
from natural_pdf.classification.mixin import ClassificationMixin
|
43
|
-
from natural_pdf.extraction.mixin import ExtractionMixin
|
41
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
44
42
|
|
45
43
|
try:
|
46
44
|
from typing import Any as TypingAny
|
47
45
|
|
48
|
-
from natural_pdf.search import TextSearchOptions
|
49
46
|
from natural_pdf.search import (
|
50
47
|
BaseSearchOptions,
|
51
48
|
SearchOptions,
|
52
49
|
SearchServiceProtocol,
|
50
|
+
TextSearchOptions,
|
53
51
|
get_search_service,
|
54
52
|
)
|
55
53
|
except ImportError:
|
@@ -62,6 +60,7 @@ except ImportError:
|
|
62
60
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
63
61
|
)
|
64
62
|
|
63
|
+
|
65
64
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
66
65
|
tqdm = get_tqdm()
|
67
66
|
|
@@ -70,7 +69,22 @@ DEFAULT_MANAGERS = {
|
|
70
69
|
"structured_data": StructuredDataManager,
|
71
70
|
}
|
72
71
|
|
73
|
-
|
72
|
+
# Deskew Imports (Conditional)
|
73
|
+
import numpy as np
|
74
|
+
from PIL import Image
|
75
|
+
|
76
|
+
try:
|
77
|
+
import img2pdf
|
78
|
+
from deskew import determine_skew
|
79
|
+
|
80
|
+
DESKEW_AVAILABLE = True
|
81
|
+
except ImportError:
|
82
|
+
DESKEW_AVAILABLE = False
|
83
|
+
img2pdf = None
|
84
|
+
# End Deskew Imports
|
85
|
+
|
86
|
+
|
87
|
+
class PDF(ExtractionMixin, ExportMixin):
|
74
88
|
"""
|
75
89
|
Enhanced PDF wrapper built on top of pdfplumber.
|
76
90
|
|
@@ -80,7 +94,7 @@ class PDF(ExtractionMixin):
|
|
80
94
|
|
81
95
|
def __init__(
|
82
96
|
self,
|
83
|
-
|
97
|
+
path_or_url_or_stream,
|
84
98
|
reading_order: bool = True,
|
85
99
|
font_attrs: Optional[List[str]] = None,
|
86
100
|
keep_spaces: bool = True,
|
@@ -89,54 +103,72 @@ class PDF(ExtractionMixin):
|
|
89
103
|
Initialize the enhanced PDF object.
|
90
104
|
|
91
105
|
Args:
|
92
|
-
|
106
|
+
path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
|
93
107
|
reading_order: Whether to use natural reading order
|
94
108
|
font_attrs: Font attributes for grouping characters into words
|
95
109
|
keep_spaces: Whether to include spaces in word elements
|
96
110
|
"""
|
97
|
-
|
98
|
-
|
99
|
-
self._original_path = path_or_url
|
111
|
+
self._original_path_or_stream = path_or_url_or_stream
|
100
112
|
self._temp_file = None
|
101
113
|
self._resolved_path = None
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
114
|
+
self._is_stream = False
|
115
|
+
stream_to_open = None
|
116
|
+
|
117
|
+
if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
|
118
|
+
logger.info("Initializing PDF from in-memory stream.")
|
119
|
+
self._is_stream = True
|
120
|
+
self._resolved_path = None # No resolved file path for streams
|
121
|
+
self.source_path = "<stream>" # Identifier for source
|
122
|
+
self.path = self.source_path # Use source identifier as path for streams
|
123
|
+
stream_to_open = path_or_url_or_stream
|
124
|
+
elif isinstance(path_or_url_or_stream, (str, Path)):
|
125
|
+
path_or_url = str(path_or_url_or_stream)
|
126
|
+
self.source_path = path_or_url # Store original path/URL as source
|
127
|
+
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
128
|
+
|
129
|
+
if is_url:
|
130
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
131
|
+
try:
|
132
|
+
# Use a context manager for the temporary file
|
133
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
|
134
|
+
self._temp_file = temp_f # Store reference if needed for cleanup
|
135
|
+
with urllib.request.urlopen(path_or_url) as response:
|
136
|
+
temp_f.write(response.read())
|
137
|
+
temp_f.flush()
|
138
|
+
self._resolved_path = temp_f.name
|
139
|
+
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
140
|
+
stream_to_open = self._resolved_path
|
141
|
+
except Exception as e:
|
142
|
+
if self._temp_file and hasattr(self._temp_file, "name"):
|
143
|
+
try:
|
144
|
+
os.unlink(self._temp_file.name)
|
145
|
+
except: # noqa E722
|
146
|
+
pass
|
147
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
148
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
149
|
+
else:
|
150
|
+
self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
|
151
|
+
stream_to_open = self._resolved_path
|
152
|
+
self.path = self._resolved_path # Use resolved path for file-based PDFs
|
121
153
|
else:
|
122
|
-
|
154
|
+
raise TypeError(
|
155
|
+
f"Invalid input type: {type(path_or_url_or_stream)}. "
|
156
|
+
f"Expected path (str/Path), URL (str), or file-like object."
|
157
|
+
)
|
123
158
|
|
124
|
-
logger.info(f"
|
159
|
+
logger.info(f"Opening PDF source: {self.source_path}")
|
125
160
|
logger.debug(
|
126
161
|
f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
|
127
162
|
)
|
128
163
|
|
129
164
|
try:
|
130
|
-
self._pdf = pdfplumber.open(
|
165
|
+
self._pdf = pdfplumber.open(stream_to_open)
|
131
166
|
except Exception as e:
|
132
167
|
logger.error(f"Failed to open PDF: {e}", exc_info=True)
|
133
|
-
self.close()
|
134
|
-
raise IOError(f"Failed to open PDF
|
135
|
-
|
136
|
-
self._path = self._resolved_path
|
137
|
-
self.path = self._resolved_path
|
138
|
-
self.source_path = self._original_path
|
168
|
+
self.close() # Attempt cleanup if opening fails
|
169
|
+
raise IOError(f"Failed to open PDF source: {self.source_path}") from e
|
139
170
|
|
171
|
+
# Store configuration used for initialization
|
140
172
|
self._reading_order = reading_order
|
141
173
|
self._config = {"keep_spaces": keep_spaces}
|
142
174
|
self._font_attrs = font_attrs
|
@@ -144,9 +176,11 @@ class PDF(ExtractionMixin):
|
|
144
176
|
self._ocr_manager = OCRManager() if OCRManager else None
|
145
177
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
146
178
|
self.highlighter = HighlightingService(self)
|
147
|
-
self._classification_manager_instance = ClassificationManager()
|
179
|
+
# self._classification_manager_instance = ClassificationManager() # Removed this line
|
148
180
|
self._manager_registry = {}
|
149
181
|
|
182
|
+
from natural_pdf.core.page import Page
|
183
|
+
|
150
184
|
self._pages = [
|
151
185
|
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
152
186
|
for i, p in enumerate(self._pdf.pages)
|
@@ -175,16 +209,20 @@ class PDF(ExtractionMixin):
|
|
175
209
|
def get_manager(self, key: str) -> Any:
|
176
210
|
"""Retrieve a manager instance by its key."""
|
177
211
|
if key not in self._managers:
|
178
|
-
raise KeyError(
|
179
|
-
|
212
|
+
raise KeyError(
|
213
|
+
f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
|
214
|
+
)
|
215
|
+
|
180
216
|
manager_instance = self._managers.get(key)
|
181
|
-
|
217
|
+
|
182
218
|
if manager_instance is None:
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
219
|
+
manager_class = DEFAULT_MANAGERS.get(key)
|
220
|
+
if manager_class:
|
221
|
+
raise RuntimeError(
|
222
|
+
f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
|
223
|
+
)
|
224
|
+
else:
|
225
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
|
188
226
|
|
189
227
|
return manager_instance
|
190
228
|
|
@@ -227,6 +265,7 @@ class PDF(ExtractionMixin):
|
|
227
265
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
228
266
|
|
229
267
|
Args:
|
268
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
230
269
|
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
231
270
|
label: Optional label for this exclusion
|
232
271
|
|
@@ -259,11 +298,22 @@ class PDF(ExtractionMixin):
|
|
259
298
|
) -> "PDF":
|
260
299
|
"""
|
261
300
|
Applies OCR to specified pages of the PDF using batch processing.
|
301
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
262
302
|
|
263
303
|
Args:
|
264
304
|
engine: Name of the OCR engine
|
265
305
|
languages: List of language codes
|
266
|
-
min_confidence: Minimum confidence threshold
|
306
|
+
min_confidence: Minimum confidence threshold
|
307
|
+
device: Device to run OCR on
|
308
|
+
resolution: DPI resolution for page images
|
309
|
+
apply_exclusions: Whether to mask excluded areas
|
310
|
+
detect_only: If True, only detect text boxes
|
311
|
+
replace: Whether to replace existing OCR elements
|
312
|
+
options: Engine-specific options
|
313
|
+
pages: Page indices to process or None for all pages
|
314
|
+
engine: Name of the OCR engine
|
315
|
+
languages: List of language codes
|
316
|
+
min_confidence: Minimum confidence threshold
|
267
317
|
device: Device to run OCR on
|
268
318
|
resolution: DPI resolution for page images
|
269
319
|
apply_exclusions: Whether to mask excluded areas
|
@@ -274,6 +324,7 @@ class PDF(ExtractionMixin):
|
|
274
324
|
|
275
325
|
Returns:
|
276
326
|
Self for method chaining
|
327
|
+
Self for method chaining
|
277
328
|
"""
|
278
329
|
if not self._ocr_manager:
|
279
330
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
@@ -281,7 +332,9 @@ class PDF(ExtractionMixin):
|
|
281
332
|
|
282
333
|
thread_id = threading.current_thread().name
|
283
334
|
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
284
|
-
|
335
|
+
|
336
|
+
target_pages = []
|
337
|
+
|
285
338
|
target_pages = []
|
286
339
|
if pages is None:
|
287
340
|
target_pages = self._pages
|
@@ -303,7 +356,7 @@ class PDF(ExtractionMixin):
|
|
303
356
|
|
304
357
|
page_numbers = [p.number for p in target_pages]
|
305
358
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
306
|
-
|
359
|
+
|
307
360
|
final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
|
308
361
|
logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
|
309
362
|
|
@@ -312,7 +365,7 @@ class PDF(ExtractionMixin):
|
|
312
365
|
logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
|
313
366
|
failed_page_num = "unknown"
|
314
367
|
render_start_time = time.monotonic()
|
315
|
-
|
368
|
+
|
316
369
|
try:
|
317
370
|
for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
|
318
371
|
failed_page_num = page.number
|
@@ -326,14 +379,21 @@ class PDF(ExtractionMixin):
|
|
326
379
|
if img is None:
|
327
380
|
logger.error(f" Failed to render page {page.number} to image.")
|
328
381
|
continue
|
382
|
+
continue
|
329
383
|
images_pil.append(img)
|
330
384
|
page_image_map.append((page, img))
|
331
385
|
except Exception as e:
|
386
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
332
387
|
logger.error(f"Failed to render pages for batch OCR: {e}")
|
333
388
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
334
|
-
|
389
|
+
|
335
390
|
render_end_time = time.monotonic()
|
336
|
-
logger.debug(
|
391
|
+
logger.debug(
|
392
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
393
|
+
)
|
394
|
+
logger.debug(
|
395
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
396
|
+
)
|
337
397
|
|
338
398
|
if not images_pil or not page_image_map:
|
339
399
|
logger.error("No images were successfully rendered for batch OCR.")
|
@@ -344,16 +404,18 @@ class PDF(ExtractionMixin):
|
|
344
404
|
"engine": engine,
|
345
405
|
"languages": languages,
|
346
406
|
"min_confidence": min_confidence,
|
407
|
+
"min_confidence": min_confidence,
|
347
408
|
"device": device,
|
348
409
|
"options": options,
|
349
410
|
"detect_only": detect_only,
|
350
411
|
}
|
351
412
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
352
413
|
|
353
|
-
ocr_call_args = {k:v for k,v in manager_args.items() if k!=
|
414
|
+
ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
|
415
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
354
416
|
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
355
417
|
ocr_start_time = time.monotonic()
|
356
|
-
|
418
|
+
|
357
419
|
try:
|
358
420
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
359
421
|
|
@@ -365,24 +427,28 @@ class PDF(ExtractionMixin):
|
|
365
427
|
except Exception as e:
|
366
428
|
logger.error(f"Batch OCR processing failed: {e}")
|
367
429
|
return self
|
368
|
-
|
430
|
+
|
369
431
|
ocr_end_time = time.monotonic()
|
370
|
-
logger.debug(
|
432
|
+
logger.debug(
|
433
|
+
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
434
|
+
)
|
371
435
|
|
372
436
|
logger.info("Adding OCR results to respective pages...")
|
373
437
|
total_elements_added = 0
|
374
|
-
|
438
|
+
|
375
439
|
for i, (page, img) in enumerate(page_image_map):
|
376
440
|
results_for_page = batch_results[i]
|
377
441
|
if not isinstance(results_for_page, list):
|
378
|
-
logger.warning(
|
442
|
+
logger.warning(
|
443
|
+
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
444
|
+
)
|
379
445
|
continue
|
380
446
|
|
381
447
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
382
448
|
try:
|
383
449
|
if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
|
384
450
|
page._element_mgr.remove_ocr_elements()
|
385
|
-
|
451
|
+
|
386
452
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
387
453
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
388
454
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -407,6 +473,7 @@ class PDF(ExtractionMixin):
|
|
407
473
|
Add a region function to the PDF.
|
408
474
|
|
409
475
|
Args:
|
476
|
+
region_func: A function that takes a Page and returns a Region, or None
|
410
477
|
region_func: A function that takes a Page and returns a Region, or None
|
411
478
|
name: Optional name for the region
|
412
479
|
|
@@ -425,126 +492,194 @@ class PDF(ExtractionMixin):
|
|
425
492
|
if region_instance and isinstance(region_instance, Region):
|
426
493
|
page.add_region(region_instance, name=name, source="named")
|
427
494
|
elif region_instance is not None:
|
428
|
-
logger.warning(
|
495
|
+
logger.warning(
|
496
|
+
f"Region function did not return a valid Region for page {page.number}"
|
497
|
+
)
|
429
498
|
except Exception as e:
|
430
499
|
logger.error(f"Error adding region for page {page.number}: {e}")
|
431
500
|
|
432
501
|
return self
|
433
502
|
|
503
|
+
@overload
|
504
|
+
def find(
|
505
|
+
self,
|
506
|
+
*,
|
507
|
+
text: str,
|
508
|
+
apply_exclusions: bool = True,
|
509
|
+
regex: bool = False,
|
510
|
+
case: bool = True,
|
511
|
+
**kwargs,
|
512
|
+
) -> Optional[Any]: ...
|
513
|
+
|
514
|
+
@overload
|
434
515
|
def find(
|
435
|
-
self,
|
516
|
+
self,
|
517
|
+
selector: str,
|
518
|
+
*,
|
519
|
+
apply_exclusions: bool = True,
|
520
|
+
regex: bool = False,
|
521
|
+
case: bool = True,
|
522
|
+
**kwargs,
|
523
|
+
) -> Optional[Any]: ...
|
524
|
+
|
525
|
+
def find(
|
526
|
+
self,
|
527
|
+
selector: Optional[str] = None,
|
528
|
+
*,
|
529
|
+
text: Optional[str] = None,
|
530
|
+
apply_exclusions: bool = True,
|
531
|
+
regex: bool = False,
|
532
|
+
case: bool = True,
|
533
|
+
**kwargs,
|
436
534
|
) -> Optional[Any]:
|
437
535
|
"""
|
438
|
-
Find the first element matching the selector.
|
536
|
+
Find the first element matching the selector OR text content across all pages.
|
537
|
+
|
538
|
+
Provide EITHER `selector` OR `text`, but not both.
|
439
539
|
|
440
540
|
Args:
|
441
|
-
selector: CSS-like selector string
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
541
|
+
selector: CSS-like selector string.
|
542
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
543
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
544
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
545
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
546
|
+
**kwargs: Additional filter parameters.
|
446
547
|
|
447
548
|
Returns:
|
448
|
-
Element object or None if not found
|
549
|
+
Element object or None if not found.
|
449
550
|
"""
|
450
551
|
if not hasattr(self, "_pages"):
|
451
552
|
raise AttributeError("PDF pages not yet initialized.")
|
452
553
|
|
453
|
-
|
554
|
+
if selector is not None and text is not None:
|
555
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
556
|
+
if selector is None and text is None:
|
557
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
558
|
+
|
559
|
+
# Construct selector if 'text' is provided
|
560
|
+
effective_selector = ""
|
561
|
+
if text is not None:
|
562
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
563
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
564
|
+
logger.debug(
|
565
|
+
f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
|
566
|
+
)
|
567
|
+
elif selector is not None:
|
568
|
+
effective_selector = selector
|
569
|
+
else:
|
570
|
+
raise ValueError("Internal error: No selector or text provided.")
|
571
|
+
|
572
|
+
selector_obj = parse_selector(effective_selector)
|
454
573
|
kwargs["regex"] = regex
|
455
574
|
kwargs["case"] = case
|
456
575
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
576
|
+
# Search page by page
|
577
|
+
for page in self.pages:
|
578
|
+
# Note: _apply_selector is on Page, so we call find directly here
|
579
|
+
# We pass the constructed/validated effective_selector
|
580
|
+
element = page.find(
|
581
|
+
selector=effective_selector, # Use the processed selector
|
582
|
+
apply_exclusions=apply_exclusions,
|
583
|
+
regex=regex, # Pass down flags
|
584
|
+
case=case,
|
585
|
+
**kwargs,
|
586
|
+
)
|
587
|
+
if element:
|
588
|
+
return element
|
589
|
+
return None # Not found on any page
|
461
590
|
|
591
|
+
@overload
|
462
592
|
def find_all(
|
463
|
-
self,
|
464
|
-
|
465
|
-
|
466
|
-
|
593
|
+
self,
|
594
|
+
*,
|
595
|
+
text: str,
|
596
|
+
apply_exclusions: bool = True,
|
597
|
+
regex: bool = False,
|
598
|
+
case: bool = True,
|
599
|
+
**kwargs,
|
600
|
+
) -> "ElementCollection": ...
|
467
601
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
602
|
+
@overload
|
603
|
+
def find_all(
|
604
|
+
self,
|
605
|
+
selector: str,
|
606
|
+
*,
|
607
|
+
apply_exclusions: bool = True,
|
608
|
+
regex: bool = False,
|
609
|
+
case: bool = True,
|
610
|
+
**kwargs,
|
611
|
+
) -> "ElementCollection": ...
|
474
612
|
|
475
|
-
|
476
|
-
|
613
|
+
def find_all(
|
614
|
+
self,
|
615
|
+
selector: Optional[str] = None,
|
616
|
+
*,
|
617
|
+
text: Optional[str] = None,
|
618
|
+
apply_exclusions: bool = True,
|
619
|
+
regex: bool = False,
|
620
|
+
case: bool = True,
|
621
|
+
**kwargs,
|
622
|
+
) -> "ElementCollection":
|
477
623
|
"""
|
478
|
-
|
479
|
-
raise AttributeError("PDF pages not yet initialized.")
|
624
|
+
Find all elements matching the selector OR text content across all pages.
|
480
625
|
|
481
|
-
|
482
|
-
kwargs["regex"] = regex
|
483
|
-
kwargs["case"] = case
|
484
|
-
|
485
|
-
results = self._apply_selector(
|
486
|
-
selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
|
487
|
-
)
|
488
|
-
return results
|
489
|
-
|
490
|
-
def _apply_selector(
|
491
|
-
self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
|
492
|
-
) -> ElementCollection:
|
493
|
-
"""
|
494
|
-
Apply selector to PDF elements across all pages.
|
626
|
+
Provide EITHER `selector` OR `text`, but not both.
|
495
627
|
|
496
628
|
Args:
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
629
|
+
selector: CSS-like selector string.
|
630
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
631
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
632
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
633
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
634
|
+
**kwargs: Additional filter parameters.
|
501
635
|
|
502
636
|
Returns:
|
503
|
-
ElementCollection
|
637
|
+
ElementCollection with matching elements.
|
504
638
|
"""
|
505
|
-
|
639
|
+
if not hasattr(self, "_pages"):
|
640
|
+
raise AttributeError("PDF pages not yet initialized.")
|
506
641
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
642
|
+
if selector is not None and text is not None:
|
643
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
644
|
+
if selector is None and text is None:
|
645
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
646
|
+
|
647
|
+
# Construct selector if 'text' is provided
|
648
|
+
effective_selector = ""
|
649
|
+
if text is not None:
|
650
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
651
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
652
|
+
logger.debug(
|
653
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
654
|
+
)
|
655
|
+
elif selector is not None:
|
656
|
+
effective_selector = selector
|
657
|
+
else:
|
658
|
+
raise ValueError("Internal error: No selector or text provided.")
|
512
659
|
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
return ElementCollection([])
|
660
|
+
# Instead of parsing here, let each page parse and apply
|
661
|
+
# This avoids parsing the same selector multiple times if not needed
|
662
|
+
# selector_obj = parse_selector(effective_selector)
|
517
663
|
|
518
|
-
|
519
|
-
|
520
|
-
if 0 <= page_idx < len(self._pages):
|
521
|
-
page = self._pages[page_idx]
|
522
|
-
page_elements_collection = page._apply_selector(
|
523
|
-
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
524
|
-
)
|
525
|
-
if page_elements_collection:
|
526
|
-
page_elements = page_elements_collection.elements
|
527
|
-
all_elements.extend(page_elements)
|
528
|
-
if first_only and page_elements:
|
529
|
-
break
|
530
|
-
else:
|
531
|
-
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
664
|
+
# kwargs["regex"] = regex # Removed: Already passed explicitly
|
665
|
+
# kwargs["case"] = case # Removed: Already passed explicitly
|
532
666
|
|
533
|
-
|
667
|
+
all_elements = []
|
668
|
+
for page in self.pages:
|
669
|
+
# Call page.find_all with the effective selector and flags
|
670
|
+
page_elements = page.find_all(
|
671
|
+
selector=effective_selector,
|
672
|
+
apply_exclusions=apply_exclusions,
|
673
|
+
regex=regex,
|
674
|
+
case=case,
|
675
|
+
**kwargs,
|
676
|
+
)
|
677
|
+
if page_elements:
|
678
|
+
all_elements.extend(page_elements.elements)
|
534
679
|
|
535
|
-
|
536
|
-
if all(
|
537
|
-
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
538
|
-
for el in combined.elements
|
539
|
-
):
|
540
|
-
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
541
|
-
else:
|
542
|
-
try:
|
543
|
-
combined.sort(key=lambda el: el.page.index)
|
544
|
-
except AttributeError:
|
545
|
-
logger.warning("Cannot sort elements in document order: Missing required attributes.")
|
680
|
+
from natural_pdf.elements.collections import ElementCollection
|
546
681
|
|
547
|
-
return
|
682
|
+
return ElementCollection(all_elements)
|
548
683
|
|
549
684
|
def extract_text(
|
550
685
|
self,
|
@@ -562,6 +697,9 @@ class PDF(ExtractionMixin):
|
|
562
697
|
preserve_whitespace: Whether to keep blank characters
|
563
698
|
use_exclusions: Whether to apply exclusion regions
|
564
699
|
debug_exclusions: Whether to output detailed debugging for exclusions
|
700
|
+
preserve_whitespace: Whether to keep blank characters
|
701
|
+
use_exclusions: Whether to apply exclusion regions
|
702
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
565
703
|
**kwargs: Additional extraction parameters
|
566
704
|
|
567
705
|
Returns:
|
@@ -610,22 +748,22 @@ class PDF(ExtractionMixin):
|
|
610
748
|
"""
|
611
749
|
if not hasattr(self, "_pages"):
|
612
750
|
raise AttributeError("PDF pages not yet initialized.")
|
613
|
-
|
751
|
+
|
614
752
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
615
753
|
all_tables = []
|
616
|
-
|
754
|
+
|
617
755
|
for page in self.pages:
|
618
756
|
if hasattr(page, "extract_tables"):
|
619
757
|
all_tables.extend(page.extract_tables(**kwargs))
|
620
758
|
else:
|
621
759
|
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
622
|
-
|
760
|
+
|
623
761
|
if selector:
|
624
762
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
625
|
-
|
763
|
+
|
626
764
|
if merge_across_pages:
|
627
765
|
logger.warning("Merging tables across pages is not implemented.")
|
628
|
-
|
766
|
+
|
629
767
|
return all_tables
|
630
768
|
|
631
769
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
@@ -638,6 +776,9 @@ class PDF(ExtractionMixin):
|
|
638
776
|
output_path: Path to save the searchable PDF
|
639
777
|
dpi: Resolution for rendering and OCR overlay
|
640
778
|
**kwargs: Additional keyword arguments passed to the exporter
|
779
|
+
output_path: Path to save the searchable PDF
|
780
|
+
dpi: Resolution for rendering and OCR overlay
|
781
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
641
782
|
"""
|
642
783
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
643
784
|
|
@@ -667,6 +808,7 @@ class PDF(ExtractionMixin):
|
|
667
808
|
|
668
809
|
Returns:
|
669
810
|
A dictionary containing the answer, confidence, and other metadata
|
811
|
+
A dictionary containing the answer, confidence, and other metadata
|
670
812
|
"""
|
671
813
|
from natural_pdf.qa import get_qa_engine
|
672
814
|
|
@@ -713,14 +855,19 @@ class PDF(ExtractionMixin):
|
|
713
855
|
) -> List[Dict[str, Any]]:
|
714
856
|
"""
|
715
857
|
Finds relevant documents from this PDF within a search index.
|
858
|
+
Finds relevant documents from this PDF within a search index.
|
716
859
|
|
717
860
|
Args:
|
718
861
|
query: The search query (text, image path, PIL Image, Region)
|
719
862
|
search_service: A pre-configured SearchService instance
|
720
863
|
options: Optional SearchOptions to configure the query
|
864
|
+
query: The search query (text, image path, PIL Image, Region)
|
865
|
+
search_service: A pre-configured SearchService instance
|
866
|
+
options: Optional SearchOptions to configure the query
|
721
867
|
|
722
868
|
Returns:
|
723
869
|
A list of result dictionaries, sorted by relevance
|
870
|
+
A list of result dictionaries, sorted by relevance
|
724
871
|
|
725
872
|
Raises:
|
726
873
|
ImportError: If search dependencies are not installed
|
@@ -728,12 +875,19 @@ class PDF(ExtractionMixin):
|
|
728
875
|
TypeError: If search_service does not conform to the protocol
|
729
876
|
FileNotFoundError: If the collection managed by the service does not exist
|
730
877
|
RuntimeError: For other search failures
|
878
|
+
ImportError: If search dependencies are not installed
|
879
|
+
ValueError: If search_service is None
|
880
|
+
TypeError: If search_service does not conform to the protocol
|
881
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
882
|
+
RuntimeError: For other search failures
|
731
883
|
"""
|
732
884
|
if not search_service:
|
733
885
|
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
734
886
|
|
735
887
|
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
736
|
-
logger.info(
|
888
|
+
logger.info(
|
889
|
+
f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
|
890
|
+
)
|
737
891
|
|
738
892
|
service = search_service
|
739
893
|
|
@@ -743,12 +897,15 @@ class PDF(ExtractionMixin):
|
|
743
897
|
if isinstance(query, Region):
|
744
898
|
logger.debug("Query is a Region object. Extracting text.")
|
745
899
|
if not isinstance(effective_options, TextSearchOptions):
|
746
|
-
logger.warning(
|
900
|
+
logger.warning(
|
901
|
+
"Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
|
902
|
+
)
|
747
903
|
query_input = query.extract_text()
|
748
904
|
if not query_input or query_input.isspace():
|
749
905
|
logger.error("Region has no extractable text for query.")
|
750
906
|
return []
|
751
907
|
|
908
|
+
# Add filter to scope search to THIS PDF
|
752
909
|
# Add filter to scope search to THIS PDF
|
753
910
|
pdf_scope_filter = {
|
754
911
|
"field": "pdf_path",
|
@@ -760,7 +917,10 @@ class PDF(ExtractionMixin):
|
|
760
917
|
# Combine with existing filters in options (if any)
|
761
918
|
if effective_options.filters:
|
762
919
|
logger.debug(f"Combining PDF scope filter with existing filters")
|
763
|
-
if
|
920
|
+
if (
|
921
|
+
isinstance(effective_options.filters, dict)
|
922
|
+
and effective_options.filters.get("operator") == "AND"
|
923
|
+
):
|
764
924
|
effective_options.filters["conditions"].append(pdf_scope_filter)
|
765
925
|
elif isinstance(effective_options.filters, list):
|
766
926
|
effective_options.filters = {
|
@@ -773,7 +933,9 @@ class PDF(ExtractionMixin):
|
|
773
933
|
"conditions": [effective_options.filters, pdf_scope_filter],
|
774
934
|
}
|
775
935
|
else:
|
776
|
-
logger.warning(
|
936
|
+
logger.warning(
|
937
|
+
f"Unsupported format for existing filters. Overwriting with PDF scope filter."
|
938
|
+
)
|
777
939
|
effective_options.filters = pdf_scope_filter
|
778
940
|
else:
|
779
941
|
effective_options.filters = pdf_scope_filter
|
@@ -790,26 +952,40 @@ class PDF(ExtractionMixin):
|
|
790
952
|
except FileNotFoundError as fnf:
|
791
953
|
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
792
954
|
raise
|
955
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
956
|
+
raise
|
793
957
|
except Exception as e:
|
794
958
|
logger.error(f"SearchService search failed: {e}")
|
795
959
|
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
960
|
+
logger.error(f"SearchService search failed: {e}")
|
961
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
796
962
|
|
797
963
|
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
798
964
|
"""
|
799
965
|
Exports OCR results from this PDF into a correction task package.
|
966
|
+
Exports OCR results from this PDF into a correction task package.
|
800
967
|
|
801
968
|
Args:
|
969
|
+
output_zip_path: The path to save the output zip file
|
802
970
|
output_zip_path: The path to save the output zip file
|
803
971
|
**kwargs: Additional arguments passed to create_correction_task_package
|
804
972
|
"""
|
805
973
|
try:
|
806
974
|
from natural_pdf.utils.packaging import create_correction_task_package
|
975
|
+
|
807
976
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
808
977
|
except ImportError:
|
809
|
-
logger.error(
|
978
|
+
logger.error(
|
979
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
980
|
+
)
|
981
|
+
logger.error(
|
982
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
983
|
+
)
|
810
984
|
except Exception as e:
|
811
985
|
logger.error(f"Failed to export correction task: {e}")
|
812
986
|
raise
|
987
|
+
logger.error(f"Failed to export correction task: {e}")
|
988
|
+
raise
|
813
989
|
|
814
990
|
def correct_ocr(
|
815
991
|
self,
|
@@ -820,17 +996,23 @@ class PDF(ExtractionMixin):
|
|
820
996
|
) -> "PDF":
|
821
997
|
"""
|
822
998
|
Applies corrections to OCR text elements using a callback function.
|
999
|
+
Applies corrections to OCR text elements using a callback function.
|
823
1000
|
|
824
1001
|
Args:
|
1002
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
825
1003
|
correction_callback: Function that takes an element and returns corrected text or None
|
826
1004
|
pages: Optional page indices/slice to limit the scope of correction
|
827
1005
|
max_workers: Maximum number of threads to use for parallel execution
|
828
1006
|
progress_callback: Optional callback function for progress updates
|
1007
|
+
max_workers: Maximum number of threads to use for parallel execution
|
1008
|
+
progress_callback: Optional callback function for progress updates
|
829
1009
|
|
830
1010
|
Returns:
|
831
1011
|
Self for method chaining
|
1012
|
+
Self for method chaining
|
832
1013
|
"""
|
833
1014
|
target_page_indices = []
|
1015
|
+
target_page_indices = []
|
834
1016
|
if pages is None:
|
835
1017
|
target_page_indices = list(range(len(self._pages)))
|
836
1018
|
elif isinstance(pages, slice):
|
@@ -843,14 +1025,17 @@ class PDF(ExtractionMixin):
|
|
843
1025
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
844
1026
|
except (IndexError, TypeError, ValueError) as e:
|
845
1027
|
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1028
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
846
1029
|
else:
|
847
1030
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1031
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
848
1032
|
|
849
1033
|
if not target_page_indices:
|
850
1034
|
logger.warning("No pages selected for OCR correction.")
|
851
1035
|
return self
|
852
1036
|
|
853
1037
|
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1038
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
854
1039
|
|
855
1040
|
for page_idx in target_page_indices:
|
856
1041
|
page = self._pages[page_idx]
|
@@ -862,7 +1047,9 @@ class PDF(ExtractionMixin):
|
|
862
1047
|
)
|
863
1048
|
except Exception as e:
|
864
1049
|
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1050
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
865
1051
|
|
1052
|
+
logger.info("OCR correction process finished.")
|
866
1053
|
logger.info("OCR correction process finished.")
|
867
1054
|
return self
|
868
1055
|
|
@@ -872,15 +1059,16 @@ class PDF(ExtractionMixin):
|
|
872
1059
|
return 0
|
873
1060
|
return len(self._pages)
|
874
1061
|
|
875
|
-
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
1062
|
+
def __getitem__(self, key) -> Union["Page", "PageCollection"]:
|
876
1063
|
"""Access pages by index or slice."""
|
877
1064
|
if not hasattr(self, "_pages"):
|
878
1065
|
raise AttributeError("PDF pages not initialized yet.")
|
879
|
-
|
1066
|
+
|
880
1067
|
if isinstance(key, slice):
|
881
1068
|
from natural_pdf.elements.collections import PageCollection
|
1069
|
+
|
882
1070
|
return PageCollection(self._pages[key])
|
883
|
-
|
1071
|
+
|
884
1072
|
if isinstance(key, int):
|
885
1073
|
if 0 <= key < len(self._pages):
|
886
1074
|
return self._pages[key]
|
@@ -905,13 +1093,12 @@ class PDF(ExtractionMixin):
|
|
905
1093
|
try:
|
906
1094
|
if hasattr(self._temp_file, "name") and self._temp_file.name:
|
907
1095
|
temp_file_path = self._temp_file.name
|
908
|
-
if
|
1096
|
+
# Only unlink if it exists and _is_stream is False (meaning WE created it)
|
1097
|
+
if not self._is_stream and os.path.exists(temp_file_path):
|
909
1098
|
os.unlink(temp_file_path)
|
910
1099
|
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
911
1100
|
except Exception as e:
|
912
1101
|
logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
|
913
|
-
finally:
|
914
|
-
self._temp_file = None
|
915
1102
|
|
916
1103
|
def __enter__(self):
|
917
1104
|
"""Context manager entry."""
|
@@ -922,9 +1109,136 @@ class PDF(ExtractionMixin):
|
|
922
1109
|
self.close()
|
923
1110
|
|
924
1111
|
def get_id(self) -> str:
|
1112
|
+
"""Get unique identifier for this PDF."""
|
925
1113
|
"""Get unique identifier for this PDF."""
|
926
1114
|
return self.path
|
927
1115
|
|
1116
|
+
# --- Deskew Method --- #
|
1117
|
+
|
1118
|
+
def deskew(
|
1119
|
+
self,
|
1120
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1121
|
+
resolution: int = 300,
|
1122
|
+
detection_resolution: int = 72,
|
1123
|
+
force_overwrite: bool = False,
|
1124
|
+
**deskew_kwargs,
|
1125
|
+
) -> "PDF":
|
1126
|
+
"""
|
1127
|
+
Creates a new, in-memory PDF object containing deskewed versions of the
|
1128
|
+
specified pages from the original PDF.
|
1129
|
+
|
1130
|
+
This method renders each selected page, detects and corrects skew using the 'deskew'
|
1131
|
+
library, and then combines the resulting images into a new PDF using 'img2pdf'.
|
1132
|
+
The new PDF object is returned directly.
|
1133
|
+
|
1134
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
1135
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
1136
|
+
|
1137
|
+
Args:
|
1138
|
+
pages: Page indices/slice to include (0-based). If None, processes all pages.
|
1139
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
1140
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
1141
|
+
already cached on the page objects.
|
1142
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
1143
|
+
already contains processed elements (text, OCR, regions) to
|
1144
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
1145
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
1146
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
1147
|
+
|
1148
|
+
Returns:
|
1149
|
+
A new PDF object representing the deskewed document.
|
1150
|
+
|
1151
|
+
Raises:
|
1152
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
|
1153
|
+
ValueError: If `force_overwrite` is False and target pages contain elements.
|
1154
|
+
FileNotFoundError: If the source PDF cannot be read (if file-based).
|
1155
|
+
IOError: If creating the in-memory PDF fails.
|
1156
|
+
RuntimeError: If rendering or deskewing individual pages fails.
|
1157
|
+
"""
|
1158
|
+
if not DESKEW_AVAILABLE:
|
1159
|
+
raise ImportError(
|
1160
|
+
"Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
|
1161
|
+
)
|
1162
|
+
|
1163
|
+
target_pages = self._get_target_pages(pages) # Use helper to resolve pages
|
1164
|
+
|
1165
|
+
# --- Safety Check --- #
|
1166
|
+
if not force_overwrite:
|
1167
|
+
for page in target_pages:
|
1168
|
+
# Check if the element manager has been initialized and contains any elements
|
1169
|
+
if (
|
1170
|
+
hasattr(page, "_element_mgr")
|
1171
|
+
and page._element_mgr
|
1172
|
+
and page._element_mgr.has_elements()
|
1173
|
+
):
|
1174
|
+
raise ValueError(
|
1175
|
+
f"Page {page.number} contains existing elements (text, OCR, etc.). "
|
1176
|
+
f"Deskewing creates an image-only PDF, discarding these elements. "
|
1177
|
+
f"Set force_overwrite=True to proceed."
|
1178
|
+
)
|
1179
|
+
|
1180
|
+
# --- Process Pages --- #
|
1181
|
+
deskewed_images_bytes = []
|
1182
|
+
logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
|
1183
|
+
|
1184
|
+
# Use tqdm via get_tqdm
|
1185
|
+
for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
|
1186
|
+
try:
|
1187
|
+
# Use page.deskew to get the corrected PIL image
|
1188
|
+
# Pass down resolutions and kwargs
|
1189
|
+
deskewed_img = page.deskew(
|
1190
|
+
resolution=resolution,
|
1191
|
+
angle=None, # Let page.deskew handle detection/caching
|
1192
|
+
detection_resolution=detection_resolution,
|
1193
|
+
**deskew_kwargs,
|
1194
|
+
)
|
1195
|
+
|
1196
|
+
if not deskewed_img:
|
1197
|
+
logger.warning(
|
1198
|
+
f"Page {page.number}: Failed to generate deskewed image, skipping."
|
1199
|
+
)
|
1200
|
+
continue
|
1201
|
+
|
1202
|
+
# Convert image to bytes for img2pdf (use PNG for lossless quality)
|
1203
|
+
with io.BytesIO() as buf:
|
1204
|
+
deskewed_img.save(buf, format="PNG")
|
1205
|
+
deskewed_images_bytes.append(buf.getvalue())
|
1206
|
+
|
1207
|
+
except Exception as e:
|
1208
|
+
logger.error(
|
1209
|
+
f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
|
1210
|
+
)
|
1211
|
+
# Option: Raise a runtime error, or continue and skip the page?
|
1212
|
+
# Raising makes the whole operation fail if one page fails.
|
1213
|
+
raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
|
1214
|
+
|
1215
|
+
# --- Create PDF --- #
|
1216
|
+
if not deskewed_images_bytes:
|
1217
|
+
raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
|
1218
|
+
|
1219
|
+
logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
|
1220
|
+
try:
|
1221
|
+
# Use img2pdf to combine image bytes into PDF bytes
|
1222
|
+
pdf_bytes = img2pdf.convert(deskewed_images_bytes)
|
1223
|
+
|
1224
|
+
# Wrap bytes in a stream
|
1225
|
+
pdf_stream = io.BytesIO(pdf_bytes)
|
1226
|
+
|
1227
|
+
# Create a new PDF object from the stream using original config
|
1228
|
+
logger.info("Creating new PDF object from deskewed stream...")
|
1229
|
+
new_pdf = PDF(
|
1230
|
+
pdf_stream,
|
1231
|
+
reading_order=self._reading_order,
|
1232
|
+
font_attrs=self._font_attrs,
|
1233
|
+
keep_spaces=self._config.get("keep_spaces", True),
|
1234
|
+
)
|
1235
|
+
return new_pdf
|
1236
|
+
except Exception as e:
|
1237
|
+
logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
|
1238
|
+
raise IOError("Failed to create deskewed PDF object from image stream.") from e
|
1239
|
+
|
1240
|
+
# --- End Deskew Method --- #
|
1241
|
+
|
928
1242
|
# --- Classification Methods --- #
|
929
1243
|
|
930
1244
|
def classify_pages(
|
@@ -954,19 +1268,20 @@ class PDF(ExtractionMixin):
|
|
954
1268
|
raise ValueError("Categories list cannot be empty.")
|
955
1269
|
|
956
1270
|
try:
|
957
|
-
manager = self.get_manager(
|
1271
|
+
manager = self.get_manager("classification")
|
958
1272
|
except (ValueError, RuntimeError) as e:
|
959
1273
|
raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
|
960
1274
|
|
961
1275
|
if not manager or not manager.is_available():
|
962
1276
|
try:
|
963
1277
|
from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
|
1278
|
+
|
964
1279
|
if not _CLASSIFICATION_AVAILABLE:
|
965
1280
|
raise ImportError("Classification dependencies missing.")
|
966
1281
|
except ImportError:
|
967
1282
|
raise ImportError(
|
968
1283
|
"Classification dependencies missing. "
|
969
|
-
|
1284
|
+
'Install with: pip install "natural-pdf[classification]"'
|
970
1285
|
)
|
971
1286
|
raise ClassificationError("ClassificationManager not available.")
|
972
1287
|
|
@@ -990,12 +1305,14 @@ class PDF(ExtractionMixin):
|
|
990
1305
|
return self
|
991
1306
|
|
992
1307
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
993
|
-
logger.info(
|
1308
|
+
logger.info(
|
1309
|
+
f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
|
1310
|
+
)
|
994
1311
|
|
995
1312
|
page_contents = []
|
996
1313
|
pages_to_classify = []
|
997
1314
|
logger.debug(f"Gathering content for {len(target_pages)} pages...")
|
998
|
-
|
1315
|
+
|
999
1316
|
for page in target_pages:
|
1000
1317
|
try:
|
1001
1318
|
content = page._get_classification_content(model_type=inferred_using, **kwargs)
|
@@ -1009,7 +1326,7 @@ class PDF(ExtractionMixin):
|
|
1009
1326
|
if not page_contents:
|
1010
1327
|
logger.warning("No content could be gathered for batch classification.")
|
1011
1328
|
return self
|
1012
|
-
|
1329
|
+
|
1013
1330
|
logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
|
1014
1331
|
|
1015
1332
|
try:
|
@@ -1025,17 +1342,23 @@ class PDF(ExtractionMixin):
|
|
1025
1342
|
raise ClassificationError(f"Batch classification failed: {e}") from e
|
1026
1343
|
|
1027
1344
|
if len(batch_results) != len(pages_to_classify):
|
1028
|
-
logger.error(
|
1345
|
+
logger.error(
|
1346
|
+
f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
|
1347
|
+
)
|
1029
1348
|
return self
|
1030
1349
|
|
1031
|
-
logger.debug(
|
1350
|
+
logger.debug(
|
1351
|
+
f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
|
1352
|
+
)
|
1032
1353
|
for page, result_obj in zip(pages_to_classify, batch_results):
|
1033
1354
|
try:
|
1034
|
-
if not hasattr(page,
|
1355
|
+
if not hasattr(page, "analyses") or page.analyses is None:
|
1035
1356
|
page.analyses = {}
|
1036
1357
|
page.analyses[analysis_key] = result_obj
|
1037
1358
|
except Exception as e:
|
1038
|
-
logger.warning(
|
1359
|
+
logger.warning(
|
1360
|
+
f"Failed to store classification results for page {page.number}: {e}"
|
1361
|
+
)
|
1039
1362
|
|
1040
1363
|
logger.info(f"Finished classifying PDF pages.")
|
1041
1364
|
return self
|
@@ -1043,7 +1366,7 @@ class PDF(ExtractionMixin):
|
|
1043
1366
|
# --- End Classification Methods --- #
|
1044
1367
|
|
1045
1368
|
# --- Extraction Support --- #
|
1046
|
-
def _get_extraction_content(self, using: str =
|
1369
|
+
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
1047
1370
|
"""
|
1048
1371
|
Retrieves the content for the entire PDF.
|
1049
1372
|
|
@@ -1056,28 +1379,28 @@ class PDF(ExtractionMixin):
|
|
1056
1379
|
List[PIL.Image.Image]: List of page images if using='vision'
|
1057
1380
|
None: If content cannot be retrieved
|
1058
1381
|
"""
|
1059
|
-
if using ==
|
1382
|
+
if using == "text":
|
1060
1383
|
try:
|
1061
|
-
layout = kwargs.pop(
|
1384
|
+
layout = kwargs.pop("layout", True)
|
1062
1385
|
return self.extract_text(layout=layout, **kwargs)
|
1063
1386
|
except Exception as e:
|
1064
1387
|
logger.error(f"Error extracting text from PDF: {e}")
|
1065
1388
|
return None
|
1066
|
-
elif using ==
|
1389
|
+
elif using == "vision":
|
1067
1390
|
page_images = []
|
1068
1391
|
logger.info(f"Rendering {len(self.pages)} pages to images...")
|
1069
|
-
|
1070
|
-
resolution = kwargs.pop(
|
1071
|
-
include_highlights = kwargs.pop(
|
1072
|
-
labels = kwargs.pop(
|
1073
|
-
|
1392
|
+
|
1393
|
+
resolution = kwargs.pop("resolution", 72)
|
1394
|
+
include_highlights = kwargs.pop("include_highlights", False)
|
1395
|
+
labels = kwargs.pop("labels", False)
|
1396
|
+
|
1074
1397
|
try:
|
1075
1398
|
for page in tqdm(self.pages, desc="Rendering Pages"):
|
1076
1399
|
img = page.to_image(
|
1077
1400
|
resolution=resolution,
|
1078
1401
|
include_highlights=include_highlights,
|
1079
1402
|
labels=labels,
|
1080
|
-
**kwargs
|
1403
|
+
**kwargs,
|
1081
1404
|
)
|
1082
1405
|
if img:
|
1083
1406
|
page_images.append(img)
|
@@ -1093,4 +1416,124 @@ class PDF(ExtractionMixin):
|
|
1093
1416
|
else:
|
1094
1417
|
logger.error(f"Unsupported value for 'using': {using}")
|
1095
1418
|
return None
|
1419
|
+
|
1096
1420
|
# --- End Extraction Support --- #
|
1421
|
+
|
1422
|
+
def _gather_analysis_data(
|
1423
|
+
self,
|
1424
|
+
analysis_keys: List[str],
|
1425
|
+
include_content: bool,
|
1426
|
+
include_images: bool,
|
1427
|
+
image_dir: Optional[Path],
|
1428
|
+
image_format: str,
|
1429
|
+
image_resolution: int,
|
1430
|
+
) -> List[Dict[str, Any]]:
|
1431
|
+
"""
|
1432
|
+
Gather analysis data from all pages in the PDF.
|
1433
|
+
|
1434
|
+
Args:
|
1435
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1436
|
+
include_content: Whether to include extracted text
|
1437
|
+
include_images: Whether to export images
|
1438
|
+
image_dir: Directory to save images
|
1439
|
+
image_format: Format to save images
|
1440
|
+
image_resolution: Resolution for exported images
|
1441
|
+
|
1442
|
+
Returns:
|
1443
|
+
List of dictionaries containing analysis data
|
1444
|
+
"""
|
1445
|
+
if not hasattr(self, "_pages") or not self._pages:
|
1446
|
+
logger.warning(f"No pages found in PDF {self.path}")
|
1447
|
+
return []
|
1448
|
+
|
1449
|
+
all_data = []
|
1450
|
+
|
1451
|
+
for page in tqdm(self._pages, desc="Gathering page data", leave=False):
|
1452
|
+
# Basic page information
|
1453
|
+
page_data = {
|
1454
|
+
"pdf_path": self.path,
|
1455
|
+
"page_number": page.number,
|
1456
|
+
"page_index": page.index,
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
# Include extracted text if requested
|
1460
|
+
if include_content:
|
1461
|
+
try:
|
1462
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
1463
|
+
except Exception as e:
|
1464
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
1465
|
+
page_data["content"] = ""
|
1466
|
+
|
1467
|
+
# Save image if requested
|
1468
|
+
if include_images:
|
1469
|
+
try:
|
1470
|
+
# Create image filename
|
1471
|
+
image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
|
1472
|
+
image_path = image_dir / image_filename
|
1473
|
+
|
1474
|
+
# Save image
|
1475
|
+
page.save_image(
|
1476
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
1477
|
+
)
|
1478
|
+
|
1479
|
+
# Add relative path to data
|
1480
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1481
|
+
except Exception as e:
|
1482
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
1483
|
+
page_data["image_path"] = None
|
1484
|
+
|
1485
|
+
# Add analyses data
|
1486
|
+
for key in analysis_keys:
|
1487
|
+
if not hasattr(page, "analyses") or not page.analyses:
|
1488
|
+
raise ValueError(f"Page {page.number} does not have analyses data")
|
1489
|
+
|
1490
|
+
if key not in page.analyses:
|
1491
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
1492
|
+
|
1493
|
+
# Get the analysis result
|
1494
|
+
analysis_result = page.analyses[key]
|
1495
|
+
|
1496
|
+
# If the result has a to_dict method, use it
|
1497
|
+
if hasattr(analysis_result, "to_dict"):
|
1498
|
+
analysis_data = analysis_result.to_dict()
|
1499
|
+
else:
|
1500
|
+
# Otherwise, use the result directly if it's dict-like
|
1501
|
+
try:
|
1502
|
+
analysis_data = dict(analysis_result)
|
1503
|
+
except (TypeError, ValueError):
|
1504
|
+
# Last resort: convert to string
|
1505
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1506
|
+
|
1507
|
+
# Add analysis data to page data with the key as prefix
|
1508
|
+
for k, v in analysis_data.items():
|
1509
|
+
page_data[f"{key}.{k}"] = v
|
1510
|
+
|
1511
|
+
all_data.append(page_data)
|
1512
|
+
|
1513
|
+
return all_data
|
1514
|
+
|
1515
|
+
def _get_target_pages(
|
1516
|
+
self, pages: Optional[Union[Iterable[int], range, slice]] = None
|
1517
|
+
) -> List["Page"]:
|
1518
|
+
"""
|
1519
|
+
Helper method to get a list of Page objects based on the input pages.
|
1520
|
+
|
1521
|
+
Args:
|
1522
|
+
pages: Page indices, slice, or None for all pages
|
1523
|
+
|
1524
|
+
Returns:
|
1525
|
+
List of Page objects
|
1526
|
+
"""
|
1527
|
+
if pages is None:
|
1528
|
+
return self._pages
|
1529
|
+
elif isinstance(pages, slice):
|
1530
|
+
return self._pages[pages]
|
1531
|
+
elif hasattr(pages, "__iter__"):
|
1532
|
+
try:
|
1533
|
+
return [self._pages[i] for i in pages]
|
1534
|
+
except IndexError:
|
1535
|
+
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
1536
|
+
except TypeError:
|
1537
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1538
|
+
else:
|
1539
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|