natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import io
|
2
3
|
import logging
|
3
4
|
import os
|
4
5
|
import re
|
5
6
|
import tempfile
|
6
|
-
import urllib.request
|
7
|
-
import time
|
8
7
|
import threading
|
8
|
+
import time
|
9
|
+
import urllib.request
|
9
10
|
from pathlib import Path
|
10
11
|
from typing import (
|
11
12
|
TYPE_CHECKING,
|
@@ -18,38 +19,35 @@ from typing import (
|
|
18
19
|
Tuple,
|
19
20
|
Type,
|
20
21
|
Union,
|
22
|
+
overload,
|
21
23
|
)
|
22
|
-
from natural_pdf.utils.tqdm_utils import get_tqdm
|
23
24
|
|
24
25
|
import pdfplumber
|
25
26
|
from PIL import Image
|
26
27
|
|
27
28
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
|
+
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
30
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
31
|
+
from natural_pdf.classification.results import ClassificationResult
|
28
32
|
from natural_pdf.core.highlighting_service import HighlightingService
|
29
|
-
from natural_pdf.
|
30
|
-
from natural_pdf.elements.collections import ElementCollection
|
33
|
+
from natural_pdf.elements.base import Element
|
31
34
|
from natural_pdf.elements.region import Region
|
35
|
+
from natural_pdf.export.mixin import ExportMixin
|
36
|
+
from natural_pdf.extraction.manager import StructuredDataManager
|
37
|
+
from natural_pdf.extraction.mixin import ExtractionMixin
|
32
38
|
from natural_pdf.ocr import OCRManager, OCROptions
|
33
39
|
from natural_pdf.selectors.parser import parse_selector
|
34
|
-
|
35
|
-
from natural_pdf.classification.manager import ClassificationManager
|
36
|
-
from natural_pdf.classification.manager import ClassificationError
|
37
|
-
from natural_pdf.classification.results import ClassificationResult
|
38
|
-
from natural_pdf.extraction.manager import StructuredDataManager
|
39
|
-
|
40
40
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from natural_pdf.
|
42
|
-
from natural_pdf.classification.mixin import ClassificationMixin
|
43
|
-
from natural_pdf.extraction.mixin import ExtractionMixin
|
41
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
44
42
|
|
45
43
|
try:
|
46
44
|
from typing import Any as TypingAny
|
47
45
|
|
48
|
-
from natural_pdf.search import TextSearchOptions
|
49
46
|
from natural_pdf.search import (
|
50
47
|
BaseSearchOptions,
|
51
48
|
SearchOptions,
|
52
49
|
SearchServiceProtocol,
|
50
|
+
TextSearchOptions,
|
53
51
|
get_search_service,
|
54
52
|
)
|
55
53
|
except ImportError:
|
@@ -62,6 +60,7 @@ except ImportError:
|
|
62
60
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
63
61
|
)
|
64
62
|
|
63
|
+
|
65
64
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
66
65
|
tqdm = get_tqdm()
|
67
66
|
|
@@ -70,7 +69,22 @@ DEFAULT_MANAGERS = {
|
|
70
69
|
"structured_data": StructuredDataManager,
|
71
70
|
}
|
72
71
|
|
73
|
-
|
72
|
+
# Deskew Imports (Conditional)
|
73
|
+
import numpy as np
|
74
|
+
from PIL import Image
|
75
|
+
|
76
|
+
try:
|
77
|
+
import img2pdf
|
78
|
+
from deskew import determine_skew
|
79
|
+
|
80
|
+
DESKEW_AVAILABLE = True
|
81
|
+
except ImportError:
|
82
|
+
DESKEW_AVAILABLE = False
|
83
|
+
img2pdf = None
|
84
|
+
# End Deskew Imports
|
85
|
+
|
86
|
+
|
87
|
+
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
74
88
|
"""
|
75
89
|
Enhanced PDF wrapper built on top of pdfplumber.
|
76
90
|
|
@@ -80,7 +94,7 @@ class PDF(ExtractionMixin):
|
|
80
94
|
|
81
95
|
def __init__(
|
82
96
|
self,
|
83
|
-
|
97
|
+
path_or_url_or_stream,
|
84
98
|
reading_order: bool = True,
|
85
99
|
font_attrs: Optional[List[str]] = None,
|
86
100
|
keep_spaces: bool = True,
|
@@ -89,54 +103,72 @@ class PDF(ExtractionMixin):
|
|
89
103
|
Initialize the enhanced PDF object.
|
90
104
|
|
91
105
|
Args:
|
92
|
-
|
106
|
+
path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
|
93
107
|
reading_order: Whether to use natural reading order
|
94
108
|
font_attrs: Font attributes for grouping characters into words
|
95
109
|
keep_spaces: Whether to include spaces in word elements
|
96
110
|
"""
|
97
|
-
|
98
|
-
|
99
|
-
self._original_path = path_or_url
|
111
|
+
self._original_path_or_stream = path_or_url_or_stream
|
100
112
|
self._temp_file = None
|
101
113
|
self._resolved_path = None
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
114
|
+
self._is_stream = False
|
115
|
+
stream_to_open = None
|
116
|
+
|
117
|
+
if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
|
118
|
+
logger.info("Initializing PDF from in-memory stream.")
|
119
|
+
self._is_stream = True
|
120
|
+
self._resolved_path = None # No resolved file path for streams
|
121
|
+
self.source_path = "<stream>" # Identifier for source
|
122
|
+
self.path = self.source_path # Use source identifier as path for streams
|
123
|
+
stream_to_open = path_or_url_or_stream
|
124
|
+
elif isinstance(path_or_url_or_stream, (str, Path)):
|
125
|
+
path_or_url = str(path_or_url_or_stream)
|
126
|
+
self.source_path = path_or_url # Store original path/URL as source
|
127
|
+
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
128
|
+
|
129
|
+
if is_url:
|
130
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
131
|
+
try:
|
132
|
+
# Use a context manager for the temporary file
|
133
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
|
134
|
+
self._temp_file = temp_f # Store reference if needed for cleanup
|
135
|
+
with urllib.request.urlopen(path_or_url) as response:
|
136
|
+
temp_f.write(response.read())
|
137
|
+
temp_f.flush()
|
138
|
+
self._resolved_path = temp_f.name
|
139
|
+
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
140
|
+
stream_to_open = self._resolved_path
|
141
|
+
except Exception as e:
|
142
|
+
if self._temp_file and hasattr(self._temp_file, "name"):
|
143
|
+
try:
|
144
|
+
os.unlink(self._temp_file.name)
|
145
|
+
except: # noqa E722
|
146
|
+
pass
|
147
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
148
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
149
|
+
else:
|
150
|
+
self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
|
151
|
+
stream_to_open = self._resolved_path
|
152
|
+
self.path = self._resolved_path # Use resolved path for file-based PDFs
|
121
153
|
else:
|
122
|
-
|
154
|
+
raise TypeError(
|
155
|
+
f"Invalid input type: {type(path_or_url_or_stream)}. "
|
156
|
+
f"Expected path (str/Path), URL (str), or file-like object."
|
157
|
+
)
|
123
158
|
|
124
|
-
logger.info(f"
|
159
|
+
logger.info(f"Opening PDF source: {self.source_path}")
|
125
160
|
logger.debug(
|
126
161
|
f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
|
127
162
|
)
|
128
163
|
|
129
164
|
try:
|
130
|
-
self._pdf = pdfplumber.open(
|
165
|
+
self._pdf = pdfplumber.open(stream_to_open)
|
131
166
|
except Exception as e:
|
132
167
|
logger.error(f"Failed to open PDF: {e}", exc_info=True)
|
133
|
-
self.close()
|
134
|
-
raise IOError(f"Failed to open PDF
|
135
|
-
|
136
|
-
self._path = self._resolved_path
|
137
|
-
self.path = self._resolved_path
|
138
|
-
self.source_path = self._original_path
|
168
|
+
self.close() # Attempt cleanup if opening fails
|
169
|
+
raise IOError(f"Failed to open PDF source: {self.source_path}") from e
|
139
170
|
|
171
|
+
# Store configuration used for initialization
|
140
172
|
self._reading_order = reading_order
|
141
173
|
self._config = {"keep_spaces": keep_spaces}
|
142
174
|
self._font_attrs = font_attrs
|
@@ -144,9 +176,11 @@ class PDF(ExtractionMixin):
|
|
144
176
|
self._ocr_manager = OCRManager() if OCRManager else None
|
145
177
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
146
178
|
self.highlighter = HighlightingService(self)
|
147
|
-
self._classification_manager_instance = ClassificationManager()
|
179
|
+
# self._classification_manager_instance = ClassificationManager() # Removed this line
|
148
180
|
self._manager_registry = {}
|
149
181
|
|
182
|
+
from natural_pdf.core.page import Page
|
183
|
+
|
150
184
|
self._pages = [
|
151
185
|
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
152
186
|
for i, p in enumerate(self._pdf.pages)
|
@@ -160,6 +194,7 @@ class PDF(ExtractionMixin):
|
|
160
194
|
|
161
195
|
self._initialize_managers()
|
162
196
|
self._initialize_highlighter()
|
197
|
+
self.analyses: Dict[str, Any] = {}
|
163
198
|
|
164
199
|
def _initialize_managers(self):
|
165
200
|
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
@@ -175,16 +210,20 @@ class PDF(ExtractionMixin):
|
|
175
210
|
def get_manager(self, key: str) -> Any:
|
176
211
|
"""Retrieve a manager instance by its key."""
|
177
212
|
if key not in self._managers:
|
178
|
-
raise KeyError(
|
179
|
-
|
213
|
+
raise KeyError(
|
214
|
+
f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
|
215
|
+
)
|
216
|
+
|
180
217
|
manager_instance = self._managers.get(key)
|
181
|
-
|
218
|
+
|
182
219
|
if manager_instance is None:
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
220
|
+
manager_class = DEFAULT_MANAGERS.get(key)
|
221
|
+
if manager_class:
|
222
|
+
raise RuntimeError(
|
223
|
+
f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
|
224
|
+
)
|
225
|
+
else:
|
226
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
|
188
227
|
|
189
228
|
return manager_instance
|
190
229
|
|
@@ -227,6 +266,7 @@ class PDF(ExtractionMixin):
|
|
227
266
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
228
267
|
|
229
268
|
Args:
|
269
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
230
270
|
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
231
271
|
label: Optional label for this exclusion
|
232
272
|
|
@@ -259,11 +299,22 @@ class PDF(ExtractionMixin):
|
|
259
299
|
) -> "PDF":
|
260
300
|
"""
|
261
301
|
Applies OCR to specified pages of the PDF using batch processing.
|
302
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
262
303
|
|
263
304
|
Args:
|
264
305
|
engine: Name of the OCR engine
|
265
306
|
languages: List of language codes
|
266
|
-
min_confidence: Minimum confidence threshold
|
307
|
+
min_confidence: Minimum confidence threshold
|
308
|
+
device: Device to run OCR on
|
309
|
+
resolution: DPI resolution for page images
|
310
|
+
apply_exclusions: Whether to mask excluded areas
|
311
|
+
detect_only: If True, only detect text boxes
|
312
|
+
replace: Whether to replace existing OCR elements
|
313
|
+
options: Engine-specific options
|
314
|
+
pages: Page indices to process or None for all pages
|
315
|
+
engine: Name of the OCR engine
|
316
|
+
languages: List of language codes
|
317
|
+
min_confidence: Minimum confidence threshold
|
267
318
|
device: Device to run OCR on
|
268
319
|
resolution: DPI resolution for page images
|
269
320
|
apply_exclusions: Whether to mask excluded areas
|
@@ -274,6 +325,7 @@ class PDF(ExtractionMixin):
|
|
274
325
|
|
275
326
|
Returns:
|
276
327
|
Self for method chaining
|
328
|
+
Self for method chaining
|
277
329
|
"""
|
278
330
|
if not self._ocr_manager:
|
279
331
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
@@ -281,7 +333,9 @@ class PDF(ExtractionMixin):
|
|
281
333
|
|
282
334
|
thread_id = threading.current_thread().name
|
283
335
|
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
284
|
-
|
336
|
+
|
337
|
+
target_pages = []
|
338
|
+
|
285
339
|
target_pages = []
|
286
340
|
if pages is None:
|
287
341
|
target_pages = self._pages
|
@@ -303,7 +357,7 @@ class PDF(ExtractionMixin):
|
|
303
357
|
|
304
358
|
page_numbers = [p.number for p in target_pages]
|
305
359
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
306
|
-
|
360
|
+
|
307
361
|
final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
|
308
362
|
logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
|
309
363
|
|
@@ -312,7 +366,7 @@ class PDF(ExtractionMixin):
|
|
312
366
|
logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
|
313
367
|
failed_page_num = "unknown"
|
314
368
|
render_start_time = time.monotonic()
|
315
|
-
|
369
|
+
|
316
370
|
try:
|
317
371
|
for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
|
318
372
|
failed_page_num = page.number
|
@@ -326,14 +380,21 @@ class PDF(ExtractionMixin):
|
|
326
380
|
if img is None:
|
327
381
|
logger.error(f" Failed to render page {page.number} to image.")
|
328
382
|
continue
|
383
|
+
continue
|
329
384
|
images_pil.append(img)
|
330
385
|
page_image_map.append((page, img))
|
331
386
|
except Exception as e:
|
387
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
332
388
|
logger.error(f"Failed to render pages for batch OCR: {e}")
|
333
389
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
334
|
-
|
390
|
+
|
335
391
|
render_end_time = time.monotonic()
|
336
|
-
logger.debug(
|
392
|
+
logger.debug(
|
393
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
394
|
+
)
|
395
|
+
logger.debug(
|
396
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
397
|
+
)
|
337
398
|
|
338
399
|
if not images_pil or not page_image_map:
|
339
400
|
logger.error("No images were successfully rendered for batch OCR.")
|
@@ -344,16 +405,18 @@ class PDF(ExtractionMixin):
|
|
344
405
|
"engine": engine,
|
345
406
|
"languages": languages,
|
346
407
|
"min_confidence": min_confidence,
|
408
|
+
"min_confidence": min_confidence,
|
347
409
|
"device": device,
|
348
410
|
"options": options,
|
349
411
|
"detect_only": detect_only,
|
350
412
|
}
|
351
413
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
352
414
|
|
353
|
-
ocr_call_args = {k:v for k,v in manager_args.items() if k!=
|
415
|
+
ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
|
416
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
354
417
|
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
355
418
|
ocr_start_time = time.monotonic()
|
356
|
-
|
419
|
+
|
357
420
|
try:
|
358
421
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
359
422
|
|
@@ -365,24 +428,28 @@ class PDF(ExtractionMixin):
|
|
365
428
|
except Exception as e:
|
366
429
|
logger.error(f"Batch OCR processing failed: {e}")
|
367
430
|
return self
|
368
|
-
|
431
|
+
|
369
432
|
ocr_end_time = time.monotonic()
|
370
|
-
logger.debug(
|
433
|
+
logger.debug(
|
434
|
+
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
435
|
+
)
|
371
436
|
|
372
437
|
logger.info("Adding OCR results to respective pages...")
|
373
438
|
total_elements_added = 0
|
374
|
-
|
439
|
+
|
375
440
|
for i, (page, img) in enumerate(page_image_map):
|
376
441
|
results_for_page = batch_results[i]
|
377
442
|
if not isinstance(results_for_page, list):
|
378
|
-
logger.warning(
|
443
|
+
logger.warning(
|
444
|
+
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
445
|
+
)
|
379
446
|
continue
|
380
447
|
|
381
448
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
382
449
|
try:
|
383
450
|
if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
|
384
451
|
page._element_mgr.remove_ocr_elements()
|
385
|
-
|
452
|
+
|
386
453
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
387
454
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
388
455
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -407,6 +474,7 @@ class PDF(ExtractionMixin):
|
|
407
474
|
Add a region function to the PDF.
|
408
475
|
|
409
476
|
Args:
|
477
|
+
region_func: A function that takes a Page and returns a Region, or None
|
410
478
|
region_func: A function that takes a Page and returns a Region, or None
|
411
479
|
name: Optional name for the region
|
412
480
|
|
@@ -425,126 +493,194 @@ class PDF(ExtractionMixin):
|
|
425
493
|
if region_instance and isinstance(region_instance, Region):
|
426
494
|
page.add_region(region_instance, name=name, source="named")
|
427
495
|
elif region_instance is not None:
|
428
|
-
logger.warning(
|
496
|
+
logger.warning(
|
497
|
+
f"Region function did not return a valid Region for page {page.number}"
|
498
|
+
)
|
429
499
|
except Exception as e:
|
430
500
|
logger.error(f"Error adding region for page {page.number}: {e}")
|
431
501
|
|
432
502
|
return self
|
433
503
|
|
504
|
+
@overload
|
434
505
|
def find(
|
435
|
-
self,
|
506
|
+
self,
|
507
|
+
*,
|
508
|
+
text: str,
|
509
|
+
apply_exclusions: bool = True,
|
510
|
+
regex: bool = False,
|
511
|
+
case: bool = True,
|
512
|
+
**kwargs,
|
513
|
+
) -> Optional[Any]: ...
|
514
|
+
|
515
|
+
@overload
|
516
|
+
def find(
|
517
|
+
self,
|
518
|
+
selector: str,
|
519
|
+
*,
|
520
|
+
apply_exclusions: bool = True,
|
521
|
+
regex: bool = False,
|
522
|
+
case: bool = True,
|
523
|
+
**kwargs,
|
524
|
+
) -> Optional[Any]: ...
|
525
|
+
|
526
|
+
def find(
|
527
|
+
self,
|
528
|
+
selector: Optional[str] = None,
|
529
|
+
*,
|
530
|
+
text: Optional[str] = None,
|
531
|
+
apply_exclusions: bool = True,
|
532
|
+
regex: bool = False,
|
533
|
+
case: bool = True,
|
534
|
+
**kwargs,
|
436
535
|
) -> Optional[Any]:
|
437
536
|
"""
|
438
|
-
Find the first element matching the selector.
|
537
|
+
Find the first element matching the selector OR text content across all pages.
|
538
|
+
|
539
|
+
Provide EITHER `selector` OR `text`, but not both.
|
439
540
|
|
440
541
|
Args:
|
441
|
-
selector: CSS-like selector string
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
542
|
+
selector: CSS-like selector string.
|
543
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
544
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
545
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
546
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
547
|
+
**kwargs: Additional filter parameters.
|
446
548
|
|
447
549
|
Returns:
|
448
|
-
Element object or None if not found
|
550
|
+
Element object or None if not found.
|
449
551
|
"""
|
450
552
|
if not hasattr(self, "_pages"):
|
451
553
|
raise AttributeError("PDF pages not yet initialized.")
|
452
554
|
|
453
|
-
|
555
|
+
if selector is not None and text is not None:
|
556
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
557
|
+
if selector is None and text is None:
|
558
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
559
|
+
|
560
|
+
# Construct selector if 'text' is provided
|
561
|
+
effective_selector = ""
|
562
|
+
if text is not None:
|
563
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
564
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
565
|
+
logger.debug(
|
566
|
+
f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
|
567
|
+
)
|
568
|
+
elif selector is not None:
|
569
|
+
effective_selector = selector
|
570
|
+
else:
|
571
|
+
raise ValueError("Internal error: No selector or text provided.")
|
572
|
+
|
573
|
+
selector_obj = parse_selector(effective_selector)
|
454
574
|
kwargs["regex"] = regex
|
455
575
|
kwargs["case"] = case
|
456
576
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
577
|
+
# Search page by page
|
578
|
+
for page in self.pages:
|
579
|
+
# Note: _apply_selector is on Page, so we call find directly here
|
580
|
+
# We pass the constructed/validated effective_selector
|
581
|
+
element = page.find(
|
582
|
+
selector=effective_selector, # Use the processed selector
|
583
|
+
apply_exclusions=apply_exclusions,
|
584
|
+
regex=regex, # Pass down flags
|
585
|
+
case=case,
|
586
|
+
**kwargs,
|
587
|
+
)
|
588
|
+
if element:
|
589
|
+
return element
|
590
|
+
return None # Not found on any page
|
461
591
|
|
592
|
+
@overload
|
462
593
|
def find_all(
|
463
|
-
self,
|
464
|
-
|
465
|
-
|
466
|
-
|
594
|
+
self,
|
595
|
+
*,
|
596
|
+
text: str,
|
597
|
+
apply_exclusions: bool = True,
|
598
|
+
regex: bool = False,
|
599
|
+
case: bool = True,
|
600
|
+
**kwargs,
|
601
|
+
) -> "ElementCollection": ...
|
467
602
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
603
|
+
@overload
|
604
|
+
def find_all(
|
605
|
+
self,
|
606
|
+
selector: str,
|
607
|
+
*,
|
608
|
+
apply_exclusions: bool = True,
|
609
|
+
regex: bool = False,
|
610
|
+
case: bool = True,
|
611
|
+
**kwargs,
|
612
|
+
) -> "ElementCollection": ...
|
474
613
|
|
475
|
-
|
476
|
-
|
614
|
+
def find_all(
|
615
|
+
self,
|
616
|
+
selector: Optional[str] = None,
|
617
|
+
*,
|
618
|
+
text: Optional[str] = None,
|
619
|
+
apply_exclusions: bool = True,
|
620
|
+
regex: bool = False,
|
621
|
+
case: bool = True,
|
622
|
+
**kwargs,
|
623
|
+
) -> "ElementCollection":
|
477
624
|
"""
|
478
|
-
|
479
|
-
raise AttributeError("PDF pages not yet initialized.")
|
480
|
-
|
481
|
-
selector_obj = parse_selector(selector)
|
482
|
-
kwargs["regex"] = regex
|
483
|
-
kwargs["case"] = case
|
484
|
-
|
485
|
-
results = self._apply_selector(
|
486
|
-
selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
|
487
|
-
)
|
488
|
-
return results
|
625
|
+
Find all elements matching the selector OR text content across all pages.
|
489
626
|
|
490
|
-
|
491
|
-
self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
|
492
|
-
) -> ElementCollection:
|
493
|
-
"""
|
494
|
-
Apply selector to PDF elements across all pages.
|
627
|
+
Provide EITHER `selector` OR `text`, but not both.
|
495
628
|
|
496
629
|
Args:
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
630
|
+
selector: CSS-like selector string.
|
631
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
632
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
633
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
634
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
635
|
+
**kwargs: Additional filter parameters.
|
501
636
|
|
502
637
|
Returns:
|
503
|
-
ElementCollection
|
638
|
+
ElementCollection with matching elements.
|
504
639
|
"""
|
505
|
-
|
640
|
+
if not hasattr(self, "_pages"):
|
641
|
+
raise AttributeError("PDF pages not yet initialized.")
|
506
642
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
643
|
+
if selector is not None and text is not None:
|
644
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
645
|
+
if selector is None and text is None:
|
646
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
647
|
+
|
648
|
+
# Construct selector if 'text' is provided
|
649
|
+
effective_selector = ""
|
650
|
+
if text is not None:
|
651
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
652
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
653
|
+
logger.debug(
|
654
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
655
|
+
)
|
656
|
+
elif selector is not None:
|
657
|
+
effective_selector = selector
|
658
|
+
else:
|
659
|
+
raise ValueError("Internal error: No selector or text provided.")
|
512
660
|
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
return ElementCollection([])
|
661
|
+
# Instead of parsing here, let each page parse and apply
|
662
|
+
# This avoids parsing the same selector multiple times if not needed
|
663
|
+
# selector_obj = parse_selector(effective_selector)
|
517
664
|
|
518
|
-
|
519
|
-
|
520
|
-
if 0 <= page_idx < len(self._pages):
|
521
|
-
page = self._pages[page_idx]
|
522
|
-
page_elements_collection = page._apply_selector(
|
523
|
-
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
524
|
-
)
|
525
|
-
if page_elements_collection:
|
526
|
-
page_elements = page_elements_collection.elements
|
527
|
-
all_elements.extend(page_elements)
|
528
|
-
if first_only and page_elements:
|
529
|
-
break
|
530
|
-
else:
|
531
|
-
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
665
|
+
# kwargs["regex"] = regex # Removed: Already passed explicitly
|
666
|
+
# kwargs["case"] = case # Removed: Already passed explicitly
|
532
667
|
|
533
|
-
|
668
|
+
all_elements = []
|
669
|
+
for page in self.pages:
|
670
|
+
# Call page.find_all with the effective selector and flags
|
671
|
+
page_elements = page.find_all(
|
672
|
+
selector=effective_selector,
|
673
|
+
apply_exclusions=apply_exclusions,
|
674
|
+
regex=regex,
|
675
|
+
case=case,
|
676
|
+
**kwargs,
|
677
|
+
)
|
678
|
+
if page_elements:
|
679
|
+
all_elements.extend(page_elements.elements)
|
534
680
|
|
535
|
-
|
536
|
-
if all(
|
537
|
-
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
538
|
-
for el in combined.elements
|
539
|
-
):
|
540
|
-
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
541
|
-
else:
|
542
|
-
try:
|
543
|
-
combined.sort(key=lambda el: el.page.index)
|
544
|
-
except AttributeError:
|
545
|
-
logger.warning("Cannot sort elements in document order: Missing required attributes.")
|
681
|
+
from natural_pdf.elements.collections import ElementCollection
|
546
682
|
|
547
|
-
return
|
683
|
+
return ElementCollection(all_elements)
|
548
684
|
|
549
685
|
def extract_text(
|
550
686
|
self,
|
@@ -562,6 +698,9 @@ class PDF(ExtractionMixin):
|
|
562
698
|
preserve_whitespace: Whether to keep blank characters
|
563
699
|
use_exclusions: Whether to apply exclusion regions
|
564
700
|
debug_exclusions: Whether to output detailed debugging for exclusions
|
701
|
+
preserve_whitespace: Whether to keep blank characters
|
702
|
+
use_exclusions: Whether to apply exclusion regions
|
703
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
565
704
|
**kwargs: Additional extraction parameters
|
566
705
|
|
567
706
|
Returns:
|
@@ -610,22 +749,22 @@ class PDF(ExtractionMixin):
|
|
610
749
|
"""
|
611
750
|
if not hasattr(self, "_pages"):
|
612
751
|
raise AttributeError("PDF pages not yet initialized.")
|
613
|
-
|
752
|
+
|
614
753
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
615
754
|
all_tables = []
|
616
|
-
|
755
|
+
|
617
756
|
for page in self.pages:
|
618
757
|
if hasattr(page, "extract_tables"):
|
619
758
|
all_tables.extend(page.extract_tables(**kwargs))
|
620
759
|
else:
|
621
760
|
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
622
|
-
|
761
|
+
|
623
762
|
if selector:
|
624
763
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
625
|
-
|
764
|
+
|
626
765
|
if merge_across_pages:
|
627
766
|
logger.warning("Merging tables across pages is not implemented.")
|
628
|
-
|
767
|
+
|
629
768
|
return all_tables
|
630
769
|
|
631
770
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
@@ -638,6 +777,9 @@ class PDF(ExtractionMixin):
|
|
638
777
|
output_path: Path to save the searchable PDF
|
639
778
|
dpi: Resolution for rendering and OCR overlay
|
640
779
|
**kwargs: Additional keyword arguments passed to the exporter
|
780
|
+
output_path: Path to save the searchable PDF
|
781
|
+
dpi: Resolution for rendering and OCR overlay
|
782
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
641
783
|
"""
|
642
784
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
643
785
|
|
@@ -667,6 +809,7 @@ class PDF(ExtractionMixin):
|
|
667
809
|
|
668
810
|
Returns:
|
669
811
|
A dictionary containing the answer, confidence, and other metadata
|
812
|
+
A dictionary containing the answer, confidence, and other metadata
|
670
813
|
"""
|
671
814
|
from natural_pdf.qa import get_qa_engine
|
672
815
|
|
@@ -713,14 +856,19 @@ class PDF(ExtractionMixin):
|
|
713
856
|
) -> List[Dict[str, Any]]:
|
714
857
|
"""
|
715
858
|
Finds relevant documents from this PDF within a search index.
|
859
|
+
Finds relevant documents from this PDF within a search index.
|
716
860
|
|
717
861
|
Args:
|
718
862
|
query: The search query (text, image path, PIL Image, Region)
|
719
863
|
search_service: A pre-configured SearchService instance
|
720
864
|
options: Optional SearchOptions to configure the query
|
865
|
+
query: The search query (text, image path, PIL Image, Region)
|
866
|
+
search_service: A pre-configured SearchService instance
|
867
|
+
options: Optional SearchOptions to configure the query
|
721
868
|
|
722
869
|
Returns:
|
723
870
|
A list of result dictionaries, sorted by relevance
|
871
|
+
A list of result dictionaries, sorted by relevance
|
724
872
|
|
725
873
|
Raises:
|
726
874
|
ImportError: If search dependencies are not installed
|
@@ -728,12 +876,19 @@ class PDF(ExtractionMixin):
|
|
728
876
|
TypeError: If search_service does not conform to the protocol
|
729
877
|
FileNotFoundError: If the collection managed by the service does not exist
|
730
878
|
RuntimeError: For other search failures
|
879
|
+
ImportError: If search dependencies are not installed
|
880
|
+
ValueError: If search_service is None
|
881
|
+
TypeError: If search_service does not conform to the protocol
|
882
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
883
|
+
RuntimeError: For other search failures
|
731
884
|
"""
|
732
885
|
if not search_service:
|
733
886
|
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
734
887
|
|
735
888
|
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
736
|
-
logger.info(
|
889
|
+
logger.info(
|
890
|
+
f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
|
891
|
+
)
|
737
892
|
|
738
893
|
service = search_service
|
739
894
|
|
@@ -743,12 +898,15 @@ class PDF(ExtractionMixin):
|
|
743
898
|
if isinstance(query, Region):
|
744
899
|
logger.debug("Query is a Region object. Extracting text.")
|
745
900
|
if not isinstance(effective_options, TextSearchOptions):
|
746
|
-
logger.warning(
|
901
|
+
logger.warning(
|
902
|
+
"Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
|
903
|
+
)
|
747
904
|
query_input = query.extract_text()
|
748
905
|
if not query_input or query_input.isspace():
|
749
906
|
logger.error("Region has no extractable text for query.")
|
750
907
|
return []
|
751
908
|
|
909
|
+
# Add filter to scope search to THIS PDF
|
752
910
|
# Add filter to scope search to THIS PDF
|
753
911
|
pdf_scope_filter = {
|
754
912
|
"field": "pdf_path",
|
@@ -760,7 +918,10 @@ class PDF(ExtractionMixin):
|
|
760
918
|
# Combine with existing filters in options (if any)
|
761
919
|
if effective_options.filters:
|
762
920
|
logger.debug(f"Combining PDF scope filter with existing filters")
|
763
|
-
if
|
921
|
+
if (
|
922
|
+
isinstance(effective_options.filters, dict)
|
923
|
+
and effective_options.filters.get("operator") == "AND"
|
924
|
+
):
|
764
925
|
effective_options.filters["conditions"].append(pdf_scope_filter)
|
765
926
|
elif isinstance(effective_options.filters, list):
|
766
927
|
effective_options.filters = {
|
@@ -773,7 +934,9 @@ class PDF(ExtractionMixin):
|
|
773
934
|
"conditions": [effective_options.filters, pdf_scope_filter],
|
774
935
|
}
|
775
936
|
else:
|
776
|
-
logger.warning(
|
937
|
+
logger.warning(
|
938
|
+
f"Unsupported format for existing filters. Overwriting with PDF scope filter."
|
939
|
+
)
|
777
940
|
effective_options.filters = pdf_scope_filter
|
778
941
|
else:
|
779
942
|
effective_options.filters = pdf_scope_filter
|
@@ -790,26 +953,40 @@ class PDF(ExtractionMixin):
|
|
790
953
|
except FileNotFoundError as fnf:
|
791
954
|
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
792
955
|
raise
|
956
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
957
|
+
raise
|
793
958
|
except Exception as e:
|
794
959
|
logger.error(f"SearchService search failed: {e}")
|
795
960
|
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
961
|
+
logger.error(f"SearchService search failed: {e}")
|
962
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
796
963
|
|
797
964
|
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
798
965
|
"""
|
799
966
|
Exports OCR results from this PDF into a correction task package.
|
967
|
+
Exports OCR results from this PDF into a correction task package.
|
800
968
|
|
801
969
|
Args:
|
970
|
+
output_zip_path: The path to save the output zip file
|
802
971
|
output_zip_path: The path to save the output zip file
|
803
972
|
**kwargs: Additional arguments passed to create_correction_task_package
|
804
973
|
"""
|
805
974
|
try:
|
806
975
|
from natural_pdf.utils.packaging import create_correction_task_package
|
976
|
+
|
807
977
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
808
978
|
except ImportError:
|
809
|
-
logger.error(
|
979
|
+
logger.error(
|
980
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
981
|
+
)
|
982
|
+
logger.error(
|
983
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
984
|
+
)
|
810
985
|
except Exception as e:
|
811
986
|
logger.error(f"Failed to export correction task: {e}")
|
812
987
|
raise
|
988
|
+
logger.error(f"Failed to export correction task: {e}")
|
989
|
+
raise
|
813
990
|
|
814
991
|
def correct_ocr(
|
815
992
|
self,
|
@@ -820,17 +997,23 @@ class PDF(ExtractionMixin):
|
|
820
997
|
) -> "PDF":
|
821
998
|
"""
|
822
999
|
Applies corrections to OCR text elements using a callback function.
|
1000
|
+
Applies corrections to OCR text elements using a callback function.
|
823
1001
|
|
824
1002
|
Args:
|
1003
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
825
1004
|
correction_callback: Function that takes an element and returns corrected text or None
|
826
1005
|
pages: Optional page indices/slice to limit the scope of correction
|
827
1006
|
max_workers: Maximum number of threads to use for parallel execution
|
828
1007
|
progress_callback: Optional callback function for progress updates
|
1008
|
+
max_workers: Maximum number of threads to use for parallel execution
|
1009
|
+
progress_callback: Optional callback function for progress updates
|
829
1010
|
|
830
1011
|
Returns:
|
831
1012
|
Self for method chaining
|
1013
|
+
Self for method chaining
|
832
1014
|
"""
|
833
1015
|
target_page_indices = []
|
1016
|
+
target_page_indices = []
|
834
1017
|
if pages is None:
|
835
1018
|
target_page_indices = list(range(len(self._pages)))
|
836
1019
|
elif isinstance(pages, slice):
|
@@ -843,14 +1026,17 @@ class PDF(ExtractionMixin):
|
|
843
1026
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
844
1027
|
except (IndexError, TypeError, ValueError) as e:
|
845
1028
|
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1029
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
846
1030
|
else:
|
847
1031
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1032
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
848
1033
|
|
849
1034
|
if not target_page_indices:
|
850
1035
|
logger.warning("No pages selected for OCR correction.")
|
851
1036
|
return self
|
852
1037
|
|
853
1038
|
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1039
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
854
1040
|
|
855
1041
|
for page_idx in target_page_indices:
|
856
1042
|
page = self._pages[page_idx]
|
@@ -862,7 +1048,9 @@ class PDF(ExtractionMixin):
|
|
862
1048
|
)
|
863
1049
|
except Exception as e:
|
864
1050
|
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1051
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
865
1052
|
|
1053
|
+
logger.info("OCR correction process finished.")
|
866
1054
|
logger.info("OCR correction process finished.")
|
867
1055
|
return self
|
868
1056
|
|
@@ -872,15 +1060,16 @@ class PDF(ExtractionMixin):
|
|
872
1060
|
return 0
|
873
1061
|
return len(self._pages)
|
874
1062
|
|
875
|
-
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
1063
|
+
def __getitem__(self, key) -> Union["Page", "PageCollection"]:
|
876
1064
|
"""Access pages by index or slice."""
|
877
1065
|
if not hasattr(self, "_pages"):
|
878
1066
|
raise AttributeError("PDF pages not initialized yet.")
|
879
|
-
|
1067
|
+
|
880
1068
|
if isinstance(key, slice):
|
881
1069
|
from natural_pdf.elements.collections import PageCollection
|
1070
|
+
|
882
1071
|
return PageCollection(self._pages[key])
|
883
|
-
|
1072
|
+
|
884
1073
|
if isinstance(key, int):
|
885
1074
|
if 0 <= key < len(self._pages):
|
886
1075
|
return self._pages[key]
|
@@ -905,13 +1094,12 @@ class PDF(ExtractionMixin):
|
|
905
1094
|
try:
|
906
1095
|
if hasattr(self._temp_file, "name") and self._temp_file.name:
|
907
1096
|
temp_file_path = self._temp_file.name
|
908
|
-
if
|
1097
|
+
# Only unlink if it exists and _is_stream is False (meaning WE created it)
|
1098
|
+
if not self._is_stream and os.path.exists(temp_file_path):
|
909
1099
|
os.unlink(temp_file_path)
|
910
1100
|
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
911
1101
|
except Exception as e:
|
912
1102
|
logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
|
913
|
-
finally:
|
914
|
-
self._temp_file = None
|
915
1103
|
|
916
1104
|
def __enter__(self):
|
917
1105
|
"""Context manager entry."""
|
@@ -922,14 +1110,141 @@ class PDF(ExtractionMixin):
|
|
922
1110
|
self.close()
|
923
1111
|
|
924
1112
|
def get_id(self) -> str:
|
1113
|
+
"""Get unique identifier for this PDF."""
|
925
1114
|
"""Get unique identifier for this PDF."""
|
926
1115
|
return self.path
|
927
1116
|
|
1117
|
+
# --- Deskew Method --- #
|
1118
|
+
|
1119
|
+
def deskew(
|
1120
|
+
self,
|
1121
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1122
|
+
resolution: int = 300,
|
1123
|
+
detection_resolution: int = 72,
|
1124
|
+
force_overwrite: bool = False,
|
1125
|
+
**deskew_kwargs,
|
1126
|
+
) -> "PDF":
|
1127
|
+
"""
|
1128
|
+
Creates a new, in-memory PDF object containing deskewed versions of the
|
1129
|
+
specified pages from the original PDF.
|
1130
|
+
|
1131
|
+
This method renders each selected page, detects and corrects skew using the 'deskew'
|
1132
|
+
library, and then combines the resulting images into a new PDF using 'img2pdf'.
|
1133
|
+
The new PDF object is returned directly.
|
1134
|
+
|
1135
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
1136
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
1137
|
+
|
1138
|
+
Args:
|
1139
|
+
pages: Page indices/slice to include (0-based). If None, processes all pages.
|
1140
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
1141
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
1142
|
+
already cached on the page objects.
|
1143
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
1144
|
+
already contains processed elements (text, OCR, regions) to
|
1145
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
1146
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
1147
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
1148
|
+
|
1149
|
+
Returns:
|
1150
|
+
A new PDF object representing the deskewed document.
|
1151
|
+
|
1152
|
+
Raises:
|
1153
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
|
1154
|
+
ValueError: If `force_overwrite` is False and target pages contain elements.
|
1155
|
+
FileNotFoundError: If the source PDF cannot be read (if file-based).
|
1156
|
+
IOError: If creating the in-memory PDF fails.
|
1157
|
+
RuntimeError: If rendering or deskewing individual pages fails.
|
1158
|
+
"""
|
1159
|
+
if not DESKEW_AVAILABLE:
|
1160
|
+
raise ImportError(
|
1161
|
+
"Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
|
1162
|
+
)
|
1163
|
+
|
1164
|
+
target_pages = self._get_target_pages(pages) # Use helper to resolve pages
|
1165
|
+
|
1166
|
+
# --- Safety Check --- #
|
1167
|
+
if not force_overwrite:
|
1168
|
+
for page in target_pages:
|
1169
|
+
# Check if the element manager has been initialized and contains any elements
|
1170
|
+
if (
|
1171
|
+
hasattr(page, "_element_mgr")
|
1172
|
+
and page._element_mgr
|
1173
|
+
and page._element_mgr.has_elements()
|
1174
|
+
):
|
1175
|
+
raise ValueError(
|
1176
|
+
f"Page {page.number} contains existing elements (text, OCR, etc.). "
|
1177
|
+
f"Deskewing creates an image-only PDF, discarding these elements. "
|
1178
|
+
f"Set force_overwrite=True to proceed."
|
1179
|
+
)
|
1180
|
+
|
1181
|
+
# --- Process Pages --- #
|
1182
|
+
deskewed_images_bytes = []
|
1183
|
+
logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
|
1184
|
+
|
1185
|
+
# Use tqdm via get_tqdm
|
1186
|
+
for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
|
1187
|
+
try:
|
1188
|
+
# Use page.deskew to get the corrected PIL image
|
1189
|
+
# Pass down resolutions and kwargs
|
1190
|
+
deskewed_img = page.deskew(
|
1191
|
+
resolution=resolution,
|
1192
|
+
angle=None, # Let page.deskew handle detection/caching
|
1193
|
+
detection_resolution=detection_resolution,
|
1194
|
+
**deskew_kwargs,
|
1195
|
+
)
|
1196
|
+
|
1197
|
+
if not deskewed_img:
|
1198
|
+
logger.warning(
|
1199
|
+
f"Page {page.number}: Failed to generate deskewed image, skipping."
|
1200
|
+
)
|
1201
|
+
continue
|
1202
|
+
|
1203
|
+
# Convert image to bytes for img2pdf (use PNG for lossless quality)
|
1204
|
+
with io.BytesIO() as buf:
|
1205
|
+
deskewed_img.save(buf, format="PNG")
|
1206
|
+
deskewed_images_bytes.append(buf.getvalue())
|
1207
|
+
|
1208
|
+
except Exception as e:
|
1209
|
+
logger.error(
|
1210
|
+
f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
|
1211
|
+
)
|
1212
|
+
# Option: Raise a runtime error, or continue and skip the page?
|
1213
|
+
# Raising makes the whole operation fail if one page fails.
|
1214
|
+
raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
|
1215
|
+
|
1216
|
+
# --- Create PDF --- #
|
1217
|
+
if not deskewed_images_bytes:
|
1218
|
+
raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
|
1219
|
+
|
1220
|
+
logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
|
1221
|
+
try:
|
1222
|
+
# Use img2pdf to combine image bytes into PDF bytes
|
1223
|
+
pdf_bytes = img2pdf.convert(deskewed_images_bytes)
|
1224
|
+
|
1225
|
+
# Wrap bytes in a stream
|
1226
|
+
pdf_stream = io.BytesIO(pdf_bytes)
|
1227
|
+
|
1228
|
+
# Create a new PDF object from the stream using original config
|
1229
|
+
logger.info("Creating new PDF object from deskewed stream...")
|
1230
|
+
new_pdf = PDF(
|
1231
|
+
pdf_stream,
|
1232
|
+
reading_order=self._reading_order,
|
1233
|
+
font_attrs=self._font_attrs,
|
1234
|
+
keep_spaces=self._config.get("keep_spaces", True),
|
1235
|
+
)
|
1236
|
+
return new_pdf
|
1237
|
+
except Exception as e:
|
1238
|
+
logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
|
1239
|
+
raise IOError("Failed to create deskewed PDF object from image stream.") from e
|
1240
|
+
|
1241
|
+
# --- End Deskew Method --- #
|
1242
|
+
|
928
1243
|
# --- Classification Methods --- #
|
929
1244
|
|
930
1245
|
def classify_pages(
|
931
1246
|
self,
|
932
|
-
|
1247
|
+
labels: List[str],
|
933
1248
|
model: Optional[str] = None,
|
934
1249
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
935
1250
|
analysis_key: str = "classification",
|
@@ -940,7 +1255,7 @@ class PDF(ExtractionMixin):
|
|
940
1255
|
Classifies specified pages of the PDF.
|
941
1256
|
|
942
1257
|
Args:
|
943
|
-
|
1258
|
+
labels: List of category names
|
944
1259
|
model: Model identifier ('text', 'vision', or specific HF ID)
|
945
1260
|
pages: Page indices, slice, or None for all pages
|
946
1261
|
analysis_key: Key to store results in page's analyses dict
|
@@ -950,23 +1265,24 @@ class PDF(ExtractionMixin):
|
|
950
1265
|
Returns:
|
951
1266
|
Self for method chaining
|
952
1267
|
"""
|
953
|
-
if not
|
954
|
-
raise ValueError("
|
1268
|
+
if not labels:
|
1269
|
+
raise ValueError("Labels list cannot be empty.")
|
955
1270
|
|
956
1271
|
try:
|
957
|
-
manager = self.get_manager(
|
1272
|
+
manager = self.get_manager("classification")
|
958
1273
|
except (ValueError, RuntimeError) as e:
|
959
1274
|
raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
|
960
1275
|
|
961
1276
|
if not manager or not manager.is_available():
|
962
1277
|
try:
|
963
1278
|
from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
|
1279
|
+
|
964
1280
|
if not _CLASSIFICATION_AVAILABLE:
|
965
1281
|
raise ImportError("Classification dependencies missing.")
|
966
1282
|
except ImportError:
|
967
1283
|
raise ImportError(
|
968
1284
|
"Classification dependencies missing. "
|
969
|
-
|
1285
|
+
'Install with: pip install "natural-pdf[classification]"'
|
970
1286
|
)
|
971
1287
|
raise ClassificationError("ClassificationManager not available.")
|
972
1288
|
|
@@ -990,12 +1306,14 @@ class PDF(ExtractionMixin):
|
|
990
1306
|
return self
|
991
1307
|
|
992
1308
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
993
|
-
logger.info(
|
1309
|
+
logger.info(
|
1310
|
+
f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
|
1311
|
+
)
|
994
1312
|
|
995
1313
|
page_contents = []
|
996
1314
|
pages_to_classify = []
|
997
1315
|
logger.debug(f"Gathering content for {len(target_pages)} pages...")
|
998
|
-
|
1316
|
+
|
999
1317
|
for page in target_pages:
|
1000
1318
|
try:
|
1001
1319
|
content = page._get_classification_content(model_type=inferred_using, **kwargs)
|
@@ -1009,13 +1327,13 @@ class PDF(ExtractionMixin):
|
|
1009
1327
|
if not page_contents:
|
1010
1328
|
logger.warning("No content could be gathered for batch classification.")
|
1011
1329
|
return self
|
1012
|
-
|
1330
|
+
|
1013
1331
|
logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
|
1014
1332
|
|
1015
1333
|
try:
|
1016
1334
|
batch_results = manager.classify_batch(
|
1017
1335
|
item_contents=page_contents,
|
1018
|
-
|
1336
|
+
labels=labels,
|
1019
1337
|
model_id=model,
|
1020
1338
|
using=inferred_using,
|
1021
1339
|
**kwargs,
|
@@ -1025,17 +1343,23 @@ class PDF(ExtractionMixin):
|
|
1025
1343
|
raise ClassificationError(f"Batch classification failed: {e}") from e
|
1026
1344
|
|
1027
1345
|
if len(batch_results) != len(pages_to_classify):
|
1028
|
-
logger.error(
|
1346
|
+
logger.error(
|
1347
|
+
f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
|
1348
|
+
)
|
1029
1349
|
return self
|
1030
1350
|
|
1031
|
-
logger.debug(
|
1351
|
+
logger.debug(
|
1352
|
+
f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
|
1353
|
+
)
|
1032
1354
|
for page, result_obj in zip(pages_to_classify, batch_results):
|
1033
1355
|
try:
|
1034
|
-
if not hasattr(page,
|
1356
|
+
if not hasattr(page, "analyses") or page.analyses is None:
|
1035
1357
|
page.analyses = {}
|
1036
1358
|
page.analyses[analysis_key] = result_obj
|
1037
1359
|
except Exception as e:
|
1038
|
-
logger.warning(
|
1360
|
+
logger.warning(
|
1361
|
+
f"Failed to store classification results for page {page.number}: {e}"
|
1362
|
+
)
|
1039
1363
|
|
1040
1364
|
logger.info(f"Finished classifying PDF pages.")
|
1041
1365
|
return self
|
@@ -1043,7 +1367,7 @@ class PDF(ExtractionMixin):
|
|
1043
1367
|
# --- End Classification Methods --- #
|
1044
1368
|
|
1045
1369
|
# --- Extraction Support --- #
|
1046
|
-
def _get_extraction_content(self, using: str =
|
1370
|
+
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
1047
1371
|
"""
|
1048
1372
|
Retrieves the content for the entire PDF.
|
1049
1373
|
|
@@ -1056,28 +1380,28 @@ class PDF(ExtractionMixin):
|
|
1056
1380
|
List[PIL.Image.Image]: List of page images if using='vision'
|
1057
1381
|
None: If content cannot be retrieved
|
1058
1382
|
"""
|
1059
|
-
if using ==
|
1383
|
+
if using == "text":
|
1060
1384
|
try:
|
1061
|
-
layout = kwargs.pop(
|
1385
|
+
layout = kwargs.pop("layout", True)
|
1062
1386
|
return self.extract_text(layout=layout, **kwargs)
|
1063
1387
|
except Exception as e:
|
1064
1388
|
logger.error(f"Error extracting text from PDF: {e}")
|
1065
1389
|
return None
|
1066
|
-
elif using ==
|
1390
|
+
elif using == "vision":
|
1067
1391
|
page_images = []
|
1068
1392
|
logger.info(f"Rendering {len(self.pages)} pages to images...")
|
1069
|
-
|
1070
|
-
resolution = kwargs.pop(
|
1071
|
-
include_highlights = kwargs.pop(
|
1072
|
-
labels = kwargs.pop(
|
1073
|
-
|
1393
|
+
|
1394
|
+
resolution = kwargs.pop("resolution", 72)
|
1395
|
+
include_highlights = kwargs.pop("include_highlights", False)
|
1396
|
+
labels = kwargs.pop("labels", False)
|
1397
|
+
|
1074
1398
|
try:
|
1075
1399
|
for page in tqdm(self.pages, desc="Rendering Pages"):
|
1076
1400
|
img = page.to_image(
|
1077
1401
|
resolution=resolution,
|
1078
1402
|
include_highlights=include_highlights,
|
1079
1403
|
labels=labels,
|
1080
|
-
**kwargs
|
1404
|
+
**kwargs,
|
1081
1405
|
)
|
1082
1406
|
if img:
|
1083
1407
|
page_images.append(img)
|
@@ -1093,4 +1417,179 @@ class PDF(ExtractionMixin):
|
|
1093
1417
|
else:
|
1094
1418
|
logger.error(f"Unsupported value for 'using': {using}")
|
1095
1419
|
return None
|
1420
|
+
|
1096
1421
|
# --- End Extraction Support --- #
|
1422
|
+
|
1423
|
+
def _gather_analysis_data(
|
1424
|
+
self,
|
1425
|
+
analysis_keys: List[str],
|
1426
|
+
include_content: bool,
|
1427
|
+
include_images: bool,
|
1428
|
+
image_dir: Optional[Path],
|
1429
|
+
image_format: str,
|
1430
|
+
image_resolution: int,
|
1431
|
+
) -> List[Dict[str, Any]]:
|
1432
|
+
"""
|
1433
|
+
Gather analysis data from all pages in the PDF.
|
1434
|
+
|
1435
|
+
Args:
|
1436
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1437
|
+
include_content: Whether to include extracted text
|
1438
|
+
include_images: Whether to export images
|
1439
|
+
image_dir: Directory to save images
|
1440
|
+
image_format: Format to save images
|
1441
|
+
image_resolution: Resolution for exported images
|
1442
|
+
|
1443
|
+
Returns:
|
1444
|
+
List of dictionaries containing analysis data
|
1445
|
+
"""
|
1446
|
+
if not hasattr(self, "_pages") or not self._pages:
|
1447
|
+
logger.warning(f"No pages found in PDF {self.path}")
|
1448
|
+
return []
|
1449
|
+
|
1450
|
+
all_data = []
|
1451
|
+
|
1452
|
+
for page in tqdm(self._pages, desc="Gathering page data", leave=False):
|
1453
|
+
# Basic page information
|
1454
|
+
page_data = {
|
1455
|
+
"pdf_path": self.path,
|
1456
|
+
"page_number": page.number,
|
1457
|
+
"page_index": page.index,
|
1458
|
+
}
|
1459
|
+
|
1460
|
+
# Include extracted text if requested
|
1461
|
+
if include_content:
|
1462
|
+
try:
|
1463
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
1464
|
+
except Exception as e:
|
1465
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
1466
|
+
page_data["content"] = ""
|
1467
|
+
|
1468
|
+
# Save image if requested
|
1469
|
+
if include_images:
|
1470
|
+
try:
|
1471
|
+
# Create image filename
|
1472
|
+
image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
|
1473
|
+
image_path = image_dir / image_filename
|
1474
|
+
|
1475
|
+
# Save image
|
1476
|
+
page.save_image(
|
1477
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
1478
|
+
)
|
1479
|
+
|
1480
|
+
# Add relative path to data
|
1481
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1482
|
+
except Exception as e:
|
1483
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
1484
|
+
page_data["image_path"] = None
|
1485
|
+
|
1486
|
+
# Add analyses data
|
1487
|
+
for key in analysis_keys:
|
1488
|
+
if not hasattr(page, "analyses") or not page.analyses:
|
1489
|
+
raise ValueError(f"Page {page.number} does not have analyses data")
|
1490
|
+
|
1491
|
+
if key not in page.analyses:
|
1492
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
1493
|
+
|
1494
|
+
# Get the analysis result
|
1495
|
+
analysis_result = page.analyses[key]
|
1496
|
+
|
1497
|
+
# If the result has a to_dict method, use it
|
1498
|
+
if hasattr(analysis_result, "to_dict"):
|
1499
|
+
analysis_data = analysis_result.to_dict()
|
1500
|
+
else:
|
1501
|
+
# Otherwise, use the result directly if it's dict-like
|
1502
|
+
try:
|
1503
|
+
analysis_data = dict(analysis_result)
|
1504
|
+
except (TypeError, ValueError):
|
1505
|
+
# Last resort: convert to string
|
1506
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1507
|
+
|
1508
|
+
# Add analysis data to page data with the key as prefix
|
1509
|
+
for k, v in analysis_data.items():
|
1510
|
+
page_data[f"{key}.{k}"] = v
|
1511
|
+
|
1512
|
+
all_data.append(page_data)
|
1513
|
+
|
1514
|
+
return all_data
|
1515
|
+
|
1516
|
+
def _get_target_pages(
|
1517
|
+
self, pages: Optional[Union[Iterable[int], range, slice]] = None
|
1518
|
+
) -> List["Page"]:
|
1519
|
+
"""
|
1520
|
+
Helper method to get a list of Page objects based on the input pages.
|
1521
|
+
|
1522
|
+
Args:
|
1523
|
+
pages: Page indices, slice, or None for all pages
|
1524
|
+
|
1525
|
+
Returns:
|
1526
|
+
List of Page objects
|
1527
|
+
"""
|
1528
|
+
if pages is None:
|
1529
|
+
return self._pages
|
1530
|
+
elif isinstance(pages, slice):
|
1531
|
+
return self._pages[pages]
|
1532
|
+
elif hasattr(pages, "__iter__"):
|
1533
|
+
try:
|
1534
|
+
return [self._pages[i] for i in pages]
|
1535
|
+
except IndexError:
|
1536
|
+
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
1537
|
+
except TypeError:
|
1538
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1539
|
+
else:
|
1540
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1541
|
+
|
1542
|
+
# --- Classification Mixin Implementation --- #
|
1543
|
+
|
1544
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
1545
|
+
"""Returns the ClassificationManager instance for this PDF."""
|
1546
|
+
try:
|
1547
|
+
return self.get_manager("classification")
|
1548
|
+
except (KeyError, RuntimeError) as e:
|
1549
|
+
raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
|
1550
|
+
|
1551
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
|
1552
|
+
"""
|
1553
|
+
Provides the content for classifying the entire PDF.
|
1554
|
+
|
1555
|
+
Args:
|
1556
|
+
model_type: 'text' or 'vision'.
|
1557
|
+
**kwargs: Additional arguments (e.g., for text extraction or image rendering).
|
1558
|
+
|
1559
|
+
Returns:
|
1560
|
+
Extracted text (str) or the first page's image (PIL.Image).
|
1561
|
+
|
1562
|
+
Raises:
|
1563
|
+
ValueError: If model_type is 'vision' and PDF has != 1 page,
|
1564
|
+
or if model_type is unsupported, or if content cannot be generated.
|
1565
|
+
"""
|
1566
|
+
if model_type == "text":
|
1567
|
+
try:
|
1568
|
+
# Extract text from the whole document
|
1569
|
+
text = self.extract_text(**kwargs) # Pass relevant kwargs
|
1570
|
+
if not text or text.isspace():
|
1571
|
+
raise ValueError("PDF contains no extractable text for classification.")
|
1572
|
+
return text
|
1573
|
+
except Exception as e:
|
1574
|
+
logger.error(f"Error extracting text for PDF classification: {e}")
|
1575
|
+
raise ValueError("Failed to extract text for classification.") from e
|
1576
|
+
|
1577
|
+
elif model_type == "vision":
|
1578
|
+
if len(self.pages) == 1:
|
1579
|
+
# Use the single page's content method
|
1580
|
+
try:
|
1581
|
+
return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
|
1582
|
+
except Exception as e:
|
1583
|
+
logger.error(f"Error getting image from single page for classification: {e}")
|
1584
|
+
raise ValueError("Failed to get image from single page.") from e
|
1585
|
+
elif len(self.pages) == 0:
|
1586
|
+
raise ValueError("Cannot classify empty PDF using vision model.")
|
1587
|
+
else:
|
1588
|
+
raise ValueError(
|
1589
|
+
f"Vision classification for a PDF object is only supported for single-page PDFs. "
|
1590
|
+
f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
|
1591
|
+
)
|
1592
|
+
else:
|
1593
|
+
raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
|
1594
|
+
|
1595
|
+
# --- End Classification Mixin Implementation ---
|