natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
-
import copy
|
1
|
+
import copy
|
2
|
+
import io
|
2
3
|
import logging
|
3
4
|
import os
|
4
5
|
import re
|
5
6
|
import tempfile
|
7
|
+
import threading
|
8
|
+
import time
|
6
9
|
import urllib.request
|
7
|
-
from pathlib import Path
|
8
|
-
from typing import (
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import (
|
9
12
|
TYPE_CHECKING,
|
10
13
|
Any,
|
11
14
|
Callable,
|
@@ -16,56 +19,72 @@ from typing import ( # Added Iterable and TYPE_CHECKING
|
|
16
19
|
Tuple,
|
17
20
|
Type,
|
18
21
|
Union,
|
22
|
+
overload,
|
19
23
|
)
|
20
|
-
from pathlib import Path
|
21
|
-
|
22
24
|
|
23
25
|
import pdfplumber
|
24
26
|
from PIL import Image
|
25
27
|
|
26
|
-
from natural_pdf.analyzers.layout.layout_manager import
|
27
|
-
|
28
|
-
|
29
|
-
from natural_pdf.
|
30
|
-
from natural_pdf.core.
|
31
|
-
from natural_pdf.elements.
|
28
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
|
+
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
30
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
31
|
+
from natural_pdf.classification.results import ClassificationResult
|
32
|
+
from natural_pdf.core.highlighting_service import HighlightingService
|
33
|
+
from natural_pdf.elements.base import Element
|
32
34
|
from natural_pdf.elements.region import Region
|
35
|
+
from natural_pdf.export.mixin import ExportMixin
|
36
|
+
from natural_pdf.extraction.manager import StructuredDataManager
|
37
|
+
from natural_pdf.extraction.mixin import ExtractionMixin
|
33
38
|
from natural_pdf.ocr import OCRManager, OCROptions
|
34
39
|
from natural_pdf.selectors.parser import parse_selector
|
40
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
35
42
|
|
36
|
-
# Import the flag directly - this should always work
|
37
|
-
|
38
|
-
# --- Add Search Service Imports (needed for new methods) ---
|
39
43
|
try:
|
40
|
-
from typing import Any as TypingAny
|
44
|
+
from typing import Any as TypingAny
|
41
45
|
|
42
|
-
from natural_pdf.search import TextSearchOptions # Keep for ask default
|
43
46
|
from natural_pdf.search import (
|
44
47
|
BaseSearchOptions,
|
45
48
|
SearchOptions,
|
46
49
|
SearchServiceProtocol,
|
50
|
+
TextSearchOptions,
|
47
51
|
get_search_service,
|
48
52
|
)
|
49
53
|
except ImportError:
|
50
|
-
# Define dummies if needed for type hints within the class
|
51
54
|
SearchServiceProtocol = object
|
52
55
|
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
53
56
|
TypingAny = object
|
54
57
|
|
55
|
-
# Dummy factory needed for default arg in methods
|
56
58
|
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
57
59
|
raise ImportError(
|
58
60
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
59
61
|
)
|
60
62
|
|
61
63
|
|
62
|
-
# --- End Search Service Imports ---
|
63
|
-
|
64
|
-
# Set up logger early
|
65
64
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
65
|
+
tqdm = get_tqdm()
|
66
66
|
|
67
|
+
DEFAULT_MANAGERS = {
|
68
|
+
"classification": ClassificationManager,
|
69
|
+
"structured_data": StructuredDataManager,
|
70
|
+
}
|
67
71
|
|
68
|
-
|
72
|
+
# Deskew Imports (Conditional)
|
73
|
+
import numpy as np
|
74
|
+
from PIL import Image
|
75
|
+
|
76
|
+
try:
|
77
|
+
import img2pdf
|
78
|
+
from deskew import determine_skew
|
79
|
+
|
80
|
+
DESKEW_AVAILABLE = True
|
81
|
+
except ImportError:
|
82
|
+
DESKEW_AVAILABLE = False
|
83
|
+
img2pdf = None
|
84
|
+
# End Deskew Imports
|
85
|
+
|
86
|
+
|
87
|
+
class PDF(ExtractionMixin, ExportMixin):
|
69
88
|
"""
|
70
89
|
Enhanced PDF wrapper built on top of pdfplumber.
|
71
90
|
|
@@ -75,7 +94,7 @@ class PDF:
|
|
75
94
|
|
76
95
|
def __init__(
|
77
96
|
self,
|
78
|
-
|
97
|
+
path_or_url_or_stream,
|
79
98
|
reading_order: bool = True,
|
80
99
|
font_attrs: Optional[List[str]] = None,
|
81
100
|
keep_spaces: bool = True,
|
@@ -84,95 +103,132 @@ class PDF:
|
|
84
103
|
Initialize the enhanced PDF object.
|
85
104
|
|
86
105
|
Args:
|
87
|
-
|
106
|
+
path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
|
88
107
|
reading_order: Whether to use natural reading order
|
89
|
-
font_attrs: Font attributes
|
90
|
-
|
91
|
-
None: Only consider spatial relationships
|
92
|
-
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
93
|
-
keep_spaces: Whether to include spaces in word elements (default: True).
|
94
|
-
True: Spaces are part of words, better for multi-word searching
|
95
|
-
False: Break text at spaces, each word is separate (legacy behavior)
|
108
|
+
font_attrs: Font attributes for grouping characters into words
|
109
|
+
keep_spaces: Whether to include spaces in word elements
|
96
110
|
"""
|
97
|
-
|
98
|
-
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
99
|
-
|
100
|
-
# Initialize path-related attributes
|
101
|
-
self._original_path = path_or_url
|
111
|
+
self._original_path_or_stream = path_or_url_or_stream
|
102
112
|
self._temp_file = None
|
103
|
-
self._resolved_path = None
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
113
|
+
self._resolved_path = None
|
114
|
+
self._is_stream = False
|
115
|
+
stream_to_open = None
|
116
|
+
|
117
|
+
if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
|
118
|
+
logger.info("Initializing PDF from in-memory stream.")
|
119
|
+
self._is_stream = True
|
120
|
+
self._resolved_path = None # No resolved file path for streams
|
121
|
+
self.source_path = "<stream>" # Identifier for source
|
122
|
+
self.path = self.source_path # Use source identifier as path for streams
|
123
|
+
stream_to_open = path_or_url_or_stream
|
124
|
+
elif isinstance(path_or_url_or_stream, (str, Path)):
|
125
|
+
path_or_url = str(path_or_url_or_stream)
|
126
|
+
self.source_path = path_or_url # Store original path/URL as source
|
127
|
+
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
128
|
+
|
129
|
+
if is_url:
|
130
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
131
|
+
try:
|
132
|
+
# Use a context manager for the temporary file
|
133
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
|
134
|
+
self._temp_file = temp_f # Store reference if needed for cleanup
|
135
|
+
with urllib.request.urlopen(path_or_url) as response:
|
136
|
+
temp_f.write(response.read())
|
137
|
+
temp_f.flush()
|
138
|
+
self._resolved_path = temp_f.name
|
139
|
+
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
140
|
+
stream_to_open = self._resolved_path
|
141
|
+
except Exception as e:
|
142
|
+
if self._temp_file and hasattr(self._temp_file, "name"):
|
143
|
+
try:
|
144
|
+
os.unlink(self._temp_file.name)
|
145
|
+
except: # noqa E722
|
146
|
+
pass
|
147
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
148
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
149
|
+
else:
|
150
|
+
self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
|
151
|
+
stream_to_open = self._resolved_path
|
152
|
+
self.path = self._resolved_path # Use resolved path for file-based PDFs
|
128
153
|
else:
|
129
|
-
|
130
|
-
|
154
|
+
raise TypeError(
|
155
|
+
f"Invalid input type: {type(path_or_url_or_stream)}. "
|
156
|
+
f"Expected path (str/Path), URL (str), or file-like object."
|
157
|
+
)
|
131
158
|
|
132
|
-
logger.info(f"
|
159
|
+
logger.info(f"Opening PDF source: {self.source_path}")
|
133
160
|
logger.debug(
|
134
161
|
f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
|
135
162
|
)
|
136
163
|
|
137
164
|
try:
|
138
|
-
self._pdf = pdfplumber.open(
|
165
|
+
self._pdf = pdfplumber.open(stream_to_open)
|
139
166
|
except Exception as e:
|
140
|
-
logger.error(
|
141
|
-
|
142
|
-
|
143
|
-
)
|
144
|
-
# Clean up temp file if creation failed
|
145
|
-
self.close()
|
146
|
-
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
147
|
-
|
148
|
-
self._path = self._resolved_path # Keep original path too?
|
149
|
-
self.path = self._resolved_path # Public attribute for the resolved path
|
150
|
-
self.source_path = self._original_path # Public attribute for the user-provided path/URL
|
167
|
+
logger.error(f"Failed to open PDF: {e}", exc_info=True)
|
168
|
+
self.close() # Attempt cleanup if opening fails
|
169
|
+
raise IOError(f"Failed to open PDF source: {self.source_path}") from e
|
151
170
|
|
171
|
+
# Store configuration used for initialization
|
152
172
|
self._reading_order = reading_order
|
153
173
|
self._config = {"keep_spaces": keep_spaces}
|
174
|
+
self._font_attrs = font_attrs
|
154
175
|
|
155
|
-
self._font_attrs = font_attrs # Store the font attribute configuration
|
156
|
-
|
157
|
-
# Initialize Managers and Services (conditionally available)
|
158
176
|
self._ocr_manager = OCRManager() if OCRManager else None
|
159
177
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
160
178
|
self.highlighter = HighlightingService(self)
|
179
|
+
# self._classification_manager_instance = ClassificationManager() # Removed this line
|
180
|
+
self._manager_registry = {}
|
181
|
+
|
182
|
+
from natural_pdf.core.page import Page
|
161
183
|
|
162
|
-
# Initialize pages last, passing necessary refs
|
163
184
|
self._pages = [
|
164
185
|
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
165
186
|
for i, p in enumerate(self._pdf.pages)
|
166
187
|
]
|
167
188
|
|
168
|
-
# Other state
|
169
189
|
self._element_cache = {}
|
170
|
-
self._exclusions = []
|
171
|
-
self._regions = []
|
190
|
+
self._exclusions = []
|
191
|
+
self._regions = []
|
172
192
|
|
173
|
-
logger.info("Initialized HighlightingService.")
|
174
193
|
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
175
194
|
|
195
|
+
self._initialize_managers()
|
196
|
+
self._initialize_highlighter()
|
197
|
+
|
198
|
+
def _initialize_managers(self):
|
199
|
+
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
200
|
+
self._managers = {}
|
201
|
+
for key, manager_class in DEFAULT_MANAGERS.items():
|
202
|
+
try:
|
203
|
+
self._managers[key] = manager_class()
|
204
|
+
logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
|
205
|
+
except Exception as e:
|
206
|
+
logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
|
207
|
+
self._managers[key] = None
|
208
|
+
|
209
|
+
def get_manager(self, key: str) -> Any:
|
210
|
+
"""Retrieve a manager instance by its key."""
|
211
|
+
if key not in self._managers:
|
212
|
+
raise KeyError(
|
213
|
+
f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
|
214
|
+
)
|
215
|
+
|
216
|
+
manager_instance = self._managers.get(key)
|
217
|
+
|
218
|
+
if manager_instance is None:
|
219
|
+
manager_class = DEFAULT_MANAGERS.get(key)
|
220
|
+
if manager_class:
|
221
|
+
raise RuntimeError(
|
222
|
+
f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
|
223
|
+
)
|
224
|
+
else:
|
225
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
|
226
|
+
|
227
|
+
return manager_instance
|
228
|
+
|
229
|
+
def _initialize_highlighter(self):
|
230
|
+
pass
|
231
|
+
|
176
232
|
@property
|
177
233
|
def metadata(self) -> Dict[str, Any]:
|
178
234
|
"""Access metadata as a dictionary."""
|
@@ -183,7 +239,6 @@ class PDF:
|
|
183
239
|
"""Access pages as a PageCollection object."""
|
184
240
|
from natural_pdf.elements.collections import PageCollection
|
185
241
|
|
186
|
-
# Ensure _pages is initialized
|
187
242
|
if not hasattr(self, "_pages"):
|
188
243
|
raise AttributeError("PDF pages not yet initialized.")
|
189
244
|
return PageCollection(self._pages)
|
@@ -195,12 +250,10 @@ class PDF:
|
|
195
250
|
Returns:
|
196
251
|
Self for method chaining
|
197
252
|
"""
|
198
|
-
# Ensure _pages is initialized
|
199
253
|
if not hasattr(self, "_pages"):
|
200
254
|
raise AttributeError("PDF pages not yet initialized.")
|
201
255
|
|
202
256
|
self._exclusions = []
|
203
|
-
# Also clear from pages
|
204
257
|
for page in self._pages:
|
205
258
|
page.clear_exclusions()
|
206
259
|
return self
|
@@ -212,99 +265,90 @@ class PDF:
|
|
212
265
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
213
266
|
|
214
267
|
Args:
|
215
|
-
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
268
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
269
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
216
270
|
label: Optional label for this exclusion
|
217
271
|
|
218
272
|
Returns:
|
219
273
|
Self for method chaining
|
220
274
|
"""
|
221
|
-
# Ensure _pages is initialized
|
222
275
|
if not hasattr(self, "_pages"):
|
223
276
|
raise AttributeError("PDF pages not yet initialized.")
|
224
277
|
|
225
|
-
# Store exclusion with its label at PDF level
|
226
278
|
exclusion_data = (exclusion_func, label)
|
227
279
|
self._exclusions.append(exclusion_data)
|
228
280
|
|
229
|
-
# Apply this exclusion to all pages
|
230
281
|
for page in self._pages:
|
231
|
-
# We pass the original function, Page.add_exclusion handles calling it
|
232
282
|
page.add_exclusion(exclusion_func, label=label)
|
233
283
|
|
234
284
|
return self
|
235
285
|
|
236
286
|
def apply_ocr(
|
237
287
|
self,
|
238
|
-
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
239
288
|
engine: Optional[str] = None,
|
240
|
-
# --- Common OCR Parameters (Direct Arguments) ---
|
241
289
|
languages: Optional[List[str]] = None,
|
242
|
-
min_confidence: Optional[float] = None,
|
290
|
+
min_confidence: Optional[float] = None,
|
243
291
|
device: Optional[str] = None,
|
244
|
-
resolution: Optional[int] = None,
|
245
|
-
apply_exclusions: bool = True,
|
292
|
+
resolution: Optional[int] = None,
|
293
|
+
apply_exclusions: bool = True,
|
246
294
|
detect_only: bool = False,
|
247
|
-
|
248
|
-
options: Optional[Any] = None,
|
249
|
-
|
295
|
+
replace: bool = True,
|
296
|
+
options: Optional[Any] = None,
|
297
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
250
298
|
) -> "PDF":
|
251
299
|
"""
|
252
|
-
Applies OCR to specified pages
|
253
|
-
|
254
|
-
This method renders the specified pages to images, sends them as a batch
|
255
|
-
to the OCRManager, and adds the resulting TextElements to each respective page.
|
300
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
301
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
256
302
|
|
257
303
|
Args:
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
detect_only: If True, only detect text
|
275
|
-
|
276
|
-
|
304
|
+
engine: Name of the OCR engine
|
305
|
+
languages: List of language codes
|
306
|
+
min_confidence: Minimum confidence threshold
|
307
|
+
device: Device to run OCR on
|
308
|
+
resolution: DPI resolution for page images
|
309
|
+
apply_exclusions: Whether to mask excluded areas
|
310
|
+
detect_only: If True, only detect text boxes
|
311
|
+
replace: Whether to replace existing OCR elements
|
312
|
+
options: Engine-specific options
|
313
|
+
pages: Page indices to process or None for all pages
|
314
|
+
engine: Name of the OCR engine
|
315
|
+
languages: List of language codes
|
316
|
+
min_confidence: Minimum confidence threshold
|
317
|
+
device: Device to run OCR on
|
318
|
+
resolution: DPI resolution for page images
|
319
|
+
apply_exclusions: Whether to mask excluded areas
|
320
|
+
detect_only: If True, only detect text boxes
|
321
|
+
replace: Whether to replace existing OCR elements
|
322
|
+
options: Engine-specific options
|
323
|
+
pages: Page indices to process or None for all pages
|
277
324
|
|
278
325
|
Returns:
|
279
|
-
Self for method chaining
|
280
|
-
|
281
|
-
Raises:
|
282
|
-
ValueError: If page indices are invalid.
|
283
|
-
TypeError: If 'options' is not compatible with the engine.
|
284
|
-
RuntimeError: If the OCRManager or selected engine is not available.
|
326
|
+
Self for method chaining
|
327
|
+
Self for method chaining
|
285
328
|
"""
|
286
329
|
if not self._ocr_manager:
|
287
330
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
288
|
-
# Or raise RuntimeError("OCRManager not initialized.")
|
289
331
|
return self
|
290
332
|
|
291
|
-
|
292
|
-
|
333
|
+
thread_id = threading.current_thread().name
|
334
|
+
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
335
|
+
|
336
|
+
target_pages = []
|
337
|
+
|
338
|
+
target_pages = []
|
293
339
|
if pages is None:
|
294
340
|
target_pages = self._pages
|
295
341
|
elif isinstance(pages, slice):
|
296
342
|
target_pages = self._pages[pages]
|
297
|
-
elif hasattr(pages, "__iter__"):
|
343
|
+
elif hasattr(pages, "__iter__"):
|
298
344
|
try:
|
299
345
|
target_pages = [self._pages[i] for i in pages]
|
300
346
|
except IndexError:
|
301
347
|
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
302
348
|
except TypeError:
|
303
|
-
raise TypeError(
|
304
|
-
"'pages' must be None, a slice, or an iterable of page indices (int)."
|
305
|
-
)
|
349
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
306
350
|
else:
|
307
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
351
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
308
352
|
|
309
353
|
if not target_pages:
|
310
354
|
logger.warning("No pages selected for OCR processing.")
|
@@ -312,26 +356,20 @@ class PDF:
|
|
312
356
|
|
313
357
|
page_numbers = [p.number for p in target_pages]
|
314
358
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
|
-
# --- Determine Rendering Resolution ---
|
316
|
-
# Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
|
317
|
-
final_resolution = resolution # Use direct arg if provided
|
318
|
-
if final_resolution is None:
|
319
|
-
final_resolution = getattr(self, "_config", {}).get("resolution", 150)
|
320
359
|
|
321
|
-
|
360
|
+
final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
|
361
|
+
logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
|
362
|
+
|
363
|
+
images_pil = []
|
364
|
+
page_image_map = []
|
365
|
+
logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
|
366
|
+
failed_page_num = "unknown"
|
367
|
+
render_start_time = time.monotonic()
|
322
368
|
|
323
|
-
# --- Render Images for Batch ---
|
324
|
-
images_pil: List[Image.Image] = []
|
325
|
-
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
326
|
-
logger.info(
|
327
|
-
f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
|
328
|
-
)
|
329
|
-
failed_page_num = "unknown" # Keep track of potentially failing page
|
330
369
|
try:
|
331
|
-
for i, page in enumerate(target_pages):
|
332
|
-
failed_page_num = page.number
|
370
|
+
for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
|
371
|
+
failed_page_num = page.number
|
333
372
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
334
|
-
# Use the determined final_resolution and apply exclusions if requested
|
335
373
|
to_image_kwargs = {
|
336
374
|
"resolution": final_resolution,
|
337
375
|
"include_highlights": False,
|
@@ -340,58 +378,64 @@ class PDF:
|
|
340
378
|
img = page.to_image(**to_image_kwargs)
|
341
379
|
if img is None:
|
342
380
|
logger.error(f" Failed to render page {page.number} to image.")
|
343
|
-
|
344
|
-
continue
|
381
|
+
continue
|
382
|
+
continue
|
345
383
|
images_pil.append(img)
|
346
|
-
page_image_map.append((page, img))
|
384
|
+
page_image_map.append((page, img))
|
347
385
|
except Exception as e:
|
348
|
-
logger.error(f"Failed to render
|
386
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
387
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
349
388
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
350
389
|
|
390
|
+
render_end_time = time.monotonic()
|
391
|
+
logger.debug(
|
392
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
393
|
+
)
|
394
|
+
logger.debug(
|
395
|
+
f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
|
396
|
+
)
|
397
|
+
|
351
398
|
if not images_pil or not page_image_map:
|
352
399
|
logger.error("No images were successfully rendered for batch OCR.")
|
353
400
|
return self
|
354
401
|
|
355
|
-
# --- Prepare Arguments for Manager ---
|
356
|
-
# Pass common args directly, engine-specific via options
|
357
402
|
manager_args = {
|
358
403
|
"images": images_pil,
|
359
404
|
"engine": engine,
|
360
405
|
"languages": languages,
|
361
|
-
"min_confidence": min_confidence,
|
406
|
+
"min_confidence": min_confidence,
|
407
|
+
"min_confidence": min_confidence,
|
362
408
|
"device": device,
|
363
409
|
"options": options,
|
364
410
|
"detect_only": detect_only,
|
365
|
-
# Note: resolution is used for rendering, not passed to OCR manager directly
|
366
411
|
}
|
367
|
-
# Filter out None values so manager can use its defaults
|
368
412
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
369
413
|
|
370
|
-
|
371
|
-
logger.info(
|
372
|
-
|
373
|
-
)
|
414
|
+
ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
|
415
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
416
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
417
|
+
ocr_start_time = time.monotonic()
|
418
|
+
|
374
419
|
try:
|
375
|
-
# Manager's apply_ocr signature needs to accept common args directly
|
376
420
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
377
421
|
|
378
422
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
379
|
-
logger.error(
|
380
|
-
f"OCR Manager returned unexpected result format or length for batch processing. "
|
381
|
-
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
382
|
-
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
383
|
-
)
|
423
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
384
424
|
return self
|
385
425
|
|
386
426
|
logger.info("OCR Manager batch processing complete.")
|
387
|
-
|
388
427
|
except Exception as e:
|
389
|
-
logger.error(f"Batch OCR processing failed: {e}"
|
428
|
+
logger.error(f"Batch OCR processing failed: {e}")
|
390
429
|
return self
|
391
430
|
|
392
|
-
|
431
|
+
ocr_end_time = time.monotonic()
|
432
|
+
logger.debug(
|
433
|
+
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
434
|
+
)
|
435
|
+
|
393
436
|
logger.info("Adding OCR results to respective pages...")
|
394
437
|
total_elements_added = 0
|
438
|
+
|
395
439
|
for i, (page, img) in enumerate(page_image_map):
|
396
440
|
results_for_page = batch_results[i]
|
397
441
|
if not isinstance(results_for_page, list):
|
@@ -402,6 +446,9 @@ class PDF:
|
|
402
446
|
|
403
447
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
404
448
|
try:
|
449
|
+
if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
|
450
|
+
page._element_mgr.remove_ocr_elements()
|
451
|
+
|
405
452
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
406
453
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
407
454
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -414,188 +461,225 @@ class PDF:
|
|
414
461
|
else:
|
415
462
|
logger.debug(f" No valid TextElements created for page {page.number}.")
|
416
463
|
except Exception as e:
|
417
|
-
logger.error(
|
418
|
-
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
419
|
-
)
|
464
|
+
logger.error(f" Error adding OCR elements to page {page.number}: {e}")
|
420
465
|
|
421
|
-
logger.info(
|
422
|
-
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
423
|
-
)
|
466
|
+
logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
|
424
467
|
return self
|
425
468
|
|
426
469
|
def add_region(
|
427
470
|
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
428
471
|
) -> "PDF":
|
429
472
|
"""
|
430
|
-
Add a region function to the PDF.
|
473
|
+
Add a region function to the PDF.
|
431
474
|
|
432
475
|
Args:
|
433
|
-
region_func: A function that takes a Page and returns a Region, or None
|
476
|
+
region_func: A function that takes a Page and returns a Region, or None
|
477
|
+
region_func: A function that takes a Page and returns a Region, or None
|
434
478
|
name: Optional name for the region
|
435
479
|
|
436
480
|
Returns:
|
437
481
|
Self for method chaining
|
438
482
|
"""
|
439
|
-
# Ensure _pages is initialized
|
440
483
|
if not hasattr(self, "_pages"):
|
441
484
|
raise AttributeError("PDF pages not yet initialized.")
|
442
485
|
|
443
|
-
# Store region with its name at PDF level
|
444
486
|
region_data = (region_func, name)
|
445
487
|
self._regions.append(region_data)
|
446
488
|
|
447
|
-
# Apply this region to all pages
|
448
489
|
for page in self._pages:
|
449
490
|
try:
|
450
|
-
# Call the function to get the region for this specific page
|
451
491
|
region_instance = region_func(page)
|
452
492
|
if region_instance and isinstance(region_instance, Region):
|
453
|
-
# If a valid region is returned, add it to the page
|
454
493
|
page.add_region(region_instance, name=name, source="named")
|
455
494
|
elif region_instance is not None:
|
456
495
|
logger.warning(
|
457
|
-
f"Region function did not return a valid Region
|
496
|
+
f"Region function did not return a valid Region for page {page.number}"
|
458
497
|
)
|
459
498
|
except Exception as e:
|
460
|
-
logger.error(
|
461
|
-
f"Error executing or adding region function for page {page.number}: {e}",
|
462
|
-
exc_info=True,
|
463
|
-
)
|
499
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
464
500
|
|
465
501
|
return self
|
466
502
|
|
503
|
+
@overload
|
504
|
+
def find(
|
505
|
+
self,
|
506
|
+
*,
|
507
|
+
text: str,
|
508
|
+
apply_exclusions: bool = True,
|
509
|
+
regex: bool = False,
|
510
|
+
case: bool = True,
|
511
|
+
**kwargs,
|
512
|
+
) -> Optional[Any]: ...
|
513
|
+
|
514
|
+
@overload
|
515
|
+
def find(
|
516
|
+
self,
|
517
|
+
selector: str,
|
518
|
+
*,
|
519
|
+
apply_exclusions: bool = True,
|
520
|
+
regex: bool = False,
|
521
|
+
case: bool = True,
|
522
|
+
**kwargs,
|
523
|
+
) -> Optional[Any]: ...
|
524
|
+
|
467
525
|
def find(
|
468
|
-
self,
|
526
|
+
self,
|
527
|
+
selector: Optional[str] = None,
|
528
|
+
*,
|
529
|
+
text: Optional[str] = None,
|
530
|
+
apply_exclusions: bool = True,
|
531
|
+
regex: bool = False,
|
532
|
+
case: bool = True,
|
533
|
+
**kwargs,
|
469
534
|
) -> Optional[Any]:
|
470
535
|
"""
|
471
|
-
Find the first element matching the selector.
|
536
|
+
Find the first element matching the selector OR text content across all pages.
|
537
|
+
|
538
|
+
Provide EITHER `selector` OR `text`, but not both.
|
472
539
|
|
473
540
|
Args:
|
474
|
-
selector: CSS-like selector string
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
541
|
+
selector: CSS-like selector string.
|
542
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
543
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
544
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
545
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
546
|
+
**kwargs: Additional filter parameters.
|
479
547
|
|
480
548
|
Returns:
|
481
|
-
Element object or None if not found
|
549
|
+
Element object or None if not found.
|
482
550
|
"""
|
483
|
-
# Ensure _pages is initialized
|
484
551
|
if not hasattr(self, "_pages"):
|
485
552
|
raise AttributeError("PDF pages not yet initialized.")
|
486
553
|
|
487
|
-
|
554
|
+
if selector is not None and text is not None:
|
555
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
556
|
+
if selector is None and text is None:
|
557
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
488
558
|
|
489
|
-
#
|
559
|
+
# Construct selector if 'text' is provided
|
560
|
+
effective_selector = ""
|
561
|
+
if text is not None:
|
562
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
563
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
564
|
+
logger.debug(
|
565
|
+
f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
|
566
|
+
)
|
567
|
+
elif selector is not None:
|
568
|
+
effective_selector = selector
|
569
|
+
else:
|
570
|
+
raise ValueError("Internal error: No selector or text provided.")
|
571
|
+
|
572
|
+
selector_obj = parse_selector(effective_selector)
|
490
573
|
kwargs["regex"] = regex
|
491
574
|
kwargs["case"] = case
|
492
575
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
576
|
+
# Search page by page
|
577
|
+
for page in self.pages:
|
578
|
+
# Note: _apply_selector is on Page, so we call find directly here
|
579
|
+
# We pass the constructed/validated effective_selector
|
580
|
+
element = page.find(
|
581
|
+
selector=effective_selector, # Use the processed selector
|
582
|
+
apply_exclusions=apply_exclusions,
|
583
|
+
regex=regex, # Pass down flags
|
584
|
+
case=case,
|
585
|
+
**kwargs,
|
586
|
+
)
|
587
|
+
if element:
|
588
|
+
return element
|
589
|
+
return None # Not found on any page
|
590
|
+
|
591
|
+
@overload
|
592
|
+
def find_all(
|
593
|
+
self,
|
594
|
+
*,
|
595
|
+
text: str,
|
596
|
+
apply_exclusions: bool = True,
|
597
|
+
regex: bool = False,
|
598
|
+
case: bool = True,
|
599
|
+
**kwargs,
|
600
|
+
) -> "ElementCollection": ...
|
497
601
|
|
602
|
+
@overload
|
498
603
|
def find_all(
|
499
|
-
self,
|
500
|
-
|
604
|
+
self,
|
605
|
+
selector: str,
|
606
|
+
*,
|
607
|
+
apply_exclusions: bool = True,
|
608
|
+
regex: bool = False,
|
609
|
+
case: bool = True,
|
610
|
+
**kwargs,
|
611
|
+
) -> "ElementCollection": ...
|
612
|
+
|
613
|
+
def find_all(
|
614
|
+
self,
|
615
|
+
selector: Optional[str] = None,
|
616
|
+
*,
|
617
|
+
text: Optional[str] = None,
|
618
|
+
apply_exclusions: bool = True,
|
619
|
+
regex: bool = False,
|
620
|
+
case: bool = True,
|
621
|
+
**kwargs,
|
622
|
+
) -> "ElementCollection":
|
501
623
|
"""
|
502
|
-
Find all elements matching the selector.
|
624
|
+
Find all elements matching the selector OR text content across all pages.
|
625
|
+
|
626
|
+
Provide EITHER `selector` OR `text`, but not both.
|
503
627
|
|
504
628
|
Args:
|
505
|
-
selector: CSS-like selector string
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
629
|
+
selector: CSS-like selector string.
|
630
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
631
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
632
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
633
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
634
|
+
**kwargs: Additional filter parameters.
|
510
635
|
|
511
636
|
Returns:
|
512
|
-
ElementCollection with matching elements
|
637
|
+
ElementCollection with matching elements.
|
513
638
|
"""
|
514
|
-
# Ensure _pages is initialized
|
515
639
|
if not hasattr(self, "_pages"):
|
516
640
|
raise AttributeError("PDF pages not yet initialized.")
|
517
641
|
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
kwargs["case"] = case
|
523
|
-
|
524
|
-
results = self._apply_selector(
|
525
|
-
selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
|
526
|
-
)
|
527
|
-
return results
|
528
|
-
|
529
|
-
def _apply_selector(
|
530
|
-
self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
|
531
|
-
) -> ElementCollection:
|
532
|
-
"""
|
533
|
-
Apply selector to PDF elements across all pages.
|
534
|
-
|
535
|
-
Args:
|
536
|
-
selector_obj: Parsed selector dictionary
|
537
|
-
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
538
|
-
first_only: If True, stop searching after the first match is found.
|
539
|
-
**kwargs: Additional filter parameters
|
642
|
+
if selector is not None and text is not None:
|
643
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
644
|
+
if selector is None and text is None:
|
645
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
540
646
|
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
647
|
+
# Construct selector if 'text' is provided
|
648
|
+
effective_selector = ""
|
649
|
+
if text is not None:
|
650
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
651
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
652
|
+
logger.debug(
|
653
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
654
|
+
)
|
655
|
+
elif selector is not None:
|
656
|
+
effective_selector = selector
|
657
|
+
else:
|
658
|
+
raise ValueError("Internal error: No selector or text provided.")
|
545
659
|
|
546
|
-
#
|
547
|
-
|
548
|
-
|
549
|
-
page_indices = [page_indices]
|
550
|
-
elif isinstance(page_indices, slice):
|
551
|
-
page_indices = range(*page_indices.indices(len(self._pages)))
|
660
|
+
# Instead of parsing here, let each page parse and apply
|
661
|
+
# This avoids parsing the same selector multiple times if not needed
|
662
|
+
# selector_obj = parse_selector(effective_selector)
|
552
663
|
|
553
|
-
#
|
554
|
-
|
555
|
-
if pseudo.get("name") in ("spans", "continues"):
|
556
|
-
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
557
|
-
return ElementCollection([])
|
664
|
+
# kwargs["regex"] = regex # Removed: Already passed explicitly
|
665
|
+
# kwargs["case"] = case # Removed: Already passed explicitly
|
558
666
|
|
559
|
-
# Regular case: collect elements from each page
|
560
667
|
all_elements = []
|
561
|
-
for
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
if first_only and page_elements:
|
573
|
-
break # Stop iterating through pages
|
574
|
-
else:
|
575
|
-
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
576
|
-
|
577
|
-
# Create a combined collection
|
578
|
-
combined = ElementCollection(all_elements)
|
668
|
+
for page in self.pages:
|
669
|
+
# Call page.find_all with the effective selector and flags
|
670
|
+
page_elements = page.find_all(
|
671
|
+
selector=effective_selector,
|
672
|
+
apply_exclusions=apply_exclusions,
|
673
|
+
regex=regex,
|
674
|
+
case=case,
|
675
|
+
**kwargs,
|
676
|
+
)
|
677
|
+
if page_elements:
|
678
|
+
all_elements.extend(page_elements.elements)
|
579
679
|
|
580
|
-
|
581
|
-
if not first_only and kwargs.get("document_order", True):
|
582
|
-
# Check if elements have page, top, x0 before sorting
|
583
|
-
if all(
|
584
|
-
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
585
|
-
for el in combined.elements
|
586
|
-
):
|
587
|
-
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
588
|
-
else:
|
589
|
-
# Elements might be Regions without inherent sorting order yet
|
590
|
-
# Attempt sorting by page index if possible
|
591
|
-
try:
|
592
|
-
combined.sort(key=lambda el: el.page.index)
|
593
|
-
except AttributeError:
|
594
|
-
logger.warning(
|
595
|
-
"Cannot sort elements in document order: Missing required attributes (e.g., page)."
|
596
|
-
)
|
680
|
+
from natural_pdf.elements.collections import ElementCollection
|
597
681
|
|
598
|
-
return
|
682
|
+
return ElementCollection(all_elements)
|
599
683
|
|
600
684
|
def extract_text(
|
601
685
|
self,
|
@@ -610,24 +694,24 @@ class PDF:
|
|
610
694
|
|
611
695
|
Args:
|
612
696
|
selector: Optional selector to filter elements
|
613
|
-
preserve_whitespace: Whether to keep blank characters
|
614
|
-
use_exclusions: Whether to apply exclusion regions
|
615
|
-
debug_exclusions: Whether to output detailed debugging for exclusions
|
697
|
+
preserve_whitespace: Whether to keep blank characters
|
698
|
+
use_exclusions: Whether to apply exclusion regions
|
699
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
700
|
+
preserve_whitespace: Whether to keep blank characters
|
701
|
+
use_exclusions: Whether to apply exclusion regions
|
702
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
616
703
|
**kwargs: Additional extraction parameters
|
617
704
|
|
618
705
|
Returns:
|
619
706
|
Extracted text as string
|
620
707
|
"""
|
621
|
-
# Ensure _pages is initialized
|
622
708
|
if not hasattr(self, "_pages"):
|
623
709
|
raise AttributeError("PDF pages not yet initialized.")
|
624
710
|
|
625
|
-
# If selector is provided, find elements first
|
626
711
|
if selector:
|
627
712
|
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
628
713
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
629
714
|
|
630
|
-
# Otherwise extract from all pages
|
631
715
|
if debug_exclusions:
|
632
716
|
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
633
717
|
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
@@ -648,25 +732,6 @@ class PDF:
|
|
648
732
|
|
649
733
|
return "\n".join(texts)
|
650
734
|
|
651
|
-
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
652
|
-
"""
|
653
|
-
Shorthand for finding elements and extracting their text.
|
654
|
-
|
655
|
-
Args:
|
656
|
-
selector: CSS-like selector string
|
657
|
-
preserve_whitespace: Whether to keep blank characters (default: True)
|
658
|
-
**kwargs: Additional extraction parameters
|
659
|
-
|
660
|
-
Returns:
|
661
|
-
Extracted text from matching elements
|
662
|
-
"""
|
663
|
-
# Ensure _pages is initialized
|
664
|
-
if not hasattr(self, "_pages"):
|
665
|
-
raise AttributeError("PDF pages not yet initialized.")
|
666
|
-
return self.extract_text(
|
667
|
-
selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
|
668
|
-
) # apply_exclusions is handled by find_all in extract_text
|
669
|
-
|
670
735
|
def extract_tables(
|
671
736
|
self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
|
672
737
|
) -> List[Any]:
|
@@ -681,54 +746,46 @@ class PDF:
|
|
681
746
|
Returns:
|
682
747
|
List of extracted tables
|
683
748
|
"""
|
684
|
-
# Ensure _pages is initialized
|
685
749
|
if not hasattr(self, "_pages"):
|
686
750
|
raise AttributeError("PDF pages not yet initialized.")
|
687
|
-
|
751
|
+
|
688
752
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
689
753
|
all_tables = []
|
754
|
+
|
690
755
|
for page in self.pages:
|
691
|
-
# Assuming page.extract_tables(**kwargs) exists or is added
|
692
756
|
if hasattr(page, "extract_tables"):
|
693
757
|
all_tables.extend(page.extract_tables(**kwargs))
|
694
758
|
else:
|
695
759
|
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
696
|
-
|
760
|
+
|
697
761
|
if selector:
|
698
762
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
699
|
-
|
700
|
-
# Placeholder merging
|
763
|
+
|
701
764
|
if merge_across_pages:
|
702
765
|
logger.warning("Merging tables across pages is not implemented.")
|
703
|
-
|
766
|
+
|
704
767
|
return all_tables
|
705
768
|
|
706
|
-
# --- New Method: save_searchable ---
|
707
769
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
708
770
|
"""
|
709
771
|
Saves the PDF with an OCR text layer, making content searchable.
|
710
772
|
|
711
773
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
712
774
|
|
713
|
-
Note: OCR must have been applied to the pages beforehand
|
714
|
-
(e.g., using pdf.apply_ocr()).
|
715
|
-
|
716
775
|
Args:
|
717
|
-
output_path: Path to save the searchable PDF
|
718
|
-
dpi: Resolution for rendering and OCR overlay
|
719
|
-
**kwargs: Additional keyword arguments passed to the exporter
|
776
|
+
output_path: Path to save the searchable PDF
|
777
|
+
dpi: Resolution for rendering and OCR overlay
|
778
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
779
|
+
output_path: Path to save the searchable PDF
|
780
|
+
dpi: Resolution for rendering and OCR overlay
|
781
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
720
782
|
"""
|
721
|
-
# Import moved here, assuming it's always available now
|
722
783
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
723
784
|
|
724
|
-
# Convert pathlib.Path to string if necessary
|
725
785
|
output_path_str = str(output_path)
|
726
|
-
|
727
786
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
728
787
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
729
788
|
|
730
|
-
# --- End New Method ---
|
731
|
-
|
732
789
|
def ask(
|
733
790
|
self,
|
734
791
|
question: str,
|
@@ -750,27 +807,22 @@ class PDF:
|
|
750
807
|
**kwargs: Additional parameters passed to the QA engine
|
751
808
|
|
752
809
|
Returns:
|
753
|
-
A dictionary containing the answer, confidence, and other metadata
|
754
|
-
|
810
|
+
A dictionary containing the answer, confidence, and other metadata
|
811
|
+
A dictionary containing the answer, confidence, and other metadata
|
755
812
|
"""
|
756
813
|
from natural_pdf.qa import get_qa_engine
|
757
814
|
|
758
|
-
# Initialize or get QA engine
|
759
815
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
760
816
|
|
761
|
-
# Determine which pages to query
|
762
817
|
if pages is None:
|
763
818
|
target_pages = list(range(len(self.pages)))
|
764
819
|
elif isinstance(pages, int):
|
765
|
-
# Single page
|
766
820
|
target_pages = [pages]
|
767
821
|
elif isinstance(pages, (list, range)):
|
768
|
-
# List or range of pages
|
769
822
|
target_pages = pages
|
770
823
|
else:
|
771
824
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
772
825
|
|
773
|
-
# Actually query each page and gather results
|
774
826
|
results = []
|
775
827
|
for page_idx in target_pages:
|
776
828
|
if 0 <= page_idx < len(self.pages):
|
@@ -779,136 +831,110 @@ class PDF:
|
|
779
831
|
page=page, question=question, min_confidence=min_confidence, **kwargs
|
780
832
|
)
|
781
833
|
|
782
|
-
# Add to results if it found an answer
|
783
834
|
if page_result and page_result.get("found", False):
|
784
835
|
results.append(page_result)
|
785
836
|
|
786
|
-
# Sort results by confidence
|
787
837
|
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
788
838
|
|
789
|
-
# Return the best result, or a default result if none found
|
790
839
|
if results:
|
791
840
|
return results[0]
|
792
841
|
else:
|
793
|
-
# Return a structure indicating no answer found
|
794
842
|
return {
|
795
843
|
"answer": None,
|
796
844
|
"confidence": 0.0,
|
797
845
|
"found": False,
|
798
|
-
"page_num": None,
|
846
|
+
"page_num": None,
|
799
847
|
"source_elements": [],
|
800
848
|
}
|
801
849
|
|
802
850
|
def search_within_index(
|
803
851
|
self,
|
804
852
|
query: Union[str, Path, Image.Image, Region],
|
805
|
-
search_service: SearchServiceProtocol,
|
853
|
+
search_service: SearchServiceProtocol,
|
806
854
|
options: Optional[SearchOptions] = None,
|
807
855
|
) -> List[Dict[str, Any]]:
|
808
856
|
"""
|
809
|
-
Finds relevant documents
|
810
|
-
|
811
|
-
|
812
|
-
This method uses a pre-configured SearchService instance and adds
|
813
|
-
a filter to the search query to scope results only to pages from
|
814
|
-
this specific PDF object (based on its resolved path).
|
857
|
+
Finds relevant documents from this PDF within a search index.
|
858
|
+
Finds relevant documents from this PDF within a search index.
|
815
859
|
|
816
860
|
Args:
|
817
|
-
query: The search query (text, image path, PIL Image, Region)
|
818
|
-
search_service: A pre-configured SearchService instance
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
PDF-scoping filter using an 'AND' condition.
|
861
|
+
query: The search query (text, image path, PIL Image, Region)
|
862
|
+
search_service: A pre-configured SearchService instance
|
863
|
+
options: Optional SearchOptions to configure the query
|
864
|
+
query: The search query (text, image path, PIL Image, Region)
|
865
|
+
search_service: A pre-configured SearchService instance
|
866
|
+
options: Optional SearchOptions to configure the query
|
824
867
|
|
825
868
|
Returns:
|
826
|
-
A list of result dictionaries, sorted by relevance
|
827
|
-
|
869
|
+
A list of result dictionaries, sorted by relevance
|
870
|
+
A list of result dictionaries, sorted by relevance
|
828
871
|
|
829
872
|
Raises:
|
830
|
-
ImportError: If search dependencies are not installed
|
831
|
-
ValueError: If search_service is None
|
832
|
-
TypeError: If search_service does not conform to the protocol
|
833
|
-
FileNotFoundError: If the collection managed by the service does not exist
|
834
|
-
RuntimeError: For other search failures
|
873
|
+
ImportError: If search dependencies are not installed
|
874
|
+
ValueError: If search_service is None
|
875
|
+
TypeError: If search_service does not conform to the protocol
|
876
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
877
|
+
RuntimeError: For other search failures
|
878
|
+
ImportError: If search dependencies are not installed
|
879
|
+
ValueError: If search_service is None
|
880
|
+
TypeError: If search_service does not conform to the protocol
|
881
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
882
|
+
RuntimeError: For other search failures
|
835
883
|
"""
|
836
884
|
if not search_service:
|
837
885
|
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
838
|
-
# Optional stricter check:
|
839
|
-
# if not isinstance(search_service, SearchServiceProtocol):
|
840
|
-
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
841
886
|
|
842
|
-
# Get collection name from service for logging
|
843
887
|
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
844
888
|
logger.info(
|
845
|
-
f"Searching within index '{collection_name}'
|
889
|
+
f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
|
846
890
|
)
|
847
891
|
|
848
|
-
|
849
|
-
|
850
|
-
# if search_service:
|
851
|
-
# service = search_service
|
852
|
-
# else:
|
853
|
-
# logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
|
854
|
-
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
855
|
-
# # TODO: Pass embedding model from options/pdf config if needed?
|
856
|
-
# service = get_search_service(**factory_args)
|
857
|
-
service = search_service # Use validated provided service
|
858
|
-
|
859
|
-
# --- 2. Prepare Query and Options ---
|
892
|
+
service = search_service
|
893
|
+
|
860
894
|
query_input = query
|
861
|
-
# Resolve options (use default TextSearch if none provided)
|
862
895
|
effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
|
863
896
|
|
864
|
-
# Handle Region query - extract text for now
|
865
897
|
if isinstance(query, Region):
|
866
898
|
logger.debug("Query is a Region object. Extracting text.")
|
867
899
|
if not isinstance(effective_options, TextSearchOptions):
|
868
900
|
logger.warning(
|
869
|
-
"Querying with Region image requires MultiModalSearchOptions
|
901
|
+
"Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
|
870
902
|
)
|
871
903
|
query_input = query.extract_text()
|
872
904
|
if not query_input or query_input.isspace():
|
873
905
|
logger.error("Region has no extractable text for query.")
|
874
906
|
return []
|
875
907
|
|
876
|
-
#
|
877
|
-
#
|
908
|
+
# Add filter to scope search to THIS PDF
|
909
|
+
# Add filter to scope search to THIS PDF
|
878
910
|
pdf_scope_filter = {
|
879
|
-
"field": "pdf_path",
|
911
|
+
"field": "pdf_path",
|
880
912
|
"operator": "eq",
|
881
|
-
"value": self.path,
|
913
|
+
"value": self.path,
|
882
914
|
}
|
883
915
|
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
884
916
|
|
885
917
|
# Combine with existing filters in options (if any)
|
886
918
|
if effective_options.filters:
|
887
|
-
logger.debug(
|
888
|
-
f"Combining PDF scope filter with existing filters: {effective_options.filters}"
|
889
|
-
)
|
890
|
-
# Assume filters are compatible with the underlying search service
|
891
|
-
# If existing filters aren't already in an AND block, wrap them
|
919
|
+
logger.debug(f"Combining PDF scope filter with existing filters")
|
892
920
|
if (
|
893
921
|
isinstance(effective_options.filters, dict)
|
894
922
|
and effective_options.filters.get("operator") == "AND"
|
895
923
|
):
|
896
|
-
# Already an AND block, just append the condition
|
897
924
|
effective_options.filters["conditions"].append(pdf_scope_filter)
|
898
925
|
elif isinstance(effective_options.filters, list):
|
899
|
-
# Assume list represents implicit AND conditions
|
900
926
|
effective_options.filters = {
|
901
927
|
"operator": "AND",
|
902
928
|
"conditions": effective_options.filters + [pdf_scope_filter],
|
903
929
|
}
|
904
|
-
elif isinstance(effective_options.filters, dict):
|
930
|
+
elif isinstance(effective_options.filters, dict):
|
905
931
|
effective_options.filters = {
|
906
932
|
"operator": "AND",
|
907
933
|
"conditions": [effective_options.filters, pdf_scope_filter],
|
908
934
|
}
|
909
935
|
else:
|
910
936
|
logger.warning(
|
911
|
-
f"Unsupported format for existing filters
|
937
|
+
f"Unsupported format for existing filters. Overwriting with PDF scope filter."
|
912
938
|
)
|
913
939
|
effective_options.filters = pdf_scope_filter
|
914
940
|
else:
|
@@ -916,39 +942,33 @@ class PDF:
|
|
916
942
|
|
917
943
|
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
918
944
|
|
919
|
-
# --- 4. Call SearchService ---
|
920
945
|
try:
|
921
|
-
# Call the service's search method (no collection_name needed)
|
922
946
|
results = service.search(
|
923
947
|
query=query_input,
|
924
948
|
options=effective_options,
|
925
949
|
)
|
926
|
-
logger.info(
|
927
|
-
f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
|
928
|
-
)
|
950
|
+
logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
|
929
951
|
return results
|
930
952
|
except FileNotFoundError as fnf:
|
931
|
-
logger.error(
|
932
|
-
|
933
|
-
)
|
934
|
-
raise
|
953
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
954
|
+
raise
|
955
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
956
|
+
raise
|
935
957
|
except Exception as e:
|
936
|
-
logger.error(
|
937
|
-
|
938
|
-
|
939
|
-
)
|
940
|
-
raise RuntimeError(
|
941
|
-
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
942
|
-
) from e
|
958
|
+
logger.error(f"SearchService search failed: {e}")
|
959
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
960
|
+
logger.error(f"SearchService search failed: {e}")
|
961
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
943
962
|
|
944
963
|
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
945
964
|
"""
|
946
|
-
Exports OCR results from this PDF into a correction task package
|
965
|
+
Exports OCR results from this PDF into a correction task package.
|
966
|
+
Exports OCR results from this PDF into a correction task package.
|
947
967
|
|
948
968
|
Args:
|
949
|
-
output_zip_path: The path to save the output zip file
|
969
|
+
output_zip_path: The path to save the output zip file
|
970
|
+
output_zip_path: The path to save the output zip file
|
950
971
|
**kwargs: Additional arguments passed to create_correction_task_package
|
951
|
-
(e.g., image_render_scale, overwrite).
|
952
972
|
"""
|
953
973
|
try:
|
954
974
|
from natural_pdf.utils.packaging import create_correction_task_package
|
@@ -958,32 +978,41 @@ class PDF:
|
|
958
978
|
logger.error(
|
959
979
|
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
960
980
|
)
|
961
|
-
|
981
|
+
logger.error(
|
982
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
983
|
+
)
|
962
984
|
except Exception as e:
|
963
|
-
logger.error(f"Failed to export correction task
|
964
|
-
raise
|
985
|
+
logger.error(f"Failed to export correction task: {e}")
|
986
|
+
raise
|
987
|
+
logger.error(f"Failed to export correction task: {e}")
|
988
|
+
raise
|
965
989
|
|
966
990
|
def correct_ocr(
|
967
991
|
self,
|
968
992
|
correction_callback: Callable[[Any], Optional[str]],
|
969
993
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
970
|
-
|
994
|
+
max_workers: Optional[int] = None,
|
995
|
+
progress_callback: Optional[Callable[[], None]] = None,
|
996
|
+
) -> "PDF":
|
971
997
|
"""
|
972
|
-
Applies corrections to OCR
|
973
|
-
|
998
|
+
Applies corrections to OCR text elements using a callback function.
|
999
|
+
Applies corrections to OCR text elements using a callback function.
|
974
1000
|
|
975
1001
|
Args:
|
976
|
-
correction_callback:
|
977
|
-
|
978
|
-
corrected text string if an update is needed, otherwise None.
|
1002
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
1003
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
979
1004
|
pages: Optional page indices/slice to limit the scope of correction
|
980
|
-
|
1005
|
+
max_workers: Maximum number of threads to use for parallel execution
|
1006
|
+
progress_callback: Optional callback function for progress updates
|
1007
|
+
max_workers: Maximum number of threads to use for parallel execution
|
1008
|
+
progress_callback: Optional callback function for progress updates
|
981
1009
|
|
982
1010
|
Returns:
|
983
|
-
Self for method chaining
|
1011
|
+
Self for method chaining
|
1012
|
+
Self for method chaining
|
984
1013
|
"""
|
985
|
-
|
986
|
-
target_page_indices
|
1014
|
+
target_page_indices = []
|
1015
|
+
target_page_indices = []
|
987
1016
|
if pages is None:
|
988
1017
|
target_page_indices = list(range(len(self._pages)))
|
989
1018
|
elif isinstance(pages, slice):
|
@@ -991,56 +1020,55 @@ class PDF:
|
|
991
1020
|
elif hasattr(pages, "__iter__"):
|
992
1021
|
try:
|
993
1022
|
target_page_indices = [int(i) for i in pages]
|
994
|
-
# Validate indices
|
995
1023
|
for idx in target_page_indices:
|
996
1024
|
if not (0 <= idx < len(self._pages)):
|
997
1025
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
998
1026
|
except (IndexError, TypeError, ValueError) as e:
|
999
|
-
raise ValueError(
|
1000
|
-
|
1001
|
-
) from e
|
1027
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1028
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1002
1029
|
else:
|
1003
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
1030
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1031
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1004
1032
|
|
1005
1033
|
if not target_page_indices:
|
1006
1034
|
logger.warning("No pages selected for OCR correction.")
|
1007
1035
|
return self
|
1008
1036
|
|
1009
|
-
logger.info(
|
1010
|
-
|
1011
|
-
)
|
1037
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1038
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1012
1039
|
|
1013
|
-
# Iterate through target pages and call their correct_ocr method
|
1014
1040
|
for page_idx in target_page_indices:
|
1015
1041
|
page = self._pages[page_idx]
|
1016
1042
|
try:
|
1017
|
-
page.correct_ocr(
|
1043
|
+
page.correct_ocr(
|
1044
|
+
correction_callback=correction_callback,
|
1045
|
+
max_workers=max_workers,
|
1046
|
+
progress_callback=progress_callback,
|
1047
|
+
)
|
1018
1048
|
except Exception as e:
|
1019
|
-
logger.error(f"Error during correct_ocr on page {page_idx}: {e}"
|
1020
|
-
|
1049
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1050
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1021
1051
|
|
1022
|
-
logger.info(
|
1052
|
+
logger.info("OCR correction process finished.")
|
1053
|
+
logger.info("OCR correction process finished.")
|
1023
1054
|
return self
|
1024
1055
|
|
1025
1056
|
def __len__(self) -> int:
|
1026
1057
|
"""Return the number of pages in the PDF."""
|
1027
|
-
# Ensure _pages is initialized
|
1028
1058
|
if not hasattr(self, "_pages"):
|
1029
|
-
# Return 0 or raise error if not fully initialized? Let's return 0.
|
1030
1059
|
return 0
|
1031
1060
|
return len(self._pages)
|
1032
1061
|
|
1033
|
-
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
1062
|
+
def __getitem__(self, key) -> Union["Page", "PageCollection"]:
|
1034
1063
|
"""Access pages by index or slice."""
|
1035
|
-
# Check if self._pages has been initialized
|
1036
1064
|
if not hasattr(self, "_pages"):
|
1037
1065
|
raise AttributeError("PDF pages not initialized yet.")
|
1066
|
+
|
1038
1067
|
if isinstance(key, slice):
|
1039
|
-
# Return a PageCollection slice
|
1040
1068
|
from natural_pdf.elements.collections import PageCollection
|
1041
1069
|
|
1042
1070
|
return PageCollection(self._pages[key])
|
1043
|
-
|
1071
|
+
|
1044
1072
|
if isinstance(key, int):
|
1045
1073
|
if 0 <= key < len(self._pages):
|
1046
1074
|
return self._pages[key]
|
@@ -1054,25 +1082,23 @@ class PDF:
|
|
1054
1082
|
if hasattr(self, "_pdf") and self._pdf is not None:
|
1055
1083
|
try:
|
1056
1084
|
self._pdf.close()
|
1057
|
-
logger.debug(f"Closed
|
1085
|
+
logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
|
1058
1086
|
except Exception as e:
|
1059
1087
|
logger.warning(f"Error closing pdfplumber object: {e}")
|
1060
1088
|
finally:
|
1061
1089
|
self._pdf = None
|
1062
1090
|
|
1063
|
-
# Clean up temporary file if it exists
|
1064
1091
|
if hasattr(self, "_temp_file") and self._temp_file is not None:
|
1065
1092
|
temp_file_path = None
|
1066
1093
|
try:
|
1067
1094
|
if hasattr(self._temp_file, "name") and self._temp_file.name:
|
1068
1095
|
temp_file_path = self._temp_file.name
|
1069
|
-
if
|
1096
|
+
# Only unlink if it exists and _is_stream is False (meaning WE created it)
|
1097
|
+
if not self._is_stream and os.path.exists(temp_file_path):
|
1070
1098
|
os.unlink(temp_file_path)
|
1071
1099
|
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
1072
1100
|
except Exception as e:
|
1073
|
-
logger.warning(f"Failed to clean up temporary
|
1074
|
-
finally:
|
1075
|
-
self._temp_file = None
|
1101
|
+
logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
|
1076
1102
|
|
1077
1103
|
def __enter__(self):
|
1078
1104
|
"""Context manager entry."""
|
@@ -1082,6 +1108,432 @@ class PDF:
|
|
1082
1108
|
"""Context manager exit."""
|
1083
1109
|
self.close()
|
1084
1110
|
|
1085
|
-
# --- Indexable Protocol Methods --- Needed for search/sync
|
1086
1111
|
def get_id(self) -> str:
|
1112
|
+
"""Get unique identifier for this PDF."""
|
1113
|
+
"""Get unique identifier for this PDF."""
|
1087
1114
|
return self.path
|
1115
|
+
|
1116
|
+
# --- Deskew Method --- #
|
1117
|
+
|
1118
|
+
def deskew(
|
1119
|
+
self,
|
1120
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1121
|
+
resolution: int = 300,
|
1122
|
+
detection_resolution: int = 72,
|
1123
|
+
force_overwrite: bool = False,
|
1124
|
+
**deskew_kwargs,
|
1125
|
+
) -> "PDF":
|
1126
|
+
"""
|
1127
|
+
Creates a new, in-memory PDF object containing deskewed versions of the
|
1128
|
+
specified pages from the original PDF.
|
1129
|
+
|
1130
|
+
This method renders each selected page, detects and corrects skew using the 'deskew'
|
1131
|
+
library, and then combines the resulting images into a new PDF using 'img2pdf'.
|
1132
|
+
The new PDF object is returned directly.
|
1133
|
+
|
1134
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
1135
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
1136
|
+
|
1137
|
+
Args:
|
1138
|
+
pages: Page indices/slice to include (0-based). If None, processes all pages.
|
1139
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
1140
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
1141
|
+
already cached on the page objects.
|
1142
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
1143
|
+
already contains processed elements (text, OCR, regions) to
|
1144
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
1145
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
1146
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
1147
|
+
|
1148
|
+
Returns:
|
1149
|
+
A new PDF object representing the deskewed document.
|
1150
|
+
|
1151
|
+
Raises:
|
1152
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
|
1153
|
+
ValueError: If `force_overwrite` is False and target pages contain elements.
|
1154
|
+
FileNotFoundError: If the source PDF cannot be read (if file-based).
|
1155
|
+
IOError: If creating the in-memory PDF fails.
|
1156
|
+
RuntimeError: If rendering or deskewing individual pages fails.
|
1157
|
+
"""
|
1158
|
+
if not DESKEW_AVAILABLE:
|
1159
|
+
raise ImportError(
|
1160
|
+
"Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
|
1161
|
+
)
|
1162
|
+
|
1163
|
+
target_pages = self._get_target_pages(pages) # Use helper to resolve pages
|
1164
|
+
|
1165
|
+
# --- Safety Check --- #
|
1166
|
+
if not force_overwrite:
|
1167
|
+
for page in target_pages:
|
1168
|
+
# Check if the element manager has been initialized and contains any elements
|
1169
|
+
if (
|
1170
|
+
hasattr(page, "_element_mgr")
|
1171
|
+
and page._element_mgr
|
1172
|
+
and page._element_mgr.has_elements()
|
1173
|
+
):
|
1174
|
+
raise ValueError(
|
1175
|
+
f"Page {page.number} contains existing elements (text, OCR, etc.). "
|
1176
|
+
f"Deskewing creates an image-only PDF, discarding these elements. "
|
1177
|
+
f"Set force_overwrite=True to proceed."
|
1178
|
+
)
|
1179
|
+
|
1180
|
+
# --- Process Pages --- #
|
1181
|
+
deskewed_images_bytes = []
|
1182
|
+
logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
|
1183
|
+
|
1184
|
+
# Use tqdm via get_tqdm
|
1185
|
+
for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
|
1186
|
+
try:
|
1187
|
+
# Use page.deskew to get the corrected PIL image
|
1188
|
+
# Pass down resolutions and kwargs
|
1189
|
+
deskewed_img = page.deskew(
|
1190
|
+
resolution=resolution,
|
1191
|
+
angle=None, # Let page.deskew handle detection/caching
|
1192
|
+
detection_resolution=detection_resolution,
|
1193
|
+
**deskew_kwargs,
|
1194
|
+
)
|
1195
|
+
|
1196
|
+
if not deskewed_img:
|
1197
|
+
logger.warning(
|
1198
|
+
f"Page {page.number}: Failed to generate deskewed image, skipping."
|
1199
|
+
)
|
1200
|
+
continue
|
1201
|
+
|
1202
|
+
# Convert image to bytes for img2pdf (use PNG for lossless quality)
|
1203
|
+
with io.BytesIO() as buf:
|
1204
|
+
deskewed_img.save(buf, format="PNG")
|
1205
|
+
deskewed_images_bytes.append(buf.getvalue())
|
1206
|
+
|
1207
|
+
except Exception as e:
|
1208
|
+
logger.error(
|
1209
|
+
f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
|
1210
|
+
)
|
1211
|
+
# Option: Raise a runtime error, or continue and skip the page?
|
1212
|
+
# Raising makes the whole operation fail if one page fails.
|
1213
|
+
raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
|
1214
|
+
|
1215
|
+
# --- Create PDF --- #
|
1216
|
+
if not deskewed_images_bytes:
|
1217
|
+
raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
|
1218
|
+
|
1219
|
+
logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
|
1220
|
+
try:
|
1221
|
+
# Use img2pdf to combine image bytes into PDF bytes
|
1222
|
+
pdf_bytes = img2pdf.convert(deskewed_images_bytes)
|
1223
|
+
|
1224
|
+
# Wrap bytes in a stream
|
1225
|
+
pdf_stream = io.BytesIO(pdf_bytes)
|
1226
|
+
|
1227
|
+
# Create a new PDF object from the stream using original config
|
1228
|
+
logger.info("Creating new PDF object from deskewed stream...")
|
1229
|
+
new_pdf = PDF(
|
1230
|
+
pdf_stream,
|
1231
|
+
reading_order=self._reading_order,
|
1232
|
+
font_attrs=self._font_attrs,
|
1233
|
+
keep_spaces=self._config.get("keep_spaces", True),
|
1234
|
+
)
|
1235
|
+
return new_pdf
|
1236
|
+
except Exception as e:
|
1237
|
+
logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
|
1238
|
+
raise IOError("Failed to create deskewed PDF object from image stream.") from e
|
1239
|
+
|
1240
|
+
# --- End Deskew Method --- #
|
1241
|
+
|
1242
|
+
# --- Classification Methods --- #
|
1243
|
+
|
1244
|
+
def classify_pages(
|
1245
|
+
self,
|
1246
|
+
categories: List[str],
|
1247
|
+
model: Optional[str] = None,
|
1248
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1249
|
+
analysis_key: str = "classification",
|
1250
|
+
using: Optional[str] = None,
|
1251
|
+
**kwargs,
|
1252
|
+
) -> "PDF":
|
1253
|
+
"""
|
1254
|
+
Classifies specified pages of the PDF.
|
1255
|
+
|
1256
|
+
Args:
|
1257
|
+
categories: List of category names
|
1258
|
+
model: Model identifier ('text', 'vision', or specific HF ID)
|
1259
|
+
pages: Page indices, slice, or None for all pages
|
1260
|
+
analysis_key: Key to store results in page's analyses dict
|
1261
|
+
using: Processing mode ('text' or 'vision')
|
1262
|
+
**kwargs: Additional arguments for the ClassificationManager
|
1263
|
+
|
1264
|
+
Returns:
|
1265
|
+
Self for method chaining
|
1266
|
+
"""
|
1267
|
+
if not categories:
|
1268
|
+
raise ValueError("Categories list cannot be empty.")
|
1269
|
+
|
1270
|
+
try:
|
1271
|
+
manager = self.get_manager("classification")
|
1272
|
+
except (ValueError, RuntimeError) as e:
|
1273
|
+
raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
|
1274
|
+
|
1275
|
+
if not manager or not manager.is_available():
|
1276
|
+
try:
|
1277
|
+
from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
|
1278
|
+
|
1279
|
+
if not _CLASSIFICATION_AVAILABLE:
|
1280
|
+
raise ImportError("Classification dependencies missing.")
|
1281
|
+
except ImportError:
|
1282
|
+
raise ImportError(
|
1283
|
+
"Classification dependencies missing. "
|
1284
|
+
'Install with: pip install "natural-pdf[classification]"'
|
1285
|
+
)
|
1286
|
+
raise ClassificationError("ClassificationManager not available.")
|
1287
|
+
|
1288
|
+
target_pages = []
|
1289
|
+
if pages is None:
|
1290
|
+
target_pages = self._pages
|
1291
|
+
elif isinstance(pages, slice):
|
1292
|
+
target_pages = self._pages[pages]
|
1293
|
+
elif hasattr(pages, "__iter__"):
|
1294
|
+
try:
|
1295
|
+
target_pages = [self._pages[i] for i in pages]
|
1296
|
+
except IndexError:
|
1297
|
+
raise ValueError("Invalid page index provided.")
|
1298
|
+
except TypeError:
|
1299
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1300
|
+
else:
|
1301
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1302
|
+
|
1303
|
+
if not target_pages:
|
1304
|
+
logger.warning("No pages selected for classification.")
|
1305
|
+
return self
|
1306
|
+
|
1307
|
+
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
1308
|
+
logger.info(
|
1309
|
+
f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
|
1310
|
+
)
|
1311
|
+
|
1312
|
+
page_contents = []
|
1313
|
+
pages_to_classify = []
|
1314
|
+
logger.debug(f"Gathering content for {len(target_pages)} pages...")
|
1315
|
+
|
1316
|
+
for page in target_pages:
|
1317
|
+
try:
|
1318
|
+
content = page._get_classification_content(model_type=inferred_using, **kwargs)
|
1319
|
+
page_contents.append(content)
|
1320
|
+
pages_to_classify.append(page)
|
1321
|
+
except ValueError as e:
|
1322
|
+
logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
|
1323
|
+
except Exception as e:
|
1324
|
+
logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
|
1325
|
+
|
1326
|
+
if not page_contents:
|
1327
|
+
logger.warning("No content could be gathered for batch classification.")
|
1328
|
+
return self
|
1329
|
+
|
1330
|
+
logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
|
1331
|
+
|
1332
|
+
try:
|
1333
|
+
batch_results = manager.classify_batch(
|
1334
|
+
item_contents=page_contents,
|
1335
|
+
categories=categories,
|
1336
|
+
model_id=model,
|
1337
|
+
using=inferred_using,
|
1338
|
+
**kwargs,
|
1339
|
+
)
|
1340
|
+
except Exception as e:
|
1341
|
+
logger.error(f"Batch classification failed: {e}")
|
1342
|
+
raise ClassificationError(f"Batch classification failed: {e}") from e
|
1343
|
+
|
1344
|
+
if len(batch_results) != len(pages_to_classify):
|
1345
|
+
logger.error(
|
1346
|
+
f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
|
1347
|
+
)
|
1348
|
+
return self
|
1349
|
+
|
1350
|
+
logger.debug(
|
1351
|
+
f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
|
1352
|
+
)
|
1353
|
+
for page, result_obj in zip(pages_to_classify, batch_results):
|
1354
|
+
try:
|
1355
|
+
if not hasattr(page, "analyses") or page.analyses is None:
|
1356
|
+
page.analyses = {}
|
1357
|
+
page.analyses[analysis_key] = result_obj
|
1358
|
+
except Exception as e:
|
1359
|
+
logger.warning(
|
1360
|
+
f"Failed to store classification results for page {page.number}: {e}"
|
1361
|
+
)
|
1362
|
+
|
1363
|
+
logger.info(f"Finished classifying PDF pages.")
|
1364
|
+
return self
|
1365
|
+
|
1366
|
+
# --- End Classification Methods --- #
|
1367
|
+
|
1368
|
+
# --- Extraction Support --- #
|
1369
|
+
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
1370
|
+
"""
|
1371
|
+
Retrieves the content for the entire PDF.
|
1372
|
+
|
1373
|
+
Args:
|
1374
|
+
using: 'text' or 'vision'
|
1375
|
+
**kwargs: Additional arguments passed to extract_text or page.to_image
|
1376
|
+
|
1377
|
+
Returns:
|
1378
|
+
str: Extracted text if using='text'
|
1379
|
+
List[PIL.Image.Image]: List of page images if using='vision'
|
1380
|
+
None: If content cannot be retrieved
|
1381
|
+
"""
|
1382
|
+
if using == "text":
|
1383
|
+
try:
|
1384
|
+
layout = kwargs.pop("layout", True)
|
1385
|
+
return self.extract_text(layout=layout, **kwargs)
|
1386
|
+
except Exception as e:
|
1387
|
+
logger.error(f"Error extracting text from PDF: {e}")
|
1388
|
+
return None
|
1389
|
+
elif using == "vision":
|
1390
|
+
page_images = []
|
1391
|
+
logger.info(f"Rendering {len(self.pages)} pages to images...")
|
1392
|
+
|
1393
|
+
resolution = kwargs.pop("resolution", 72)
|
1394
|
+
include_highlights = kwargs.pop("include_highlights", False)
|
1395
|
+
labels = kwargs.pop("labels", False)
|
1396
|
+
|
1397
|
+
try:
|
1398
|
+
for page in tqdm(self.pages, desc="Rendering Pages"):
|
1399
|
+
img = page.to_image(
|
1400
|
+
resolution=resolution,
|
1401
|
+
include_highlights=include_highlights,
|
1402
|
+
labels=labels,
|
1403
|
+
**kwargs,
|
1404
|
+
)
|
1405
|
+
if img:
|
1406
|
+
page_images.append(img)
|
1407
|
+
else:
|
1408
|
+
logger.warning(f"Failed to render page {page.number}, skipping.")
|
1409
|
+
if not page_images:
|
1410
|
+
logger.error("Failed to render any pages.")
|
1411
|
+
return None
|
1412
|
+
return page_images
|
1413
|
+
except Exception as e:
|
1414
|
+
logger.error(f"Error rendering pages: {e}")
|
1415
|
+
return None
|
1416
|
+
else:
|
1417
|
+
logger.error(f"Unsupported value for 'using': {using}")
|
1418
|
+
return None
|
1419
|
+
|
1420
|
+
# --- End Extraction Support --- #
|
1421
|
+
|
1422
|
+
def _gather_analysis_data(
|
1423
|
+
self,
|
1424
|
+
analysis_keys: List[str],
|
1425
|
+
include_content: bool,
|
1426
|
+
include_images: bool,
|
1427
|
+
image_dir: Optional[Path],
|
1428
|
+
image_format: str,
|
1429
|
+
image_resolution: int,
|
1430
|
+
) -> List[Dict[str, Any]]:
|
1431
|
+
"""
|
1432
|
+
Gather analysis data from all pages in the PDF.
|
1433
|
+
|
1434
|
+
Args:
|
1435
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1436
|
+
include_content: Whether to include extracted text
|
1437
|
+
include_images: Whether to export images
|
1438
|
+
image_dir: Directory to save images
|
1439
|
+
image_format: Format to save images
|
1440
|
+
image_resolution: Resolution for exported images
|
1441
|
+
|
1442
|
+
Returns:
|
1443
|
+
List of dictionaries containing analysis data
|
1444
|
+
"""
|
1445
|
+
if not hasattr(self, "_pages") or not self._pages:
|
1446
|
+
logger.warning(f"No pages found in PDF {self.path}")
|
1447
|
+
return []
|
1448
|
+
|
1449
|
+
all_data = []
|
1450
|
+
|
1451
|
+
for page in tqdm(self._pages, desc="Gathering page data", leave=False):
|
1452
|
+
# Basic page information
|
1453
|
+
page_data = {
|
1454
|
+
"pdf_path": self.path,
|
1455
|
+
"page_number": page.number,
|
1456
|
+
"page_index": page.index,
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
# Include extracted text if requested
|
1460
|
+
if include_content:
|
1461
|
+
try:
|
1462
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
1463
|
+
except Exception as e:
|
1464
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
1465
|
+
page_data["content"] = ""
|
1466
|
+
|
1467
|
+
# Save image if requested
|
1468
|
+
if include_images:
|
1469
|
+
try:
|
1470
|
+
# Create image filename
|
1471
|
+
image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
|
1472
|
+
image_path = image_dir / image_filename
|
1473
|
+
|
1474
|
+
# Save image
|
1475
|
+
page.save_image(
|
1476
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
1477
|
+
)
|
1478
|
+
|
1479
|
+
# Add relative path to data
|
1480
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1481
|
+
except Exception as e:
|
1482
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
1483
|
+
page_data["image_path"] = None
|
1484
|
+
|
1485
|
+
# Add analyses data
|
1486
|
+
for key in analysis_keys:
|
1487
|
+
if not hasattr(page, "analyses") or not page.analyses:
|
1488
|
+
raise ValueError(f"Page {page.number} does not have analyses data")
|
1489
|
+
|
1490
|
+
if key not in page.analyses:
|
1491
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
1492
|
+
|
1493
|
+
# Get the analysis result
|
1494
|
+
analysis_result = page.analyses[key]
|
1495
|
+
|
1496
|
+
# If the result has a to_dict method, use it
|
1497
|
+
if hasattr(analysis_result, "to_dict"):
|
1498
|
+
analysis_data = analysis_result.to_dict()
|
1499
|
+
else:
|
1500
|
+
# Otherwise, use the result directly if it's dict-like
|
1501
|
+
try:
|
1502
|
+
analysis_data = dict(analysis_result)
|
1503
|
+
except (TypeError, ValueError):
|
1504
|
+
# Last resort: convert to string
|
1505
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1506
|
+
|
1507
|
+
# Add analysis data to page data with the key as prefix
|
1508
|
+
for k, v in analysis_data.items():
|
1509
|
+
page_data[f"{key}.{k}"] = v
|
1510
|
+
|
1511
|
+
all_data.append(page_data)
|
1512
|
+
|
1513
|
+
return all_data
|
1514
|
+
|
1515
|
+
def _get_target_pages(
|
1516
|
+
self, pages: Optional[Union[Iterable[int], range, slice]] = None
|
1517
|
+
) -> List["Page"]:
|
1518
|
+
"""
|
1519
|
+
Helper method to get a list of Page objects based on the input pages.
|
1520
|
+
|
1521
|
+
Args:
|
1522
|
+
pages: Page indices, slice, or None for all pages
|
1523
|
+
|
1524
|
+
Returns:
|
1525
|
+
List of Page objects
|
1526
|
+
"""
|
1527
|
+
if pages is None:
|
1528
|
+
return self._pages
|
1529
|
+
elif isinstance(pages, slice):
|
1530
|
+
return self._pages[pages]
|
1531
|
+
elif hasattr(pages, "__iter__"):
|
1532
|
+
try:
|
1533
|
+
return [self._pages[i] for i in pages]
|
1534
|
+
except IndexError:
|
1535
|
+
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
1536
|
+
except TypeError:
|
1537
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1538
|
+
else:
|
1539
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|