natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,62 +1,88 @@
|
|
1
|
-
import
|
1
|
+
import copy # Add import for deepcopy
|
2
2
|
import logging
|
3
|
-
import tempfile
|
4
3
|
import os
|
5
4
|
import re
|
5
|
+
import tempfile
|
6
6
|
import urllib.request
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
7
|
+
from pathlib import Path # Added Path
|
8
|
+
from typing import ( # Added Iterable and TYPE_CHECKING
|
9
|
+
TYPE_CHECKING,
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Dict,
|
13
|
+
Iterable,
|
14
|
+
List,
|
15
|
+
Optional,
|
16
|
+
Tuple,
|
17
|
+
Type,
|
18
|
+
Union,
|
19
|
+
)
|
20
|
+
from pathlib import Path
|
21
|
+
|
22
|
+
|
23
|
+
import pdfplumber
|
10
24
|
from PIL import Image
|
11
25
|
|
26
|
+
from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
|
27
|
+
LayoutManager,
|
28
|
+
)
|
29
|
+
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
12
30
|
from natural_pdf.core.page import Page
|
13
|
-
from natural_pdf.selectors.parser import parse_selector
|
14
31
|
from natural_pdf.elements.collections import ElementCollection
|
15
32
|
from natural_pdf.elements.region import Region
|
16
33
|
from natural_pdf.ocr import OCRManager, OCROptions
|
17
|
-
from natural_pdf.
|
18
|
-
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
34
|
+
from natural_pdf.selectors.parser import parse_selector
|
19
35
|
|
20
36
|
# Import the flag directly - this should always work
|
21
37
|
|
22
38
|
# --- Add Search Service Imports (needed for new methods) ---
|
23
39
|
try:
|
40
|
+
from typing import Any as TypingAny # Import Any if not already
|
41
|
+
|
42
|
+
from natural_pdf.search import TextSearchOptions # Keep for ask default
|
24
43
|
from natural_pdf.search import (
|
25
|
-
|
26
|
-
SearchServiceProtocol,
|
44
|
+
BaseSearchOptions,
|
27
45
|
SearchOptions,
|
28
|
-
|
29
|
-
|
46
|
+
SearchServiceProtocol,
|
47
|
+
get_search_service,
|
30
48
|
)
|
31
|
-
from typing import Any as TypingAny # Import Any if not already
|
32
49
|
except ImportError:
|
33
50
|
# Define dummies if needed for type hints within the class
|
34
51
|
SearchServiceProtocol = object
|
35
52
|
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
36
53
|
TypingAny = object
|
54
|
+
|
37
55
|
# Dummy factory needed for default arg in methods
|
38
56
|
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
39
|
-
raise ImportError(
|
57
|
+
raise ImportError(
|
58
|
+
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
59
|
+
)
|
60
|
+
|
40
61
|
|
41
62
|
# --- End Search Service Imports ---
|
42
63
|
|
43
64
|
# Set up logger early
|
44
65
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
45
66
|
|
67
|
+
|
46
68
|
class PDF:
|
47
69
|
"""
|
48
70
|
Enhanced PDF wrapper built on top of pdfplumber.
|
49
|
-
|
71
|
+
|
50
72
|
This class provides a fluent interface for working with PDF documents,
|
51
73
|
with improved selection, navigation, and extraction capabilities.
|
52
74
|
"""
|
53
|
-
|
54
|
-
def __init__(
|
55
|
-
|
56
|
-
|
75
|
+
|
76
|
+
def __init__(
|
77
|
+
self,
|
78
|
+
path_or_url: str,
|
79
|
+
reading_order: bool = True,
|
80
|
+
font_attrs: Optional[List[str]] = None,
|
81
|
+
keep_spaces: bool = True,
|
82
|
+
):
|
57
83
|
"""
|
58
84
|
Initialize the enhanced PDF object.
|
59
|
-
|
85
|
+
|
60
86
|
Args:
|
61
87
|
path_or_url: Path to the PDF file or a URL to a PDF
|
62
88
|
reading_order: Whether to use natural reading order
|
@@ -69,30 +95,30 @@ class PDF:
|
|
69
95
|
False: Break text at spaces, each word is separate (legacy behavior)
|
70
96
|
"""
|
71
97
|
# Check if the input is a URL
|
72
|
-
is_url = path_or_url.startswith(
|
73
|
-
|
98
|
+
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
99
|
+
|
74
100
|
# Initialize path-related attributes
|
75
101
|
self._original_path = path_or_url
|
76
102
|
self._temp_file = None
|
77
|
-
self._resolved_path = None
|
103
|
+
self._resolved_path = None # Store the actual path used by pdfplumber
|
78
104
|
|
79
105
|
if is_url:
|
80
106
|
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
81
107
|
try:
|
82
108
|
# Create a temporary file to store the downloaded PDF
|
83
|
-
self._temp_file = tempfile.NamedTemporaryFile(suffix=
|
84
|
-
|
109
|
+
self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
110
|
+
|
85
111
|
# Download the PDF
|
86
112
|
with urllib.request.urlopen(path_or_url) as response:
|
87
113
|
self._temp_file.write(response.read())
|
88
114
|
self._temp_file.flush()
|
89
115
|
self._temp_file.close()
|
90
|
-
|
116
|
+
|
91
117
|
# Use the temporary file path
|
92
118
|
self._resolved_path = self._temp_file.name
|
93
119
|
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
94
120
|
except Exception as e:
|
95
|
-
if self._temp_file and hasattr(self._temp_file,
|
121
|
+
if self._temp_file and hasattr(self._temp_file, "name"):
|
96
122
|
try:
|
97
123
|
os.unlink(self._temp_file.name)
|
98
124
|
except:
|
@@ -104,40 +130,46 @@ class PDF:
|
|
104
130
|
self._resolved_path = path_or_url
|
105
131
|
|
106
132
|
logger.info(f"Initializing PDF from {self._resolved_path}")
|
107
|
-
logger.debug(
|
108
|
-
|
133
|
+
logger.debug(
|
134
|
+
f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
|
135
|
+
)
|
136
|
+
|
109
137
|
try:
|
110
138
|
self._pdf = pdfplumber.open(self._resolved_path)
|
111
139
|
except Exception as e:
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
140
|
+
logger.error(
|
141
|
+
f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
|
142
|
+
exc_info=True,
|
143
|
+
)
|
144
|
+
# Clean up temp file if creation failed
|
145
|
+
self.close()
|
146
|
+
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
116
147
|
|
117
|
-
self._path = self._resolved_path
|
118
|
-
self.path = self._resolved_path
|
119
|
-
self.source_path = self._original_path
|
148
|
+
self._path = self._resolved_path # Keep original path too?
|
149
|
+
self.path = self._resolved_path # Public attribute for the resolved path
|
150
|
+
self.source_path = self._original_path # Public attribute for the user-provided path/URL
|
120
151
|
|
121
152
|
self._reading_order = reading_order
|
122
|
-
self._config = {
|
123
|
-
'keep_spaces': keep_spaces
|
124
|
-
}
|
153
|
+
self._config = {"keep_spaces": keep_spaces}
|
125
154
|
|
126
155
|
self._font_attrs = font_attrs # Store the font attribute configuration
|
127
156
|
|
128
157
|
# Initialize Managers and Services (conditionally available)
|
129
158
|
self._ocr_manager = OCRManager() if OCRManager else None
|
130
159
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
131
|
-
self.highlighter = HighlightingService(self)
|
160
|
+
self.highlighter = HighlightingService(self)
|
132
161
|
|
133
162
|
# Initialize pages last, passing necessary refs
|
134
|
-
self._pages = [
|
163
|
+
self._pages = [
|
164
|
+
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
165
|
+
for i, p in enumerate(self._pdf.pages)
|
166
|
+
]
|
135
167
|
|
136
168
|
# Other state
|
137
169
|
self._element_cache = {}
|
138
170
|
self._exclusions = [] # List to store exclusion functions/regions
|
139
171
|
self._regions = [] # List to store region functions/definitions
|
140
|
-
|
172
|
+
|
141
173
|
logger.info("Initialized HighlightingService.")
|
142
174
|
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
143
175
|
|
@@ -147,45 +179,48 @@ class PDF:
|
|
147
179
|
return self._pdf.metadata
|
148
180
|
|
149
181
|
@property
|
150
|
-
def pages(self) ->
|
182
|
+
def pages(self) -> "PageCollection":
|
151
183
|
"""Access pages as a PageCollection object."""
|
152
184
|
from natural_pdf.elements.collections import PageCollection
|
185
|
+
|
153
186
|
# Ensure _pages is initialized
|
154
|
-
if not hasattr(self,
|
155
|
-
|
187
|
+
if not hasattr(self, "_pages"):
|
188
|
+
raise AttributeError("PDF pages not yet initialized.")
|
156
189
|
return PageCollection(self._pages)
|
157
|
-
|
158
|
-
def clear_exclusions(self) ->
|
190
|
+
|
191
|
+
def clear_exclusions(self) -> "PDF":
|
159
192
|
"""
|
160
193
|
Clear all exclusion functions from the PDF.
|
161
|
-
|
194
|
+
|
162
195
|
Returns:
|
163
196
|
Self for method chaining
|
164
197
|
"""
|
165
198
|
# Ensure _pages is initialized
|
166
|
-
if not hasattr(self,
|
167
|
-
|
199
|
+
if not hasattr(self, "_pages"):
|
200
|
+
raise AttributeError("PDF pages not yet initialized.")
|
168
201
|
|
169
202
|
self._exclusions = []
|
170
203
|
# Also clear from pages
|
171
204
|
for page in self._pages:
|
172
205
|
page.clear_exclusions()
|
173
206
|
return self
|
174
|
-
|
175
|
-
def add_exclusion(
|
207
|
+
|
208
|
+
def add_exclusion(
|
209
|
+
self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
|
210
|
+
) -> "PDF":
|
176
211
|
"""
|
177
212
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
178
|
-
|
213
|
+
|
179
214
|
Args:
|
180
215
|
exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
|
181
216
|
label: Optional label for this exclusion
|
182
|
-
|
217
|
+
|
183
218
|
Returns:
|
184
219
|
Self for method chaining
|
185
220
|
"""
|
186
221
|
# Ensure _pages is initialized
|
187
|
-
if not hasattr(self,
|
188
|
-
|
222
|
+
if not hasattr(self, "_pages"):
|
223
|
+
raise AttributeError("PDF pages not yet initialized.")
|
189
224
|
|
190
225
|
# Store exclusion with its label at PDF level
|
191
226
|
exclusion_data = (exclusion_func, label)
|
@@ -202,12 +237,17 @@ class PDF:
|
|
202
237
|
self,
|
203
238
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
204
239
|
engine: Optional[str] = None,
|
205
|
-
|
240
|
+
# --- Common OCR Parameters (Direct Arguments) ---
|
206
241
|
languages: Optional[List[str]] = None,
|
207
|
-
min_confidence: Optional[float] = None,
|
242
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
208
243
|
device: Optional[str] = None,
|
209
|
-
|
210
|
-
|
244
|
+
resolution: Optional[int] = None, # DPI for rendering before OCR
|
245
|
+
apply_exclusions: bool = True, # New parameter
|
246
|
+
detect_only: bool = False,
|
247
|
+
# --- Engine-Specific Options --- Use 'options=' for this
|
248
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
|
249
|
+
# **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
|
250
|
+
) -> "PDF":
|
211
251
|
"""
|
212
252
|
Applies OCR to specified pages (or all pages) of the PDF using batch processing.
|
213
253
|
|
@@ -217,42 +257,54 @@ class PDF:
|
|
217
257
|
Args:
|
218
258
|
pages: An iterable of 0-based page indices (list, range, tuple),
|
219
259
|
a slice object, or None to process all pages.
|
220
|
-
engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
221
|
-
Uses manager's default
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
min_confidence: Minimum confidence threshold for
|
226
|
-
|
260
|
+
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
261
|
+
Uses manager's default ('easyocr') if None.
|
262
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
|
263
|
+
**Must be codes understood by the specific selected engine.**
|
264
|
+
No mapping is performed. Overrides manager/engine default.
|
265
|
+
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
266
|
+
Overrides manager/engine default.
|
267
|
+
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
268
|
+
Overrides manager/engine default.
|
269
|
+
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
270
|
+
Affects input quality for OCR. Defaults to 150 if not set.
|
271
|
+
apply_exclusions: If True (default), render page image for OCR with
|
272
|
+
excluded areas masked (whited out). If False, OCR
|
273
|
+
the raw page image without masking exclusions.
|
274
|
+
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
275
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict
|
276
|
+
containing parameters specific to the chosen engine.
|
227
277
|
|
228
278
|
Returns:
|
229
279
|
Self for method chaining.
|
230
280
|
|
231
281
|
Raises:
|
232
|
-
ValueError: If page indices are invalid
|
233
|
-
TypeError: If
|
282
|
+
ValueError: If page indices are invalid.
|
283
|
+
TypeError: If 'options' is not compatible with the engine.
|
234
284
|
RuntimeError: If the OCRManager or selected engine is not available.
|
235
285
|
"""
|
236
286
|
if not self._ocr_manager:
|
237
|
-
|
238
|
-
|
239
|
-
|
287
|
+
logger.error("OCRManager not available. Cannot apply OCR.")
|
288
|
+
# Or raise RuntimeError("OCRManager not initialized.")
|
289
|
+
return self
|
240
290
|
|
241
|
-
# --- Determine Target Pages ---
|
291
|
+
# --- Determine Target Pages (unchanged) ---
|
242
292
|
target_pages: List[Page] = []
|
243
293
|
if pages is None:
|
244
294
|
target_pages = self._pages
|
245
295
|
elif isinstance(pages, slice):
|
246
296
|
target_pages = self._pages[pages]
|
247
|
-
elif hasattr(pages,
|
297
|
+
elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
|
248
298
|
try:
|
249
299
|
target_pages = [self._pages[i] for i in pages]
|
250
300
|
except IndexError:
|
251
301
|
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
252
302
|
except TypeError:
|
253
|
-
|
303
|
+
raise TypeError(
|
304
|
+
"'pages' must be None, a slice, or an iterable of page indices (int)."
|
305
|
+
)
|
254
306
|
else:
|
255
|
-
|
307
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
256
308
|
|
257
309
|
if not target_pages:
|
258
310
|
logger.warning("No pages selected for OCR processing.")
|
@@ -260,101 +312,129 @@ class PDF:
|
|
260
312
|
|
261
313
|
page_numbers = [p.number for p in target_pages]
|
262
314
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
|
+
# --- Determine Rendering Resolution ---
|
316
|
+
# Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
|
317
|
+
final_resolution = resolution # Use direct arg if provided
|
318
|
+
if final_resolution is None:
|
319
|
+
final_resolution = getattr(self, "_config", {}).get("resolution", 150)
|
320
|
+
|
321
|
+
logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
|
263
322
|
|
264
323
|
# --- Render Images for Batch ---
|
265
324
|
images_pil: List[Image.Image] = []
|
266
|
-
page_image_map: List[Tuple[Page, Image.Image]] = []
|
267
|
-
logger.info(f"Rendering {len(target_pages)} pages to images...")
|
268
|
-
failed_page_num =
|
325
|
+
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
326
|
+
logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
|
327
|
+
failed_page_num = "unknown" # Keep track of potentially failing page
|
269
328
|
try:
|
270
|
-
ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
|
271
329
|
for i, page in enumerate(target_pages):
|
272
|
-
failed_page_num = page.number
|
330
|
+
failed_page_num = page.number # Update current page number in case of error
|
273
331
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
274
|
-
# Use
|
275
|
-
|
332
|
+
# Use the determined final_resolution and apply exclusions if requested
|
333
|
+
to_image_kwargs = {
|
334
|
+
"resolution": final_resolution,
|
335
|
+
"include_highlights": False,
|
336
|
+
"exclusions": "mask" if apply_exclusions else None,
|
337
|
+
}
|
338
|
+
img = page.to_image(**to_image_kwargs)
|
339
|
+
if img is None:
|
340
|
+
logger.error(f" Failed to render page {page.number} to image.")
|
341
|
+
# Decide how to handle: skip page, raise error? For now, skip.
|
342
|
+
continue # Skip this page if rendering failed
|
276
343
|
images_pil.append(img)
|
277
|
-
page_image_map.append((page, img))
|
344
|
+
page_image_map.append((page, img)) # Store pair
|
278
345
|
except Exception as e:
|
279
346
|
logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
|
280
347
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
281
348
|
|
282
|
-
if not images_pil:
|
283
|
-
|
284
|
-
|
349
|
+
if not images_pil or not page_image_map:
|
350
|
+
logger.error("No images were successfully rendered for batch OCR.")
|
351
|
+
return self
|
285
352
|
|
286
353
|
# --- Prepare Arguments for Manager ---
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
354
|
+
# Pass common args directly, engine-specific via options
|
355
|
+
manager_args = {
|
356
|
+
"images": images_pil,
|
357
|
+
"engine": engine,
|
358
|
+
"languages": languages,
|
359
|
+
"min_confidence": min_confidence, # Use the renamed parameter
|
360
|
+
"device": device,
|
361
|
+
"options": options,
|
362
|
+
"detect_only": detect_only,
|
363
|
+
# Note: resolution is used for rendering, not passed to OCR manager directly
|
364
|
+
}
|
365
|
+
# Filter out None values so manager can use its defaults
|
366
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
293
367
|
|
294
368
|
# --- Call OCR Manager for Batch Processing ---
|
295
|
-
logger.info(f"Calling OCR Manager for
|
369
|
+
logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
|
296
370
|
try:
|
297
|
-
#
|
371
|
+
# Manager's apply_ocr signature needs to accept common args directly
|
298
372
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
299
373
|
|
300
374
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
301
|
-
logger.error(
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
375
|
+
logger.error(
|
376
|
+
f"OCR Manager returned unexpected result format or length for batch processing. "
|
377
|
+
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
378
|
+
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
379
|
+
)
|
380
|
+
return self
|
306
381
|
|
307
382
|
logger.info("OCR Manager batch processing complete.")
|
308
383
|
|
309
384
|
except Exception as e:
|
310
|
-
|
311
|
-
|
385
|
+
logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
|
386
|
+
return self
|
312
387
|
|
313
|
-
# --- Distribute Results and Add Elements to Pages ---
|
388
|
+
# --- Distribute Results and Add Elements to Pages (unchanged) ---
|
314
389
|
logger.info("Adding OCR results to respective pages...")
|
315
390
|
total_elements_added = 0
|
316
391
|
for i, (page, img) in enumerate(page_image_map):
|
317
392
|
results_for_page = batch_results[i]
|
318
393
|
if not isinstance(results_for_page, list):
|
319
|
-
|
320
|
-
|
394
|
+
logger.warning(
|
395
|
+
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
396
|
+
)
|
397
|
+
continue
|
321
398
|
|
322
399
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
323
|
-
# Use the page's element manager to create elements from its results
|
324
|
-
# Changed from page._create_text_elements_from_ocr to use element_mgr
|
325
400
|
try:
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
401
|
+
img_scale_x = page.width / img.width if img.width > 0 else 1
|
402
|
+
img_scale_y = page.height / img.height if img.height > 0 else 1
|
403
|
+
elements = page._element_mgr.create_text_elements_from_ocr(
|
404
|
+
results_for_page, img_scale_x, img_scale_y
|
405
|
+
)
|
406
|
+
|
407
|
+
if elements:
|
408
|
+
total_elements_added += len(elements)
|
409
|
+
logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
|
410
|
+
else:
|
411
|
+
logger.debug(f" No valid TextElements created for page {page.number}.")
|
337
412
|
except Exception as e:
|
338
|
-
|
339
|
-
|
413
|
+
logger.error(
|
414
|
+
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
415
|
+
)
|
340
416
|
|
341
|
-
logger.info(
|
417
|
+
logger.info(
|
418
|
+
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
419
|
+
)
|
342
420
|
return self
|
343
|
-
|
344
|
-
def add_region(
|
421
|
+
|
422
|
+
def add_region(
|
423
|
+
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
424
|
+
) -> "PDF":
|
345
425
|
"""
|
346
426
|
Add a region function to the PDF. This creates regions on all pages using the provided function.
|
347
|
-
|
427
|
+
|
348
428
|
Args:
|
349
429
|
region_func: A function that takes a Page and returns a Region, or None.
|
350
430
|
name: Optional name for the region
|
351
|
-
|
431
|
+
|
352
432
|
Returns:
|
353
433
|
Self for method chaining
|
354
434
|
"""
|
355
435
|
# Ensure _pages is initialized
|
356
|
-
if not hasattr(self,
|
357
|
-
|
436
|
+
if not hasattr(self, "_pages"):
|
437
|
+
raise AttributeError("PDF pages not yet initialized.")
|
358
438
|
|
359
439
|
# Store region with its name at PDF level
|
360
440
|
region_data = (region_func, name)
|
@@ -367,93 +447,108 @@ class PDF:
|
|
367
447
|
region_instance = region_func(page)
|
368
448
|
if region_instance and isinstance(region_instance, Region):
|
369
449
|
# If a valid region is returned, add it to the page
|
370
|
-
page.add_region(region_instance, name=name, source=
|
450
|
+
page.add_region(region_instance, name=name, source="named")
|
371
451
|
elif region_instance is not None:
|
372
|
-
|
452
|
+
logger.warning(
|
453
|
+
f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
|
454
|
+
)
|
373
455
|
except Exception as e:
|
374
|
-
logger.error(
|
456
|
+
logger.error(
|
457
|
+
f"Error executing or adding region function for page {page.number}: {e}",
|
458
|
+
exc_info=True,
|
459
|
+
)
|
375
460
|
|
376
461
|
return self
|
377
|
-
|
378
|
-
def find(
|
462
|
+
|
463
|
+
def find(
|
464
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
465
|
+
) -> Optional[Any]:
|
379
466
|
"""
|
380
467
|
Find the first element matching the selector.
|
381
|
-
|
468
|
+
|
382
469
|
Args:
|
383
470
|
selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
|
384
471
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
385
472
|
regex: Whether to use regex for text search in :contains (default: False)
|
386
473
|
case: Whether to do case-sensitive text search (default: True)
|
387
474
|
**kwargs: Additional filter parameters
|
388
|
-
|
475
|
+
|
389
476
|
Returns:
|
390
477
|
Element object or None if not found
|
391
478
|
"""
|
392
479
|
# Ensure _pages is initialized
|
393
|
-
if not hasattr(self,
|
394
|
-
|
480
|
+
if not hasattr(self, "_pages"):
|
481
|
+
raise AttributeError("PDF pages not yet initialized.")
|
395
482
|
|
396
483
|
selector_obj = parse_selector(selector)
|
397
|
-
|
484
|
+
|
398
485
|
# Pass regex and case flags to selector function
|
399
|
-
kwargs[
|
400
|
-
kwargs[
|
401
|
-
|
402
|
-
results = self._apply_selector(
|
486
|
+
kwargs["regex"] = regex
|
487
|
+
kwargs["case"] = case
|
488
|
+
|
489
|
+
results = self._apply_selector(
|
490
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
|
491
|
+
)
|
403
492
|
return results.first if results else None
|
404
|
-
|
405
|
-
def find_all(
|
493
|
+
|
494
|
+
def find_all(
|
495
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
496
|
+
) -> ElementCollection:
|
406
497
|
"""
|
407
498
|
Find all elements matching the selector.
|
408
|
-
|
499
|
+
|
409
500
|
Args:
|
410
501
|
selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
|
411
502
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
412
503
|
regex: Whether to use regex for text search in :contains (default: False)
|
413
504
|
case: Whether to do case-sensitive text search (default: True)
|
414
505
|
**kwargs: Additional filter parameters
|
415
|
-
|
506
|
+
|
416
507
|
Returns:
|
417
508
|
ElementCollection with matching elements
|
418
509
|
"""
|
419
510
|
# Ensure _pages is initialized
|
420
|
-
if not hasattr(self,
|
421
|
-
|
511
|
+
if not hasattr(self, "_pages"):
|
512
|
+
raise AttributeError("PDF pages not yet initialized.")
|
422
513
|
|
423
514
|
selector_obj = parse_selector(selector)
|
424
|
-
|
515
|
+
|
425
516
|
# Pass regex and case flags to selector function
|
426
|
-
kwargs[
|
427
|
-
kwargs[
|
428
|
-
|
429
|
-
results = self._apply_selector(
|
517
|
+
kwargs["regex"] = regex
|
518
|
+
kwargs["case"] = case
|
519
|
+
|
520
|
+
results = self._apply_selector(
|
521
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
|
522
|
+
)
|
430
523
|
return results
|
431
|
-
|
432
|
-
def _apply_selector(
|
524
|
+
|
525
|
+
def _apply_selector(
|
526
|
+
self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
|
527
|
+
) -> ElementCollection:
|
433
528
|
"""
|
434
529
|
Apply selector to PDF elements across all pages.
|
435
|
-
|
530
|
+
|
436
531
|
Args:
|
437
532
|
selector_obj: Parsed selector dictionary
|
438
533
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
439
534
|
first_only: If True, stop searching after the first match is found.
|
440
535
|
**kwargs: Additional filter parameters
|
441
|
-
|
536
|
+
|
442
537
|
Returns:
|
443
538
|
ElementCollection of matching elements
|
444
539
|
"""
|
445
540
|
from natural_pdf.elements.collections import ElementCollection
|
446
|
-
|
541
|
+
|
447
542
|
# Determine page range to search
|
448
|
-
page_indices = kwargs.get(
|
543
|
+
page_indices = kwargs.get("pages", range(len(self._pages)))
|
449
544
|
if isinstance(page_indices, int):
|
450
545
|
page_indices = [page_indices]
|
451
546
|
elif isinstance(page_indices, slice):
|
452
547
|
page_indices = range(*page_indices.indices(len(self._pages)))
|
453
548
|
|
454
549
|
# Check for cross-page pseudo-classes (currently not supported)
|
455
|
-
for pseudo in selector_obj.get(
|
456
|
-
if pseudo.get(
|
550
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
551
|
+
if pseudo.get("name") in ("spans", "continues"):
|
457
552
|
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
458
553
|
return ElementCollection([])
|
459
554
|
|
@@ -464,134 +559,148 @@ class PDF:
|
|
464
559
|
page = self._pages[page_idx]
|
465
560
|
# Pass first_only down to page._apply_selector
|
466
561
|
page_elements_collection = page._apply_selector(
|
467
|
-
selector_obj,
|
468
|
-
apply_exclusions=apply_exclusions,
|
469
|
-
first_only=first_only,
|
470
|
-
**kwargs
|
562
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
471
563
|
)
|
472
564
|
if page_elements_collection:
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
565
|
+
page_elements = page_elements_collection.elements
|
566
|
+
all_elements.extend(page_elements)
|
567
|
+
# If we only need the first match overall, and we found one on this page, stop
|
568
|
+
if first_only and page_elements:
|
569
|
+
break # Stop iterating through pages
|
478
570
|
else:
|
479
|
-
|
571
|
+
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
480
572
|
|
481
573
|
# Create a combined collection
|
482
574
|
combined = ElementCollection(all_elements)
|
483
575
|
|
484
576
|
# Sort in document order if requested and not first_only (already sorted by page)
|
485
|
-
if not first_only and kwargs.get(
|
577
|
+
if not first_only and kwargs.get("document_order", True):
|
486
578
|
# Check if elements have page, top, x0 before sorting
|
487
|
-
if all(
|
488
|
-
|
579
|
+
if all(
|
580
|
+
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
581
|
+
for el in combined.elements
|
582
|
+
):
|
583
|
+
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
489
584
|
else:
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
585
|
+
# Elements might be Regions without inherent sorting order yet
|
586
|
+
# Attempt sorting by page index if possible
|
587
|
+
try:
|
588
|
+
combined.sort(key=lambda el: el.page.index)
|
589
|
+
except AttributeError:
|
590
|
+
logger.warning(
|
591
|
+
"Cannot sort elements in document order: Missing required attributes (e.g., page)."
|
592
|
+
)
|
496
593
|
|
497
594
|
return combined
|
498
|
-
|
499
|
-
def extract_text(
|
500
|
-
|
595
|
+
|
596
|
+
def extract_text(
|
597
|
+
self,
|
598
|
+
selector: Optional[str] = None,
|
599
|
+
preserve_whitespace=True,
|
600
|
+
use_exclusions=True,
|
601
|
+
debug_exclusions=False,
|
602
|
+
**kwargs,
|
603
|
+
) -> str:
|
501
604
|
"""
|
502
605
|
Extract text from the entire document or matching elements.
|
503
|
-
|
606
|
+
|
504
607
|
Args:
|
505
608
|
selector: Optional selector to filter elements
|
506
609
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
507
610
|
use_exclusions: Whether to apply exclusion regions (default: True)
|
508
611
|
debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
|
509
612
|
**kwargs: Additional extraction parameters
|
510
|
-
|
613
|
+
|
511
614
|
Returns:
|
512
615
|
Extracted text as string
|
513
616
|
"""
|
514
617
|
# Ensure _pages is initialized
|
515
|
-
if not hasattr(self,
|
516
|
-
|
618
|
+
if not hasattr(self, "_pages"):
|
619
|
+
raise AttributeError("PDF pages not yet initialized.")
|
517
620
|
|
518
621
|
# If selector is provided, find elements first
|
519
622
|
if selector:
|
520
623
|
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
521
624
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
522
|
-
|
625
|
+
|
523
626
|
# Otherwise extract from all pages
|
524
627
|
if debug_exclusions:
|
525
628
|
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
526
629
|
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
527
|
-
|
630
|
+
|
528
631
|
texts = []
|
529
632
|
for page in self.pages:
|
530
|
-
texts.append(
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
633
|
+
texts.append(
|
634
|
+
page.extract_text(
|
635
|
+
preserve_whitespace=preserve_whitespace,
|
636
|
+
use_exclusions=use_exclusions,
|
637
|
+
debug_exclusions=debug_exclusions,
|
638
|
+
**kwargs,
|
639
|
+
)
|
640
|
+
)
|
641
|
+
|
537
642
|
if debug_exclusions:
|
538
643
|
print(f"PDF: Combined {len(texts)} pages of text")
|
539
|
-
|
644
|
+
|
540
645
|
return "\n".join(texts)
|
541
|
-
|
646
|
+
|
542
647
|
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
543
648
|
"""
|
544
649
|
Shorthand for finding elements and extracting their text.
|
545
|
-
|
650
|
+
|
546
651
|
Args:
|
547
652
|
selector: CSS-like selector string
|
548
653
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
549
654
|
**kwargs: Additional extraction parameters
|
550
|
-
|
655
|
+
|
551
656
|
Returns:
|
552
657
|
Extracted text from matching elements
|
553
658
|
"""
|
554
659
|
# Ensure _pages is initialized
|
555
|
-
if not hasattr(self,
|
556
|
-
|
557
|
-
return self.extract_text(
|
558
|
-
|
559
|
-
|
660
|
+
if not hasattr(self, "_pages"):
|
661
|
+
raise AttributeError("PDF pages not yet initialized.")
|
662
|
+
return self.extract_text(
|
663
|
+
selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
|
664
|
+
) # apply_exclusions is handled by find_all in extract_text
|
665
|
+
|
666
|
+
def extract_tables(
|
667
|
+
self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
|
668
|
+
) -> List[Any]:
|
560
669
|
"""
|
561
670
|
Extract tables from the document or matching elements.
|
562
|
-
|
671
|
+
|
563
672
|
Args:
|
564
673
|
selector: Optional selector to filter tables
|
565
674
|
merge_across_pages: Whether to merge tables that span across pages
|
566
675
|
**kwargs: Additional extraction parameters
|
567
|
-
|
676
|
+
|
568
677
|
Returns:
|
569
678
|
List of extracted tables
|
570
679
|
"""
|
571
680
|
# Ensure _pages is initialized
|
572
|
-
if not hasattr(self,
|
573
|
-
|
681
|
+
if not hasattr(self, "_pages"):
|
682
|
+
raise AttributeError("PDF pages not yet initialized.")
|
574
683
|
# TODO: Implement table extraction
|
575
684
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
576
685
|
all_tables = []
|
577
686
|
for page in self.pages:
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
687
|
+
# Assuming page.extract_tables(**kwargs) exists or is added
|
688
|
+
if hasattr(page, "extract_tables"):
|
689
|
+
all_tables.extend(page.extract_tables(**kwargs))
|
690
|
+
else:
|
691
|
+
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
583
692
|
# Placeholder filtering
|
584
693
|
if selector:
|
585
694
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
586
695
|
# Would need to parse selector and filter the list `all_tables`
|
587
696
|
# Placeholder merging
|
588
697
|
if merge_across_pages:
|
589
|
-
|
590
|
-
|
698
|
+
logger.warning("Merging tables across pages is not implemented.")
|
699
|
+
# Would need logic to detect and merge related tables
|
591
700
|
return all_tables
|
592
|
-
|
701
|
+
|
593
702
|
# --- New Method: save_searchable ---
|
594
|
-
def save_searchable(self, output_path: Union[str,
|
703
|
+
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
595
704
|
"""
|
596
705
|
Saves the PDF with an OCR text layer, making content searchable.
|
597
706
|
|
@@ -608,15 +717,6 @@ class PDF:
|
|
608
717
|
# Import moved here, assuming it's always available now
|
609
718
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
610
719
|
|
611
|
-
# TODO: Need a reliable way for Page to signal if it has OCR elements.
|
612
|
-
# This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
|
613
|
-
# or checking if page.get_elements(source='ocr') returns anything.
|
614
|
-
# For now, we pass through and let the exporter handle pages without OCR elements.
|
615
|
-
# if not any(page.get_elements(source='ocr') for page in self.pages):
|
616
|
-
# logger.warning("No OCR elements found on pages. "
|
617
|
-
# "Ensure apply_ocr() was called. "
|
618
|
-
# "Output PDF might not be searchable.")
|
619
|
-
|
620
720
|
# Convert pathlib.Path to string if necessary
|
621
721
|
output_path_str = str(output_path)
|
622
722
|
|
@@ -625,15 +725,18 @@ class PDF:
|
|
625
725
|
|
626
726
|
# --- End New Method ---
|
627
727
|
|
628
|
-
def ask(
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
728
|
+
def ask(
|
729
|
+
self,
|
730
|
+
question: str,
|
731
|
+
mode: str = "extractive",
|
732
|
+
pages: Union[int, List[int], range] = None,
|
733
|
+
min_confidence: float = 0.1,
|
734
|
+
model: str = None,
|
735
|
+
**kwargs,
|
736
|
+
) -> Dict[str, Any]:
|
634
737
|
"""
|
635
738
|
Ask a question about the document content.
|
636
|
-
|
739
|
+
|
637
740
|
Args:
|
638
741
|
question: Question to ask about the document
|
639
742
|
mode: "extractive" to extract answer from document, "generative" to generate
|
@@ -641,16 +744,16 @@ class PDF:
|
|
641
744
|
min_confidence: Minimum confidence threshold for answers
|
642
745
|
model: Optional model name for question answering
|
643
746
|
**kwargs: Additional parameters passed to the QA engine
|
644
|
-
|
747
|
+
|
645
748
|
Returns:
|
646
749
|
A dictionary containing the answer, confidence, and other metadata.
|
647
750
|
Result will have an 'answer' key containing the answer text.
|
648
751
|
"""
|
649
752
|
from natural_pdf.qa import get_qa_engine
|
650
|
-
|
753
|
+
|
651
754
|
# Initialize or get QA engine
|
652
755
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
653
|
-
|
756
|
+
|
654
757
|
# Determine which pages to query
|
655
758
|
if pages is None:
|
656
759
|
target_pages = list(range(len(self.pages)))
|
@@ -662,43 +765,40 @@ class PDF:
|
|
662
765
|
target_pages = pages
|
663
766
|
else:
|
664
767
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
665
|
-
|
768
|
+
|
666
769
|
# Actually query each page and gather results
|
667
770
|
results = []
|
668
771
|
for page_idx in target_pages:
|
669
772
|
if 0 <= page_idx < len(self.pages):
|
670
773
|
page = self.pages[page_idx]
|
671
774
|
page_result = qa_engine.ask_pdf_page(
|
672
|
-
page=page,
|
673
|
-
question=question,
|
674
|
-
min_confidence=min_confidence,
|
675
|
-
**kwargs
|
775
|
+
page=page, question=question, min_confidence=min_confidence, **kwargs
|
676
776
|
)
|
677
|
-
|
777
|
+
|
678
778
|
# Add to results if it found an answer
|
679
779
|
if page_result and page_result.get("found", False):
|
680
780
|
results.append(page_result)
|
681
|
-
|
781
|
+
|
682
782
|
# Sort results by confidence
|
683
783
|
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
684
|
-
|
784
|
+
|
685
785
|
# Return the best result, or a default result if none found
|
686
786
|
if results:
|
687
787
|
return results[0]
|
688
788
|
else:
|
689
789
|
# Return a structure indicating no answer found
|
690
790
|
return {
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
791
|
+
"answer": None,
|
792
|
+
"confidence": 0.0,
|
793
|
+
"found": False,
|
794
|
+
"page_num": None, # Or maybe the pages searched?
|
795
|
+
"source_elements": [],
|
696
796
|
}
|
697
797
|
|
698
798
|
def search_within_index(
|
699
799
|
self,
|
700
800
|
query: Union[str, Path, Image.Image, Region],
|
701
|
-
search_service: SearchServiceProtocol,
|
801
|
+
search_service: SearchServiceProtocol, # Now required
|
702
802
|
options: Optional[SearchOptions] = None,
|
703
803
|
) -> List[Dict[str, Any]]:
|
704
804
|
"""
|
@@ -730,14 +830,16 @@ class PDF:
|
|
730
830
|
RuntimeError: For other search failures.
|
731
831
|
"""
|
732
832
|
if not search_service:
|
733
|
-
|
833
|
+
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
734
834
|
# Optional stricter check:
|
735
835
|
# if not isinstance(search_service, SearchServiceProtocol):
|
736
836
|
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
737
837
|
|
738
838
|
# Get collection name from service for logging
|
739
|
-
collection_name = getattr(search_service,
|
740
|
-
logger.info(
|
839
|
+
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
840
|
+
logger.info(
|
841
|
+
f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
|
842
|
+
)
|
741
843
|
|
742
844
|
# --- 1. Get Search Service Instance --- (REMOVED - provided directly)
|
743
845
|
# service: SearchServiceProtocol
|
@@ -748,7 +850,7 @@ class PDF:
|
|
748
850
|
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
749
851
|
# # TODO: Pass embedding model from options/pdf config if needed?
|
750
852
|
# service = get_search_service(**factory_args)
|
751
|
-
service = search_service
|
853
|
+
service = search_service # Use validated provided service
|
752
854
|
|
753
855
|
# --- 2. Prepare Query and Options ---
|
754
856
|
query_input = query
|
@@ -757,119 +859,220 @@ class PDF:
|
|
757
859
|
|
758
860
|
# Handle Region query - extract text for now
|
759
861
|
if isinstance(query, Region):
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
862
|
+
logger.debug("Query is a Region object. Extracting text.")
|
863
|
+
if not isinstance(effective_options, TextSearchOptions):
|
864
|
+
logger.warning(
|
865
|
+
"Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
|
866
|
+
)
|
867
|
+
query_input = query.extract_text()
|
868
|
+
if not query_input or query_input.isspace():
|
869
|
+
logger.error("Region has no extractable text for query.")
|
870
|
+
return []
|
767
871
|
|
768
872
|
# --- 3. Add Filter to Scope Search to THIS PDF ---
|
769
873
|
# Assume metadata field 'pdf_path' stores the resolved path used during indexing
|
770
874
|
pdf_scope_filter = {
|
771
|
-
|
772
|
-
|
773
|
-
|
875
|
+
"field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
|
876
|
+
"operator": "eq",
|
877
|
+
"value": self.path, # Use the resolved path of this PDF instance
|
774
878
|
}
|
775
879
|
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
776
880
|
|
777
881
|
# Combine with existing filters in options (if any)
|
778
882
|
if effective_options.filters:
|
779
|
-
logger.debug(
|
883
|
+
logger.debug(
|
884
|
+
f"Combining PDF scope filter with existing filters: {effective_options.filters}"
|
885
|
+
)
|
780
886
|
# Assume filters are compatible with the underlying search service
|
781
887
|
# If existing filters aren't already in an AND block, wrap them
|
782
|
-
if
|
783
|
-
|
784
|
-
|
888
|
+
if (
|
889
|
+
isinstance(effective_options.filters, dict)
|
890
|
+
and effective_options.filters.get("operator") == "AND"
|
891
|
+
):
|
892
|
+
# Already an AND block, just append the condition
|
893
|
+
effective_options.filters["conditions"].append(pdf_scope_filter)
|
785
894
|
elif isinstance(effective_options.filters, list):
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
895
|
+
# Assume list represents implicit AND conditions
|
896
|
+
effective_options.filters = {
|
897
|
+
"operator": "AND",
|
898
|
+
"conditions": effective_options.filters + [pdf_scope_filter],
|
899
|
+
}
|
900
|
+
elif isinstance(effective_options.filters, dict): # Single filter dict
|
901
|
+
effective_options.filters = {
|
902
|
+
"operator": "AND",
|
903
|
+
"conditions": [effective_options.filters, pdf_scope_filter],
|
904
|
+
}
|
790
905
|
else:
|
791
|
-
|
792
|
-
|
906
|
+
logger.warning(
|
907
|
+
f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
|
908
|
+
)
|
909
|
+
effective_options.filters = pdf_scope_filter
|
793
910
|
else:
|
794
911
|
effective_options.filters = pdf_scope_filter
|
795
912
|
|
796
913
|
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
797
914
|
|
798
|
-
# --- 4. Call SearchService ---
|
915
|
+
# --- 4. Call SearchService ---
|
799
916
|
try:
|
800
917
|
# Call the service's search method (no collection_name needed)
|
801
918
|
results = service.search(
|
802
919
|
query=query_input,
|
803
920
|
options=effective_options,
|
804
921
|
)
|
805
|
-
logger.info(
|
922
|
+
logger.info(
|
923
|
+
f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
|
924
|
+
)
|
806
925
|
return results
|
807
926
|
except FileNotFoundError as fnf:
|
808
|
-
|
809
|
-
|
927
|
+
logger.error(
|
928
|
+
f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
|
929
|
+
)
|
930
|
+
raise # Re-raise specific error
|
931
|
+
except Exception as e:
|
932
|
+
logger.error(
|
933
|
+
f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
|
934
|
+
exc_info=True,
|
935
|
+
)
|
936
|
+
raise RuntimeError(
|
937
|
+
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
938
|
+
) from e
|
939
|
+
|
940
|
+
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
941
|
+
"""
|
942
|
+
Exports OCR results from this PDF into a correction task package (zip file).
|
943
|
+
|
944
|
+
Args:
|
945
|
+
output_zip_path: The path to save the output zip file.
|
946
|
+
**kwargs: Additional arguments passed to create_correction_task_package
|
947
|
+
(e.g., image_render_scale, overwrite).
|
948
|
+
"""
|
949
|
+
try:
|
950
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
951
|
+
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
952
|
+
except ImportError:
|
953
|
+
logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
|
954
|
+
# Or raise
|
810
955
|
except Exception as e:
|
811
|
-
|
812
|
-
|
956
|
+
logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
|
957
|
+
raise # Re-raise the exception from the utility function
|
958
|
+
|
959
|
+
def correct_ocr(
|
960
|
+
self,
|
961
|
+
correction_callback: Callable[[Any], Optional[str]],
|
962
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
963
|
+
) -> "PDF": # Return self for chaining
|
964
|
+
"""
|
965
|
+
Applies corrections to OCR-generated text elements using a callback function,
|
966
|
+
delegating the core work to the `Page.correct_ocr` method.
|
967
|
+
|
968
|
+
Args:
|
969
|
+
correction_callback: A function that accepts a single argument (an element
|
970
|
+
object) and returns `Optional[str]`. It returns the
|
971
|
+
corrected text string if an update is needed, otherwise None.
|
972
|
+
pages: Optional page indices/slice to limit the scope of correction
|
973
|
+
(default: all pages).
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
Self for method chaining.
|
977
|
+
"""
|
978
|
+
# Determine target pages
|
979
|
+
target_page_indices: List[int] = []
|
980
|
+
if pages is None:
|
981
|
+
target_page_indices = list(range(len(self._pages)))
|
982
|
+
elif isinstance(pages, slice):
|
983
|
+
target_page_indices = list(range(*pages.indices(len(self._pages))))
|
984
|
+
elif hasattr(pages, "__iter__"):
|
985
|
+
try:
|
986
|
+
target_page_indices = [int(i) for i in pages]
|
987
|
+
# Validate indices
|
988
|
+
for idx in target_page_indices:
|
989
|
+
if not (0 <= idx < len(self._pages)):
|
990
|
+
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
991
|
+
except (IndexError, TypeError, ValueError) as e:
|
992
|
+
raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
|
993
|
+
else:
|
994
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
995
|
+
|
996
|
+
if not target_page_indices:
|
997
|
+
logger.warning("No pages selected for OCR correction.")
|
998
|
+
return self
|
999
|
+
|
1000
|
+
logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
|
1001
|
+
|
1002
|
+
# Iterate through target pages and call their correct_ocr method
|
1003
|
+
for page_idx in target_page_indices:
|
1004
|
+
page = self._pages[page_idx]
|
1005
|
+
try:
|
1006
|
+
page.correct_ocr(correction_callback=correction_callback)
|
1007
|
+
except Exception as e:
|
1008
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
|
1009
|
+
# Optionally re-raise or just log and continue
|
1010
|
+
|
1011
|
+
logger.info(f"OCR correction process finished for requested pages.")
|
1012
|
+
return self
|
813
1013
|
|
814
1014
|
def __len__(self) -> int:
|
815
1015
|
"""Return the number of pages in the PDF."""
|
816
1016
|
# Ensure _pages is initialized
|
817
|
-
if not hasattr(self,
|
1017
|
+
if not hasattr(self, "_pages"):
|
818
1018
|
# Return 0 or raise error if not fully initialized? Let's return 0.
|
819
|
-
|
1019
|
+
return 0
|
820
1020
|
return len(self._pages)
|
821
|
-
|
822
|
-
def __getitem__(self, key) -> Union[Page,
|
1021
|
+
|
1022
|
+
def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
|
823
1023
|
"""Access pages by index or slice."""
|
824
1024
|
# Check if self._pages has been initialized
|
825
|
-
if not hasattr(self,
|
826
|
-
|
1025
|
+
if not hasattr(self, "_pages"):
|
1026
|
+
raise AttributeError("PDF pages not initialized yet.")
|
827
1027
|
if isinstance(key, slice):
|
828
|
-
|
829
|
-
|
830
|
-
|
1028
|
+
# Return a PageCollection slice
|
1029
|
+
from natural_pdf.elements.collections import PageCollection
|
1030
|
+
|
1031
|
+
return PageCollection(self._pages[key])
|
831
1032
|
# Check index bounds before accessing
|
832
1033
|
if isinstance(key, int):
|
833
1034
|
if 0 <= key < len(self._pages):
|
834
|
-
|
1035
|
+
return self._pages[key]
|
835
1036
|
else:
|
836
|
-
|
1037
|
+
raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
|
837
1038
|
else:
|
838
|
-
|
839
|
-
|
1039
|
+
raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
|
1040
|
+
|
840
1041
|
def close(self):
|
841
1042
|
"""Close the underlying PDF file and clean up any temporary files."""
|
842
|
-
if hasattr(self,
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
1043
|
+
if hasattr(self, "_pdf") and self._pdf is not None:
|
1044
|
+
try:
|
1045
|
+
self._pdf.close()
|
1046
|
+
logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
|
1047
|
+
except Exception as e:
|
1048
|
+
logger.warning(f"Error closing pdfplumber object: {e}")
|
1049
|
+
finally:
|
1050
|
+
self._pdf = None
|
850
1051
|
|
851
1052
|
# Clean up temporary file if it exists
|
852
|
-
if hasattr(self,
|
1053
|
+
if hasattr(self, "_temp_file") and self._temp_file is not None:
|
853
1054
|
temp_file_path = None
|
854
1055
|
try:
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
1056
|
+
if hasattr(self._temp_file, "name") and self._temp_file.name:
|
1057
|
+
temp_file_path = self._temp_file.name
|
1058
|
+
if os.path.exists(temp_file_path):
|
1059
|
+
os.unlink(temp_file_path)
|
1060
|
+
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
860
1061
|
except Exception as e:
|
861
|
-
|
1062
|
+
logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
|
862
1063
|
finally:
|
863
|
-
|
1064
|
+
self._temp_file = None
|
864
1065
|
|
865
1066
|
def __enter__(self):
|
866
1067
|
"""Context manager entry."""
|
867
1068
|
return self
|
868
|
-
|
1069
|
+
|
869
1070
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
870
1071
|
"""Context manager exit."""
|
871
1072
|
self.close()
|
872
1073
|
|
873
|
-
|
874
|
-
|
875
|
-
|
1074
|
+
|
1075
|
+
# --- Indexable Protocol Methods --- Needed for search/sync
|
1076
|
+
def get_id(self) -> str:
|
1077
|
+
return self.path
|
1078
|
+
|