natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
import copy
|
1
|
+
import copy
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import re
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
-
|
8
|
-
|
7
|
+
import time
|
8
|
+
import threading
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import (
|
9
11
|
TYPE_CHECKING,
|
10
12
|
Any,
|
11
13
|
Callable,
|
@@ -17,29 +19,33 @@ from typing import ( # Added Iterable and TYPE_CHECKING
|
|
17
19
|
Type,
|
18
20
|
Union,
|
19
21
|
)
|
20
|
-
from
|
21
|
-
|
22
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
22
23
|
|
23
24
|
import pdfplumber
|
24
25
|
from PIL import Image
|
25
26
|
|
26
|
-
from natural_pdf.analyzers.layout.layout_manager import
|
27
|
-
|
28
|
-
)
|
29
|
-
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
27
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
28
|
+
from natural_pdf.core.highlighting_service import HighlightingService
|
30
29
|
from natural_pdf.core.page import Page
|
31
30
|
from natural_pdf.elements.collections import ElementCollection
|
32
31
|
from natural_pdf.elements.region import Region
|
33
32
|
from natural_pdf.ocr import OCRManager, OCROptions
|
34
33
|
from natural_pdf.selectors.parser import parse_selector
|
35
34
|
|
36
|
-
|
35
|
+
from natural_pdf.classification.manager import ClassificationManager
|
36
|
+
from natural_pdf.classification.manager import ClassificationError
|
37
|
+
from natural_pdf.classification.results import ClassificationResult
|
38
|
+
from natural_pdf.extraction.manager import StructuredDataManager
|
39
|
+
|
40
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
+
from natural_pdf.elements.base import Element
|
42
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
43
|
+
from natural_pdf.extraction.mixin import ExtractionMixin
|
37
44
|
|
38
|
-
# --- Add Search Service Imports (needed for new methods) ---
|
39
45
|
try:
|
40
|
-
from typing import Any as TypingAny
|
46
|
+
from typing import Any as TypingAny
|
41
47
|
|
42
|
-
from natural_pdf.search import TextSearchOptions
|
48
|
+
from natural_pdf.search import TextSearchOptions
|
43
49
|
from natural_pdf.search import (
|
44
50
|
BaseSearchOptions,
|
45
51
|
SearchOptions,
|
@@ -47,25 +53,24 @@ try:
|
|
47
53
|
get_search_service,
|
48
54
|
)
|
49
55
|
except ImportError:
|
50
|
-
# Define dummies if needed for type hints within the class
|
51
56
|
SearchServiceProtocol = object
|
52
57
|
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
53
58
|
TypingAny = object
|
54
59
|
|
55
|
-
# Dummy factory needed for default arg in methods
|
56
60
|
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
57
61
|
raise ImportError(
|
58
62
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
59
63
|
)
|
60
64
|
|
61
|
-
|
62
|
-
# --- End Search Service Imports ---
|
63
|
-
|
64
|
-
# Set up logger early
|
65
65
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
66
|
+
tqdm = get_tqdm()
|
66
67
|
|
68
|
+
DEFAULT_MANAGERS = {
|
69
|
+
"classification": ClassificationManager,
|
70
|
+
"structured_data": StructuredDataManager,
|
71
|
+
}
|
67
72
|
|
68
|
-
class PDF:
|
73
|
+
class PDF(ExtractionMixin):
|
69
74
|
"""
|
70
75
|
Enhanced PDF wrapper built on top of pdfplumber.
|
71
76
|
|
@@ -86,35 +91,23 @@ class PDF:
|
|
86
91
|
Args:
|
87
92
|
path_or_url: Path to the PDF file or a URL to a PDF
|
88
93
|
reading_order: Whether to use natural reading order
|
89
|
-
font_attrs: Font attributes
|
90
|
-
|
91
|
-
None: Only consider spatial relationships
|
92
|
-
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
93
|
-
keep_spaces: Whether to include spaces in word elements (default: True).
|
94
|
-
True: Spaces are part of words, better for multi-word searching
|
95
|
-
False: Break text at spaces, each word is separate (legacy behavior)
|
94
|
+
font_attrs: Font attributes for grouping characters into words
|
95
|
+
keep_spaces: Whether to include spaces in word elements
|
96
96
|
"""
|
97
|
-
# Check if the input is a URL
|
98
97
|
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
99
98
|
|
100
|
-
# Initialize path-related attributes
|
101
99
|
self._original_path = path_or_url
|
102
100
|
self._temp_file = None
|
103
|
-
self._resolved_path = None
|
101
|
+
self._resolved_path = None
|
104
102
|
|
105
103
|
if is_url:
|
106
104
|
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
107
105
|
try:
|
108
|
-
# Create a temporary file to store the downloaded PDF
|
109
106
|
self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
110
|
-
|
111
|
-
# Download the PDF
|
112
107
|
with urllib.request.urlopen(path_or_url) as response:
|
113
108
|
self._temp_file.write(response.read())
|
114
109
|
self._temp_file.flush()
|
115
110
|
self._temp_file.close()
|
116
|
-
|
117
|
-
# Use the temporary file path
|
118
111
|
self._resolved_path = self._temp_file.name
|
119
112
|
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
120
113
|
except Exception as e:
|
@@ -126,7 +119,6 @@ class PDF:
|
|
126
119
|
logger.error(f"Failed to download PDF from URL: {e}")
|
127
120
|
raise ValueError(f"Failed to download PDF from URL: {e}")
|
128
121
|
else:
|
129
|
-
# Use the provided path directly
|
130
122
|
self._resolved_path = path_or_url
|
131
123
|
|
132
124
|
logger.info(f"Initializing PDF from {self._resolved_path}")
|
@@ -137,42 +129,68 @@ class PDF:
|
|
137
129
|
try:
|
138
130
|
self._pdf = pdfplumber.open(self._resolved_path)
|
139
131
|
except Exception as e:
|
140
|
-
logger.error(
|
141
|
-
f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
|
142
|
-
exc_info=True,
|
143
|
-
)
|
144
|
-
# Clean up temp file if creation failed
|
132
|
+
logger.error(f"Failed to open PDF: {e}", exc_info=True)
|
145
133
|
self.close()
|
146
134
|
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
147
135
|
|
148
|
-
self._path = self._resolved_path
|
149
|
-
self.path = self._resolved_path
|
150
|
-
self.source_path = self._original_path
|
136
|
+
self._path = self._resolved_path
|
137
|
+
self.path = self._resolved_path
|
138
|
+
self.source_path = self._original_path
|
151
139
|
|
152
140
|
self._reading_order = reading_order
|
153
141
|
self._config = {"keep_spaces": keep_spaces}
|
142
|
+
self._font_attrs = font_attrs
|
154
143
|
|
155
|
-
self._font_attrs = font_attrs # Store the font attribute configuration
|
156
|
-
|
157
|
-
# Initialize Managers and Services (conditionally available)
|
158
144
|
self._ocr_manager = OCRManager() if OCRManager else None
|
159
145
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
160
146
|
self.highlighter = HighlightingService(self)
|
147
|
+
self._classification_manager_instance = ClassificationManager()
|
148
|
+
self._manager_registry = {}
|
161
149
|
|
162
|
-
# Initialize pages last, passing necessary refs
|
163
150
|
self._pages = [
|
164
151
|
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
165
152
|
for i, p in enumerate(self._pdf.pages)
|
166
153
|
]
|
167
154
|
|
168
|
-
# Other state
|
169
155
|
self._element_cache = {}
|
170
|
-
self._exclusions = []
|
171
|
-
self._regions = []
|
156
|
+
self._exclusions = []
|
157
|
+
self._regions = []
|
172
158
|
|
173
|
-
logger.info("Initialized HighlightingService.")
|
174
159
|
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
175
160
|
|
161
|
+
self._initialize_managers()
|
162
|
+
self._initialize_highlighter()
|
163
|
+
|
164
|
+
def _initialize_managers(self):
|
165
|
+
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
166
|
+
self._managers = {}
|
167
|
+
for key, manager_class in DEFAULT_MANAGERS.items():
|
168
|
+
try:
|
169
|
+
self._managers[key] = manager_class()
|
170
|
+
logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
|
171
|
+
except Exception as e:
|
172
|
+
logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
|
173
|
+
self._managers[key] = None
|
174
|
+
|
175
|
+
def get_manager(self, key: str) -> Any:
|
176
|
+
"""Retrieve a manager instance by its key."""
|
177
|
+
if key not in self._managers:
|
178
|
+
raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
|
179
|
+
|
180
|
+
manager_instance = self._managers.get(key)
|
181
|
+
|
182
|
+
if manager_instance is None:
|
183
|
+
manager_class = DEFAULT_MANAGERS.get(key)
|
184
|
+
if manager_class:
|
185
|
+
raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
|
186
|
+
else:
|
187
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
|
188
|
+
|
189
|
+
return manager_instance
|
190
|
+
|
191
|
+
def _initialize_highlighter(self):
|
192
|
+
pass
|
193
|
+
|
176
194
|
@property
|
177
195
|
def metadata(self) -> Dict[str, Any]:
|
178
196
|
"""Access metadata as a dictionary."""
|
@@ -183,7 +201,6 @@ class PDF:
|
|
183
201
|
"""Access pages as a PageCollection object."""
|
184
202
|
from natural_pdf.elements.collections import PageCollection
|
185
203
|
|
186
|
-
# Ensure _pages is initialized
|
187
204
|
if not hasattr(self, "_pages"):
|
188
205
|
raise AttributeError("PDF pages not yet initialized.")
|
189
206
|
return PageCollection(self._pages)
|
@@ -195,12 +212,10 @@ class PDF:
|
|
195
212
|
Returns:
|
196
213
|
Self for method chaining
|
197
214
|
"""
|
198
|
-
# Ensure _pages is initialized
|
199
215
|
if not hasattr(self, "_pages"):
|
200
216
|
raise AttributeError("PDF pages not yet initialized.")
|
201
217
|
|
202
218
|
self._exclusions = []
|
203
|
-
# Also clear from pages
|
204
219
|
for page in self._pages:
|
205
220
|
page.clear_exclusions()
|
206
221
|
return self
|
@@ -212,99 +227,75 @@ class PDF:
|
|
212
227
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
213
228
|
|
214
229
|
Args:
|
215
|
-
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
230
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
216
231
|
label: Optional label for this exclusion
|
217
232
|
|
218
233
|
Returns:
|
219
234
|
Self for method chaining
|
220
235
|
"""
|
221
|
-
# Ensure _pages is initialized
|
222
236
|
if not hasattr(self, "_pages"):
|
223
237
|
raise AttributeError("PDF pages not yet initialized.")
|
224
238
|
|
225
|
-
# Store exclusion with its label at PDF level
|
226
239
|
exclusion_data = (exclusion_func, label)
|
227
240
|
self._exclusions.append(exclusion_data)
|
228
241
|
|
229
|
-
# Apply this exclusion to all pages
|
230
242
|
for page in self._pages:
|
231
|
-
# We pass the original function, Page.add_exclusion handles calling it
|
232
243
|
page.add_exclusion(exclusion_func, label=label)
|
233
244
|
|
234
245
|
return self
|
235
246
|
|
236
247
|
def apply_ocr(
|
237
248
|
self,
|
238
|
-
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
239
249
|
engine: Optional[str] = None,
|
240
|
-
# --- Common OCR Parameters (Direct Arguments) ---
|
241
250
|
languages: Optional[List[str]] = None,
|
242
|
-
min_confidence: Optional[float] = None,
|
251
|
+
min_confidence: Optional[float] = None,
|
243
252
|
device: Optional[str] = None,
|
244
|
-
resolution: Optional[int] = None,
|
245
|
-
apply_exclusions: bool = True,
|
253
|
+
resolution: Optional[int] = None,
|
254
|
+
apply_exclusions: bool = True,
|
246
255
|
detect_only: bool = False,
|
247
|
-
|
248
|
-
options: Optional[Any] = None,
|
249
|
-
|
256
|
+
replace: bool = True,
|
257
|
+
options: Optional[Any] = None,
|
258
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
250
259
|
) -> "PDF":
|
251
260
|
"""
|
252
|
-
Applies OCR to specified pages
|
253
|
-
|
254
|
-
This method renders the specified pages to images, sends them as a batch
|
255
|
-
to the OCRManager, and adds the resulting TextElements to each respective page.
|
261
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
256
262
|
|
257
263
|
Args:
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
Overrides manager/engine default.
|
269
|
-
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
270
|
-
Affects input quality for OCR. Defaults to 150 if not set.
|
271
|
-
apply_exclusions: If True (default), render page image for OCR with
|
272
|
-
excluded areas masked (whited out). If False, OCR
|
273
|
-
the raw page image without masking exclusions.
|
274
|
-
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
275
|
-
options: An engine-specific options object (e.g., EasyOCROptions) or dict
|
276
|
-
containing parameters specific to the chosen engine.
|
264
|
+
engine: Name of the OCR engine
|
265
|
+
languages: List of language codes
|
266
|
+
min_confidence: Minimum confidence threshold
|
267
|
+
device: Device to run OCR on
|
268
|
+
resolution: DPI resolution for page images
|
269
|
+
apply_exclusions: Whether to mask excluded areas
|
270
|
+
detect_only: If True, only detect text boxes
|
271
|
+
replace: Whether to replace existing OCR elements
|
272
|
+
options: Engine-specific options
|
273
|
+
pages: Page indices to process or None for all pages
|
277
274
|
|
278
275
|
Returns:
|
279
|
-
Self for method chaining
|
280
|
-
|
281
|
-
Raises:
|
282
|
-
ValueError: If page indices are invalid.
|
283
|
-
TypeError: If 'options' is not compatible with the engine.
|
284
|
-
RuntimeError: If the OCRManager or selected engine is not available.
|
276
|
+
Self for method chaining
|
285
277
|
"""
|
286
278
|
if not self._ocr_manager:
|
287
279
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
288
|
-
# Or raise RuntimeError("OCRManager not initialized.")
|
289
280
|
return self
|
290
281
|
|
291
|
-
|
292
|
-
|
282
|
+
thread_id = threading.current_thread().name
|
283
|
+
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
284
|
+
|
285
|
+
target_pages = []
|
293
286
|
if pages is None:
|
294
287
|
target_pages = self._pages
|
295
288
|
elif isinstance(pages, slice):
|
296
289
|
target_pages = self._pages[pages]
|
297
|
-
elif hasattr(pages, "__iter__"):
|
290
|
+
elif hasattr(pages, "__iter__"):
|
298
291
|
try:
|
299
292
|
target_pages = [self._pages[i] for i in pages]
|
300
293
|
except IndexError:
|
301
294
|
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
302
295
|
except TypeError:
|
303
|
-
raise TypeError(
|
304
|
-
"'pages' must be None, a slice, or an iterable of page indices (int)."
|
305
|
-
)
|
296
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
306
297
|
else:
|
307
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
298
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
308
299
|
|
309
300
|
if not target_pages:
|
310
301
|
logger.warning("No pages selected for OCR processing.")
|
@@ -312,26 +303,20 @@ class PDF:
|
|
312
303
|
|
313
304
|
page_numbers = [p.number for p in target_pages]
|
314
305
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
logger.
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
326
|
-
logger.info(
|
327
|
-
f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
|
328
|
-
)
|
329
|
-
failed_page_num = "unknown" # Keep track of potentially failing page
|
306
|
+
|
307
|
+
final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
|
308
|
+
logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
|
309
|
+
|
310
|
+
images_pil = []
|
311
|
+
page_image_map = []
|
312
|
+
logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
|
313
|
+
failed_page_num = "unknown"
|
314
|
+
render_start_time = time.monotonic()
|
315
|
+
|
330
316
|
try:
|
331
|
-
for i, page in enumerate(target_pages):
|
332
|
-
failed_page_num = page.number
|
317
|
+
for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
|
318
|
+
failed_page_num = page.number
|
333
319
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
334
|
-
# Use the determined final_resolution and apply exclusions if requested
|
335
320
|
to_image_kwargs = {
|
336
321
|
"resolution": final_resolution,
|
337
322
|
"include_highlights": False,
|
@@ -340,68 +325,64 @@ class PDF:
|
|
340
325
|
img = page.to_image(**to_image_kwargs)
|
341
326
|
if img is None:
|
342
327
|
logger.error(f" Failed to render page {page.number} to image.")
|
343
|
-
|
344
|
-
continue # Skip this page if rendering failed
|
328
|
+
continue
|
345
329
|
images_pil.append(img)
|
346
|
-
page_image_map.append((page, img))
|
330
|
+
page_image_map.append((page, img))
|
347
331
|
except Exception as e:
|
348
|
-
logger.error(f"Failed to render
|
332
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
349
333
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
334
|
+
|
335
|
+
render_end_time = time.monotonic()
|
336
|
+
logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
|
350
337
|
|
351
338
|
if not images_pil or not page_image_map:
|
352
339
|
logger.error("No images were successfully rendered for batch OCR.")
|
353
340
|
return self
|
354
341
|
|
355
|
-
# --- Prepare Arguments for Manager ---
|
356
|
-
# Pass common args directly, engine-specific via options
|
357
342
|
manager_args = {
|
358
343
|
"images": images_pil,
|
359
344
|
"engine": engine,
|
360
345
|
"languages": languages,
|
361
|
-
"min_confidence": min_confidence,
|
346
|
+
"min_confidence": min_confidence,
|
362
347
|
"device": device,
|
363
348
|
"options": options,
|
364
349
|
"detect_only": detect_only,
|
365
|
-
# Note: resolution is used for rendering, not passed to OCR manager directly
|
366
350
|
}
|
367
|
-
# Filter out None values so manager can use its defaults
|
368
351
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
369
352
|
|
370
|
-
|
371
|
-
logger.info(
|
372
|
-
|
373
|
-
|
353
|
+
ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
|
354
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
355
|
+
ocr_start_time = time.monotonic()
|
356
|
+
|
374
357
|
try:
|
375
|
-
# Manager's apply_ocr signature needs to accept common args directly
|
376
358
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
377
359
|
|
378
360
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
379
|
-
logger.error(
|
380
|
-
f"OCR Manager returned unexpected result format or length for batch processing. "
|
381
|
-
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
382
|
-
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
383
|
-
)
|
361
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
384
362
|
return self
|
385
363
|
|
386
364
|
logger.info("OCR Manager batch processing complete.")
|
387
|
-
|
388
365
|
except Exception as e:
|
389
|
-
logger.error(f"Batch OCR processing failed: {e}"
|
366
|
+
logger.error(f"Batch OCR processing failed: {e}")
|
390
367
|
return self
|
368
|
+
|
369
|
+
ocr_end_time = time.monotonic()
|
370
|
+
logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
|
391
371
|
|
392
|
-
# --- Distribute Results and Add Elements to Pages (unchanged) ---
|
393
372
|
logger.info("Adding OCR results to respective pages...")
|
394
373
|
total_elements_added = 0
|
374
|
+
|
395
375
|
for i, (page, img) in enumerate(page_image_map):
|
396
376
|
results_for_page = batch_results[i]
|
397
377
|
if not isinstance(results_for_page, list):
|
398
|
-
logger.warning(
|
399
|
-
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
400
|
-
)
|
378
|
+
logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
|
401
379
|
continue
|
402
380
|
|
403
381
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
404
382
|
try:
|
383
|
+
if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
|
384
|
+
page._element_mgr.remove_ocr_elements()
|
385
|
+
|
405
386
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
406
387
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
407
388
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -414,53 +395,39 @@ class PDF:
|
|
414
395
|
else:
|
415
396
|
logger.debug(f" No valid TextElements created for page {page.number}.")
|
416
397
|
except Exception as e:
|
417
|
-
logger.error(
|
418
|
-
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
419
|
-
)
|
398
|
+
logger.error(f" Error adding OCR elements to page {page.number}: {e}")
|
420
399
|
|
421
|
-
logger.info(
|
422
|
-
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
423
|
-
)
|
400
|
+
logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
|
424
401
|
return self
|
425
402
|
|
426
403
|
def add_region(
|
427
404
|
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
428
405
|
) -> "PDF":
|
429
406
|
"""
|
430
|
-
Add a region function to the PDF.
|
407
|
+
Add a region function to the PDF.
|
431
408
|
|
432
409
|
Args:
|
433
|
-
region_func: A function that takes a Page and returns a Region, or None
|
410
|
+
region_func: A function that takes a Page and returns a Region, or None
|
434
411
|
name: Optional name for the region
|
435
412
|
|
436
413
|
Returns:
|
437
414
|
Self for method chaining
|
438
415
|
"""
|
439
|
-
# Ensure _pages is initialized
|
440
416
|
if not hasattr(self, "_pages"):
|
441
417
|
raise AttributeError("PDF pages not yet initialized.")
|
442
418
|
|
443
|
-
# Store region with its name at PDF level
|
444
419
|
region_data = (region_func, name)
|
445
420
|
self._regions.append(region_data)
|
446
421
|
|
447
|
-
# Apply this region to all pages
|
448
422
|
for page in self._pages:
|
449
423
|
try:
|
450
|
-
# Call the function to get the region for this specific page
|
451
424
|
region_instance = region_func(page)
|
452
425
|
if region_instance and isinstance(region_instance, Region):
|
453
|
-
# If a valid region is returned, add it to the page
|
454
426
|
page.add_region(region_instance, name=name, source="named")
|
455
427
|
elif region_instance is not None:
|
456
|
-
logger.warning(
|
457
|
-
f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
|
458
|
-
)
|
428
|
+
logger.warning(f"Region function did not return a valid Region for page {page.number}")
|
459
429
|
except Exception as e:
|
460
|
-
logger.error(
|
461
|
-
f"Error executing or adding region function for page {page.number}: {e}",
|
462
|
-
exc_info=True,
|
463
|
-
)
|
430
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
464
431
|
|
465
432
|
return self
|
466
433
|
|
@@ -471,22 +438,19 @@ class PDF:
|
|
471
438
|
Find the first element matching the selector.
|
472
439
|
|
473
440
|
Args:
|
474
|
-
selector: CSS-like selector string
|
475
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
476
|
-
regex: Whether to use regex for text search
|
477
|
-
case: Whether to do case-sensitive text search
|
441
|
+
selector: CSS-like selector string
|
442
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
443
|
+
regex: Whether to use regex for text search
|
444
|
+
case: Whether to do case-sensitive text search
|
478
445
|
**kwargs: Additional filter parameters
|
479
446
|
|
480
447
|
Returns:
|
481
448
|
Element object or None if not found
|
482
449
|
"""
|
483
|
-
# Ensure _pages is initialized
|
484
450
|
if not hasattr(self, "_pages"):
|
485
451
|
raise AttributeError("PDF pages not yet initialized.")
|
486
452
|
|
487
453
|
selector_obj = parse_selector(selector)
|
488
|
-
|
489
|
-
# Pass regex and case flags to selector function
|
490
454
|
kwargs["regex"] = regex
|
491
455
|
kwargs["case"] = case
|
492
456
|
|
@@ -502,22 +466,19 @@ class PDF:
|
|
502
466
|
Find all elements matching the selector.
|
503
467
|
|
504
468
|
Args:
|
505
|
-
selector: CSS-like selector string
|
506
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
507
|
-
regex: Whether to use regex for text search
|
508
|
-
case: Whether to do case-sensitive text search
|
469
|
+
selector: CSS-like selector string
|
470
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
471
|
+
regex: Whether to use regex for text search
|
472
|
+
case: Whether to do case-sensitive text search
|
509
473
|
**kwargs: Additional filter parameters
|
510
474
|
|
511
475
|
Returns:
|
512
476
|
ElementCollection with matching elements
|
513
477
|
"""
|
514
|
-
# Ensure _pages is initialized
|
515
478
|
if not hasattr(self, "_pages"):
|
516
479
|
raise AttributeError("PDF pages not yet initialized.")
|
517
480
|
|
518
481
|
selector_obj = parse_selector(selector)
|
519
|
-
|
520
|
-
# Pass regex and case flags to selector function
|
521
482
|
kwargs["regex"] = regex
|
522
483
|
kwargs["case"] = case
|
523
484
|
|
@@ -534,8 +495,8 @@ class PDF:
|
|
534
495
|
|
535
496
|
Args:
|
536
497
|
selector_obj: Parsed selector dictionary
|
537
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
538
|
-
first_only: If True, stop searching after the first match is found
|
498
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
499
|
+
first_only: If True, stop searching after the first match is found
|
539
500
|
**kwargs: Additional filter parameters
|
540
501
|
|
541
502
|
Returns:
|
@@ -543,57 +504,45 @@ class PDF:
|
|
543
504
|
"""
|
544
505
|
from natural_pdf.elements.collections import ElementCollection
|
545
506
|
|
546
|
-
# Determine page range to search
|
547
507
|
page_indices = kwargs.get("pages", range(len(self._pages)))
|
548
508
|
if isinstance(page_indices, int):
|
549
509
|
page_indices = [page_indices]
|
550
510
|
elif isinstance(page_indices, slice):
|
551
511
|
page_indices = range(*page_indices.indices(len(self._pages)))
|
552
512
|
|
553
|
-
# Check for cross-page pseudo-classes (currently not supported)
|
554
513
|
for pseudo in selector_obj.get("pseudo_classes", []):
|
555
514
|
if pseudo.get("name") in ("spans", "continues"):
|
556
515
|
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
557
516
|
return ElementCollection([])
|
558
517
|
|
559
|
-
# Regular case: collect elements from each page
|
560
518
|
all_elements = []
|
561
519
|
for page_idx in page_indices:
|
562
520
|
if 0 <= page_idx < len(self._pages):
|
563
521
|
page = self._pages[page_idx]
|
564
|
-
# Pass first_only down to page._apply_selector
|
565
522
|
page_elements_collection = page._apply_selector(
|
566
523
|
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
567
524
|
)
|
568
525
|
if page_elements_collection:
|
569
526
|
page_elements = page_elements_collection.elements
|
570
527
|
all_elements.extend(page_elements)
|
571
|
-
# If we only need the first match overall, and we found one on this page, stop
|
572
528
|
if first_only and page_elements:
|
573
|
-
break
|
529
|
+
break
|
574
530
|
else:
|
575
531
|
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
576
532
|
|
577
|
-
# Create a combined collection
|
578
533
|
combined = ElementCollection(all_elements)
|
579
534
|
|
580
|
-
# Sort in document order if requested and not first_only (already sorted by page)
|
581
535
|
if not first_only and kwargs.get("document_order", True):
|
582
|
-
# Check if elements have page, top, x0 before sorting
|
583
536
|
if all(
|
584
537
|
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
585
538
|
for el in combined.elements
|
586
539
|
):
|
587
540
|
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
588
541
|
else:
|
589
|
-
# Elements might be Regions without inherent sorting order yet
|
590
|
-
# Attempt sorting by page index if possible
|
591
542
|
try:
|
592
543
|
combined.sort(key=lambda el: el.page.index)
|
593
544
|
except AttributeError:
|
594
|
-
logger.warning(
|
595
|
-
"Cannot sort elements in document order: Missing required attributes (e.g., page)."
|
596
|
-
)
|
545
|
+
logger.warning("Cannot sort elements in document order: Missing required attributes.")
|
597
546
|
|
598
547
|
return combined
|
599
548
|
|
@@ -610,24 +559,21 @@ class PDF:
|
|
610
559
|
|
611
560
|
Args:
|
612
561
|
selector: Optional selector to filter elements
|
613
|
-
preserve_whitespace: Whether to keep blank characters
|
614
|
-
use_exclusions: Whether to apply exclusion regions
|
615
|
-
debug_exclusions: Whether to output detailed debugging for exclusions
|
562
|
+
preserve_whitespace: Whether to keep blank characters
|
563
|
+
use_exclusions: Whether to apply exclusion regions
|
564
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
616
565
|
**kwargs: Additional extraction parameters
|
617
566
|
|
618
567
|
Returns:
|
619
568
|
Extracted text as string
|
620
569
|
"""
|
621
|
-
# Ensure _pages is initialized
|
622
570
|
if not hasattr(self, "_pages"):
|
623
571
|
raise AttributeError("PDF pages not yet initialized.")
|
624
572
|
|
625
|
-
# If selector is provided, find elements first
|
626
573
|
if selector:
|
627
574
|
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
628
575
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
629
576
|
|
630
|
-
# Otherwise extract from all pages
|
631
577
|
if debug_exclusions:
|
632
578
|
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
633
579
|
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
@@ -648,25 +594,6 @@ class PDF:
|
|
648
594
|
|
649
595
|
return "\n".join(texts)
|
650
596
|
|
651
|
-
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
652
|
-
"""
|
653
|
-
Shorthand for finding elements and extracting their text.
|
654
|
-
|
655
|
-
Args:
|
656
|
-
selector: CSS-like selector string
|
657
|
-
preserve_whitespace: Whether to keep blank characters (default: True)
|
658
|
-
**kwargs: Additional extraction parameters
|
659
|
-
|
660
|
-
Returns:
|
661
|
-
Extracted text from matching elements
|
662
|
-
"""
|
663
|
-
# Ensure _pages is initialized
|
664
|
-
if not hasattr(self, "_pages"):
|
665
|
-
raise AttributeError("PDF pages not yet initialized.")
|
666
|
-
return self.extract_text(
|
667
|
-
selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
|
668
|
-
) # apply_exclusions is handled by find_all in extract_text
|
669
|
-
|
670
597
|
def extract_tables(
|
671
598
|
self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
|
672
599
|
) -> List[Any]:
|
@@ -681,54 +608,43 @@ class PDF:
|
|
681
608
|
Returns:
|
682
609
|
List of extracted tables
|
683
610
|
"""
|
684
|
-
# Ensure _pages is initialized
|
685
611
|
if not hasattr(self, "_pages"):
|
686
612
|
raise AttributeError("PDF pages not yet initialized.")
|
687
|
-
|
613
|
+
|
688
614
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
689
615
|
all_tables = []
|
616
|
+
|
690
617
|
for page in self.pages:
|
691
|
-
# Assuming page.extract_tables(**kwargs) exists or is added
|
692
618
|
if hasattr(page, "extract_tables"):
|
693
619
|
all_tables.extend(page.extract_tables(**kwargs))
|
694
620
|
else:
|
695
621
|
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
696
|
-
|
622
|
+
|
697
623
|
if selector:
|
698
624
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
699
|
-
|
700
|
-
# Placeholder merging
|
625
|
+
|
701
626
|
if merge_across_pages:
|
702
627
|
logger.warning("Merging tables across pages is not implemented.")
|
703
|
-
|
628
|
+
|
704
629
|
return all_tables
|
705
630
|
|
706
|
-
# --- New Method: save_searchable ---
|
707
631
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
708
632
|
"""
|
709
633
|
Saves the PDF with an OCR text layer, making content searchable.
|
710
634
|
|
711
635
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
712
636
|
|
713
|
-
Note: OCR must have been applied to the pages beforehand
|
714
|
-
(e.g., using pdf.apply_ocr()).
|
715
|
-
|
716
637
|
Args:
|
717
|
-
output_path: Path to save the searchable PDF
|
718
|
-
dpi: Resolution for rendering and OCR overlay
|
719
|
-
**kwargs: Additional keyword arguments passed to the exporter
|
638
|
+
output_path: Path to save the searchable PDF
|
639
|
+
dpi: Resolution for rendering and OCR overlay
|
640
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
720
641
|
"""
|
721
|
-
# Import moved here, assuming it's always available now
|
722
642
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
723
643
|
|
724
|
-
# Convert pathlib.Path to string if necessary
|
725
644
|
output_path_str = str(output_path)
|
726
|
-
|
727
645
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
728
646
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
729
647
|
|
730
|
-
# --- End New Method ---
|
731
|
-
|
732
648
|
def ask(
|
733
649
|
self,
|
734
650
|
question: str,
|
@@ -750,27 +666,21 @@ class PDF:
|
|
750
666
|
**kwargs: Additional parameters passed to the QA engine
|
751
667
|
|
752
668
|
Returns:
|
753
|
-
A dictionary containing the answer, confidence, and other metadata
|
754
|
-
Result will have an 'answer' key containing the answer text.
|
669
|
+
A dictionary containing the answer, confidence, and other metadata
|
755
670
|
"""
|
756
671
|
from natural_pdf.qa import get_qa_engine
|
757
672
|
|
758
|
-
# Initialize or get QA engine
|
759
673
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
760
674
|
|
761
|
-
# Determine which pages to query
|
762
675
|
if pages is None:
|
763
676
|
target_pages = list(range(len(self.pages)))
|
764
677
|
elif isinstance(pages, int):
|
765
|
-
# Single page
|
766
678
|
target_pages = [pages]
|
767
679
|
elif isinstance(pages, (list, range)):
|
768
|
-
# List or range of pages
|
769
680
|
target_pages = pages
|
770
681
|
else:
|
771
682
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
772
683
|
|
773
|
-
# Actually query each page and gather results
|
774
684
|
results = []
|
775
685
|
for page_idx in target_pages:
|
776
686
|
if 0 <= page_idx < len(self.pages):
|
@@ -779,211 +689,148 @@ class PDF:
|
|
779
689
|
page=page, question=question, min_confidence=min_confidence, **kwargs
|
780
690
|
)
|
781
691
|
|
782
|
-
# Add to results if it found an answer
|
783
692
|
if page_result and page_result.get("found", False):
|
784
693
|
results.append(page_result)
|
785
694
|
|
786
|
-
# Sort results by confidence
|
787
695
|
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
788
696
|
|
789
|
-
# Return the best result, or a default result if none found
|
790
697
|
if results:
|
791
698
|
return results[0]
|
792
699
|
else:
|
793
|
-
# Return a structure indicating no answer found
|
794
700
|
return {
|
795
701
|
"answer": None,
|
796
702
|
"confidence": 0.0,
|
797
703
|
"found": False,
|
798
|
-
"page_num": None,
|
704
|
+
"page_num": None,
|
799
705
|
"source_elements": [],
|
800
706
|
}
|
801
707
|
|
802
708
|
def search_within_index(
|
803
709
|
self,
|
804
710
|
query: Union[str, Path, Image.Image, Region],
|
805
|
-
search_service: SearchServiceProtocol,
|
711
|
+
search_service: SearchServiceProtocol,
|
806
712
|
options: Optional[SearchOptions] = None,
|
807
713
|
) -> List[Dict[str, Any]]:
|
808
714
|
"""
|
809
|
-
Finds relevant documents
|
810
|
-
within a search index managed by the provided SearchService.
|
811
|
-
|
812
|
-
This method uses a pre-configured SearchService instance and adds
|
813
|
-
a filter to the search query to scope results only to pages from
|
814
|
-
this specific PDF object (based on its resolved path).
|
715
|
+
Finds relevant documents from this PDF within a search index.
|
815
716
|
|
816
717
|
Args:
|
817
|
-
query: The search query (text, image path, PIL Image, Region)
|
818
|
-
search_service: A pre-configured SearchService instance
|
819
|
-
|
820
|
-
is expected to be found.
|
821
|
-
options: Optional SearchOptions to configure the query (top_k, filters, etc.).
|
822
|
-
Any existing filters in `options` will be combined with the
|
823
|
-
PDF-scoping filter using an 'AND' condition.
|
718
|
+
query: The search query (text, image path, PIL Image, Region)
|
719
|
+
search_service: A pre-configured SearchService instance
|
720
|
+
options: Optional SearchOptions to configure the query
|
824
721
|
|
825
722
|
Returns:
|
826
|
-
A list of result dictionaries, sorted by relevance
|
827
|
-
results originating from this PDF's pages.
|
723
|
+
A list of result dictionaries, sorted by relevance
|
828
724
|
|
829
725
|
Raises:
|
830
|
-
ImportError: If search dependencies are not installed
|
831
|
-
ValueError: If search_service is None
|
832
|
-
TypeError: If search_service does not conform to the protocol
|
833
|
-
FileNotFoundError: If the collection managed by the service does not exist
|
834
|
-
RuntimeError: For other search failures
|
726
|
+
ImportError: If search dependencies are not installed
|
727
|
+
ValueError: If search_service is None
|
728
|
+
TypeError: If search_service does not conform to the protocol
|
729
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
730
|
+
RuntimeError: For other search failures
|
835
731
|
"""
|
836
732
|
if not search_service:
|
837
733
|
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
838
|
-
# Optional stricter check:
|
839
|
-
# if not isinstance(search_service, SearchServiceProtocol):
|
840
|
-
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
841
734
|
|
842
|
-
# Get collection name from service for logging
|
843
735
|
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
844
|
-
logger.info(
|
845
|
-
|
846
|
-
|
736
|
+
logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
|
737
|
+
|
738
|
+
service = search_service
|
847
739
|
|
848
|
-
# --- 1. Get Search Service Instance --- (REMOVED - provided directly)
|
849
|
-
# service: SearchServiceProtocol
|
850
|
-
# if search_service:
|
851
|
-
# service = search_service
|
852
|
-
# else:
|
853
|
-
# logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
|
854
|
-
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
855
|
-
# # TODO: Pass embedding model from options/pdf config if needed?
|
856
|
-
# service = get_search_service(**factory_args)
|
857
|
-
service = search_service # Use validated provided service
|
858
|
-
|
859
|
-
# --- 2. Prepare Query and Options ---
|
860
740
|
query_input = query
|
861
|
-
# Resolve options (use default TextSearch if none provided)
|
862
741
|
effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
|
863
742
|
|
864
|
-
# Handle Region query - extract text for now
|
865
743
|
if isinstance(query, Region):
|
866
744
|
logger.debug("Query is a Region object. Extracting text.")
|
867
745
|
if not isinstance(effective_options, TextSearchOptions):
|
868
|
-
logger.warning(
|
869
|
-
"Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
|
870
|
-
)
|
746
|
+
logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
|
871
747
|
query_input = query.extract_text()
|
872
748
|
if not query_input or query_input.isspace():
|
873
749
|
logger.error("Region has no extractable text for query.")
|
874
750
|
return []
|
875
751
|
|
876
|
-
#
|
877
|
-
# Assume metadata field 'pdf_path' stores the resolved path used during indexing
|
752
|
+
# Add filter to scope search to THIS PDF
|
878
753
|
pdf_scope_filter = {
|
879
|
-
"field": "pdf_path",
|
754
|
+
"field": "pdf_path",
|
880
755
|
"operator": "eq",
|
881
|
-
"value": self.path,
|
756
|
+
"value": self.path,
|
882
757
|
}
|
883
758
|
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
884
759
|
|
885
760
|
# Combine with existing filters in options (if any)
|
886
761
|
if effective_options.filters:
|
887
|
-
logger.debug(
|
888
|
-
|
889
|
-
)
|
890
|
-
# Assume filters are compatible with the underlying search service
|
891
|
-
# If existing filters aren't already in an AND block, wrap them
|
892
|
-
if (
|
893
|
-
isinstance(effective_options.filters, dict)
|
894
|
-
and effective_options.filters.get("operator") == "AND"
|
895
|
-
):
|
896
|
-
# Already an AND block, just append the condition
|
762
|
+
logger.debug(f"Combining PDF scope filter with existing filters")
|
763
|
+
if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
|
897
764
|
effective_options.filters["conditions"].append(pdf_scope_filter)
|
898
765
|
elif isinstance(effective_options.filters, list):
|
899
|
-
# Assume list represents implicit AND conditions
|
900
766
|
effective_options.filters = {
|
901
767
|
"operator": "AND",
|
902
768
|
"conditions": effective_options.filters + [pdf_scope_filter],
|
903
769
|
}
|
904
|
-
elif isinstance(effective_options.filters, dict):
|
770
|
+
elif isinstance(effective_options.filters, dict):
|
905
771
|
effective_options.filters = {
|
906
772
|
"operator": "AND",
|
907
773
|
"conditions": [effective_options.filters, pdf_scope_filter],
|
908
774
|
}
|
909
775
|
else:
|
910
|
-
logger.warning(
|
911
|
-
f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
|
912
|
-
)
|
776
|
+
logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
|
913
777
|
effective_options.filters = pdf_scope_filter
|
914
778
|
else:
|
915
779
|
effective_options.filters = pdf_scope_filter
|
916
780
|
|
917
781
|
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
918
782
|
|
919
|
-
# --- 4. Call SearchService ---
|
920
783
|
try:
|
921
|
-
# Call the service's search method (no collection_name needed)
|
922
784
|
results = service.search(
|
923
785
|
query=query_input,
|
924
786
|
options=effective_options,
|
925
787
|
)
|
926
|
-
logger.info(
|
927
|
-
f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
|
928
|
-
)
|
788
|
+
logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
|
929
789
|
return results
|
930
790
|
except FileNotFoundError as fnf:
|
931
|
-
logger.error(
|
932
|
-
|
933
|
-
)
|
934
|
-
raise # Re-raise specific error
|
791
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
792
|
+
raise
|
935
793
|
except Exception as e:
|
936
|
-
logger.error(
|
937
|
-
|
938
|
-
exc_info=True,
|
939
|
-
)
|
940
|
-
raise RuntimeError(
|
941
|
-
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
942
|
-
) from e
|
794
|
+
logger.error(f"SearchService search failed: {e}")
|
795
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
943
796
|
|
944
797
|
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
945
798
|
"""
|
946
|
-
Exports OCR results from this PDF into a correction task package
|
799
|
+
Exports OCR results from this PDF into a correction task package.
|
947
800
|
|
948
801
|
Args:
|
949
|
-
output_zip_path: The path to save the output zip file
|
802
|
+
output_zip_path: The path to save the output zip file
|
950
803
|
**kwargs: Additional arguments passed to create_correction_task_package
|
951
|
-
(e.g., image_render_scale, overwrite).
|
952
804
|
"""
|
953
805
|
try:
|
954
806
|
from natural_pdf.utils.packaging import create_correction_task_package
|
955
|
-
|
956
807
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
957
808
|
except ImportError:
|
958
|
-
logger.error(
|
959
|
-
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
960
|
-
)
|
961
|
-
# Or raise
|
809
|
+
logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
|
962
810
|
except Exception as e:
|
963
|
-
logger.error(f"Failed to export correction task
|
964
|
-
raise
|
811
|
+
logger.error(f"Failed to export correction task: {e}")
|
812
|
+
raise
|
965
813
|
|
966
814
|
def correct_ocr(
|
967
815
|
self,
|
968
816
|
correction_callback: Callable[[Any], Optional[str]],
|
969
817
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
970
|
-
|
818
|
+
max_workers: Optional[int] = None,
|
819
|
+
progress_callback: Optional[Callable[[], None]] = None,
|
820
|
+
) -> "PDF":
|
971
821
|
"""
|
972
|
-
Applies corrections to OCR
|
973
|
-
delegating the core work to the `Page.correct_ocr` method.
|
822
|
+
Applies corrections to OCR text elements using a callback function.
|
974
823
|
|
975
824
|
Args:
|
976
|
-
correction_callback:
|
977
|
-
object) and returns `Optional[str]`. It returns the
|
978
|
-
corrected text string if an update is needed, otherwise None.
|
825
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
979
826
|
pages: Optional page indices/slice to limit the scope of correction
|
980
|
-
|
827
|
+
max_workers: Maximum number of threads to use for parallel execution
|
828
|
+
progress_callback: Optional callback function for progress updates
|
981
829
|
|
982
830
|
Returns:
|
983
|
-
Self for method chaining
|
831
|
+
Self for method chaining
|
984
832
|
"""
|
985
|
-
|
986
|
-
target_page_indices: List[int] = []
|
833
|
+
target_page_indices = []
|
987
834
|
if pages is None:
|
988
835
|
target_page_indices = list(range(len(self._pages)))
|
989
836
|
elif isinstance(pages, slice):
|
@@ -991,56 +838,49 @@ class PDF:
|
|
991
838
|
elif hasattr(pages, "__iter__"):
|
992
839
|
try:
|
993
840
|
target_page_indices = [int(i) for i in pages]
|
994
|
-
# Validate indices
|
995
841
|
for idx in target_page_indices:
|
996
842
|
if not (0 <= idx < len(self._pages)):
|
997
843
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
998
844
|
except (IndexError, TypeError, ValueError) as e:
|
999
|
-
raise ValueError(
|
1000
|
-
f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
|
1001
|
-
) from e
|
845
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1002
846
|
else:
|
1003
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
847
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1004
848
|
|
1005
849
|
if not target_page_indices:
|
1006
850
|
logger.warning("No pages selected for OCR correction.")
|
1007
851
|
return self
|
1008
852
|
|
1009
|
-
logger.info(
|
1010
|
-
f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
|
1011
|
-
)
|
853
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1012
854
|
|
1013
|
-
# Iterate through target pages and call their correct_ocr method
|
1014
855
|
for page_idx in target_page_indices:
|
1015
856
|
page = self._pages[page_idx]
|
1016
857
|
try:
|
1017
|
-
page.correct_ocr(
|
858
|
+
page.correct_ocr(
|
859
|
+
correction_callback=correction_callback,
|
860
|
+
max_workers=max_workers,
|
861
|
+
progress_callback=progress_callback,
|
862
|
+
)
|
1018
863
|
except Exception as e:
|
1019
|
-
logger.error(f"Error during correct_ocr on page {page_idx}: {e}"
|
1020
|
-
# Optionally re-raise or just log and continue
|
864
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1021
865
|
|
1022
|
-
logger.info(
|
866
|
+
logger.info("OCR correction process finished.")
|
1023
867
|
return self
|
1024
868
|
|
1025
869
|
def __len__(self) -> int:
|
1026
870
|
"""Return the number of pages in the PDF."""
|
1027
|
-
# Ensure _pages is initialized
|
1028
871
|
if not hasattr(self, "_pages"):
|
1029
|
-
# Return 0 or raise error if not fully initialized? Let's return 0.
|
1030
872
|
return 0
|
1031
873
|
return len(self._pages)
|
1032
874
|
|
1033
|
-
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
875
|
+
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
1034
876
|
"""Access pages by index or slice."""
|
1035
|
-
# Check if self._pages has been initialized
|
1036
877
|
if not hasattr(self, "_pages"):
|
1037
878
|
raise AttributeError("PDF pages not initialized yet.")
|
879
|
+
|
1038
880
|
if isinstance(key, slice):
|
1039
|
-
# Return a PageCollection slice
|
1040
881
|
from natural_pdf.elements.collections import PageCollection
|
1041
|
-
|
1042
882
|
return PageCollection(self._pages[key])
|
1043
|
-
|
883
|
+
|
1044
884
|
if isinstance(key, int):
|
1045
885
|
if 0 <= key < len(self._pages):
|
1046
886
|
return self._pages[key]
|
@@ -1054,13 +894,12 @@ class PDF:
|
|
1054
894
|
if hasattr(self, "_pdf") and self._pdf is not None:
|
1055
895
|
try:
|
1056
896
|
self._pdf.close()
|
1057
|
-
logger.debug(f"Closed
|
897
|
+
logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
|
1058
898
|
except Exception as e:
|
1059
899
|
logger.warning(f"Error closing pdfplumber object: {e}")
|
1060
900
|
finally:
|
1061
901
|
self._pdf = None
|
1062
902
|
|
1063
|
-
# Clean up temporary file if it exists
|
1064
903
|
if hasattr(self, "_temp_file") and self._temp_file is not None:
|
1065
904
|
temp_file_path = None
|
1066
905
|
try:
|
@@ -1070,7 +909,7 @@ class PDF:
|
|
1070
909
|
os.unlink(temp_file_path)
|
1071
910
|
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
1072
911
|
except Exception as e:
|
1073
|
-
logger.warning(f"Failed to clean up temporary
|
912
|
+
logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
|
1074
913
|
finally:
|
1075
914
|
self._temp_file = None
|
1076
915
|
|
@@ -1082,6 +921,176 @@ class PDF:
|
|
1082
921
|
"""Context manager exit."""
|
1083
922
|
self.close()
|
1084
923
|
|
1085
|
-
# --- Indexable Protocol Methods --- Needed for search/sync
|
1086
924
|
def get_id(self) -> str:
|
925
|
+
"""Get unique identifier for this PDF."""
|
1087
926
|
return self.path
|
927
|
+
|
928
|
+
# --- Classification Methods --- #
|
929
|
+
|
930
|
+
def classify_pages(
|
931
|
+
self,
|
932
|
+
categories: List[str],
|
933
|
+
model: Optional[str] = None,
|
934
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
935
|
+
analysis_key: str = "classification",
|
936
|
+
using: Optional[str] = None,
|
937
|
+
**kwargs,
|
938
|
+
) -> "PDF":
|
939
|
+
"""
|
940
|
+
Classifies specified pages of the PDF.
|
941
|
+
|
942
|
+
Args:
|
943
|
+
categories: List of category names
|
944
|
+
model: Model identifier ('text', 'vision', or specific HF ID)
|
945
|
+
pages: Page indices, slice, or None for all pages
|
946
|
+
analysis_key: Key to store results in page's analyses dict
|
947
|
+
using: Processing mode ('text' or 'vision')
|
948
|
+
**kwargs: Additional arguments for the ClassificationManager
|
949
|
+
|
950
|
+
Returns:
|
951
|
+
Self for method chaining
|
952
|
+
"""
|
953
|
+
if not categories:
|
954
|
+
raise ValueError("Categories list cannot be empty.")
|
955
|
+
|
956
|
+
try:
|
957
|
+
manager = self.get_manager('classification')
|
958
|
+
except (ValueError, RuntimeError) as e:
|
959
|
+
raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
|
960
|
+
|
961
|
+
if not manager or not manager.is_available():
|
962
|
+
try:
|
963
|
+
from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
|
964
|
+
if not _CLASSIFICATION_AVAILABLE:
|
965
|
+
raise ImportError("Classification dependencies missing.")
|
966
|
+
except ImportError:
|
967
|
+
raise ImportError(
|
968
|
+
"Classification dependencies missing. "
|
969
|
+
"Install with: pip install \"natural-pdf[classification]\""
|
970
|
+
)
|
971
|
+
raise ClassificationError("ClassificationManager not available.")
|
972
|
+
|
973
|
+
target_pages = []
|
974
|
+
if pages is None:
|
975
|
+
target_pages = self._pages
|
976
|
+
elif isinstance(pages, slice):
|
977
|
+
target_pages = self._pages[pages]
|
978
|
+
elif hasattr(pages, "__iter__"):
|
979
|
+
try:
|
980
|
+
target_pages = [self._pages[i] for i in pages]
|
981
|
+
except IndexError:
|
982
|
+
raise ValueError("Invalid page index provided.")
|
983
|
+
except TypeError:
|
984
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
985
|
+
else:
|
986
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
987
|
+
|
988
|
+
if not target_pages:
|
989
|
+
logger.warning("No pages selected for classification.")
|
990
|
+
return self
|
991
|
+
|
992
|
+
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
993
|
+
logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
|
994
|
+
|
995
|
+
page_contents = []
|
996
|
+
pages_to_classify = []
|
997
|
+
logger.debug(f"Gathering content for {len(target_pages)} pages...")
|
998
|
+
|
999
|
+
for page in target_pages:
|
1000
|
+
try:
|
1001
|
+
content = page._get_classification_content(model_type=inferred_using, **kwargs)
|
1002
|
+
page_contents.append(content)
|
1003
|
+
pages_to_classify.append(page)
|
1004
|
+
except ValueError as e:
|
1005
|
+
logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
|
1006
|
+
except Exception as e:
|
1007
|
+
logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
|
1008
|
+
|
1009
|
+
if not page_contents:
|
1010
|
+
logger.warning("No content could be gathered for batch classification.")
|
1011
|
+
return self
|
1012
|
+
|
1013
|
+
logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
|
1014
|
+
|
1015
|
+
try:
|
1016
|
+
batch_results = manager.classify_batch(
|
1017
|
+
item_contents=page_contents,
|
1018
|
+
categories=categories,
|
1019
|
+
model_id=model,
|
1020
|
+
using=inferred_using,
|
1021
|
+
**kwargs,
|
1022
|
+
)
|
1023
|
+
except Exception as e:
|
1024
|
+
logger.error(f"Batch classification failed: {e}")
|
1025
|
+
raise ClassificationError(f"Batch classification failed: {e}") from e
|
1026
|
+
|
1027
|
+
if len(batch_results) != len(pages_to_classify):
|
1028
|
+
logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
|
1029
|
+
return self
|
1030
|
+
|
1031
|
+
logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
|
1032
|
+
for page, result_obj in zip(pages_to_classify, batch_results):
|
1033
|
+
try:
|
1034
|
+
if not hasattr(page, 'analyses') or page.analyses is None:
|
1035
|
+
page.analyses = {}
|
1036
|
+
page.analyses[analysis_key] = result_obj
|
1037
|
+
except Exception as e:
|
1038
|
+
logger.warning(f"Failed to store classification results for page {page.number}: {e}")
|
1039
|
+
|
1040
|
+
logger.info(f"Finished classifying PDF pages.")
|
1041
|
+
return self
|
1042
|
+
|
1043
|
+
# --- End Classification Methods --- #
|
1044
|
+
|
1045
|
+
# --- Extraction Support --- #
|
1046
|
+
def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
|
1047
|
+
"""
|
1048
|
+
Retrieves the content for the entire PDF.
|
1049
|
+
|
1050
|
+
Args:
|
1051
|
+
using: 'text' or 'vision'
|
1052
|
+
**kwargs: Additional arguments passed to extract_text or page.to_image
|
1053
|
+
|
1054
|
+
Returns:
|
1055
|
+
str: Extracted text if using='text'
|
1056
|
+
List[PIL.Image.Image]: List of page images if using='vision'
|
1057
|
+
None: If content cannot be retrieved
|
1058
|
+
"""
|
1059
|
+
if using == 'text':
|
1060
|
+
try:
|
1061
|
+
layout = kwargs.pop('layout', True)
|
1062
|
+
return self.extract_text(layout=layout, **kwargs)
|
1063
|
+
except Exception as e:
|
1064
|
+
logger.error(f"Error extracting text from PDF: {e}")
|
1065
|
+
return None
|
1066
|
+
elif using == 'vision':
|
1067
|
+
page_images = []
|
1068
|
+
logger.info(f"Rendering {len(self.pages)} pages to images...")
|
1069
|
+
|
1070
|
+
resolution = kwargs.pop('resolution', 72)
|
1071
|
+
include_highlights = kwargs.pop('include_highlights', False)
|
1072
|
+
labels = kwargs.pop('labels', False)
|
1073
|
+
|
1074
|
+
try:
|
1075
|
+
for page in tqdm(self.pages, desc="Rendering Pages"):
|
1076
|
+
img = page.to_image(
|
1077
|
+
resolution=resolution,
|
1078
|
+
include_highlights=include_highlights,
|
1079
|
+
labels=labels,
|
1080
|
+
**kwargs
|
1081
|
+
)
|
1082
|
+
if img:
|
1083
|
+
page_images.append(img)
|
1084
|
+
else:
|
1085
|
+
logger.warning(f"Failed to render page {page.number}, skipping.")
|
1086
|
+
if not page_images:
|
1087
|
+
logger.error("Failed to render any pages.")
|
1088
|
+
return None
|
1089
|
+
return page_images
|
1090
|
+
except Exception as e:
|
1091
|
+
logger.error(f"Error rendering pages: {e}")
|
1092
|
+
return None
|
1093
|
+
else:
|
1094
|
+
logger.error(f"Unsupported value for 'using': {using}")
|
1095
|
+
return None
|
1096
|
+
# --- End Extraction Support --- #
|