natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
import copy
|
1
|
+
import copy
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import re
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
-
|
8
|
-
|
7
|
+
import time
|
8
|
+
import threading
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import (
|
9
11
|
TYPE_CHECKING,
|
10
12
|
Any,
|
11
13
|
Callable,
|
@@ -17,29 +19,33 @@ from typing import ( # Added Iterable and TYPE_CHECKING
|
|
17
19
|
Type,
|
18
20
|
Union,
|
19
21
|
)
|
20
|
-
from
|
21
|
-
|
22
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
22
23
|
|
23
24
|
import pdfplumber
|
24
25
|
from PIL import Image
|
25
26
|
|
26
|
-
from natural_pdf.analyzers.layout.layout_manager import
|
27
|
-
|
28
|
-
)
|
29
|
-
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
27
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
28
|
+
from natural_pdf.core.highlighting_service import HighlightingService
|
30
29
|
from natural_pdf.core.page import Page
|
31
30
|
from natural_pdf.elements.collections import ElementCollection
|
32
31
|
from natural_pdf.elements.region import Region
|
33
32
|
from natural_pdf.ocr import OCRManager, OCROptions
|
34
33
|
from natural_pdf.selectors.parser import parse_selector
|
35
34
|
|
36
|
-
|
35
|
+
from natural_pdf.classification.manager import ClassificationManager
|
36
|
+
from natural_pdf.classification.manager import ClassificationError
|
37
|
+
from natural_pdf.classification.results import ClassificationResult
|
38
|
+
from natural_pdf.extraction.manager import StructuredDataManager
|
39
|
+
|
40
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
+
from natural_pdf.elements.base import Element
|
42
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
43
|
+
from natural_pdf.extraction.mixin import ExtractionMixin
|
37
44
|
|
38
|
-
# --- Add Search Service Imports (needed for new methods) ---
|
39
45
|
try:
|
40
|
-
from typing import Any as TypingAny
|
46
|
+
from typing import Any as TypingAny
|
41
47
|
|
42
|
-
from natural_pdf.search import TextSearchOptions
|
48
|
+
from natural_pdf.search import TextSearchOptions
|
43
49
|
from natural_pdf.search import (
|
44
50
|
BaseSearchOptions,
|
45
51
|
SearchOptions,
|
@@ -47,25 +53,24 @@ try:
|
|
47
53
|
get_search_service,
|
48
54
|
)
|
49
55
|
except ImportError:
|
50
|
-
# Define dummies if needed for type hints within the class
|
51
56
|
SearchServiceProtocol = object
|
52
57
|
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
53
58
|
TypingAny = object
|
54
59
|
|
55
|
-
# Dummy factory needed for default arg in methods
|
56
60
|
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
57
61
|
raise ImportError(
|
58
62
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
59
63
|
)
|
60
64
|
|
61
|
-
|
62
|
-
# --- End Search Service Imports ---
|
63
|
-
|
64
|
-
# Set up logger early
|
65
65
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
66
|
+
tqdm = get_tqdm()
|
66
67
|
|
68
|
+
DEFAULT_MANAGERS = {
|
69
|
+
"classification": ClassificationManager,
|
70
|
+
"structured_data": StructuredDataManager,
|
71
|
+
}
|
67
72
|
|
68
|
-
class PDF:
|
73
|
+
class PDF(ExtractionMixin):
|
69
74
|
"""
|
70
75
|
Enhanced PDF wrapper built on top of pdfplumber.
|
71
76
|
|
@@ -86,35 +91,23 @@ class PDF:
|
|
86
91
|
Args:
|
87
92
|
path_or_url: Path to the PDF file or a URL to a PDF
|
88
93
|
reading_order: Whether to use natural reading order
|
89
|
-
font_attrs: Font attributes
|
90
|
-
|
91
|
-
None: Only consider spatial relationships
|
92
|
-
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
93
|
-
keep_spaces: Whether to include spaces in word elements (default: True).
|
94
|
-
True: Spaces are part of words, better for multi-word searching
|
95
|
-
False: Break text at spaces, each word is separate (legacy behavior)
|
94
|
+
font_attrs: Font attributes for grouping characters into words
|
95
|
+
keep_spaces: Whether to include spaces in word elements
|
96
96
|
"""
|
97
|
-
# Check if the input is a URL
|
98
97
|
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
99
98
|
|
100
|
-
# Initialize path-related attributes
|
101
99
|
self._original_path = path_or_url
|
102
100
|
self._temp_file = None
|
103
|
-
self._resolved_path = None
|
101
|
+
self._resolved_path = None
|
104
102
|
|
105
103
|
if is_url:
|
106
104
|
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
107
105
|
try:
|
108
|
-
# Create a temporary file to store the downloaded PDF
|
109
106
|
self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
110
|
-
|
111
|
-
# Download the PDF
|
112
107
|
with urllib.request.urlopen(path_or_url) as response:
|
113
108
|
self._temp_file.write(response.read())
|
114
109
|
self._temp_file.flush()
|
115
110
|
self._temp_file.close()
|
116
|
-
|
117
|
-
# Use the temporary file path
|
118
111
|
self._resolved_path = self._temp_file.name
|
119
112
|
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
120
113
|
except Exception as e:
|
@@ -126,7 +119,6 @@ class PDF:
|
|
126
119
|
logger.error(f"Failed to download PDF from URL: {e}")
|
127
120
|
raise ValueError(f"Failed to download PDF from URL: {e}")
|
128
121
|
else:
|
129
|
-
# Use the provided path directly
|
130
122
|
self._resolved_path = path_or_url
|
131
123
|
|
132
124
|
logger.info(f"Initializing PDF from {self._resolved_path}")
|
@@ -137,42 +129,68 @@ class PDF:
|
|
137
129
|
try:
|
138
130
|
self._pdf = pdfplumber.open(self._resolved_path)
|
139
131
|
except Exception as e:
|
140
|
-
logger.error(
|
141
|
-
f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
|
142
|
-
exc_info=True,
|
143
|
-
)
|
144
|
-
# Clean up temp file if creation failed
|
132
|
+
logger.error(f"Failed to open PDF: {e}", exc_info=True)
|
145
133
|
self.close()
|
146
134
|
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
147
135
|
|
148
|
-
self._path = self._resolved_path
|
149
|
-
self.path = self._resolved_path
|
150
|
-
self.source_path = self._original_path
|
136
|
+
self._path = self._resolved_path
|
137
|
+
self.path = self._resolved_path
|
138
|
+
self.source_path = self._original_path
|
151
139
|
|
152
140
|
self._reading_order = reading_order
|
153
141
|
self._config = {"keep_spaces": keep_spaces}
|
142
|
+
self._font_attrs = font_attrs
|
154
143
|
|
155
|
-
self._font_attrs = font_attrs # Store the font attribute configuration
|
156
|
-
|
157
|
-
# Initialize Managers and Services (conditionally available)
|
158
144
|
self._ocr_manager = OCRManager() if OCRManager else None
|
159
145
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
160
146
|
self.highlighter = HighlightingService(self)
|
147
|
+
self._classification_manager_instance = ClassificationManager()
|
148
|
+
self._manager_registry = {}
|
161
149
|
|
162
|
-
# Initialize pages last, passing necessary refs
|
163
150
|
self._pages = [
|
164
151
|
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
165
152
|
for i, p in enumerate(self._pdf.pages)
|
166
153
|
]
|
167
154
|
|
168
|
-
# Other state
|
169
155
|
self._element_cache = {}
|
170
|
-
self._exclusions = []
|
171
|
-
self._regions = []
|
156
|
+
self._exclusions = []
|
157
|
+
self._regions = []
|
172
158
|
|
173
|
-
logger.info("Initialized HighlightingService.")
|
174
159
|
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
175
160
|
|
161
|
+
self._initialize_managers()
|
162
|
+
self._initialize_highlighter()
|
163
|
+
|
164
|
+
def _initialize_managers(self):
|
165
|
+
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
166
|
+
self._managers = {}
|
167
|
+
for key, manager_class in DEFAULT_MANAGERS.items():
|
168
|
+
try:
|
169
|
+
self._managers[key] = manager_class()
|
170
|
+
logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
|
171
|
+
except Exception as e:
|
172
|
+
logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
|
173
|
+
self._managers[key] = None
|
174
|
+
|
175
|
+
def get_manager(self, key: str) -> Any:
|
176
|
+
"""Retrieve a manager instance by its key."""
|
177
|
+
if key not in self._managers:
|
178
|
+
raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
|
179
|
+
|
180
|
+
manager_instance = self._managers.get(key)
|
181
|
+
|
182
|
+
if manager_instance is None:
|
183
|
+
manager_class = DEFAULT_MANAGERS.get(key)
|
184
|
+
if manager_class:
|
185
|
+
raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
|
186
|
+
else:
|
187
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
|
188
|
+
|
189
|
+
return manager_instance
|
190
|
+
|
191
|
+
def _initialize_highlighter(self):
|
192
|
+
pass
|
193
|
+
|
176
194
|
@property
|
177
195
|
def metadata(self) -> Dict[str, Any]:
|
178
196
|
"""Access metadata as a dictionary."""
|
@@ -183,7 +201,6 @@ class PDF:
|
|
183
201
|
"""Access pages as a PageCollection object."""
|
184
202
|
from natural_pdf.elements.collections import PageCollection
|
185
203
|
|
186
|
-
# Ensure _pages is initialized
|
187
204
|
if not hasattr(self, "_pages"):
|
188
205
|
raise AttributeError("PDF pages not yet initialized.")
|
189
206
|
return PageCollection(self._pages)
|
@@ -195,12 +212,10 @@ class PDF:
|
|
195
212
|
Returns:
|
196
213
|
Self for method chaining
|
197
214
|
"""
|
198
|
-
# Ensure _pages is initialized
|
199
215
|
if not hasattr(self, "_pages"):
|
200
216
|
raise AttributeError("PDF pages not yet initialized.")
|
201
217
|
|
202
218
|
self._exclusions = []
|
203
|
-
# Also clear from pages
|
204
219
|
for page in self._pages:
|
205
220
|
page.clear_exclusions()
|
206
221
|
return self
|
@@ -212,99 +227,75 @@ class PDF:
|
|
212
227
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
213
228
|
|
214
229
|
Args:
|
215
|
-
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
230
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None
|
216
231
|
label: Optional label for this exclusion
|
217
232
|
|
218
233
|
Returns:
|
219
234
|
Self for method chaining
|
220
235
|
"""
|
221
|
-
# Ensure _pages is initialized
|
222
236
|
if not hasattr(self, "_pages"):
|
223
237
|
raise AttributeError("PDF pages not yet initialized.")
|
224
238
|
|
225
|
-
# Store exclusion with its label at PDF level
|
226
239
|
exclusion_data = (exclusion_func, label)
|
227
240
|
self._exclusions.append(exclusion_data)
|
228
241
|
|
229
|
-
# Apply this exclusion to all pages
|
230
242
|
for page in self._pages:
|
231
|
-
# We pass the original function, Page.add_exclusion handles calling it
|
232
243
|
page.add_exclusion(exclusion_func, label=label)
|
233
244
|
|
234
245
|
return self
|
235
246
|
|
236
247
|
def apply_ocr(
|
237
248
|
self,
|
238
|
-
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
239
249
|
engine: Optional[str] = None,
|
240
|
-
# --- Common OCR Parameters (Direct Arguments) ---
|
241
250
|
languages: Optional[List[str]] = None,
|
242
|
-
min_confidence: Optional[float] = None,
|
251
|
+
min_confidence: Optional[float] = None,
|
243
252
|
device: Optional[str] = None,
|
244
|
-
resolution: Optional[int] = None,
|
245
|
-
apply_exclusions: bool = True,
|
253
|
+
resolution: Optional[int] = None,
|
254
|
+
apply_exclusions: bool = True,
|
246
255
|
detect_only: bool = False,
|
247
|
-
|
248
|
-
options: Optional[Any] = None,
|
249
|
-
|
256
|
+
replace: bool = True,
|
257
|
+
options: Optional[Any] = None,
|
258
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
250
259
|
) -> "PDF":
|
251
260
|
"""
|
252
|
-
Applies OCR to specified pages
|
253
|
-
|
254
|
-
This method renders the specified pages to images, sends them as a batch
|
255
|
-
to the OCRManager, and adds the resulting TextElements to each respective page.
|
261
|
+
Applies OCR to specified pages of the PDF using batch processing.
|
256
262
|
|
257
263
|
Args:
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
Overrides manager/engine default.
|
269
|
-
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
270
|
-
Affects input quality for OCR. Defaults to 150 if not set.
|
271
|
-
apply_exclusions: If True (default), render page image for OCR with
|
272
|
-
excluded areas masked (whited out). If False, OCR
|
273
|
-
the raw page image without masking exclusions.
|
274
|
-
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
275
|
-
options: An engine-specific options object (e.g., EasyOCROptions) or dict
|
276
|
-
containing parameters specific to the chosen engine.
|
264
|
+
engine: Name of the OCR engine
|
265
|
+
languages: List of language codes
|
266
|
+
min_confidence: Minimum confidence threshold
|
267
|
+
device: Device to run OCR on
|
268
|
+
resolution: DPI resolution for page images
|
269
|
+
apply_exclusions: Whether to mask excluded areas
|
270
|
+
detect_only: If True, only detect text boxes
|
271
|
+
replace: Whether to replace existing OCR elements
|
272
|
+
options: Engine-specific options
|
273
|
+
pages: Page indices to process or None for all pages
|
277
274
|
|
278
275
|
Returns:
|
279
|
-
Self for method chaining
|
280
|
-
|
281
|
-
Raises:
|
282
|
-
ValueError: If page indices are invalid.
|
283
|
-
TypeError: If 'options' is not compatible with the engine.
|
284
|
-
RuntimeError: If the OCRManager or selected engine is not available.
|
276
|
+
Self for method chaining
|
285
277
|
"""
|
286
278
|
if not self._ocr_manager:
|
287
279
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
288
|
-
# Or raise RuntimeError("OCRManager not initialized.")
|
289
280
|
return self
|
290
281
|
|
291
|
-
|
292
|
-
|
282
|
+
thread_id = threading.current_thread().name
|
283
|
+
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
284
|
+
|
285
|
+
target_pages = []
|
293
286
|
if pages is None:
|
294
287
|
target_pages = self._pages
|
295
288
|
elif isinstance(pages, slice):
|
296
289
|
target_pages = self._pages[pages]
|
297
|
-
elif hasattr(pages, "__iter__"):
|
290
|
+
elif hasattr(pages, "__iter__"):
|
298
291
|
try:
|
299
292
|
target_pages = [self._pages[i] for i in pages]
|
300
293
|
except IndexError:
|
301
294
|
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
302
295
|
except TypeError:
|
303
|
-
raise TypeError(
|
304
|
-
"'pages' must be None, a slice, or an iterable of page indices (int)."
|
305
|
-
)
|
296
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
306
297
|
else:
|
307
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
298
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
308
299
|
|
309
300
|
if not target_pages:
|
310
301
|
logger.warning("No pages selected for OCR processing.")
|
@@ -312,24 +303,20 @@ class PDF:
|
|
312
303
|
|
313
304
|
page_numbers = [p.number for p in target_pages]
|
314
305
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
logger.
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
326
|
-
logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
|
327
|
-
failed_page_num = "unknown" # Keep track of potentially failing page
|
306
|
+
|
307
|
+
final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
|
308
|
+
logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
|
309
|
+
|
310
|
+
images_pil = []
|
311
|
+
page_image_map = []
|
312
|
+
logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
|
313
|
+
failed_page_num = "unknown"
|
314
|
+
render_start_time = time.monotonic()
|
315
|
+
|
328
316
|
try:
|
329
|
-
for i, page in enumerate(target_pages):
|
330
|
-
failed_page_num = page.number
|
317
|
+
for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
|
318
|
+
failed_page_num = page.number
|
331
319
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
332
|
-
# Use the determined final_resolution and apply exclusions if requested
|
333
320
|
to_image_kwargs = {
|
334
321
|
"resolution": final_resolution,
|
335
322
|
"include_highlights": False,
|
@@ -338,66 +325,64 @@ class PDF:
|
|
338
325
|
img = page.to_image(**to_image_kwargs)
|
339
326
|
if img is None:
|
340
327
|
logger.error(f" Failed to render page {page.number} to image.")
|
341
|
-
|
342
|
-
continue # Skip this page if rendering failed
|
328
|
+
continue
|
343
329
|
images_pil.append(img)
|
344
|
-
page_image_map.append((page, img))
|
330
|
+
page_image_map.append((page, img))
|
345
331
|
except Exception as e:
|
346
|
-
logger.error(f"Failed to render
|
332
|
+
logger.error(f"Failed to render pages for batch OCR: {e}")
|
347
333
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
334
|
+
|
335
|
+
render_end_time = time.monotonic()
|
336
|
+
logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
|
348
337
|
|
349
338
|
if not images_pil or not page_image_map:
|
350
339
|
logger.error("No images were successfully rendered for batch OCR.")
|
351
340
|
return self
|
352
341
|
|
353
|
-
# --- Prepare Arguments for Manager ---
|
354
|
-
# Pass common args directly, engine-specific via options
|
355
342
|
manager_args = {
|
356
343
|
"images": images_pil,
|
357
344
|
"engine": engine,
|
358
345
|
"languages": languages,
|
359
|
-
"min_confidence": min_confidence,
|
346
|
+
"min_confidence": min_confidence,
|
360
347
|
"device": device,
|
361
348
|
"options": options,
|
362
349
|
"detect_only": detect_only,
|
363
|
-
# Note: resolution is used for rendering, not passed to OCR manager directly
|
364
350
|
}
|
365
|
-
# Filter out None values so manager can use its defaults
|
366
351
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
367
352
|
|
368
|
-
|
369
|
-
logger.info(f"Calling OCR Manager with args: {
|
353
|
+
ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
|
354
|
+
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
355
|
+
ocr_start_time = time.monotonic()
|
356
|
+
|
370
357
|
try:
|
371
|
-
# Manager's apply_ocr signature needs to accept common args directly
|
372
358
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
373
359
|
|
374
360
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
375
|
-
logger.error(
|
376
|
-
f"OCR Manager returned unexpected result format or length for batch processing. "
|
377
|
-
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
378
|
-
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
379
|
-
)
|
361
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
380
362
|
return self
|
381
363
|
|
382
364
|
logger.info("OCR Manager batch processing complete.")
|
383
|
-
|
384
365
|
except Exception as e:
|
385
|
-
logger.error(f"Batch OCR processing failed: {e}"
|
366
|
+
logger.error(f"Batch OCR processing failed: {e}")
|
386
367
|
return self
|
368
|
+
|
369
|
+
ocr_end_time = time.monotonic()
|
370
|
+
logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
|
387
371
|
|
388
|
-
# --- Distribute Results and Add Elements to Pages (unchanged) ---
|
389
372
|
logger.info("Adding OCR results to respective pages...")
|
390
373
|
total_elements_added = 0
|
374
|
+
|
391
375
|
for i, (page, img) in enumerate(page_image_map):
|
392
376
|
results_for_page = batch_results[i]
|
393
377
|
if not isinstance(results_for_page, list):
|
394
|
-
logger.warning(
|
395
|
-
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
396
|
-
)
|
378
|
+
logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
|
397
379
|
continue
|
398
380
|
|
399
381
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
400
382
|
try:
|
383
|
+
if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
|
384
|
+
page._element_mgr.remove_ocr_elements()
|
385
|
+
|
401
386
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
402
387
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
403
388
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -410,53 +395,39 @@ class PDF:
|
|
410
395
|
else:
|
411
396
|
logger.debug(f" No valid TextElements created for page {page.number}.")
|
412
397
|
except Exception as e:
|
413
|
-
logger.error(
|
414
|
-
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
415
|
-
)
|
398
|
+
logger.error(f" Error adding OCR elements to page {page.number}: {e}")
|
416
399
|
|
417
|
-
logger.info(
|
418
|
-
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
419
|
-
)
|
400
|
+
logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
|
420
401
|
return self
|
421
402
|
|
422
403
|
def add_region(
|
423
404
|
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
424
405
|
) -> "PDF":
|
425
406
|
"""
|
426
|
-
Add a region function to the PDF.
|
407
|
+
Add a region function to the PDF.
|
427
408
|
|
428
409
|
Args:
|
429
|
-
region_func: A function that takes a Page and returns a Region, or None
|
410
|
+
region_func: A function that takes a Page and returns a Region, or None
|
430
411
|
name: Optional name for the region
|
431
412
|
|
432
413
|
Returns:
|
433
414
|
Self for method chaining
|
434
415
|
"""
|
435
|
-
# Ensure _pages is initialized
|
436
416
|
if not hasattr(self, "_pages"):
|
437
417
|
raise AttributeError("PDF pages not yet initialized.")
|
438
418
|
|
439
|
-
# Store region with its name at PDF level
|
440
419
|
region_data = (region_func, name)
|
441
420
|
self._regions.append(region_data)
|
442
421
|
|
443
|
-
# Apply this region to all pages
|
444
422
|
for page in self._pages:
|
445
423
|
try:
|
446
|
-
# Call the function to get the region for this specific page
|
447
424
|
region_instance = region_func(page)
|
448
425
|
if region_instance and isinstance(region_instance, Region):
|
449
|
-
# If a valid region is returned, add it to the page
|
450
426
|
page.add_region(region_instance, name=name, source="named")
|
451
427
|
elif region_instance is not None:
|
452
|
-
logger.warning(
|
453
|
-
f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
|
454
|
-
)
|
428
|
+
logger.warning(f"Region function did not return a valid Region for page {page.number}")
|
455
429
|
except Exception as e:
|
456
|
-
logger.error(
|
457
|
-
f"Error executing or adding region function for page {page.number}: {e}",
|
458
|
-
exc_info=True,
|
459
|
-
)
|
430
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
460
431
|
|
461
432
|
return self
|
462
433
|
|
@@ -467,22 +438,19 @@ class PDF:
|
|
467
438
|
Find the first element matching the selector.
|
468
439
|
|
469
440
|
Args:
|
470
|
-
selector: CSS-like selector string
|
471
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
472
|
-
regex: Whether to use regex for text search
|
473
|
-
case: Whether to do case-sensitive text search
|
441
|
+
selector: CSS-like selector string
|
442
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
443
|
+
regex: Whether to use regex for text search
|
444
|
+
case: Whether to do case-sensitive text search
|
474
445
|
**kwargs: Additional filter parameters
|
475
446
|
|
476
447
|
Returns:
|
477
448
|
Element object or None if not found
|
478
449
|
"""
|
479
|
-
# Ensure _pages is initialized
|
480
450
|
if not hasattr(self, "_pages"):
|
481
451
|
raise AttributeError("PDF pages not yet initialized.")
|
482
452
|
|
483
453
|
selector_obj = parse_selector(selector)
|
484
|
-
|
485
|
-
# Pass regex and case flags to selector function
|
486
454
|
kwargs["regex"] = regex
|
487
455
|
kwargs["case"] = case
|
488
456
|
|
@@ -498,22 +466,19 @@ class PDF:
|
|
498
466
|
Find all elements matching the selector.
|
499
467
|
|
500
468
|
Args:
|
501
|
-
selector: CSS-like selector string
|
502
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
503
|
-
regex: Whether to use regex for text search
|
504
|
-
case: Whether to do case-sensitive text search
|
469
|
+
selector: CSS-like selector string
|
470
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
471
|
+
regex: Whether to use regex for text search
|
472
|
+
case: Whether to do case-sensitive text search
|
505
473
|
**kwargs: Additional filter parameters
|
506
474
|
|
507
475
|
Returns:
|
508
476
|
ElementCollection with matching elements
|
509
477
|
"""
|
510
|
-
# Ensure _pages is initialized
|
511
478
|
if not hasattr(self, "_pages"):
|
512
479
|
raise AttributeError("PDF pages not yet initialized.")
|
513
480
|
|
514
481
|
selector_obj = parse_selector(selector)
|
515
|
-
|
516
|
-
# Pass regex and case flags to selector function
|
517
482
|
kwargs["regex"] = regex
|
518
483
|
kwargs["case"] = case
|
519
484
|
|
@@ -530,8 +495,8 @@ class PDF:
|
|
530
495
|
|
531
496
|
Args:
|
532
497
|
selector_obj: Parsed selector dictionary
|
533
|
-
apply_exclusions: Whether to exclude elements in exclusion regions
|
534
|
-
first_only: If True, stop searching after the first match is found
|
498
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
499
|
+
first_only: If True, stop searching after the first match is found
|
535
500
|
**kwargs: Additional filter parameters
|
536
501
|
|
537
502
|
Returns:
|
@@ -539,57 +504,45 @@ class PDF:
|
|
539
504
|
"""
|
540
505
|
from natural_pdf.elements.collections import ElementCollection
|
541
506
|
|
542
|
-
# Determine page range to search
|
543
507
|
page_indices = kwargs.get("pages", range(len(self._pages)))
|
544
508
|
if isinstance(page_indices, int):
|
545
509
|
page_indices = [page_indices]
|
546
510
|
elif isinstance(page_indices, slice):
|
547
511
|
page_indices = range(*page_indices.indices(len(self._pages)))
|
548
512
|
|
549
|
-
# Check for cross-page pseudo-classes (currently not supported)
|
550
513
|
for pseudo in selector_obj.get("pseudo_classes", []):
|
551
514
|
if pseudo.get("name") in ("spans", "continues"):
|
552
515
|
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
553
516
|
return ElementCollection([])
|
554
517
|
|
555
|
-
# Regular case: collect elements from each page
|
556
518
|
all_elements = []
|
557
519
|
for page_idx in page_indices:
|
558
520
|
if 0 <= page_idx < len(self._pages):
|
559
521
|
page = self._pages[page_idx]
|
560
|
-
# Pass first_only down to page._apply_selector
|
561
522
|
page_elements_collection = page._apply_selector(
|
562
523
|
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
563
524
|
)
|
564
525
|
if page_elements_collection:
|
565
526
|
page_elements = page_elements_collection.elements
|
566
527
|
all_elements.extend(page_elements)
|
567
|
-
# If we only need the first match overall, and we found one on this page, stop
|
568
528
|
if first_only and page_elements:
|
569
|
-
break
|
529
|
+
break
|
570
530
|
else:
|
571
531
|
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
572
532
|
|
573
|
-
# Create a combined collection
|
574
533
|
combined = ElementCollection(all_elements)
|
575
534
|
|
576
|
-
# Sort in document order if requested and not first_only (already sorted by page)
|
577
535
|
if not first_only and kwargs.get("document_order", True):
|
578
|
-
# Check if elements have page, top, x0 before sorting
|
579
536
|
if all(
|
580
537
|
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
581
538
|
for el in combined.elements
|
582
539
|
):
|
583
540
|
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
584
541
|
else:
|
585
|
-
# Elements might be Regions without inherent sorting order yet
|
586
|
-
# Attempt sorting by page index if possible
|
587
542
|
try:
|
588
543
|
combined.sort(key=lambda el: el.page.index)
|
589
544
|
except AttributeError:
|
590
|
-
logger.warning(
|
591
|
-
"Cannot sort elements in document order: Missing required attributes (e.g., page)."
|
592
|
-
)
|
545
|
+
logger.warning("Cannot sort elements in document order: Missing required attributes.")
|
593
546
|
|
594
547
|
return combined
|
595
548
|
|
@@ -606,24 +559,21 @@ class PDF:
|
|
606
559
|
|
607
560
|
Args:
|
608
561
|
selector: Optional selector to filter elements
|
609
|
-
preserve_whitespace: Whether to keep blank characters
|
610
|
-
use_exclusions: Whether to apply exclusion regions
|
611
|
-
debug_exclusions: Whether to output detailed debugging for exclusions
|
562
|
+
preserve_whitespace: Whether to keep blank characters
|
563
|
+
use_exclusions: Whether to apply exclusion regions
|
564
|
+
debug_exclusions: Whether to output detailed debugging for exclusions
|
612
565
|
**kwargs: Additional extraction parameters
|
613
566
|
|
614
567
|
Returns:
|
615
568
|
Extracted text as string
|
616
569
|
"""
|
617
|
-
# Ensure _pages is initialized
|
618
570
|
if not hasattr(self, "_pages"):
|
619
571
|
raise AttributeError("PDF pages not yet initialized.")
|
620
572
|
|
621
|
-
# If selector is provided, find elements first
|
622
573
|
if selector:
|
623
574
|
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
624
575
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
625
576
|
|
626
|
-
# Otherwise extract from all pages
|
627
577
|
if debug_exclusions:
|
628
578
|
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
629
579
|
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
@@ -644,25 +594,6 @@ class PDF:
|
|
644
594
|
|
645
595
|
return "\n".join(texts)
|
646
596
|
|
647
|
-
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
648
|
-
"""
|
649
|
-
Shorthand for finding elements and extracting their text.
|
650
|
-
|
651
|
-
Args:
|
652
|
-
selector: CSS-like selector string
|
653
|
-
preserve_whitespace: Whether to keep blank characters (default: True)
|
654
|
-
**kwargs: Additional extraction parameters
|
655
|
-
|
656
|
-
Returns:
|
657
|
-
Extracted text from matching elements
|
658
|
-
"""
|
659
|
-
# Ensure _pages is initialized
|
660
|
-
if not hasattr(self, "_pages"):
|
661
|
-
raise AttributeError("PDF pages not yet initialized.")
|
662
|
-
return self.extract_text(
|
663
|
-
selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
|
664
|
-
) # apply_exclusions is handled by find_all in extract_text
|
665
|
-
|
666
597
|
def extract_tables(
|
667
598
|
self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
|
668
599
|
) -> List[Any]:
|
@@ -677,54 +608,43 @@ class PDF:
|
|
677
608
|
Returns:
|
678
609
|
List of extracted tables
|
679
610
|
"""
|
680
|
-
# Ensure _pages is initialized
|
681
611
|
if not hasattr(self, "_pages"):
|
682
612
|
raise AttributeError("PDF pages not yet initialized.")
|
683
|
-
|
613
|
+
|
684
614
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
685
615
|
all_tables = []
|
616
|
+
|
686
617
|
for page in self.pages:
|
687
|
-
# Assuming page.extract_tables(**kwargs) exists or is added
|
688
618
|
if hasattr(page, "extract_tables"):
|
689
619
|
all_tables.extend(page.extract_tables(**kwargs))
|
690
620
|
else:
|
691
621
|
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
692
|
-
|
622
|
+
|
693
623
|
if selector:
|
694
624
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
695
|
-
|
696
|
-
# Placeholder merging
|
625
|
+
|
697
626
|
if merge_across_pages:
|
698
627
|
logger.warning("Merging tables across pages is not implemented.")
|
699
|
-
|
628
|
+
|
700
629
|
return all_tables
|
701
630
|
|
702
|
-
# --- New Method: save_searchable ---
|
703
631
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
704
632
|
"""
|
705
633
|
Saves the PDF with an OCR text layer, making content searchable.
|
706
634
|
|
707
635
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
708
636
|
|
709
|
-
Note: OCR must have been applied to the pages beforehand
|
710
|
-
(e.g., using pdf.apply_ocr()).
|
711
|
-
|
712
637
|
Args:
|
713
|
-
output_path: Path to save the searchable PDF
|
714
|
-
dpi: Resolution for rendering and OCR overlay
|
715
|
-
**kwargs: Additional keyword arguments passed to the exporter
|
638
|
+
output_path: Path to save the searchable PDF
|
639
|
+
dpi: Resolution for rendering and OCR overlay
|
640
|
+
**kwargs: Additional keyword arguments passed to the exporter
|
716
641
|
"""
|
717
|
-
# Import moved here, assuming it's always available now
|
718
642
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
719
643
|
|
720
|
-
# Convert pathlib.Path to string if necessary
|
721
644
|
output_path_str = str(output_path)
|
722
|
-
|
723
645
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
724
646
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
725
647
|
|
726
|
-
# --- End New Method ---
|
727
|
-
|
728
648
|
def ask(
|
729
649
|
self,
|
730
650
|
question: str,
|
@@ -746,27 +666,21 @@ class PDF:
|
|
746
666
|
**kwargs: Additional parameters passed to the QA engine
|
747
667
|
|
748
668
|
Returns:
|
749
|
-
A dictionary containing the answer, confidence, and other metadata
|
750
|
-
Result will have an 'answer' key containing the answer text.
|
669
|
+
A dictionary containing the answer, confidence, and other metadata
|
751
670
|
"""
|
752
671
|
from natural_pdf.qa import get_qa_engine
|
753
672
|
|
754
|
-
# Initialize or get QA engine
|
755
673
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
756
674
|
|
757
|
-
# Determine which pages to query
|
758
675
|
if pages is None:
|
759
676
|
target_pages = list(range(len(self.pages)))
|
760
677
|
elif isinstance(pages, int):
|
761
|
-
# Single page
|
762
678
|
target_pages = [pages]
|
763
679
|
elif isinstance(pages, (list, range)):
|
764
|
-
# List or range of pages
|
765
680
|
target_pages = pages
|
766
681
|
else:
|
767
682
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
768
683
|
|
769
|
-
# Actually query each page and gather results
|
770
684
|
results = []
|
771
685
|
for page_idx in target_pages:
|
772
686
|
if 0 <= page_idx < len(self.pages):
|
@@ -775,208 +689,148 @@ class PDF:
|
|
775
689
|
page=page, question=question, min_confidence=min_confidence, **kwargs
|
776
690
|
)
|
777
691
|
|
778
|
-
# Add to results if it found an answer
|
779
692
|
if page_result and page_result.get("found", False):
|
780
693
|
results.append(page_result)
|
781
694
|
|
782
|
-
# Sort results by confidence
|
783
695
|
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
784
696
|
|
785
|
-
# Return the best result, or a default result if none found
|
786
697
|
if results:
|
787
698
|
return results[0]
|
788
699
|
else:
|
789
|
-
# Return a structure indicating no answer found
|
790
700
|
return {
|
791
701
|
"answer": None,
|
792
702
|
"confidence": 0.0,
|
793
703
|
"found": False,
|
794
|
-
"page_num": None,
|
704
|
+
"page_num": None,
|
795
705
|
"source_elements": [],
|
796
706
|
}
|
797
707
|
|
798
708
|
def search_within_index(
|
799
709
|
self,
|
800
710
|
query: Union[str, Path, Image.Image, Region],
|
801
|
-
search_service: SearchServiceProtocol,
|
711
|
+
search_service: SearchServiceProtocol,
|
802
712
|
options: Optional[SearchOptions] = None,
|
803
713
|
) -> List[Dict[str, Any]]:
|
804
714
|
"""
|
805
|
-
Finds relevant documents
|
806
|
-
within a search index managed by the provided SearchService.
|
807
|
-
|
808
|
-
This method uses a pre-configured SearchService instance and adds
|
809
|
-
a filter to the search query to scope results only to pages from
|
810
|
-
this specific PDF object (based on its resolved path).
|
715
|
+
Finds relevant documents from this PDF within a search index.
|
811
716
|
|
812
717
|
Args:
|
813
|
-
query: The search query (text, image path, PIL Image, Region)
|
814
|
-
search_service: A pre-configured SearchService instance
|
815
|
-
|
816
|
-
is expected to be found.
|
817
|
-
options: Optional SearchOptions to configure the query (top_k, filters, etc.).
|
818
|
-
Any existing filters in `options` will be combined with the
|
819
|
-
PDF-scoping filter using an 'AND' condition.
|
718
|
+
query: The search query (text, image path, PIL Image, Region)
|
719
|
+
search_service: A pre-configured SearchService instance
|
720
|
+
options: Optional SearchOptions to configure the query
|
820
721
|
|
821
722
|
Returns:
|
822
|
-
A list of result dictionaries, sorted by relevance
|
823
|
-
results originating from this PDF's pages.
|
723
|
+
A list of result dictionaries, sorted by relevance
|
824
724
|
|
825
725
|
Raises:
|
826
|
-
ImportError: If search dependencies are not installed
|
827
|
-
ValueError: If search_service is None
|
828
|
-
TypeError: If search_service does not conform to the protocol
|
829
|
-
FileNotFoundError: If the collection managed by the service does not exist
|
830
|
-
RuntimeError: For other search failures
|
726
|
+
ImportError: If search dependencies are not installed
|
727
|
+
ValueError: If search_service is None
|
728
|
+
TypeError: If search_service does not conform to the protocol
|
729
|
+
FileNotFoundError: If the collection managed by the service does not exist
|
730
|
+
RuntimeError: For other search failures
|
831
731
|
"""
|
832
732
|
if not search_service:
|
833
733
|
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
834
|
-
# Optional stricter check:
|
835
|
-
# if not isinstance(search_service, SearchServiceProtocol):
|
836
|
-
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
837
734
|
|
838
|
-
# Get collection name from service for logging
|
839
735
|
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
840
|
-
logger.info(
|
841
|
-
|
842
|
-
|
736
|
+
logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
|
737
|
+
|
738
|
+
service = search_service
|
843
739
|
|
844
|
-
# --- 1. Get Search Service Instance --- (REMOVED - provided directly)
|
845
|
-
# service: SearchServiceProtocol
|
846
|
-
# if search_service:
|
847
|
-
# service = search_service
|
848
|
-
# else:
|
849
|
-
# logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
|
850
|
-
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
851
|
-
# # TODO: Pass embedding model from options/pdf config if needed?
|
852
|
-
# service = get_search_service(**factory_args)
|
853
|
-
service = search_service # Use validated provided service
|
854
|
-
|
855
|
-
# --- 2. Prepare Query and Options ---
|
856
740
|
query_input = query
|
857
|
-
# Resolve options (use default TextSearch if none provided)
|
858
741
|
effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
|
859
742
|
|
860
|
-
# Handle Region query - extract text for now
|
861
743
|
if isinstance(query, Region):
|
862
744
|
logger.debug("Query is a Region object. Extracting text.")
|
863
745
|
if not isinstance(effective_options, TextSearchOptions):
|
864
|
-
logger.warning(
|
865
|
-
"Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
|
866
|
-
)
|
746
|
+
logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
|
867
747
|
query_input = query.extract_text()
|
868
748
|
if not query_input or query_input.isspace():
|
869
749
|
logger.error("Region has no extractable text for query.")
|
870
750
|
return []
|
871
751
|
|
872
|
-
#
|
873
|
-
# Assume metadata field 'pdf_path' stores the resolved path used during indexing
|
752
|
+
# Add filter to scope search to THIS PDF
|
874
753
|
pdf_scope_filter = {
|
875
|
-
"field": "pdf_path",
|
754
|
+
"field": "pdf_path",
|
876
755
|
"operator": "eq",
|
877
|
-
"value": self.path,
|
756
|
+
"value": self.path,
|
878
757
|
}
|
879
758
|
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
880
759
|
|
881
760
|
# Combine with existing filters in options (if any)
|
882
761
|
if effective_options.filters:
|
883
|
-
logger.debug(
|
884
|
-
|
885
|
-
)
|
886
|
-
# Assume filters are compatible with the underlying search service
|
887
|
-
# If existing filters aren't already in an AND block, wrap them
|
888
|
-
if (
|
889
|
-
isinstance(effective_options.filters, dict)
|
890
|
-
and effective_options.filters.get("operator") == "AND"
|
891
|
-
):
|
892
|
-
# Already an AND block, just append the condition
|
762
|
+
logger.debug(f"Combining PDF scope filter with existing filters")
|
763
|
+
if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
|
893
764
|
effective_options.filters["conditions"].append(pdf_scope_filter)
|
894
765
|
elif isinstance(effective_options.filters, list):
|
895
|
-
# Assume list represents implicit AND conditions
|
896
766
|
effective_options.filters = {
|
897
767
|
"operator": "AND",
|
898
768
|
"conditions": effective_options.filters + [pdf_scope_filter],
|
899
769
|
}
|
900
|
-
elif isinstance(effective_options.filters, dict):
|
770
|
+
elif isinstance(effective_options.filters, dict):
|
901
771
|
effective_options.filters = {
|
902
772
|
"operator": "AND",
|
903
773
|
"conditions": [effective_options.filters, pdf_scope_filter],
|
904
774
|
}
|
905
775
|
else:
|
906
|
-
logger.warning(
|
907
|
-
f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
|
908
|
-
)
|
776
|
+
logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
|
909
777
|
effective_options.filters = pdf_scope_filter
|
910
778
|
else:
|
911
779
|
effective_options.filters = pdf_scope_filter
|
912
780
|
|
913
781
|
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
914
782
|
|
915
|
-
# --- 4. Call SearchService ---
|
916
783
|
try:
|
917
|
-
# Call the service's search method (no collection_name needed)
|
918
784
|
results = service.search(
|
919
785
|
query=query_input,
|
920
786
|
options=effective_options,
|
921
787
|
)
|
922
|
-
logger.info(
|
923
|
-
f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
|
924
|
-
)
|
788
|
+
logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
|
925
789
|
return results
|
926
790
|
except FileNotFoundError as fnf:
|
927
|
-
logger.error(
|
928
|
-
|
929
|
-
)
|
930
|
-
raise # Re-raise specific error
|
791
|
+
logger.error(f"Search failed: Collection not found. Error: {fnf}")
|
792
|
+
raise
|
931
793
|
except Exception as e:
|
932
|
-
logger.error(
|
933
|
-
|
934
|
-
exc_info=True,
|
935
|
-
)
|
936
|
-
raise RuntimeError(
|
937
|
-
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
938
|
-
) from e
|
794
|
+
logger.error(f"SearchService search failed: {e}")
|
795
|
+
raise RuntimeError(f"Search within index failed. See logs for details.") from e
|
939
796
|
|
940
797
|
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
941
798
|
"""
|
942
|
-
Exports OCR results from this PDF into a correction task package
|
799
|
+
Exports OCR results from this PDF into a correction task package.
|
943
800
|
|
944
801
|
Args:
|
945
|
-
output_zip_path: The path to save the output zip file
|
802
|
+
output_zip_path: The path to save the output zip file
|
946
803
|
**kwargs: Additional arguments passed to create_correction_task_package
|
947
|
-
(e.g., image_render_scale, overwrite).
|
948
804
|
"""
|
949
805
|
try:
|
950
806
|
from natural_pdf.utils.packaging import create_correction_task_package
|
951
807
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
952
808
|
except ImportError:
|
953
809
|
logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
|
954
|
-
# Or raise
|
955
810
|
except Exception as e:
|
956
|
-
logger.error(f"Failed to export correction task
|
957
|
-
raise
|
811
|
+
logger.error(f"Failed to export correction task: {e}")
|
812
|
+
raise
|
958
813
|
|
959
814
|
def correct_ocr(
|
960
815
|
self,
|
961
816
|
correction_callback: Callable[[Any], Optional[str]],
|
962
817
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
963
|
-
|
818
|
+
max_workers: Optional[int] = None,
|
819
|
+
progress_callback: Optional[Callable[[], None]] = None,
|
820
|
+
) -> "PDF":
|
964
821
|
"""
|
965
|
-
Applies corrections to OCR
|
966
|
-
delegating the core work to the `Page.correct_ocr` method.
|
822
|
+
Applies corrections to OCR text elements using a callback function.
|
967
823
|
|
968
824
|
Args:
|
969
|
-
correction_callback:
|
970
|
-
object) and returns `Optional[str]`. It returns the
|
971
|
-
corrected text string if an update is needed, otherwise None.
|
825
|
+
correction_callback: Function that takes an element and returns corrected text or None
|
972
826
|
pages: Optional page indices/slice to limit the scope of correction
|
973
|
-
|
827
|
+
max_workers: Maximum number of threads to use for parallel execution
|
828
|
+
progress_callback: Optional callback function for progress updates
|
974
829
|
|
975
830
|
Returns:
|
976
|
-
Self for method chaining
|
831
|
+
Self for method chaining
|
977
832
|
"""
|
978
|
-
|
979
|
-
target_page_indices: List[int] = []
|
833
|
+
target_page_indices = []
|
980
834
|
if pages is None:
|
981
835
|
target_page_indices = list(range(len(self._pages)))
|
982
836
|
elif isinstance(pages, slice):
|
@@ -984,52 +838,49 @@ class PDF:
|
|
984
838
|
elif hasattr(pages, "__iter__"):
|
985
839
|
try:
|
986
840
|
target_page_indices = [int(i) for i in pages]
|
987
|
-
# Validate indices
|
988
841
|
for idx in target_page_indices:
|
989
842
|
if not (0 <= idx < len(self._pages)):
|
990
843
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
991
844
|
except (IndexError, TypeError, ValueError) as e:
|
992
|
-
raise ValueError(f"Invalid page index
|
845
|
+
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
993
846
|
else:
|
994
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices
|
847
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
995
848
|
|
996
849
|
if not target_page_indices:
|
997
850
|
logger.warning("No pages selected for OCR correction.")
|
998
851
|
return self
|
999
852
|
|
1000
|
-
logger.info(f"Starting OCR correction
|
853
|
+
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1001
854
|
|
1002
|
-
# Iterate through target pages and call their correct_ocr method
|
1003
855
|
for page_idx in target_page_indices:
|
1004
856
|
page = self._pages[page_idx]
|
1005
857
|
try:
|
1006
|
-
page.correct_ocr(
|
858
|
+
page.correct_ocr(
|
859
|
+
correction_callback=correction_callback,
|
860
|
+
max_workers=max_workers,
|
861
|
+
progress_callback=progress_callback,
|
862
|
+
)
|
1007
863
|
except Exception as e:
|
1008
|
-
logger.error(f"Error during correct_ocr on page {page_idx}: {e}"
|
1009
|
-
# Optionally re-raise or just log and continue
|
864
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
|
1010
865
|
|
1011
|
-
logger.info(
|
866
|
+
logger.info("OCR correction process finished.")
|
1012
867
|
return self
|
1013
868
|
|
1014
869
|
def __len__(self) -> int:
|
1015
870
|
"""Return the number of pages in the PDF."""
|
1016
|
-
# Ensure _pages is initialized
|
1017
871
|
if not hasattr(self, "_pages"):
|
1018
|
-
# Return 0 or raise error if not fully initialized? Let's return 0.
|
1019
872
|
return 0
|
1020
873
|
return len(self._pages)
|
1021
874
|
|
1022
|
-
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
875
|
+
def __getitem__(self, key) -> Union[Page, "PageCollection"]:
|
1023
876
|
"""Access pages by index or slice."""
|
1024
|
-
# Check if self._pages has been initialized
|
1025
877
|
if not hasattr(self, "_pages"):
|
1026
878
|
raise AttributeError("PDF pages not initialized yet.")
|
879
|
+
|
1027
880
|
if isinstance(key, slice):
|
1028
|
-
# Return a PageCollection slice
|
1029
881
|
from natural_pdf.elements.collections import PageCollection
|
1030
|
-
|
1031
882
|
return PageCollection(self._pages[key])
|
1032
|
-
|
883
|
+
|
1033
884
|
if isinstance(key, int):
|
1034
885
|
if 0 <= key < len(self._pages):
|
1035
886
|
return self._pages[key]
|
@@ -1043,13 +894,12 @@ class PDF:
|
|
1043
894
|
if hasattr(self, "_pdf") and self._pdf is not None:
|
1044
895
|
try:
|
1045
896
|
self._pdf.close()
|
1046
|
-
logger.debug(f"Closed
|
897
|
+
logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
|
1047
898
|
except Exception as e:
|
1048
899
|
logger.warning(f"Error closing pdfplumber object: {e}")
|
1049
900
|
finally:
|
1050
901
|
self._pdf = None
|
1051
902
|
|
1052
|
-
# Clean up temporary file if it exists
|
1053
903
|
if hasattr(self, "_temp_file") and self._temp_file is not None:
|
1054
904
|
temp_file_path = None
|
1055
905
|
try:
|
@@ -1059,7 +909,7 @@ class PDF:
|
|
1059
909
|
os.unlink(temp_file_path)
|
1060
910
|
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
1061
911
|
except Exception as e:
|
1062
|
-
logger.warning(f"Failed to clean up temporary
|
912
|
+
logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
|
1063
913
|
finally:
|
1064
914
|
self._temp_file = None
|
1065
915
|
|
@@ -1071,8 +921,176 @@ class PDF:
|
|
1071
921
|
"""Context manager exit."""
|
1072
922
|
self.close()
|
1073
923
|
|
1074
|
-
|
1075
|
-
# --- Indexable Protocol Methods --- Needed for search/sync
|
1076
924
|
def get_id(self) -> str:
|
925
|
+
"""Get unique identifier for this PDF."""
|
1077
926
|
return self.path
|
1078
927
|
|
928
|
+
# --- Classification Methods --- #
|
929
|
+
|
930
|
+
def classify_pages(
|
931
|
+
self,
|
932
|
+
categories: List[str],
|
933
|
+
model: Optional[str] = None,
|
934
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
935
|
+
analysis_key: str = "classification",
|
936
|
+
using: Optional[str] = None,
|
937
|
+
**kwargs,
|
938
|
+
) -> "PDF":
|
939
|
+
"""
|
940
|
+
Classifies specified pages of the PDF.
|
941
|
+
|
942
|
+
Args:
|
943
|
+
categories: List of category names
|
944
|
+
model: Model identifier ('text', 'vision', or specific HF ID)
|
945
|
+
pages: Page indices, slice, or None for all pages
|
946
|
+
analysis_key: Key to store results in page's analyses dict
|
947
|
+
using: Processing mode ('text' or 'vision')
|
948
|
+
**kwargs: Additional arguments for the ClassificationManager
|
949
|
+
|
950
|
+
Returns:
|
951
|
+
Self for method chaining
|
952
|
+
"""
|
953
|
+
if not categories:
|
954
|
+
raise ValueError("Categories list cannot be empty.")
|
955
|
+
|
956
|
+
try:
|
957
|
+
manager = self.get_manager('classification')
|
958
|
+
except (ValueError, RuntimeError) as e:
|
959
|
+
raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
|
960
|
+
|
961
|
+
if not manager or not manager.is_available():
|
962
|
+
try:
|
963
|
+
from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
|
964
|
+
if not _CLASSIFICATION_AVAILABLE:
|
965
|
+
raise ImportError("Classification dependencies missing.")
|
966
|
+
except ImportError:
|
967
|
+
raise ImportError(
|
968
|
+
"Classification dependencies missing. "
|
969
|
+
"Install with: pip install \"natural-pdf[classification]\""
|
970
|
+
)
|
971
|
+
raise ClassificationError("ClassificationManager not available.")
|
972
|
+
|
973
|
+
target_pages = []
|
974
|
+
if pages is None:
|
975
|
+
target_pages = self._pages
|
976
|
+
elif isinstance(pages, slice):
|
977
|
+
target_pages = self._pages[pages]
|
978
|
+
elif hasattr(pages, "__iter__"):
|
979
|
+
try:
|
980
|
+
target_pages = [self._pages[i] for i in pages]
|
981
|
+
except IndexError:
|
982
|
+
raise ValueError("Invalid page index provided.")
|
983
|
+
except TypeError:
|
984
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
985
|
+
else:
|
986
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
987
|
+
|
988
|
+
if not target_pages:
|
989
|
+
logger.warning("No pages selected for classification.")
|
990
|
+
return self
|
991
|
+
|
992
|
+
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
993
|
+
logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
|
994
|
+
|
995
|
+
page_contents = []
|
996
|
+
pages_to_classify = []
|
997
|
+
logger.debug(f"Gathering content for {len(target_pages)} pages...")
|
998
|
+
|
999
|
+
for page in target_pages:
|
1000
|
+
try:
|
1001
|
+
content = page._get_classification_content(model_type=inferred_using, **kwargs)
|
1002
|
+
page_contents.append(content)
|
1003
|
+
pages_to_classify.append(page)
|
1004
|
+
except ValueError as e:
|
1005
|
+
logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
|
1006
|
+
except Exception as e:
|
1007
|
+
logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
|
1008
|
+
|
1009
|
+
if not page_contents:
|
1010
|
+
logger.warning("No content could be gathered for batch classification.")
|
1011
|
+
return self
|
1012
|
+
|
1013
|
+
logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
|
1014
|
+
|
1015
|
+
try:
|
1016
|
+
batch_results = manager.classify_batch(
|
1017
|
+
item_contents=page_contents,
|
1018
|
+
categories=categories,
|
1019
|
+
model_id=model,
|
1020
|
+
using=inferred_using,
|
1021
|
+
**kwargs,
|
1022
|
+
)
|
1023
|
+
except Exception as e:
|
1024
|
+
logger.error(f"Batch classification failed: {e}")
|
1025
|
+
raise ClassificationError(f"Batch classification failed: {e}") from e
|
1026
|
+
|
1027
|
+
if len(batch_results) != len(pages_to_classify):
|
1028
|
+
logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
|
1029
|
+
return self
|
1030
|
+
|
1031
|
+
logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
|
1032
|
+
for page, result_obj in zip(pages_to_classify, batch_results):
|
1033
|
+
try:
|
1034
|
+
if not hasattr(page, 'analyses') or page.analyses is None:
|
1035
|
+
page.analyses = {}
|
1036
|
+
page.analyses[analysis_key] = result_obj
|
1037
|
+
except Exception as e:
|
1038
|
+
logger.warning(f"Failed to store classification results for page {page.number}: {e}")
|
1039
|
+
|
1040
|
+
logger.info(f"Finished classifying PDF pages.")
|
1041
|
+
return self
|
1042
|
+
|
1043
|
+
# --- End Classification Methods --- #
|
1044
|
+
|
1045
|
+
# --- Extraction Support --- #
|
1046
|
+
def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
|
1047
|
+
"""
|
1048
|
+
Retrieves the content for the entire PDF.
|
1049
|
+
|
1050
|
+
Args:
|
1051
|
+
using: 'text' or 'vision'
|
1052
|
+
**kwargs: Additional arguments passed to extract_text or page.to_image
|
1053
|
+
|
1054
|
+
Returns:
|
1055
|
+
str: Extracted text if using='text'
|
1056
|
+
List[PIL.Image.Image]: List of page images if using='vision'
|
1057
|
+
None: If content cannot be retrieved
|
1058
|
+
"""
|
1059
|
+
if using == 'text':
|
1060
|
+
try:
|
1061
|
+
layout = kwargs.pop('layout', True)
|
1062
|
+
return self.extract_text(layout=layout, **kwargs)
|
1063
|
+
except Exception as e:
|
1064
|
+
logger.error(f"Error extracting text from PDF: {e}")
|
1065
|
+
return None
|
1066
|
+
elif using == 'vision':
|
1067
|
+
page_images = []
|
1068
|
+
logger.info(f"Rendering {len(self.pages)} pages to images...")
|
1069
|
+
|
1070
|
+
resolution = kwargs.pop('resolution', 72)
|
1071
|
+
include_highlights = kwargs.pop('include_highlights', False)
|
1072
|
+
labels = kwargs.pop('labels', False)
|
1073
|
+
|
1074
|
+
try:
|
1075
|
+
for page in tqdm(self.pages, desc="Rendering Pages"):
|
1076
|
+
img = page.to_image(
|
1077
|
+
resolution=resolution,
|
1078
|
+
include_highlights=include_highlights,
|
1079
|
+
labels=labels,
|
1080
|
+
**kwargs
|
1081
|
+
)
|
1082
|
+
if img:
|
1083
|
+
page_images.append(img)
|
1084
|
+
else:
|
1085
|
+
logger.warning(f"Failed to render page {page.number}, skipping.")
|
1086
|
+
if not page_images:
|
1087
|
+
logger.error("Failed to render any pages.")
|
1088
|
+
return None
|
1089
|
+
return page_images
|
1090
|
+
except Exception as e:
|
1091
|
+
logger.error(f"Error rendering pages: {e}")
|
1092
|
+
return None
|
1093
|
+
else:
|
1094
|
+
logger.error(f"Unsupported value for 'using': {using}")
|
1095
|
+
return None
|
1096
|
+
# --- End Extraction Support --- #
|