natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -4,7 +4,9 @@ import tempfile
|
|
4
4
|
import os
|
5
5
|
import re
|
6
6
|
import urllib.request
|
7
|
-
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable # Added Iterable
|
7
|
+
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable, TYPE_CHECKING # Added Iterable and TYPE_CHECKING
|
8
|
+
from pathlib import Path # Added Path
|
9
|
+
import copy # Add import for deepcopy
|
8
10
|
from PIL import Image
|
9
11
|
|
10
12
|
from natural_pdf.core.page import Page
|
@@ -15,9 +17,31 @@ from natural_pdf.ocr import OCRManager, OCROptions
|
|
15
17
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
|
16
18
|
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
17
19
|
|
18
|
-
#
|
19
|
-
|
20
|
+
# Import the flag directly - this should always work
|
21
|
+
|
22
|
+
# --- Add Search Service Imports (needed for new methods) ---
|
23
|
+
try:
|
24
|
+
from natural_pdf.search import (
|
25
|
+
get_search_service,
|
26
|
+
SearchServiceProtocol,
|
27
|
+
SearchOptions,
|
28
|
+
TextSearchOptions, # Keep for ask default
|
29
|
+
BaseSearchOptions
|
30
|
+
)
|
31
|
+
from typing import Any as TypingAny # Import Any if not already
|
32
|
+
except ImportError:
|
33
|
+
# Define dummies if needed for type hints within the class
|
34
|
+
SearchServiceProtocol = object
|
35
|
+
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
36
|
+
TypingAny = object
|
37
|
+
# Dummy factory needed for default arg in methods
|
38
|
+
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
39
|
+
raise ImportError("Search dependencies are not installed. Install with: pip install natural-pdf[search]")
|
20
40
|
|
41
|
+
# --- End Search Service Imports ---
|
42
|
+
|
43
|
+
# Set up logger early
|
44
|
+
logger = logging.getLogger("natural_pdf.core.pdf")
|
21
45
|
|
22
46
|
class PDF:
|
23
47
|
"""
|
@@ -50,7 +74,8 @@ class PDF:
|
|
50
74
|
# Initialize path-related attributes
|
51
75
|
self._original_path = path_or_url
|
52
76
|
self._temp_file = None
|
53
|
-
|
77
|
+
self._resolved_path = None # Store the actual path used by pdfplumber
|
78
|
+
|
54
79
|
if is_url:
|
55
80
|
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
56
81
|
try:
|
@@ -64,8 +89,8 @@ class PDF:
|
|
64
89
|
self._temp_file.close()
|
65
90
|
|
66
91
|
# Use the temporary file path
|
67
|
-
|
68
|
-
logger.info(f"PDF downloaded to temporary file: {
|
92
|
+
self._resolved_path = self._temp_file.name
|
93
|
+
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
69
94
|
except Exception as e:
|
70
95
|
if self._temp_file and hasattr(self._temp_file, 'name'):
|
71
96
|
try:
|
@@ -76,43 +101,45 @@ class PDF:
|
|
76
101
|
raise ValueError(f"Failed to download PDF from URL: {e}")
|
77
102
|
else:
|
78
103
|
# Use the provided path directly
|
79
|
-
|
80
|
-
|
81
|
-
logger.info(f"Initializing PDF from {
|
104
|
+
self._resolved_path = path_or_url
|
105
|
+
|
106
|
+
logger.info(f"Initializing PDF from {self._resolved_path}")
|
82
107
|
logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
|
83
108
|
|
84
|
-
|
85
|
-
|
109
|
+
try:
|
110
|
+
self._pdf = pdfplumber.open(self._resolved_path)
|
111
|
+
except Exception as e:
|
112
|
+
logger.error(f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}", exc_info=True)
|
113
|
+
# Clean up temp file if creation failed
|
114
|
+
self.close()
|
115
|
+
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
116
|
+
|
117
|
+
self._path = self._resolved_path # Keep original path too?
|
118
|
+
self.path = self._resolved_path # Public attribute for the resolved path
|
119
|
+
self.source_path = self._original_path # Public attribute for the user-provided path/URL
|
120
|
+
|
86
121
|
self._reading_order = reading_order
|
87
122
|
self._config = {
|
88
123
|
'keep_spaces': keep_spaces
|
89
124
|
}
|
90
|
-
self.path = path
|
91
|
-
|
92
|
-
self._font_attrs = font_attrs # Store the font attribute configuration
|
93
125
|
|
94
|
-
|
95
|
-
self._ocr_manager = OCRManager()
|
96
|
-
logger.info(f"Initialized OCRManager. Available engines: {self._ocr_manager.get_available_engines()}")
|
97
|
-
else:
|
98
|
-
self._ocr_manager = None
|
99
|
-
logger.warning("OCRManager could not be imported. OCR functionality disabled.")
|
126
|
+
self._font_attrs = font_attrs # Store the font attribute configuration
|
100
127
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
self._layout_manager = None
|
106
|
-
logger.warning("LayoutManager could not be imported. Layout analysis disabled.")
|
128
|
+
# Initialize Managers and Services (conditionally available)
|
129
|
+
self._ocr_manager = OCRManager() if OCRManager else None
|
130
|
+
self._layout_manager = LayoutManager() if LayoutManager else None
|
131
|
+
self.highlighter = HighlightingService(self)
|
107
132
|
|
133
|
+
# Initialize pages last, passing necessary refs
|
108
134
|
self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
|
135
|
+
|
136
|
+
# Other state
|
109
137
|
self._element_cache = {}
|
110
138
|
self._exclusions = [] # List to store exclusion functions/regions
|
111
139
|
self._regions = [] # List to store region functions/definitions
|
112
140
|
|
113
|
-
# Initialize the Highlighting Service
|
114
|
-
self.highlighter = HighlightingService(self)
|
115
141
|
logger.info("Initialized HighlightingService.")
|
142
|
+
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
116
143
|
|
117
144
|
@property
|
118
145
|
def metadata(self) -> Dict[str, Any]:
|
@@ -123,6 +150,9 @@ class PDF:
|
|
123
150
|
def pages(self) -> 'PageCollection':
|
124
151
|
"""Access pages as a PageCollection object."""
|
125
152
|
from natural_pdf.elements.collections import PageCollection
|
153
|
+
# Ensure _pages is initialized
|
154
|
+
if not hasattr(self, '_pages'):
|
155
|
+
raise AttributeError("PDF pages not yet initialized.")
|
126
156
|
return PageCollection(self._pages)
|
127
157
|
|
128
158
|
def clear_exclusions(self) -> 'PDF':
|
@@ -132,38 +162,40 @@ class PDF:
|
|
132
162
|
Returns:
|
133
163
|
Self for method chaining
|
134
164
|
"""
|
165
|
+
# Ensure _pages is initialized
|
166
|
+
if not hasattr(self, '_pages'):
|
167
|
+
raise AttributeError("PDF pages not yet initialized.")
|
135
168
|
|
136
169
|
self._exclusions = []
|
170
|
+
# Also clear from pages
|
171
|
+
for page in self._pages:
|
172
|
+
page.clear_exclusions()
|
137
173
|
return self
|
138
174
|
|
139
|
-
def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
|
175
|
+
def add_exclusion(self, exclusion_func: Callable[['Page'], Optional[Region]], label: str = None) -> 'PDF':
|
140
176
|
"""
|
141
177
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
142
178
|
|
143
179
|
Args:
|
144
|
-
exclusion_func: A function that takes a Page and returns a Region to exclude
|
180
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
|
145
181
|
label: Optional label for this exclusion
|
146
182
|
|
147
183
|
Returns:
|
148
184
|
Self for method chaining
|
149
185
|
"""
|
186
|
+
# Ensure _pages is initialized
|
187
|
+
if not hasattr(self, '_pages'):
|
188
|
+
raise AttributeError("PDF pages not yet initialized.")
|
189
|
+
|
150
190
|
# Store exclusion with its label at PDF level
|
151
191
|
exclusion_data = (exclusion_func, label)
|
152
192
|
self._exclusions.append(exclusion_data)
|
153
|
-
|
154
|
-
#
|
155
|
-
def exclusion_wrapper(page):
|
156
|
-
try:
|
157
|
-
region = exclusion_func(page)
|
158
|
-
return region
|
159
|
-
except Exception as e:
|
160
|
-
print(f"Error in PDF-level exclusion for page {page.index}: {e}")
|
161
|
-
return None
|
162
|
-
|
163
|
-
# Apply this exclusion to all pages using the wrapper
|
193
|
+
|
194
|
+
# Apply this exclusion to all pages
|
164
195
|
for page in self._pages:
|
165
|
-
|
166
|
-
|
196
|
+
# We pass the original function, Page.add_exclusion handles calling it
|
197
|
+
page.add_exclusion(exclusion_func, label=label)
|
198
|
+
|
167
199
|
return self
|
168
200
|
|
169
201
|
def apply_ocr_to_pages(
|
@@ -233,9 +265,11 @@ class PDF:
|
|
233
265
|
images_pil: List[Image.Image] = []
|
234
266
|
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
235
267
|
logger.info(f"Rendering {len(target_pages)} pages to images...")
|
268
|
+
failed_page_num = 'unknown' # Keep track of potentially failing page
|
236
269
|
try:
|
237
270
|
ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
|
238
271
|
for i, page in enumerate(target_pages):
|
272
|
+
failed_page_num = page.number # Update current page number in case of error
|
239
273
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
240
274
|
# Use page.to_image but ensure highlights are off for OCR base image
|
241
275
|
img = page.to_image(scale=ocr_scale, include_highlights=False)
|
@@ -243,9 +277,7 @@ class PDF:
|
|
243
277
|
page_image_map.append((page, img)) # Store pair
|
244
278
|
except Exception as e:
|
245
279
|
logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
|
246
|
-
|
247
|
-
# For now, let's fail if any page rendering fails.
|
248
|
-
raise RuntimeError(f"Failed to render page {page.number} for OCR.") from e
|
280
|
+
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
249
281
|
|
250
282
|
if not images_pil:
|
251
283
|
logger.error("No images were successfully rendered for batch OCR.")
|
@@ -253,9 +285,11 @@ class PDF:
|
|
253
285
|
|
254
286
|
# --- Prepare Arguments for Manager ---
|
255
287
|
manager_args = {'images': images_pil, 'options': options, 'engine': engine}
|
256
|
-
|
257
|
-
if
|
258
|
-
if
|
288
|
+
simple_args = {}
|
289
|
+
if languages is not None: simple_args['languages'] = languages
|
290
|
+
if min_confidence is not None: simple_args['min_confidence'] = min_confidence
|
291
|
+
if device is not None: simple_args['device'] = device
|
292
|
+
manager_args.update(simple_args) # Add simple args if options not provided
|
259
293
|
|
260
294
|
# --- Call OCR Manager for Batch Processing ---
|
261
295
|
logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
|
@@ -288,59 +322,60 @@ class PDF:
|
|
288
322
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
289
323
|
# Use the page's element manager to create elements from its results
|
290
324
|
# Changed from page._create_text_elements_from_ocr to use element_mgr
|
291
|
-
|
325
|
+
try:
|
326
|
+
# Calculate scale factors based on rendered image vs page dims
|
327
|
+
img_scale_x = page.width / img.width if img.width > 0 else 1
|
328
|
+
img_scale_y = page.height / img.height if img.height > 0 else 1
|
329
|
+
elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img_scale_x, img_scale_y)
|
292
330
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
331
|
+
if elements:
|
332
|
+
# Note: element_mgr.create_text_elements_from_ocr already adds them
|
333
|
+
total_elements_added += len(elements)
|
334
|
+
logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
|
335
|
+
else:
|
336
|
+
logger.debug(f" No valid TextElements created for page {page.number}.")
|
337
|
+
except Exception as e:
|
338
|
+
logger.error(f" Error adding OCR elements to page {page.number}: {e}", exc_info=True)
|
339
|
+
# Continue to next page
|
299
340
|
|
300
341
|
logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
|
301
342
|
return self
|
302
343
|
|
303
|
-
def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
|
344
|
+
def add_region(self, region_func: Callable[['Page'], Optional[Region]], name: str = None) -> 'PDF':
|
304
345
|
"""
|
305
346
|
Add a region function to the PDF. This creates regions on all pages using the provided function.
|
306
347
|
|
307
348
|
Args:
|
308
|
-
region_func: A function that takes a Page and returns a Region
|
349
|
+
region_func: A function that takes a Page and returns a Region, or None.
|
309
350
|
name: Optional name for the region
|
310
351
|
|
311
352
|
Returns:
|
312
353
|
Self for method chaining
|
313
354
|
"""
|
355
|
+
# Ensure _pages is initialized
|
356
|
+
if not hasattr(self, '_pages'):
|
357
|
+
raise AttributeError("PDF pages not yet initialized.")
|
358
|
+
|
314
359
|
# Store region with its name at PDF level
|
315
360
|
region_data = (region_func, name)
|
316
361
|
self._regions.append(region_data)
|
317
|
-
|
318
|
-
# Create a wrapper function that properly evaluates on each page
|
319
|
-
def region_wrapper(page):
|
320
|
-
try:
|
321
|
-
region = region_func(page)
|
322
|
-
if region:
|
323
|
-
# Apply name if provided
|
324
|
-
if name:
|
325
|
-
region.name = name
|
326
|
-
region.source = 'named'
|
327
|
-
return region
|
328
|
-
except Exception as e:
|
329
|
-
print(f"Error in PDF-level region for page {page.index}: {e}")
|
330
|
-
return None
|
331
|
-
|
362
|
+
|
332
363
|
# Apply this region to all pages
|
333
364
|
for page in self._pages:
|
334
365
|
try:
|
335
|
-
region
|
336
|
-
|
337
|
-
|
366
|
+
# Call the function to get the region for this specific page
|
367
|
+
region_instance = region_func(page)
|
368
|
+
if region_instance and isinstance(region_instance, Region):
|
369
|
+
# If a valid region is returned, add it to the page
|
370
|
+
page.add_region(region_instance, name=name, source='named')
|
371
|
+
elif region_instance is not None:
|
372
|
+
logger.warning(f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}")
|
338
373
|
except Exception as e:
|
339
|
-
|
340
|
-
|
374
|
+
logger.error(f"Error executing or adding region function for page {page.number}: {e}", exc_info=True)
|
375
|
+
|
341
376
|
return self
|
342
377
|
|
343
|
-
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
|
378
|
+
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Optional[Any]:
|
344
379
|
"""
|
345
380
|
Find the first element matching the selector.
|
346
381
|
|
@@ -354,13 +389,17 @@ class PDF:
|
|
354
389
|
Returns:
|
355
390
|
Element object or None if not found
|
356
391
|
"""
|
392
|
+
# Ensure _pages is initialized
|
393
|
+
if not hasattr(self, '_pages'):
|
394
|
+
raise AttributeError("PDF pages not yet initialized.")
|
395
|
+
|
357
396
|
selector_obj = parse_selector(selector)
|
358
397
|
|
359
398
|
# Pass regex and case flags to selector function
|
360
399
|
kwargs['regex'] = regex
|
361
400
|
kwargs['case'] = case
|
362
401
|
|
363
|
-
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
402
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs)
|
364
403
|
return results.first if results else None
|
365
404
|
|
366
405
|
def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
|
@@ -377,22 +416,27 @@ class PDF:
|
|
377
416
|
Returns:
|
378
417
|
ElementCollection with matching elements
|
379
418
|
"""
|
419
|
+
# Ensure _pages is initialized
|
420
|
+
if not hasattr(self, '_pages'):
|
421
|
+
raise AttributeError("PDF pages not yet initialized.")
|
422
|
+
|
380
423
|
selector_obj = parse_selector(selector)
|
381
424
|
|
382
425
|
# Pass regex and case flags to selector function
|
383
426
|
kwargs['regex'] = regex
|
384
427
|
kwargs['case'] = case
|
385
428
|
|
386
|
-
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
429
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs)
|
387
430
|
return results
|
388
431
|
|
389
|
-
def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
|
432
|
+
def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs) -> ElementCollection:
|
390
433
|
"""
|
391
434
|
Apply selector to PDF elements across all pages.
|
392
435
|
|
393
436
|
Args:
|
394
437
|
selector_obj: Parsed selector dictionary
|
395
438
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
439
|
+
first_only: If True, stop searching after the first match is found.
|
396
440
|
**kwargs: Additional filter parameters
|
397
441
|
|
398
442
|
Returns:
|
@@ -401,48 +445,55 @@ class PDF:
|
|
401
445
|
from natural_pdf.elements.collections import ElementCollection
|
402
446
|
|
403
447
|
# Determine page range to search
|
404
|
-
|
405
|
-
if isinstance(
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
stop = page_range.stop or len(self.pages)
|
412
|
-
step = page_range.step or 1
|
413
|
-
page_range = range(start, stop, step)
|
414
|
-
|
415
|
-
# Check for cross-page pseudo-classes
|
416
|
-
cross_page = False
|
448
|
+
page_indices = kwargs.get('pages', range(len(self._pages)))
|
449
|
+
if isinstance(page_indices, int):
|
450
|
+
page_indices = [page_indices]
|
451
|
+
elif isinstance(page_indices, slice):
|
452
|
+
page_indices = range(*page_indices.indices(len(self._pages)))
|
453
|
+
|
454
|
+
# Check for cross-page pseudo-classes (currently not supported)
|
417
455
|
for pseudo in selector_obj.get('pseudo_classes', []):
|
418
456
|
if pseudo.get('name') in ('spans', 'continues'):
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
# If searching across pages, handle specially
|
423
|
-
if cross_page:
|
424
|
-
# TODO: Implement cross-page element matching
|
425
|
-
return ElementCollection([])
|
426
|
-
|
457
|
+
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
458
|
+
return ElementCollection([])
|
459
|
+
|
427
460
|
# Regular case: collect elements from each page
|
428
461
|
all_elements = []
|
429
|
-
for page_idx in
|
430
|
-
if 0 <= page_idx < len(self.
|
431
|
-
page = self.
|
432
|
-
|
433
|
-
|
434
|
-
|
462
|
+
for page_idx in page_indices:
|
463
|
+
if 0 <= page_idx < len(self._pages):
|
464
|
+
page = self._pages[page_idx]
|
465
|
+
# Pass first_only down to page._apply_selector
|
466
|
+
page_elements_collection = page._apply_selector(
|
467
|
+
selector_obj,
|
468
|
+
apply_exclusions=apply_exclusions,
|
469
|
+
first_only=first_only,
|
470
|
+
**kwargs
|
471
|
+
)
|
472
|
+
if page_elements_collection:
|
473
|
+
page_elements = page_elements_collection.elements
|
474
|
+
all_elements.extend(page_elements)
|
475
|
+
# If we only need the first match overall, and we found one on this page, stop
|
476
|
+
if first_only and page_elements:
|
477
|
+
break # Stop iterating through pages
|
478
|
+
else:
|
479
|
+
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
480
|
+
|
435
481
|
# Create a combined collection
|
436
482
|
combined = ElementCollection(all_elements)
|
437
|
-
|
438
|
-
# Sort in document order if requested
|
439
|
-
if kwargs.get('document_order', True):
|
483
|
+
|
484
|
+
# Sort in document order if requested and not first_only (already sorted by page)
|
485
|
+
if not first_only and kwargs.get('document_order', True):
|
440
486
|
# Check if elements have page, top, x0 before sorting
|
441
487
|
if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
|
442
488
|
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
443
489
|
else:
|
444
|
-
|
445
|
-
|
490
|
+
# Elements might be Regions without inherent sorting order yet
|
491
|
+
# Attempt sorting by page index if possible
|
492
|
+
try:
|
493
|
+
combined.sort(key=lambda el: el.page.index)
|
494
|
+
except AttributeError:
|
495
|
+
logger.warning("Cannot sort elements in document order: Missing required attributes (e.g., page).")
|
496
|
+
|
446
497
|
return combined
|
447
498
|
|
448
499
|
def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
|
@@ -460,9 +511,13 @@ class PDF:
|
|
460
511
|
Returns:
|
461
512
|
Extracted text as string
|
462
513
|
"""
|
514
|
+
# Ensure _pages is initialized
|
515
|
+
if not hasattr(self, '_pages'):
|
516
|
+
raise AttributeError("PDF pages not yet initialized.")
|
517
|
+
|
463
518
|
# If selector is provided, find elements first
|
464
519
|
if selector:
|
465
|
-
elements = self.find_all(selector)
|
520
|
+
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
466
521
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
467
522
|
|
468
523
|
# Otherwise extract from all pages
|
@@ -484,8 +539,6 @@ class PDF:
|
|
484
539
|
|
485
540
|
return "\n".join(texts)
|
486
541
|
|
487
|
-
# Note: extract_text_compat method removed
|
488
|
-
|
489
542
|
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
490
543
|
"""
|
491
544
|
Shorthand for finding elements and extracting their text.
|
@@ -498,45 +551,11 @@ class PDF:
|
|
498
551
|
Returns:
|
499
552
|
Extracted text from matching elements
|
500
553
|
"""
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
# Generate an interactive HTML debug report for OCR results.
|
506
|
-
|
507
|
-
# This creates a single-file HTML report with:
|
508
|
-
# - Side-by-side view of image regions and OCR text
|
509
|
-
# - Confidence scores with color coding
|
510
|
-
# - Editable correction fields
|
511
|
-
# - Filtering and sorting options
|
512
|
-
# - Export functionality for corrected text
|
513
|
-
|
514
|
-
# Args:
|
515
|
-
# output_path: Path to save the HTML report
|
516
|
-
# pages: Pages to include in the report (default: all pages)
|
517
|
-
# Can be a page index, slice, or list of page indices
|
518
|
-
|
519
|
-
# Returns:
|
520
|
-
# Self for method chaining
|
521
|
-
# """
|
522
|
-
# from natural_pdf.utils.ocr import debug_ocr_to_html
|
554
|
+
# Ensure _pages is initialized
|
555
|
+
if not hasattr(self, '_pages'):
|
556
|
+
raise AttributeError("PDF pages not yet initialized.")
|
557
|
+
return self.extract_text(selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs) # apply_exclusions is handled by find_all in extract_text
|
523
558
|
|
524
|
-
# if pages is None:
|
525
|
-
# # Include all pages
|
526
|
-
# target_pages = self.pages
|
527
|
-
# elif isinstance(pages, int):
|
528
|
-
# # Single page index
|
529
|
-
# target_pages = [self.pages[pages]]
|
530
|
-
# elif isinstance(pages, slice):
|
531
|
-
# # Slice of pages
|
532
|
-
# target_pages = self.pages[pages]
|
533
|
-
# else:
|
534
|
-
# # Assume it's an iterable of page indices
|
535
|
-
# target_pages = [self.pages[i] for i in pages]
|
536
|
-
|
537
|
-
# debug_ocr_to_html(target_pages, output_path)
|
538
|
-
# return self
|
539
|
-
|
540
559
|
def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
|
541
560
|
"""
|
542
561
|
Extract tables from the document or matching elements.
|
@@ -549,9 +568,63 @@ class PDF:
|
|
549
568
|
Returns:
|
550
569
|
List of extracted tables
|
551
570
|
"""
|
571
|
+
# Ensure _pages is initialized
|
572
|
+
if not hasattr(self, '_pages'):
|
573
|
+
raise AttributeError("PDF pages not yet initialized.")
|
552
574
|
# TODO: Implement table extraction
|
553
|
-
|
575
|
+
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
576
|
+
all_tables = []
|
577
|
+
for page in self.pages:
|
578
|
+
# Assuming page.extract_tables(**kwargs) exists or is added
|
579
|
+
if hasattr(page, 'extract_tables'):
|
580
|
+
all_tables.extend(page.extract_tables(**kwargs))
|
581
|
+
else:
|
582
|
+
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
583
|
+
# Placeholder filtering
|
584
|
+
if selector:
|
585
|
+
logger.warning("Filtering extracted tables by selector is not implemented.")
|
586
|
+
# Would need to parse selector and filter the list `all_tables`
|
587
|
+
# Placeholder merging
|
588
|
+
if merge_across_pages:
|
589
|
+
logger.warning("Merging tables across pages is not implemented.")
|
590
|
+
# Would need logic to detect and merge related tables
|
591
|
+
return all_tables
|
554
592
|
|
593
|
+
# --- New Method: save_searchable ---
|
594
|
+
def save_searchable(self, output_path: Union[str, 'Path'], dpi: int = 300, **kwargs):
|
595
|
+
"""
|
596
|
+
Saves the PDF with an OCR text layer, making content searchable.
|
597
|
+
|
598
|
+
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
599
|
+
|
600
|
+
Note: OCR must have been applied to the pages beforehand
|
601
|
+
(e.g., using pdf.apply_ocr_to_pages()).
|
602
|
+
|
603
|
+
Args:
|
604
|
+
output_path: Path to save the searchable PDF.
|
605
|
+
dpi: Resolution for rendering and OCR overlay (default 300).
|
606
|
+
**kwargs: Additional keyword arguments passed to the exporter.
|
607
|
+
"""
|
608
|
+
# Import moved here, assuming it's always available now
|
609
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
610
|
+
|
611
|
+
# TODO: Need a reliable way for Page to signal if it has OCR elements.
|
612
|
+
# This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
|
613
|
+
# or checking if page.get_elements(source='ocr') returns anything.
|
614
|
+
# For now, we pass through and let the exporter handle pages without OCR elements.
|
615
|
+
# if not any(page.get_elements(source='ocr') for page in self.pages):
|
616
|
+
# logger.warning("No OCR elements found on pages. "
|
617
|
+
# "Ensure apply_ocr_to_pages() was called. "
|
618
|
+
# "Output PDF might not be searchable.")
|
619
|
+
|
620
|
+
# Convert pathlib.Path to string if necessary
|
621
|
+
output_path_str = str(output_path)
|
622
|
+
|
623
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
624
|
+
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
625
|
+
|
626
|
+
# --- End New Method ---
|
627
|
+
|
555
628
|
def ask(self, question: str,
|
556
629
|
mode: str = "extractive",
|
557
630
|
pages: Union[int, List[int], range] = None,
|
@@ -621,12 +694,132 @@ class PDF:
|
|
621
694
|
"page_num": None, # Or maybe the pages searched?
|
622
695
|
"source_elements": []
|
623
696
|
}
|
624
|
-
|
697
|
+
|
698
|
+
def search_within_index(
|
699
|
+
self,
|
700
|
+
query: Union[str, Path, Image.Image, Region],
|
701
|
+
search_service: SearchServiceProtocol, # Now required
|
702
|
+
options: Optional[SearchOptions] = None,
|
703
|
+
) -> List[Dict[str, Any]]:
|
704
|
+
"""
|
705
|
+
Finds relevant documents specifically originating from THIS PDF document
|
706
|
+
within a search index managed by the provided SearchService.
|
707
|
+
|
708
|
+
This method uses a pre-configured SearchService instance and adds
|
709
|
+
a filter to the search query to scope results only to pages from
|
710
|
+
this specific PDF object (based on its resolved path).
|
711
|
+
|
712
|
+
Args:
|
713
|
+
query: The search query (text, image path, PIL Image, Region).
|
714
|
+
search_service: A pre-configured SearchService instance pointing to the
|
715
|
+
index where this PDF's content (or related content)
|
716
|
+
is expected to be found.
|
717
|
+
options: Optional SearchOptions to configure the query (top_k, filters, etc.).
|
718
|
+
Any existing filters in `options` will be combined with the
|
719
|
+
PDF-scoping filter using an 'AND' condition.
|
720
|
+
|
721
|
+
Returns:
|
722
|
+
A list of result dictionaries, sorted by relevance, containing only
|
723
|
+
results originating from this PDF's pages.
|
724
|
+
|
725
|
+
Raises:
|
726
|
+
ImportError: If search dependencies are not installed.
|
727
|
+
ValueError: If search_service is None.
|
728
|
+
TypeError: If search_service does not conform to the protocol.
|
729
|
+
FileNotFoundError: If the collection managed by the service does not exist.
|
730
|
+
RuntimeError: For other search failures.
|
731
|
+
"""
|
732
|
+
if not search_service:
|
733
|
+
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
734
|
+
# Optional stricter check:
|
735
|
+
# if not isinstance(search_service, SearchServiceProtocol):
|
736
|
+
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
737
|
+
|
738
|
+
# Get collection name from service for logging
|
739
|
+
collection_name = getattr(search_service, 'collection_name', '<Unknown Collection>')
|
740
|
+
logger.info(f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}.")
|
741
|
+
|
742
|
+
# --- 1. Get Search Service Instance --- (REMOVED - provided directly)
|
743
|
+
# service: SearchServiceProtocol
|
744
|
+
# if search_service:
|
745
|
+
# service = search_service
|
746
|
+
# else:
|
747
|
+
# logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
|
748
|
+
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
749
|
+
# # TODO: Pass embedding model from options/pdf config if needed?
|
750
|
+
# service = get_search_service(**factory_args)
|
751
|
+
service = search_service # Use validated provided service
|
752
|
+
|
753
|
+
# --- 2. Prepare Query and Options ---
|
754
|
+
query_input = query
|
755
|
+
# Resolve options (use default TextSearch if none provided)
|
756
|
+
effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
|
757
|
+
|
758
|
+
# Handle Region query - extract text for now
|
759
|
+
if isinstance(query, Region):
|
760
|
+
logger.debug("Query is a Region object. Extracting text.")
|
761
|
+
if not isinstance(effective_options, TextSearchOptions):
|
762
|
+
logger.warning("Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction.")
|
763
|
+
query_input = query.extract_text()
|
764
|
+
if not query_input or query_input.isspace():
|
765
|
+
logger.error("Region has no extractable text for query.")
|
766
|
+
return []
|
767
|
+
|
768
|
+
# --- 3. Add Filter to Scope Search to THIS PDF ---
|
769
|
+
# Assume metadata field 'pdf_path' stores the resolved path used during indexing
|
770
|
+
pdf_scope_filter = {
|
771
|
+
"field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
|
772
|
+
"operator": "eq",
|
773
|
+
"value": self.path # Use the resolved path of this PDF instance
|
774
|
+
}
|
775
|
+
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
776
|
+
|
777
|
+
# Combine with existing filters in options (if any)
|
778
|
+
if effective_options.filters:
|
779
|
+
logger.debug(f"Combining PDF scope filter with existing filters: {effective_options.filters}")
|
780
|
+
# Assume filters are compatible with the underlying search service
|
781
|
+
# If existing filters aren't already in an AND block, wrap them
|
782
|
+
if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
|
783
|
+
# Already an AND block, just append the condition
|
784
|
+
effective_options.filters["conditions"].append(pdf_scope_filter)
|
785
|
+
elif isinstance(effective_options.filters, list):
|
786
|
+
# Assume list represents implicit AND conditions
|
787
|
+
effective_options.filters = {"operator": "AND", "conditions": effective_options.filters + [pdf_scope_filter]}
|
788
|
+
elif isinstance(effective_options.filters, dict): # Single filter dict
|
789
|
+
effective_options.filters = {"operator": "AND", "conditions": [effective_options.filters, pdf_scope_filter]}
|
790
|
+
else:
|
791
|
+
logger.warning(f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter.")
|
792
|
+
effective_options.filters = pdf_scope_filter
|
793
|
+
else:
|
794
|
+
effective_options.filters = pdf_scope_filter
|
795
|
+
|
796
|
+
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
797
|
+
|
798
|
+
# --- 4. Call SearchService ---
|
799
|
+
try:
|
800
|
+
# Call the service's search method (no collection_name needed)
|
801
|
+
results = service.search(
|
802
|
+
query=query_input,
|
803
|
+
options=effective_options,
|
804
|
+
)
|
805
|
+
logger.info(f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'.")
|
806
|
+
return results
|
807
|
+
except FileNotFoundError as fnf:
|
808
|
+
logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
|
809
|
+
raise # Re-raise specific error
|
810
|
+
except Exception as e:
|
811
|
+
logger.error(f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}", exc_info=True)
|
812
|
+
raise RuntimeError(f"Search within index failed for PDF '{self.path}'. See logs for details.") from e
|
813
|
+
|
625
814
|
def __len__(self) -> int:
|
626
815
|
"""Return the number of pages in the PDF."""
|
627
|
-
|
816
|
+
# Ensure _pages is initialized
|
817
|
+
if not hasattr(self, '_pages'):
|
818
|
+
# Return 0 or raise error if not fully initialized? Let's return 0.
|
819
|
+
return 0
|
820
|
+
return len(self._pages)
|
628
821
|
|
629
|
-
def __getitem__(self, key) -> Union[Page,
|
822
|
+
def __getitem__(self, key) -> Union[Page, 'PageCollection']: # Return PageCollection for slice
|
630
823
|
"""Access pages by index or slice."""
|
631
824
|
# Check if self._pages has been initialized
|
632
825
|
if not hasattr(self, '_pages'):
|
@@ -635,30 +828,48 @@ class PDF:
|
|
635
828
|
# Return a PageCollection slice
|
636
829
|
from natural_pdf.elements.collections import PageCollection
|
637
830
|
return PageCollection(self._pages[key])
|
638
|
-
#
|
639
|
-
|
831
|
+
# Check index bounds before accessing
|
832
|
+
if isinstance(key, int):
|
833
|
+
if 0 <= key < len(self._pages):
|
834
|
+
return self._pages[key]
|
835
|
+
else:
|
836
|
+
raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
|
837
|
+
else:
|
838
|
+
raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
|
640
839
|
|
641
840
|
def close(self):
|
642
841
|
"""Close the underlying PDF file and clean up any temporary files."""
|
643
842
|
if hasattr(self, '_pdf') and self._pdf is not None:
|
644
|
-
|
645
|
-
|
646
|
-
|
843
|
+
try:
|
844
|
+
self._pdf.close()
|
845
|
+
logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
|
846
|
+
except Exception as e:
|
847
|
+
logger.warning(f"Error closing pdfplumber object: {e}")
|
848
|
+
finally:
|
849
|
+
self._pdf = None
|
850
|
+
|
647
851
|
# Clean up temporary file if it exists
|
648
852
|
if hasattr(self, '_temp_file') and self._temp_file is not None:
|
853
|
+
temp_file_path = None
|
649
854
|
try:
|
650
|
-
|
651
|
-
|
652
|
-
|
855
|
+
if hasattr(self._temp_file, 'name') and self._temp_file.name:
|
856
|
+
temp_file_path = self._temp_file.name
|
857
|
+
if os.path.exists(temp_file_path):
|
858
|
+
os.unlink(temp_file_path)
|
859
|
+
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
653
860
|
except Exception as e:
|
654
|
-
|
861
|
+
logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
|
655
862
|
finally:
|
656
|
-
|
657
|
-
|
863
|
+
self._temp_file = None
|
864
|
+
|
658
865
|
def __enter__(self):
|
659
866
|
"""Context manager entry."""
|
660
867
|
return self
|
661
868
|
|
662
869
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
663
870
|
"""Context manager exit."""
|
664
|
-
self.close()
|
871
|
+
self.close()
|
872
|
+
|
873
|
+
# --- Added TYPE_CHECKING import (if not already present) ---
|
874
|
+
if TYPE_CHECKING:
|
875
|
+
from pathlib import Path # Assuming Path is used for type hint
|