natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
natural_pdf/core/pdf.py
ADDED
@@ -0,0 +1,653 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import logging
|
3
|
+
import tempfile
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import urllib.request
|
7
|
+
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable # Added Iterable
|
8
|
+
from PIL import Image
|
9
|
+
|
10
|
+
from natural_pdf.core.page import Page
|
11
|
+
from natural_pdf.selectors.parser import parse_selector
|
12
|
+
from natural_pdf.elements.collections import ElementCollection
|
13
|
+
from natural_pdf.elements.region import Region
|
14
|
+
from natural_pdf.ocr import OCRManager, OCROptions
|
15
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
|
16
|
+
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
17
|
+
|
18
|
+
# Set up module logger
|
19
|
+
logger = logging.getLogger("natural_pdf.core.pdf")
|
20
|
+
|
21
|
+
|
22
|
+
class PDF:
|
23
|
+
"""
|
24
|
+
Enhanced PDF wrapper built on top of pdfplumber.
|
25
|
+
|
26
|
+
This class provides a fluent interface for working with PDF documents,
|
27
|
+
with improved selection, navigation, and extraction capabilities.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(self, path_or_url: str, reading_order: bool = True,
|
31
|
+
font_attrs: Optional[List[str]] = None,
|
32
|
+
keep_spaces: bool = True):
|
33
|
+
"""
|
34
|
+
Initialize the enhanced PDF object.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
path_or_url: Path to the PDF file or a URL to a PDF
|
38
|
+
reading_order: Whether to use natural reading order
|
39
|
+
font_attrs: Font attributes to consider when grouping characters into words.
|
40
|
+
Default: ['fontname', 'size'] (Group by font name and size)
|
41
|
+
None: Only consider spatial relationships
|
42
|
+
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
43
|
+
keep_spaces: Whether to include spaces in word elements (default: True).
|
44
|
+
True: Spaces are part of words, better for multi-word searching
|
45
|
+
False: Break text at spaces, each word is separate (legacy behavior)
|
46
|
+
"""
|
47
|
+
# Check if the input is a URL
|
48
|
+
is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
|
49
|
+
|
50
|
+
# Initialize path-related attributes
|
51
|
+
self._original_path = path_or_url
|
52
|
+
self._temp_file = None
|
53
|
+
|
54
|
+
if is_url:
|
55
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
56
|
+
try:
|
57
|
+
# Create a temporary file to store the downloaded PDF
|
58
|
+
self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
59
|
+
|
60
|
+
# Download the PDF
|
61
|
+
with urllib.request.urlopen(path_or_url) as response:
|
62
|
+
self._temp_file.write(response.read())
|
63
|
+
self._temp_file.flush()
|
64
|
+
self._temp_file.close()
|
65
|
+
|
66
|
+
# Use the temporary file path
|
67
|
+
path = self._temp_file.name
|
68
|
+
logger.info(f"PDF downloaded to temporary file: {path}")
|
69
|
+
except Exception as e:
|
70
|
+
if self._temp_file and hasattr(self._temp_file, 'name'):
|
71
|
+
try:
|
72
|
+
os.unlink(self._temp_file.name)
|
73
|
+
except:
|
74
|
+
pass
|
75
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
76
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
77
|
+
else:
|
78
|
+
# Use the provided path directly
|
79
|
+
path = path_or_url
|
80
|
+
|
81
|
+
logger.info(f"Initializing PDF from {path}")
|
82
|
+
logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
|
83
|
+
|
84
|
+
self._pdf = pdfplumber.open(path)
|
85
|
+
self._path = path
|
86
|
+
self._reading_order = reading_order
|
87
|
+
self._config = {
|
88
|
+
'keep_spaces': keep_spaces
|
89
|
+
}
|
90
|
+
self.path = path
|
91
|
+
|
92
|
+
self._font_attrs = font_attrs # Store the font attribute configuration
|
93
|
+
|
94
|
+
if OCRManager:
|
95
|
+
self._ocr_manager = OCRManager()
|
96
|
+
logger.info(f"Initialized OCRManager. Available engines: {self._ocr_manager.get_available_engines()}")
|
97
|
+
else:
|
98
|
+
self._ocr_manager = None
|
99
|
+
logger.warning("OCRManager could not be imported. OCR functionality disabled.")
|
100
|
+
|
101
|
+
if LayoutManager:
|
102
|
+
self._layout_manager = LayoutManager()
|
103
|
+
logger.info(f"Initialized LayoutManager. Available engines: {self._layout_manager.get_available_engines()}")
|
104
|
+
else:
|
105
|
+
self._layout_manager = None
|
106
|
+
logger.warning("LayoutManager could not be imported. Layout analysis disabled.")
|
107
|
+
|
108
|
+
self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
|
109
|
+
self._element_cache = {}
|
110
|
+
self._exclusions = [] # List to store exclusion functions/regions
|
111
|
+
self._regions = [] # List to store region functions/definitions
|
112
|
+
|
113
|
+
# Initialize the Highlighting Service
|
114
|
+
self.highlighter = HighlightingService(self)
|
115
|
+
logger.info("Initialized HighlightingService.")
|
116
|
+
|
117
|
+
@property
|
118
|
+
def metadata(self) -> Dict[str, Any]:
|
119
|
+
"""Access metadata as a dictionary."""
|
120
|
+
return self._pdf.metadata
|
121
|
+
|
122
|
+
@property
|
123
|
+
def pages(self) -> 'PageCollection':
|
124
|
+
"""Access pages as a PageCollection object."""
|
125
|
+
from natural_pdf.elements.collections import PageCollection
|
126
|
+
return PageCollection(self._pages)
|
127
|
+
|
128
|
+
def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
|
129
|
+
"""
|
130
|
+
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude
|
134
|
+
label: Optional label for this exclusion
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
Self for method chaining
|
138
|
+
"""
|
139
|
+
# Store exclusion with its label at PDF level
|
140
|
+
exclusion_data = (exclusion_func, label)
|
141
|
+
self._exclusions.append(exclusion_data)
|
142
|
+
|
143
|
+
# Create a wrapper function that properly evaluates on each page
|
144
|
+
def exclusion_wrapper(page):
|
145
|
+
try:
|
146
|
+
region = exclusion_func(page)
|
147
|
+
return region
|
148
|
+
except Exception as e:
|
149
|
+
print(f"Error in PDF-level exclusion for page {page.index}: {e}")
|
150
|
+
return None
|
151
|
+
|
152
|
+
# Apply this exclusion to all pages using the wrapper
|
153
|
+
for page in self._pages:
|
154
|
+
page.add_exclusion(exclusion_wrapper)
|
155
|
+
|
156
|
+
return self
|
157
|
+
|
158
|
+
def apply_ocr_to_pages(
|
159
|
+
self,
|
160
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
161
|
+
engine: Optional[str] = None,
|
162
|
+
options: Optional['OCROptions'] = None,
|
163
|
+
languages: Optional[List[str]] = None,
|
164
|
+
min_confidence: Optional[float] = None,
|
165
|
+
device: Optional[str] = None,
|
166
|
+
# Add other simple mode args if needed
|
167
|
+
) -> 'PDF':
|
168
|
+
"""
|
169
|
+
Applies OCR to specified pages (or all pages) of the PDF using batch processing.
|
170
|
+
|
171
|
+
This method renders the specified pages to images, sends them as a batch
|
172
|
+
to the OCRManager, and adds the resulting TextElements to each respective page.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
pages: An iterable of 0-based page indices (list, range, tuple),
|
176
|
+
a slice object, or None to process all pages.
|
177
|
+
engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
178
|
+
Uses manager's default if None. Ignored if 'options' is provided.
|
179
|
+
options: An specific Options object (e.g., EasyOCROptions) for
|
180
|
+
advanced configuration. Overrides simple arguments.
|
181
|
+
languages: List of language codes for simple mode.
|
182
|
+
min_confidence: Minimum confidence threshold for simple mode.
|
183
|
+
device: Device string ('cpu', 'cuda', etc.) for simple mode.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
Self for method chaining.
|
187
|
+
|
188
|
+
Raises:
|
189
|
+
ValueError: If page indices are invalid or the engine name is invalid.
|
190
|
+
TypeError: If unexpected keyword arguments are provided in simple mode.
|
191
|
+
RuntimeError: If the OCRManager or selected engine is not available.
|
192
|
+
"""
|
193
|
+
if not self._ocr_manager:
|
194
|
+
logger.error("OCRManager not available. Cannot apply OCR.")
|
195
|
+
# Or raise RuntimeError("OCRManager not initialized.")
|
196
|
+
return self
|
197
|
+
|
198
|
+
# --- Determine Target Pages ---
|
199
|
+
target_pages: List[Page] = []
|
200
|
+
if pages is None:
|
201
|
+
target_pages = self._pages
|
202
|
+
elif isinstance(pages, slice):
|
203
|
+
target_pages = self._pages[pages]
|
204
|
+
elif hasattr(pages, '__iter__'): # Check if it's iterable (list, range, tuple, etc.)
|
205
|
+
try:
|
206
|
+
target_pages = [self._pages[i] for i in pages]
|
207
|
+
except IndexError:
|
208
|
+
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
209
|
+
except TypeError:
|
210
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
211
|
+
else:
|
212
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
213
|
+
|
214
|
+
if not target_pages:
|
215
|
+
logger.warning("No pages selected for OCR processing.")
|
216
|
+
return self
|
217
|
+
|
218
|
+
page_numbers = [p.number for p in target_pages]
|
219
|
+
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
220
|
+
|
221
|
+
# --- Render Images for Batch ---
|
222
|
+
images_pil: List[Image.Image] = []
|
223
|
+
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
224
|
+
logger.info(f"Rendering {len(target_pages)} pages to images...")
|
225
|
+
try:
|
226
|
+
ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
|
227
|
+
for i, page in enumerate(target_pages):
|
228
|
+
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
229
|
+
# Use page.to_image but ensure highlights are off for OCR base image
|
230
|
+
img = page.to_image(scale=ocr_scale, include_highlights=False)
|
231
|
+
images_pil.append(img)
|
232
|
+
page_image_map.append((page, img)) # Store pair
|
233
|
+
except Exception as e:
|
234
|
+
logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
|
235
|
+
# Decide whether to continue with successfully rendered pages or fail completely
|
236
|
+
# For now, let's fail if any page rendering fails.
|
237
|
+
raise RuntimeError(f"Failed to render page {page.number} for OCR.") from e
|
238
|
+
|
239
|
+
if not images_pil:
|
240
|
+
logger.error("No images were successfully rendered for batch OCR.")
|
241
|
+
return self
|
242
|
+
|
243
|
+
# --- Prepare Arguments for Manager ---
|
244
|
+
manager_args = {'images': images_pil, 'options': options, 'engine': engine}
|
245
|
+
if languages is not None: manager_args['languages'] = languages
|
246
|
+
if min_confidence is not None: manager_args['min_confidence'] = min_confidence
|
247
|
+
if device is not None: manager_args['device'] = device
|
248
|
+
|
249
|
+
# --- Call OCR Manager for Batch Processing ---
|
250
|
+
logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
|
251
|
+
try:
|
252
|
+
# The manager's apply_ocr handles the batch input and returns List[List[Dict]]
|
253
|
+
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
254
|
+
|
255
|
+
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
256
|
+
logger.error(f"OCR Manager returned unexpected result format or length for batch processing. "
|
257
|
+
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
258
|
+
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}.")
|
259
|
+
# Handle error - maybe return early or try processing valid parts?
|
260
|
+
return self # Return self without adding elements
|
261
|
+
|
262
|
+
logger.info("OCR Manager batch processing complete.")
|
263
|
+
|
264
|
+
except Exception as e:
|
265
|
+
logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
|
266
|
+
return self # Return self without adding elements
|
267
|
+
|
268
|
+
# --- Distribute Results and Add Elements to Pages ---
|
269
|
+
logger.info("Adding OCR results to respective pages...")
|
270
|
+
total_elements_added = 0
|
271
|
+
for i, (page, img) in enumerate(page_image_map):
|
272
|
+
results_for_page = batch_results[i]
|
273
|
+
if not isinstance(results_for_page, list):
|
274
|
+
logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
|
275
|
+
continue
|
276
|
+
|
277
|
+
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
278
|
+
# Use the page's element manager to create elements from its results
|
279
|
+
# Changed from page._create_text_elements_from_ocr to use element_mgr
|
280
|
+
elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img.width, img.height)
|
281
|
+
|
282
|
+
if elements:
|
283
|
+
# Note: element_mgr.create_text_elements_from_ocr already adds them
|
284
|
+
total_elements_added += len(elements)
|
285
|
+
logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
|
286
|
+
else:
|
287
|
+
logger.debug(f" No valid TextElements created for page {page.number}.")
|
288
|
+
|
289
|
+
logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
|
290
|
+
return self
|
291
|
+
|
292
|
+
def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
|
293
|
+
"""
|
294
|
+
Add a region function to the PDF. This creates regions on all pages using the provided function.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
region_func: A function that takes a Page and returns a Region
|
298
|
+
name: Optional name for the region
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
Self for method chaining
|
302
|
+
"""
|
303
|
+
# Store region with its name at PDF level
|
304
|
+
region_data = (region_func, name)
|
305
|
+
self._regions.append(region_data)
|
306
|
+
|
307
|
+
# Create a wrapper function that properly evaluates on each page
|
308
|
+
def region_wrapper(page):
|
309
|
+
try:
|
310
|
+
region = region_func(page)
|
311
|
+
if region:
|
312
|
+
# Apply name if provided
|
313
|
+
if name:
|
314
|
+
region.name = name
|
315
|
+
region.source = 'named'
|
316
|
+
return region
|
317
|
+
except Exception as e:
|
318
|
+
print(f"Error in PDF-level region for page {page.index}: {e}")
|
319
|
+
return None
|
320
|
+
|
321
|
+
# Apply this region to all pages
|
322
|
+
for page in self._pages:
|
323
|
+
try:
|
324
|
+
region = region_wrapper(page)
|
325
|
+
if region:
|
326
|
+
page.add_region(region, name=name)
|
327
|
+
except Exception as e:
|
328
|
+
print(f"Error adding region to page {page.index}: {e}")
|
329
|
+
|
330
|
+
return self
|
331
|
+
|
332
|
+
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
|
333
|
+
"""
|
334
|
+
Find the first element matching the selector.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
|
338
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
339
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
340
|
+
case: Whether to do case-sensitive text search (default: True)
|
341
|
+
**kwargs: Additional filter parameters
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
Element object or None if not found
|
345
|
+
"""
|
346
|
+
selector_obj = parse_selector(selector)
|
347
|
+
|
348
|
+
# Pass regex and case flags to selector function
|
349
|
+
kwargs['regex'] = regex
|
350
|
+
kwargs['case'] = case
|
351
|
+
|
352
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
353
|
+
return results.first if results else None
|
354
|
+
|
355
|
+
def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
|
356
|
+
"""
|
357
|
+
Find all elements matching the selector.
|
358
|
+
|
359
|
+
Args:
|
360
|
+
selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
|
361
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
362
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
363
|
+
case: Whether to do case-sensitive text search (default: True)
|
364
|
+
**kwargs: Additional filter parameters
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
ElementCollection with matching elements
|
368
|
+
"""
|
369
|
+
selector_obj = parse_selector(selector)
|
370
|
+
|
371
|
+
# Pass regex and case flags to selector function
|
372
|
+
kwargs['regex'] = regex
|
373
|
+
kwargs['case'] = case
|
374
|
+
|
375
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
376
|
+
return results
|
377
|
+
|
378
|
+
def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
|
379
|
+
"""
|
380
|
+
Apply selector to PDF elements across all pages.
|
381
|
+
|
382
|
+
Args:
|
383
|
+
selector_obj: Parsed selector dictionary
|
384
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
385
|
+
**kwargs: Additional filter parameters
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
ElementCollection of matching elements
|
389
|
+
"""
|
390
|
+
from natural_pdf.elements.collections import ElementCollection
|
391
|
+
|
392
|
+
# Determine page range to search
|
393
|
+
page_range = kwargs.get('pages', range(len(self.pages)))
|
394
|
+
if isinstance(page_range, (int, slice)):
|
395
|
+
# Convert int or slice to range
|
396
|
+
if isinstance(page_range, int):
|
397
|
+
page_range = [page_range]
|
398
|
+
elif isinstance(page_range, slice):
|
399
|
+
start = page_range.start or 0
|
400
|
+
stop = page_range.stop or len(self.pages)
|
401
|
+
step = page_range.step or 1
|
402
|
+
page_range = range(start, stop, step)
|
403
|
+
|
404
|
+
# Check for cross-page pseudo-classes
|
405
|
+
cross_page = False
|
406
|
+
for pseudo in selector_obj.get('pseudo_classes', []):
|
407
|
+
if pseudo.get('name') in ('spans', 'continues'):
|
408
|
+
cross_page = True
|
409
|
+
break
|
410
|
+
|
411
|
+
# If searching across pages, handle specially
|
412
|
+
if cross_page:
|
413
|
+
# TODO: Implement cross-page element matching
|
414
|
+
return ElementCollection([])
|
415
|
+
|
416
|
+
# Regular case: collect elements from each page
|
417
|
+
all_elements = []
|
418
|
+
for page_idx in page_range:
|
419
|
+
if 0 <= page_idx < len(self.pages):
|
420
|
+
page = self.pages[page_idx]
|
421
|
+
page_elements = page._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
422
|
+
all_elements.extend(page_elements.elements)
|
423
|
+
|
424
|
+
# Create a combined collection
|
425
|
+
combined = ElementCollection(all_elements)
|
426
|
+
|
427
|
+
# Sort in document order if requested
|
428
|
+
if kwargs.get('document_order', True):
|
429
|
+
# Check if elements have page, top, x0 before sorting
|
430
|
+
if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
|
431
|
+
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
432
|
+
else:
|
433
|
+
logger.warning("Cannot sort elements in document order: Missing required attributes (page, top, x0).")
|
434
|
+
|
435
|
+
return combined
|
436
|
+
|
437
|
+
def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
|
438
|
+
use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
|
439
|
+
"""
|
440
|
+
Extract text from the entire document or matching elements.
|
441
|
+
|
442
|
+
Args:
|
443
|
+
selector: Optional selector to filter elements
|
444
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
445
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
446
|
+
debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
|
447
|
+
**kwargs: Additional extraction parameters
|
448
|
+
|
449
|
+
Returns:
|
450
|
+
Extracted text as string
|
451
|
+
"""
|
452
|
+
# If selector is provided, find elements first
|
453
|
+
if selector:
|
454
|
+
elements = self.find_all(selector)
|
455
|
+
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
456
|
+
|
457
|
+
# Otherwise extract from all pages
|
458
|
+
if debug_exclusions:
|
459
|
+
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
460
|
+
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
461
|
+
|
462
|
+
texts = []
|
463
|
+
for page in self.pages:
|
464
|
+
texts.append(page.extract_text(
|
465
|
+
preserve_whitespace=preserve_whitespace,
|
466
|
+
use_exclusions=use_exclusions,
|
467
|
+
debug_exclusions=debug_exclusions,
|
468
|
+
**kwargs
|
469
|
+
))
|
470
|
+
|
471
|
+
if debug_exclusions:
|
472
|
+
print(f"PDF: Combined {len(texts)} pages of text")
|
473
|
+
|
474
|
+
return "\n".join(texts)
|
475
|
+
|
476
|
+
# Note: extract_text_compat method removed
|
477
|
+
|
478
|
+
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
479
|
+
"""
|
480
|
+
Shorthand for finding elements and extracting their text.
|
481
|
+
|
482
|
+
Args:
|
483
|
+
selector: CSS-like selector string
|
484
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
485
|
+
**kwargs: Additional extraction parameters
|
486
|
+
|
487
|
+
Returns:
|
488
|
+
Extracted text from matching elements
|
489
|
+
"""
|
490
|
+
return self.extract_text(selector, preserve_whitespace=preserve_whitespace, **kwargs)
|
491
|
+
|
492
|
+
# def debug_ocr(self, output_path, pages=None):
|
493
|
+
# """
|
494
|
+
# Generate an interactive HTML debug report for OCR results.
|
495
|
+
|
496
|
+
# This creates a single-file HTML report with:
|
497
|
+
# - Side-by-side view of image regions and OCR text
|
498
|
+
# - Confidence scores with color coding
|
499
|
+
# - Editable correction fields
|
500
|
+
# - Filtering and sorting options
|
501
|
+
# - Export functionality for corrected text
|
502
|
+
|
503
|
+
# Args:
|
504
|
+
# output_path: Path to save the HTML report
|
505
|
+
# pages: Pages to include in the report (default: all pages)
|
506
|
+
# Can be a page index, slice, or list of page indices
|
507
|
+
|
508
|
+
# Returns:
|
509
|
+
# Self for method chaining
|
510
|
+
# """
|
511
|
+
# from natural_pdf.utils.ocr import debug_ocr_to_html
|
512
|
+
|
513
|
+
# if pages is None:
|
514
|
+
# # Include all pages
|
515
|
+
# target_pages = self.pages
|
516
|
+
# elif isinstance(pages, int):
|
517
|
+
# # Single page index
|
518
|
+
# target_pages = [self.pages[pages]]
|
519
|
+
# elif isinstance(pages, slice):
|
520
|
+
# # Slice of pages
|
521
|
+
# target_pages = self.pages[pages]
|
522
|
+
# else:
|
523
|
+
# # Assume it's an iterable of page indices
|
524
|
+
# target_pages = [self.pages[i] for i in pages]
|
525
|
+
|
526
|
+
# debug_ocr_to_html(target_pages, output_path)
|
527
|
+
# return self
|
528
|
+
|
529
|
+
def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
|
530
|
+
"""
|
531
|
+
Extract tables from the document or matching elements.
|
532
|
+
|
533
|
+
Args:
|
534
|
+
selector: Optional selector to filter tables
|
535
|
+
merge_across_pages: Whether to merge tables that span across pages
|
536
|
+
**kwargs: Additional extraction parameters
|
537
|
+
|
538
|
+
Returns:
|
539
|
+
List of extracted tables
|
540
|
+
"""
|
541
|
+
# TODO: Implement table extraction
|
542
|
+
return [] # Placeholder
|
543
|
+
|
544
|
+
def ask(self, question: str,
|
545
|
+
mode: str = "extractive",
|
546
|
+
pages: Union[int, List[int], range] = None,
|
547
|
+
min_confidence: float = 0.1,
|
548
|
+
model: str = None,
|
549
|
+
**kwargs) -> Dict[str, Any]:
|
550
|
+
"""
|
551
|
+
Ask a question about the document content.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
question: Question to ask about the document
|
555
|
+
mode: "extractive" to extract answer from document, "generative" to generate
|
556
|
+
pages: Specific pages to query (default: all pages)
|
557
|
+
min_confidence: Minimum confidence threshold for answers
|
558
|
+
model: Optional model name for question answering
|
559
|
+
**kwargs: Additional parameters passed to the QA engine
|
560
|
+
|
561
|
+
Returns:
|
562
|
+
A dictionary containing the answer, confidence, and other metadata.
|
563
|
+
Result will have an 'answer' key containing the answer text.
|
564
|
+
"""
|
565
|
+
from natural_pdf.qa import get_qa_engine
|
566
|
+
|
567
|
+
# Initialize or get QA engine
|
568
|
+
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
569
|
+
|
570
|
+
# Determine which pages to query
|
571
|
+
if pages is None:
|
572
|
+
target_pages = list(range(len(self.pages)))
|
573
|
+
elif isinstance(pages, int):
|
574
|
+
# Single page
|
575
|
+
target_pages = [pages]
|
576
|
+
elif isinstance(pages, (list, range)):
|
577
|
+
# List or range of pages
|
578
|
+
target_pages = pages
|
579
|
+
else:
|
580
|
+
raise ValueError(f"Invalid pages parameter: {pages}")
|
581
|
+
|
582
|
+
# Actually query each page and gather results
|
583
|
+
results = []
|
584
|
+
for page_idx in target_pages:
|
585
|
+
if 0 <= page_idx < len(self.pages):
|
586
|
+
page = self.pages[page_idx]
|
587
|
+
page_result = qa_engine.ask_pdf_page(
|
588
|
+
page=page,
|
589
|
+
question=question,
|
590
|
+
min_confidence=min_confidence,
|
591
|
+
**kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
# Add to results if it found an answer
|
595
|
+
if page_result and page_result.get("found", False):
|
596
|
+
results.append(page_result)
|
597
|
+
|
598
|
+
# Sort results by confidence
|
599
|
+
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
600
|
+
|
601
|
+
# Return the best result, or a default result if none found
|
602
|
+
if results:
|
603
|
+
return results[0]
|
604
|
+
else:
|
605
|
+
# Return a structure indicating no answer found
|
606
|
+
return {
|
607
|
+
"answer": None,
|
608
|
+
"confidence": 0.0,
|
609
|
+
"found": False,
|
610
|
+
"page_num": None, # Or maybe the pages searched?
|
611
|
+
"source_elements": []
|
612
|
+
}
|
613
|
+
|
614
|
+
def __len__(self) -> int:
|
615
|
+
"""Return the number of pages in the PDF."""
|
616
|
+
return len(self.pages)
|
617
|
+
|
618
|
+
def __getitem__(self, key) -> Union[Page, List[Page]]:
|
619
|
+
"""Access pages by index or slice."""
|
620
|
+
# Check if self._pages has been initialized
|
621
|
+
if not hasattr(self, '_pages'):
|
622
|
+
raise AttributeError("PDF pages not initialized yet.")
|
623
|
+
if isinstance(key, slice):
|
624
|
+
# Return a PageCollection slice
|
625
|
+
from natural_pdf.elements.collections import PageCollection
|
626
|
+
return PageCollection(self._pages[key])
|
627
|
+
# Return a single Page object
|
628
|
+
return self._pages[key]
|
629
|
+
|
630
|
+
def close(self):
|
631
|
+
"""Close the underlying PDF file and clean up any temporary files."""
|
632
|
+
if hasattr(self, '_pdf') and self._pdf is not None:
|
633
|
+
self._pdf.close()
|
634
|
+
self._pdf = None
|
635
|
+
|
636
|
+
# Clean up temporary file if it exists
|
637
|
+
if hasattr(self, '_temp_file') and self._temp_file is not None:
|
638
|
+
try:
|
639
|
+
if hasattr(self._temp_file, 'name') and self._temp_file.name and os.path.exists(self._temp_file.name):
|
640
|
+
os.unlink(self._temp_file.name)
|
641
|
+
logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
|
642
|
+
except Exception as e:
|
643
|
+
logger.warning(f"Failed to clean up temporary PDF file: {e}")
|
644
|
+
finally:
|
645
|
+
self._temp_file = None
|
646
|
+
|
647
|
+
def __enter__(self):
|
648
|
+
"""Context manager entry."""
|
649
|
+
return self
|
650
|
+
|
651
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
652
|
+
"""Context manager exit."""
|
653
|
+
self.close()
|