natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
natural_pdf/core/pdf.py
ADDED
@@ -0,0 +1,572 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import logging
|
3
|
+
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
|
4
|
+
|
5
|
+
from natural_pdf.core.page import Page
|
6
|
+
from natural_pdf.selectors.parser import parse_selector
|
7
|
+
from natural_pdf.elements.collections import ElementCollection
|
8
|
+
from natural_pdf.elements.region import Region
|
9
|
+
from natural_pdf.utils.ocr import OCRManager
|
10
|
+
|
11
|
+
# Set up module logger
|
12
|
+
logger = logging.getLogger("natural_pdf.core.pdf")
|
13
|
+
|
14
|
+
# Import OCR engines
|
15
|
+
try:
|
16
|
+
from natural_pdf.ocr import OCREngine, EasyOCREngine, PaddleOCREngine, get_engine
|
17
|
+
HAS_OCR_ENGINES = True
|
18
|
+
except ImportError:
|
19
|
+
# Fallback if the OCR engines are not available
|
20
|
+
HAS_OCR_ENGINES = False
|
21
|
+
|
22
|
+
|
23
|
+
class PDF:
|
24
|
+
"""
|
25
|
+
Enhanced PDF wrapper built on top of pdfplumber.
|
26
|
+
|
27
|
+
This class provides a fluent interface for working with PDF documents,
|
28
|
+
with improved selection, navigation, and extraction capabilities.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(self, path: str, reading_order: bool = True,
|
32
|
+
ocr: Optional[Union[bool, str, List, Dict]] = None,
|
33
|
+
ocr_engine: Optional[Union[str, Any]] = None,
|
34
|
+
font_attrs: Optional[List[str]] = None,
|
35
|
+
keep_spaces: bool = True):
|
36
|
+
"""
|
37
|
+
Initialize the enhanced PDF object.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
path: Path to the PDF file
|
41
|
+
reading_order: Whether to use natural reading order
|
42
|
+
ocr: OCR configuration:
|
43
|
+
- None or False: OCR disabled
|
44
|
+
- True: OCR enabled with defaults
|
45
|
+
- "auto": Auto OCR mode
|
46
|
+
- ["en", "fr"]: Use these languages
|
47
|
+
- {"languages": ["en"]}: Detailed configuration
|
48
|
+
ocr_engine: OCR engine to use:
|
49
|
+
- None: Use default engine (PaddleOCR if available, otherwise EasyOCR)
|
50
|
+
- "easyocr": Use EasyOCR engine
|
51
|
+
- "paddleocr": Use PaddleOCR engine
|
52
|
+
- OCREngine instance: Use the provided engine instance
|
53
|
+
font_attrs: Font attributes to consider when grouping characters into words.
|
54
|
+
Default: ['fontname', 'size'] (Group by font name and size)
|
55
|
+
None: Only consider spatial relationships
|
56
|
+
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
57
|
+
keep_spaces: Whether to include spaces in word elements (default: True).
|
58
|
+
True: Spaces are part of words, better for multi-word searching
|
59
|
+
False: Break text at spaces, each word is separate (legacy behavior)
|
60
|
+
"""
|
61
|
+
logger.info(f"Initializing PDF from {path}")
|
62
|
+
logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
|
63
|
+
|
64
|
+
self._pdf = pdfplumber.open(path)
|
65
|
+
self._path = path
|
66
|
+
self._reading_order = reading_order
|
67
|
+
self._config = {
|
68
|
+
'keep_spaces': keep_spaces
|
69
|
+
}
|
70
|
+
|
71
|
+
# Initialize OCR engine
|
72
|
+
if HAS_OCR_ENGINES:
|
73
|
+
# Handle OCR engine selection
|
74
|
+
if ocr_engine is None:
|
75
|
+
# Use default engine (EasyOCR)
|
76
|
+
self._ocr_engine = EasyOCREngine()
|
77
|
+
elif isinstance(ocr_engine, str):
|
78
|
+
# String-based engine selection
|
79
|
+
try:
|
80
|
+
self._ocr_engine = get_engine(ocr_engine)
|
81
|
+
except (ImportError, ValueError) as e:
|
82
|
+
print(f"Warning: OCR engine '{ocr_engine}' could not be loaded: {e}")
|
83
|
+
print("Falling back to default OCR engine.")
|
84
|
+
self._ocr_engine = EasyOCREngine()
|
85
|
+
elif hasattr(ocr_engine, 'process_image') and hasattr(ocr_engine, 'is_available'):
|
86
|
+
# Engine instance
|
87
|
+
self._ocr_engine = ocr_engine
|
88
|
+
else:
|
89
|
+
print("Warning: Invalid OCR engine provided. Using default engine.")
|
90
|
+
self._ocr_engine = EasyOCREngine()
|
91
|
+
else:
|
92
|
+
# Fallback to legacy OCR manager
|
93
|
+
self._ocr_engine = None
|
94
|
+
|
95
|
+
# Normalize OCR configuration
|
96
|
+
if self._ocr_engine:
|
97
|
+
# Use new OCR engine system
|
98
|
+
if ocr is None:
|
99
|
+
# If no OCR config is provided, disable OCR by default
|
100
|
+
ocr = {"enabled": False}
|
101
|
+
elif ocr is False:
|
102
|
+
# Explicit disable
|
103
|
+
ocr = {"enabled": False}
|
104
|
+
elif ocr is True:
|
105
|
+
# Explicit enable
|
106
|
+
ocr = {"enabled": True}
|
107
|
+
elif isinstance(ocr, dict) and "enabled" not in ocr:
|
108
|
+
# If OCR config is provided but doesn't specify enabled, disable it by default
|
109
|
+
ocr["enabled"] = False
|
110
|
+
|
111
|
+
# Now normalize the config with the engine
|
112
|
+
self._ocr_config = self._ocr_engine.normalize_config(ocr)
|
113
|
+
logger.info(f"Initialized PDF with OCR engine: {self._ocr_engine.__class__.__name__}, enabled: {self._ocr_config.get('enabled')}")
|
114
|
+
|
115
|
+
# Double-check enabled status for debugging
|
116
|
+
if isinstance(ocr, dict) and "enabled" in ocr:
|
117
|
+
if ocr["enabled"] != self._ocr_config.get("enabled"):
|
118
|
+
logger.warning(f"OCR enabled status changed during normalization: {ocr['enabled']} -> {self._ocr_config.get('enabled')}")
|
119
|
+
else:
|
120
|
+
# Fallback to legacy OCR manager
|
121
|
+
self._ocr_manager = OCRManager.get_instance()
|
122
|
+
if ocr is None:
|
123
|
+
# If no OCR config is provided, disable OCR by default
|
124
|
+
ocr = {"enabled": False}
|
125
|
+
elif ocr is True:
|
126
|
+
# Explicit enable
|
127
|
+
ocr = {"enabled": True}
|
128
|
+
|
129
|
+
self._ocr_config = self._ocr_manager.normalize_config(ocr)
|
130
|
+
|
131
|
+
self._font_attrs = font_attrs # Store the font attribute configuration
|
132
|
+
self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
|
133
|
+
self._element_cache = {}
|
134
|
+
self._exclusions = [] # List to store exclusion functions/regions
|
135
|
+
self._regions = [] # List to store region functions/definitions
|
136
|
+
|
137
|
+
@property
|
138
|
+
def pages(self) -> 'PageCollection':
|
139
|
+
"""Access pages as a PageCollection object."""
|
140
|
+
from natural_pdf.elements.collections import PageCollection
|
141
|
+
return PageCollection(self._pages)
|
142
|
+
|
143
|
+
def with_ocr(self, enabled: bool = False, languages: List[str] = None,
|
144
|
+
engine: str = None, min_confidence: float = None) -> 'PDF':
|
145
|
+
"""
|
146
|
+
Configure OCR settings using a builder pattern.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
enabled: Whether OCR is enabled (default: False)
|
150
|
+
languages: List of language codes (e.g., ["en", "fr"])
|
151
|
+
engine: OCR engine to use ("easyocr" or "paddleocr")
|
152
|
+
min_confidence: Minimum confidence threshold for OCR results
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
Self for method chaining
|
156
|
+
"""
|
157
|
+
# Initialize the config object
|
158
|
+
config = {"enabled": enabled}
|
159
|
+
|
160
|
+
# Add optional parameters if provided
|
161
|
+
if languages:
|
162
|
+
config["languages"] = languages
|
163
|
+
if min_confidence is not None:
|
164
|
+
config["min_confidence"] = min_confidence
|
165
|
+
|
166
|
+
# Set up the OCR engine if specified
|
167
|
+
if engine:
|
168
|
+
self._ocr_engine = None # Clear existing engine
|
169
|
+
try:
|
170
|
+
from natural_pdf.ocr import get_engine
|
171
|
+
self._ocr_engine = get_engine(engine)
|
172
|
+
except (ImportError, ValueError) as e:
|
173
|
+
logger.warning(f"OCR engine '{engine}' could not be loaded: {e}")
|
174
|
+
logger.warning("Falling back to default OCR engine.")
|
175
|
+
from natural_pdf.ocr import EasyOCREngine
|
176
|
+
self._ocr_engine = EasyOCREngine()
|
177
|
+
|
178
|
+
# Normalize the configuration
|
179
|
+
if self._ocr_engine:
|
180
|
+
self._ocr_config = self._ocr_engine.normalize_config(config)
|
181
|
+
else:
|
182
|
+
from natural_pdf.utils.ocr import OCRManager
|
183
|
+
self._ocr_manager = OCRManager.get_instance()
|
184
|
+
self._ocr_config = self._ocr_manager.normalize_config(config)
|
185
|
+
|
186
|
+
return self
|
187
|
+
|
188
|
+
def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
|
189
|
+
"""
|
190
|
+
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
exclusion_func: A function that takes a Page and returns a Region to exclude
|
194
|
+
label: Optional label for this exclusion
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
Self for method chaining
|
198
|
+
"""
|
199
|
+
# Store exclusion with its label at PDF level
|
200
|
+
exclusion_data = (exclusion_func, label)
|
201
|
+
self._exclusions.append(exclusion_data)
|
202
|
+
|
203
|
+
# Create a wrapper function that properly evaluates on each page
|
204
|
+
def exclusion_wrapper(page):
|
205
|
+
try:
|
206
|
+
region = exclusion_func(page)
|
207
|
+
return region
|
208
|
+
except Exception as e:
|
209
|
+
print(f"Error in PDF-level exclusion for page {page.index}: {e}")
|
210
|
+
return None
|
211
|
+
|
212
|
+
# Apply this exclusion to all pages using the wrapper
|
213
|
+
for page in self._pages:
|
214
|
+
page.add_exclusion(exclusion_wrapper)
|
215
|
+
|
216
|
+
return self
|
217
|
+
|
218
|
+
def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
|
219
|
+
"""
|
220
|
+
Add a region function to the PDF. This creates regions on all pages using the provided function.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
region_func: A function that takes a Page and returns a Region
|
224
|
+
name: Optional name for the region
|
225
|
+
|
226
|
+
Returns:
|
227
|
+
Self for method chaining
|
228
|
+
"""
|
229
|
+
# Store region with its name at PDF level
|
230
|
+
region_data = (region_func, name)
|
231
|
+
self._regions.append(region_data)
|
232
|
+
|
233
|
+
# Create a wrapper function that properly evaluates on each page
|
234
|
+
def region_wrapper(page):
|
235
|
+
try:
|
236
|
+
region = region_func(page)
|
237
|
+
if region:
|
238
|
+
# Apply name if provided
|
239
|
+
if name:
|
240
|
+
region.name = name
|
241
|
+
region.source = 'named'
|
242
|
+
return region
|
243
|
+
except Exception as e:
|
244
|
+
print(f"Error in PDF-level region for page {page.index}: {e}")
|
245
|
+
return None
|
246
|
+
|
247
|
+
# Apply this region to all pages
|
248
|
+
for page in self._pages:
|
249
|
+
try:
|
250
|
+
region = region_wrapper(page)
|
251
|
+
if region:
|
252
|
+
page.add_region(region, name=name)
|
253
|
+
except Exception as e:
|
254
|
+
print(f"Error adding region to page {page.index}: {e}")
|
255
|
+
|
256
|
+
return self
|
257
|
+
|
258
|
+
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
|
259
|
+
"""
|
260
|
+
Find the first element matching the selector.
|
261
|
+
|
262
|
+
Args:
|
263
|
+
selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
|
264
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
265
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
266
|
+
case: Whether to do case-sensitive text search (default: True)
|
267
|
+
**kwargs: Additional filter parameters
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
Element object or None if not found
|
271
|
+
"""
|
272
|
+
selector_obj = parse_selector(selector)
|
273
|
+
|
274
|
+
# Pass regex and case flags to selector function
|
275
|
+
kwargs['regex'] = regex
|
276
|
+
kwargs['case'] = case
|
277
|
+
|
278
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
279
|
+
return results.first if results else None
|
280
|
+
|
281
|
+
def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
|
282
|
+
"""
|
283
|
+
Find all elements matching the selector.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
|
287
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
288
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
289
|
+
case: Whether to do case-sensitive text search (default: True)
|
290
|
+
**kwargs: Additional filter parameters
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
ElementCollection with matching elements
|
294
|
+
"""
|
295
|
+
selector_obj = parse_selector(selector)
|
296
|
+
|
297
|
+
# Pass regex and case flags to selector function
|
298
|
+
kwargs['regex'] = regex
|
299
|
+
kwargs['case'] = case
|
300
|
+
|
301
|
+
results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
302
|
+
return results
|
303
|
+
|
304
|
+
def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
|
305
|
+
"""
|
306
|
+
Apply selector to PDF elements across all pages.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
selector_obj: Parsed selector dictionary
|
310
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
311
|
+
**kwargs: Additional filter parameters
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
ElementCollection of matching elements
|
315
|
+
"""
|
316
|
+
from natural_pdf.elements.collections import ElementCollection
|
317
|
+
|
318
|
+
# Determine page range to search
|
319
|
+
page_range = kwargs.get('pages', range(len(self.pages)))
|
320
|
+
if isinstance(page_range, (int, slice)):
|
321
|
+
# Convert int or slice to range
|
322
|
+
if isinstance(page_range, int):
|
323
|
+
page_range = [page_range]
|
324
|
+
elif isinstance(page_range, slice):
|
325
|
+
start = page_range.start or 0
|
326
|
+
stop = page_range.stop or len(self.pages)
|
327
|
+
step = page_range.step or 1
|
328
|
+
page_range = range(start, stop, step)
|
329
|
+
|
330
|
+
# Check for cross-page pseudo-classes
|
331
|
+
cross_page = False
|
332
|
+
for pseudo in selector_obj.get('pseudo_classes', []):
|
333
|
+
if pseudo.get('name') in ('spans', 'continues'):
|
334
|
+
cross_page = True
|
335
|
+
break
|
336
|
+
|
337
|
+
# If searching across pages, handle specially
|
338
|
+
if cross_page:
|
339
|
+
# TODO: Implement cross-page element matching
|
340
|
+
return ElementCollection([])
|
341
|
+
|
342
|
+
# Regular case: collect elements from each page
|
343
|
+
all_elements = []
|
344
|
+
for page_idx in page_range:
|
345
|
+
if 0 <= page_idx < len(self.pages):
|
346
|
+
page = self.pages[page_idx]
|
347
|
+
page_elements = page._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
|
348
|
+
all_elements.extend(page_elements.elements)
|
349
|
+
|
350
|
+
# Create a combined collection
|
351
|
+
combined = ElementCollection(all_elements)
|
352
|
+
|
353
|
+
# Sort in document order if requested
|
354
|
+
if kwargs.get('document_order', True):
|
355
|
+
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
356
|
+
|
357
|
+
return combined
|
358
|
+
|
359
|
+
def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
|
360
|
+
use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
|
361
|
+
"""
|
362
|
+
Extract text from the entire document or matching elements.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
selector: Optional selector to filter elements
|
366
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
367
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
368
|
+
debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
|
369
|
+
**kwargs: Additional extraction parameters
|
370
|
+
|
371
|
+
Returns:
|
372
|
+
Extracted text as string
|
373
|
+
"""
|
374
|
+
# If selector is provided, find elements first
|
375
|
+
if selector:
|
376
|
+
elements = self.find_all(selector)
|
377
|
+
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
378
|
+
|
379
|
+
# Otherwise extract from all pages
|
380
|
+
if debug_exclusions:
|
381
|
+
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
382
|
+
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
383
|
+
|
384
|
+
texts = []
|
385
|
+
for page in self.pages:
|
386
|
+
texts.append(page.extract_text(
|
387
|
+
preserve_whitespace=preserve_whitespace,
|
388
|
+
use_exclusions=use_exclusions,
|
389
|
+
debug_exclusions=debug_exclusions,
|
390
|
+
**kwargs
|
391
|
+
))
|
392
|
+
|
393
|
+
if debug_exclusions:
|
394
|
+
print(f"PDF: Combined {len(texts)} pages of text")
|
395
|
+
|
396
|
+
return "\n".join(texts)
|
397
|
+
|
398
|
+
# Note: extract_text_compat method removed
|
399
|
+
|
400
|
+
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
401
|
+
"""
|
402
|
+
Shorthand for finding elements and extracting their text.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
selector: CSS-like selector string
|
406
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
407
|
+
**kwargs: Additional extraction parameters
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
Extracted text from matching elements
|
411
|
+
"""
|
412
|
+
return self.extract_text(selector, preserve_whitespace=preserve_whitespace, **kwargs)
|
413
|
+
|
414
|
+
def debug_ocr(self, output_path, pages=None):
|
415
|
+
"""
|
416
|
+
Generate an interactive HTML debug report for OCR results.
|
417
|
+
|
418
|
+
This creates a single-file HTML report with:
|
419
|
+
- Side-by-side view of image regions and OCR text
|
420
|
+
- Confidence scores with color coding
|
421
|
+
- Editable correction fields
|
422
|
+
- Filtering and sorting options
|
423
|
+
- Export functionality for corrected text
|
424
|
+
|
425
|
+
Args:
|
426
|
+
output_path: Path to save the HTML report
|
427
|
+
pages: Pages to include in the report (default: all pages)
|
428
|
+
Can be a page index, slice, or list of page indices
|
429
|
+
|
430
|
+
Returns:
|
431
|
+
Self for method chaining
|
432
|
+
"""
|
433
|
+
from natural_pdf.utils.ocr import debug_ocr_to_html
|
434
|
+
|
435
|
+
if pages is None:
|
436
|
+
# Include all pages
|
437
|
+
target_pages = self.pages
|
438
|
+
elif isinstance(pages, int):
|
439
|
+
# Single page index
|
440
|
+
target_pages = [self.pages[pages]]
|
441
|
+
elif isinstance(pages, slice):
|
442
|
+
# Slice of pages
|
443
|
+
target_pages = self.pages[pages]
|
444
|
+
else:
|
445
|
+
# Assume it's an iterable of page indices
|
446
|
+
target_pages = [self.pages[i] for i in pages]
|
447
|
+
|
448
|
+
debug_ocr_to_html(target_pages, output_path)
|
449
|
+
return self
|
450
|
+
|
451
|
+
def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
|
452
|
+
"""
|
453
|
+
Extract tables from the document or matching elements.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
selector: Optional selector to filter tables
|
457
|
+
merge_across_pages: Whether to merge tables that span across pages
|
458
|
+
**kwargs: Additional extraction parameters
|
459
|
+
|
460
|
+
Returns:
|
461
|
+
List of extracted tables
|
462
|
+
"""
|
463
|
+
# TODO: Implement table extraction
|
464
|
+
return [] # Placeholder
|
465
|
+
|
466
|
+
def ask(self, question: str,
|
467
|
+
mode: str = "extractive",
|
468
|
+
pages: Union[int, List[int], range] = None,
|
469
|
+
min_confidence: float = 0.1,
|
470
|
+
model: str = None,
|
471
|
+
**kwargs) -> Dict[str, Any]:
|
472
|
+
"""
|
473
|
+
Ask a question about the document content.
|
474
|
+
|
475
|
+
Args:
|
476
|
+
question: Question to ask about the document
|
477
|
+
mode: "extractive" to extract answer from document, "generative" to generate
|
478
|
+
pages: Specific pages to query (default: all pages)
|
479
|
+
min_confidence: Minimum confidence threshold for answers
|
480
|
+
model: Optional model name for question answering
|
481
|
+
**kwargs: Additional parameters passed to the QA engine
|
482
|
+
|
483
|
+
Returns:
|
484
|
+
Dictionary with answer and confidence
|
485
|
+
"""
|
486
|
+
try:
|
487
|
+
from natural_pdf.qa import get_qa_engine
|
488
|
+
|
489
|
+
# Initialize or get QA engine
|
490
|
+
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
491
|
+
|
492
|
+
# Determine which pages to query
|
493
|
+
if pages is None:
|
494
|
+
# Query all pages by default, prioritizing first few pages
|
495
|
+
target_pages = list(range(min(10, len(self.pages))))
|
496
|
+
elif isinstance(pages, int):
|
497
|
+
# Single page
|
498
|
+
target_pages = [pages]
|
499
|
+
elif isinstance(pages, (list, range)):
|
500
|
+
# List or range of pages
|
501
|
+
target_pages = pages
|
502
|
+
else:
|
503
|
+
raise ValueError(f"Invalid pages parameter: {pages}")
|
504
|
+
|
505
|
+
# Actually query each page and gather results
|
506
|
+
results = []
|
507
|
+
for page_idx in target_pages:
|
508
|
+
if 0 <= page_idx < len(self.pages):
|
509
|
+
page = self.pages[page_idx]
|
510
|
+
page_result = qa_engine.ask_pdf_page(
|
511
|
+
page=page,
|
512
|
+
question=question,
|
513
|
+
min_confidence=min_confidence,
|
514
|
+
**kwargs
|
515
|
+
)
|
516
|
+
|
517
|
+
# Add to results if it found an answer
|
518
|
+
if page_result.get("found", False):
|
519
|
+
results.append(page_result)
|
520
|
+
|
521
|
+
# Sort results by confidence
|
522
|
+
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
523
|
+
|
524
|
+
# Return the best result, or an empty result if none found
|
525
|
+
if results:
|
526
|
+
return results[0]
|
527
|
+
else:
|
528
|
+
return {
|
529
|
+
"answer": "",
|
530
|
+
"confidence": 0.0,
|
531
|
+
"found": False,
|
532
|
+
"message": "No answer found in document"
|
533
|
+
}
|
534
|
+
|
535
|
+
except ImportError as e:
|
536
|
+
logger.warning(f"QA functionality not available: {e}")
|
537
|
+
return {
|
538
|
+
"answer": "",
|
539
|
+
"confidence": 0.0,
|
540
|
+
"error": "QA functionality not available",
|
541
|
+
"found": False
|
542
|
+
}
|
543
|
+
except Exception as e:
|
544
|
+
logger.error(f"Error in document QA: {e}")
|
545
|
+
return {
|
546
|
+
"answer": "",
|
547
|
+
"confidence": 0.0,
|
548
|
+
"error": str(e),
|
549
|
+
"found": False
|
550
|
+
}
|
551
|
+
|
552
|
+
def __len__(self) -> int:
|
553
|
+
"""Return the number of pages in the PDF."""
|
554
|
+
return len(self.pages)
|
555
|
+
|
556
|
+
def __getitem__(self, key) -> Union[Page, List[Page]]:
|
557
|
+
"""Access pages by index or slice."""
|
558
|
+
return self.pages[key]
|
559
|
+
|
560
|
+
def close(self):
|
561
|
+
"""Close the underlying PDF file."""
|
562
|
+
if hasattr(self, '_pdf') and self._pdf is not None:
|
563
|
+
self._pdf.close()
|
564
|
+
self._pdf = None
|
565
|
+
|
566
|
+
def __enter__(self):
|
567
|
+
"""Context manager entry."""
|
568
|
+
return self
|
569
|
+
|
570
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
571
|
+
"""Context manager exit."""
|
572
|
+
self.close()
|