natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,770 @@
|
|
1
|
+
from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
|
2
|
+
|
3
|
+
if TYPE_CHECKING:
|
4
|
+
from natural_pdf.core.page import Page
|
5
|
+
from natural_pdf.elements.region import Region
|
6
|
+
|
7
|
+
T = TypeVar('T')
|
8
|
+
P = TypeVar('P', bound='Page')
|
9
|
+
|
10
|
+
class ElementCollection(Generic[T]):
|
11
|
+
"""
|
12
|
+
Collection of PDF elements with batch operations.
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(self, elements: List[T]):
|
16
|
+
"""
|
17
|
+
Initialize a collection of elements.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
elements: List of Element objects
|
21
|
+
"""
|
22
|
+
self._elements = elements or []
|
23
|
+
|
24
|
+
def __len__(self) -> int:
|
25
|
+
"""Get the number of elements in the collection."""
|
26
|
+
return len(self._elements)
|
27
|
+
|
28
|
+
def __getitem__(self, index: int) -> 'Element':
|
29
|
+
"""Get an element by index."""
|
30
|
+
return self._elements[index]
|
31
|
+
|
32
|
+
def __iter__(self):
|
33
|
+
"""Iterate over elements."""
|
34
|
+
return iter(self._elements)
|
35
|
+
|
36
|
+
@property
|
37
|
+
def elements(self) -> List['Element']:
|
38
|
+
"""Get the elements in this collection."""
|
39
|
+
return self._elements
|
40
|
+
|
41
|
+
@property
|
42
|
+
def first(self) -> Optional['Element']:
|
43
|
+
"""Get the first element in the collection."""
|
44
|
+
return self._elements[0] if self._elements else None
|
45
|
+
|
46
|
+
@property
|
47
|
+
def last(self) -> Optional['Element']:
|
48
|
+
"""Get the last element in the collection."""
|
49
|
+
return self._elements[-1] if self._elements else None
|
50
|
+
|
51
|
+
def highest(self) -> Optional['Element']:
|
52
|
+
"""
|
53
|
+
Get element with the smallest top y-coordinate (highest on page).
|
54
|
+
|
55
|
+
Raises:
|
56
|
+
ValueError: If elements are on multiple pages
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
Element with smallest top value or None if empty
|
60
|
+
"""
|
61
|
+
if not self._elements:
|
62
|
+
return None
|
63
|
+
|
64
|
+
# Check if elements are on multiple pages
|
65
|
+
if self._are_on_multiple_pages():
|
66
|
+
raise ValueError("Cannot determine highest element across multiple pages")
|
67
|
+
|
68
|
+
return min(self._elements, key=lambda e: e.top)
|
69
|
+
|
70
|
+
def lowest(self) -> Optional['Element']:
|
71
|
+
"""
|
72
|
+
Get element with the largest bottom y-coordinate (lowest on page).
|
73
|
+
|
74
|
+
Raises:
|
75
|
+
ValueError: If elements are on multiple pages
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Element with largest bottom value or None if empty
|
79
|
+
"""
|
80
|
+
if not self._elements:
|
81
|
+
return None
|
82
|
+
|
83
|
+
# Check if elements are on multiple pages
|
84
|
+
if self._are_on_multiple_pages():
|
85
|
+
raise ValueError("Cannot determine lowest element across multiple pages")
|
86
|
+
|
87
|
+
return max(self._elements, key=lambda e: e.bottom)
|
88
|
+
|
89
|
+
def leftmost(self) -> Optional['Element']:
|
90
|
+
"""
|
91
|
+
Get element with the smallest x0 coordinate (leftmost on page).
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
ValueError: If elements are on multiple pages
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
Element with smallest x0 value or None if empty
|
98
|
+
"""
|
99
|
+
if not self._elements:
|
100
|
+
return None
|
101
|
+
|
102
|
+
# Check if elements are on multiple pages
|
103
|
+
if self._are_on_multiple_pages():
|
104
|
+
raise ValueError("Cannot determine leftmost element across multiple pages")
|
105
|
+
|
106
|
+
return min(self._elements, key=lambda e: e.x0)
|
107
|
+
|
108
|
+
def rightmost(self) -> Optional['Element']:
|
109
|
+
"""
|
110
|
+
Get element with the largest x1 coordinate (rightmost on page).
|
111
|
+
|
112
|
+
Raises:
|
113
|
+
ValueError: If elements are on multiple pages
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
Element with largest x1 value or None if empty
|
117
|
+
"""
|
118
|
+
if not self._elements:
|
119
|
+
return None
|
120
|
+
|
121
|
+
# Check if elements are on multiple pages
|
122
|
+
if self._are_on_multiple_pages():
|
123
|
+
raise ValueError("Cannot determine rightmost element across multiple pages")
|
124
|
+
|
125
|
+
return max(self._elements, key=lambda e: e.x1)
|
126
|
+
|
127
|
+
def _are_on_multiple_pages(self) -> bool:
|
128
|
+
"""
|
129
|
+
Check if elements in this collection span multiple pages.
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
True if elements are on different pages, False otherwise
|
133
|
+
"""
|
134
|
+
if not self._elements:
|
135
|
+
return False
|
136
|
+
|
137
|
+
# Get the page index of the first element
|
138
|
+
if not hasattr(self._elements[0], 'page'):
|
139
|
+
return False
|
140
|
+
|
141
|
+
first_page_idx = self._elements[0].page.index
|
142
|
+
|
143
|
+
# Check if any element is on a different page
|
144
|
+
return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
|
145
|
+
|
146
|
+
def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
|
147
|
+
"""
|
148
|
+
Remove elements that are within any of the specified regions.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
regions: List of Region objects to exclude
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
New ElementCollection with filtered elements
|
155
|
+
"""
|
156
|
+
if not regions:
|
157
|
+
return ElementCollection(self._elements)
|
158
|
+
|
159
|
+
filtered = []
|
160
|
+
for element in self._elements:
|
161
|
+
exclude = False
|
162
|
+
for region in regions:
|
163
|
+
if region._is_element_in_region(element):
|
164
|
+
exclude = True
|
165
|
+
break
|
166
|
+
if not exclude:
|
167
|
+
filtered.append(element)
|
168
|
+
|
169
|
+
return ElementCollection(filtered)
|
170
|
+
|
171
|
+
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
172
|
+
"""
|
173
|
+
Extract text from all elements in the collection.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
177
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
178
|
+
**kwargs: Additional extraction parameters
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
Combined text from all elements
|
182
|
+
"""
|
183
|
+
# Filter to just text-like elements
|
184
|
+
text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
|
185
|
+
|
186
|
+
# Sort elements in reading order (top-to-bottom, left-to-right)
|
187
|
+
sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
|
188
|
+
|
189
|
+
# Extract text from each element
|
190
|
+
texts = []
|
191
|
+
for element in sorted_elements:
|
192
|
+
# Extract text with new parameter names
|
193
|
+
text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
|
194
|
+
|
195
|
+
if text:
|
196
|
+
texts.append(text)
|
197
|
+
|
198
|
+
return " ".join(texts)
|
199
|
+
|
200
|
+
def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
|
201
|
+
"""
|
202
|
+
Filter elements using a function.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
func: Function that takes an element and returns True to keep it
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
New ElementCollection with filtered elements
|
209
|
+
"""
|
210
|
+
return ElementCollection([e for e in self._elements if func(e)])
|
211
|
+
|
212
|
+
def sort(self, key=None, reverse=False) -> 'ElementCollection':
|
213
|
+
"""
|
214
|
+
Sort elements by the given key function.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
key: Function to generate a key for sorting
|
218
|
+
reverse: Whether to sort in descending order
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
Self for method chaining
|
222
|
+
"""
|
223
|
+
self._elements.sort(key=key, reverse=reverse)
|
224
|
+
return self
|
225
|
+
|
226
|
+
def highlight(self,
|
227
|
+
label: Optional[str] = None,
|
228
|
+
color: Optional[tuple] = None,
|
229
|
+
use_color_cycling: bool = False,
|
230
|
+
cycle_colors: bool = False,
|
231
|
+
include_attrs: Optional[List[str]] = None,
|
232
|
+
existing: str = 'append') -> 'ElementCollection': # Added for backward compatibility
|
233
|
+
"""
|
234
|
+
Highlight all elements in the collection.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
label: Optional label for the highlight
|
238
|
+
color: Optional color for the highlight (RGBA tuple)
|
239
|
+
use_color_cycling: Force color cycling even with no label (default: False)
|
240
|
+
cycle_colors: Alias for use_color_cycling (deprecated, for backward compatibility)
|
241
|
+
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
242
|
+
existing: How to handle existing highlights - 'append' (default) or 'replace'
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
Self for method chaining
|
246
|
+
"""
|
247
|
+
# Use cycle_colors if provided (backward compatibility)
|
248
|
+
color_cycle = use_color_cycling or cycle_colors
|
249
|
+
|
250
|
+
# Get the highlight manager from the first element's page (if available)
|
251
|
+
if self._elements and hasattr(self._elements[0], 'page'):
|
252
|
+
page = self._elements[0].page
|
253
|
+
if hasattr(page, '_highlight_mgr'):
|
254
|
+
highlight_mgr = page._highlight_mgr
|
255
|
+
|
256
|
+
# Add highlights for each element
|
257
|
+
for element in self._elements:
|
258
|
+
# Check if element has polygon coordinates
|
259
|
+
if hasattr(element, 'has_polygon') and element.has_polygon:
|
260
|
+
# Use polygon highlight
|
261
|
+
highlight_mgr.add_polygon_highlight(
|
262
|
+
element.polygon,
|
263
|
+
color,
|
264
|
+
label,
|
265
|
+
color_cycle,
|
266
|
+
element=element,
|
267
|
+
include_attrs=include_attrs,
|
268
|
+
existing=existing if element is self._elements[0] else 'append'
|
269
|
+
)
|
270
|
+
else:
|
271
|
+
# Get the element's bounding box
|
272
|
+
bbox = (element.x0, element.top, element.x1, element.bottom)
|
273
|
+
# Add the highlight
|
274
|
+
highlight_mgr.add_highlight(
|
275
|
+
bbox,
|
276
|
+
color,
|
277
|
+
label,
|
278
|
+
color_cycle,
|
279
|
+
element=element,
|
280
|
+
include_attrs=include_attrs,
|
281
|
+
existing=existing if element is self._elements[0] else 'append'
|
282
|
+
)
|
283
|
+
|
284
|
+
return self
|
285
|
+
|
286
|
+
def show(self,
|
287
|
+
scale: float = 2.0,
|
288
|
+
width: Optional[int] = None,
|
289
|
+
labels: bool = True,
|
290
|
+
legend_position: str = 'right',
|
291
|
+
render_ocr: bool = False) -> 'Image.Image':
|
292
|
+
"""
|
293
|
+
Show the page with this collection's elements highlighted.
|
294
|
+
|
295
|
+
Args:
|
296
|
+
scale: Scale factor for rendering
|
297
|
+
width: Optional width for the output image in pixels
|
298
|
+
labels: Whether to include a legend for labels
|
299
|
+
legend_position: Position of the legend
|
300
|
+
render_ocr: Whether to render OCR text with white background boxes
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
PIL Image of the page with elements highlighted
|
304
|
+
"""
|
305
|
+
# Use to_image to get the image
|
306
|
+
return self.to_image(
|
307
|
+
scale=scale,
|
308
|
+
width=width,
|
309
|
+
labels=labels,
|
310
|
+
legend_position=legend_position,
|
311
|
+
render_ocr=render_ocr
|
312
|
+
)
|
313
|
+
|
314
|
+
def save(self,
|
315
|
+
filename: str,
|
316
|
+
scale: float = 2.0,
|
317
|
+
width: Optional[int] = None,
|
318
|
+
labels: bool = True,
|
319
|
+
legend_position: str = 'right',
|
320
|
+
render_ocr: bool = False) -> 'ElementCollection':
|
321
|
+
"""
|
322
|
+
Save the page with this collection's elements highlighted to an image file.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
filename: Path to save the image to
|
326
|
+
scale: Scale factor for rendering
|
327
|
+
width: Optional width for the output image in pixels
|
328
|
+
labels: Whether to include a legend for labels
|
329
|
+
legend_position: Position of the legend
|
330
|
+
render_ocr: Whether to render OCR text with white background boxes
|
331
|
+
|
332
|
+
Returns:
|
333
|
+
Self for method chaining
|
334
|
+
"""
|
335
|
+
# Use to_image to generate and save the image
|
336
|
+
self.to_image(
|
337
|
+
path=filename,
|
338
|
+
scale=scale,
|
339
|
+
width=width,
|
340
|
+
labels=labels,
|
341
|
+
legend_position=legend_position,
|
342
|
+
render_ocr=render_ocr
|
343
|
+
)
|
344
|
+
return self
|
345
|
+
|
346
|
+
def to_image(self,
|
347
|
+
path: Optional[str] = None,
|
348
|
+
scale: float = 2.0,
|
349
|
+
width: Optional[int] = None,
|
350
|
+
labels: bool = True,
|
351
|
+
legend_position: str = 'right',
|
352
|
+
render_ocr: bool = False) -> Optional['Image.Image']:
|
353
|
+
"""
|
354
|
+
Generate an image of the page with this collection's elements highlighted,
|
355
|
+
optionally saving it to a file.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
path: Optional path to save the image to
|
359
|
+
scale: Scale factor for rendering
|
360
|
+
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
361
|
+
labels: Whether to include a legend for labels
|
362
|
+
legend_position: Position of the legend
|
363
|
+
render_ocr: Whether to render OCR text with white background boxes
|
364
|
+
|
365
|
+
Returns:
|
366
|
+
PIL Image of the page with elements highlighted, or None if no valid page
|
367
|
+
"""
|
368
|
+
# Get the page from the first element (if available)
|
369
|
+
if self._elements and hasattr(self._elements[0], 'page'):
|
370
|
+
page = self._elements[0].page
|
371
|
+
# Generate the image using to_image
|
372
|
+
return page.to_image(
|
373
|
+
path=path,
|
374
|
+
scale=scale,
|
375
|
+
width=width,
|
376
|
+
labels=labels,
|
377
|
+
legend_position=legend_position,
|
378
|
+
render_ocr=render_ocr
|
379
|
+
)
|
380
|
+
return None
|
381
|
+
|
382
|
+
class PageCollection(Generic[P]):
|
383
|
+
"""
|
384
|
+
A collection of PDF pages with cross-page operations.
|
385
|
+
|
386
|
+
This class provides methods for working with multiple pages, such as finding
|
387
|
+
elements across pages, extracting text from page ranges, and more.
|
388
|
+
"""
|
389
|
+
|
390
|
+
def __init__(self, pages: List[P]):
|
391
|
+
"""
|
392
|
+
Initialize a page collection.
|
393
|
+
|
394
|
+
Args:
|
395
|
+
pages: List of Page objects
|
396
|
+
"""
|
397
|
+
self.pages = pages
|
398
|
+
|
399
|
+
def __len__(self) -> int:
|
400
|
+
"""Return the number of pages in the collection."""
|
401
|
+
return len(self.pages)
|
402
|
+
|
403
|
+
def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
|
404
|
+
"""Support indexing and slicing."""
|
405
|
+
if isinstance(idx, slice):
|
406
|
+
return PageCollection(self.pages[idx])
|
407
|
+
return self.pages[idx]
|
408
|
+
|
409
|
+
def __iter__(self) -> Iterator[P]:
|
410
|
+
"""Support iteration."""
|
411
|
+
return iter(self.pages)
|
412
|
+
|
413
|
+
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
414
|
+
"""
|
415
|
+
Extract text from all pages in the collection.
|
416
|
+
|
417
|
+
Args:
|
418
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
419
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
420
|
+
**kwargs: Additional extraction parameters
|
421
|
+
|
422
|
+
Returns:
|
423
|
+
Combined text from all pages
|
424
|
+
"""
|
425
|
+
texts = []
|
426
|
+
for page in self.pages:
|
427
|
+
text = page.extract_text(
|
428
|
+
keep_blank_chars=keep_blank_chars,
|
429
|
+
apply_exclusions=apply_exclusions,
|
430
|
+
**kwargs
|
431
|
+
)
|
432
|
+
texts.append(text)
|
433
|
+
|
434
|
+
return "\n".join(texts)
|
435
|
+
|
436
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
|
437
|
+
"""
|
438
|
+
Find the first element matching the selector across all pages.
|
439
|
+
|
440
|
+
Args:
|
441
|
+
selector: CSS-like selector string
|
442
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
443
|
+
**kwargs: Additional filter parameters
|
444
|
+
|
445
|
+
Returns:
|
446
|
+
First matching element or None
|
447
|
+
"""
|
448
|
+
for page in self.pages:
|
449
|
+
element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
450
|
+
if element:
|
451
|
+
return element
|
452
|
+
return None
|
453
|
+
|
454
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
|
455
|
+
"""
|
456
|
+
Find all elements matching the selector across all pages.
|
457
|
+
|
458
|
+
Args:
|
459
|
+
selector: CSS-like selector string
|
460
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
461
|
+
**kwargs: Additional filter parameters
|
462
|
+
|
463
|
+
Returns:
|
464
|
+
ElementCollection with matching elements from all pages
|
465
|
+
"""
|
466
|
+
all_elements = []
|
467
|
+
for page in self.pages:
|
468
|
+
elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
469
|
+
if elements:
|
470
|
+
all_elements.extend(elements.elements)
|
471
|
+
|
472
|
+
return ElementCollection(all_elements)
|
473
|
+
|
474
|
+
def debug_ocr(self, output_path):
|
475
|
+
"""
|
476
|
+
Generate an interactive HTML debug report for OCR results.
|
477
|
+
|
478
|
+
This creates a single-file HTML report with:
|
479
|
+
- Side-by-side view of image regions and OCR text
|
480
|
+
- Confidence scores with color coding
|
481
|
+
- Editable correction fields
|
482
|
+
- Filtering and sorting options
|
483
|
+
- Export functionality for corrected text
|
484
|
+
|
485
|
+
Args:
|
486
|
+
output_path: Path to save the HTML report
|
487
|
+
|
488
|
+
Returns:
|
489
|
+
Path to the generated HTML file
|
490
|
+
"""
|
491
|
+
from natural_pdf.utils.ocr import debug_ocr_to_html
|
492
|
+
return debug_ocr_to_html(self.pages, output_path)
|
493
|
+
|
494
|
+
def get_sections(self,
|
495
|
+
start_elements=None,
|
496
|
+
end_elements=None,
|
497
|
+
new_section_on_page_break=False,
|
498
|
+
boundary_inclusion='both') -> List['Region']:
|
499
|
+
"""
|
500
|
+
Extract sections from a page collection based on start/end elements.
|
501
|
+
|
502
|
+
Args:
|
503
|
+
start_elements: Elements or selector string that mark the start of sections
|
504
|
+
end_elements: Elements or selector string that mark the end of sections
|
505
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
506
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
507
|
+
|
508
|
+
Returns:
|
509
|
+
List of Region objects representing the extracted sections
|
510
|
+
"""
|
511
|
+
# Find start and end elements across all pages
|
512
|
+
if isinstance(start_elements, str):
|
513
|
+
start_elements = self.find_all(start_elements).elements
|
514
|
+
|
515
|
+
if isinstance(end_elements, str):
|
516
|
+
end_elements = self.find_all(end_elements).elements
|
517
|
+
|
518
|
+
# If no start elements, return empty list
|
519
|
+
if not start_elements:
|
520
|
+
return []
|
521
|
+
|
522
|
+
# If there are page break boundaries, we'll need to add them
|
523
|
+
if new_section_on_page_break:
|
524
|
+
# For each page boundary, create virtual "end" and "start" elements
|
525
|
+
for i in range(len(self.pages) - 1):
|
526
|
+
# Add a virtual "end" element at the bottom of the current page
|
527
|
+
page = self.pages[i]
|
528
|
+
# If end_elements is None, initialize it as an empty list
|
529
|
+
if end_elements is None:
|
530
|
+
end_elements = []
|
531
|
+
|
532
|
+
# Create a region at the bottom of the page as an artificial end marker
|
533
|
+
from natural_pdf.elements.region import Region
|
534
|
+
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
535
|
+
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
536
|
+
end_elements.append(bottom_region)
|
537
|
+
|
538
|
+
# Add a virtual "start" element at the top of the next page
|
539
|
+
next_page = self.pages[i + 1]
|
540
|
+
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
541
|
+
top_region.is_page_boundary = True # Mark it as a special boundary
|
542
|
+
start_elements.append(top_region)
|
543
|
+
|
544
|
+
# Get all elements from all pages and sort them in document order
|
545
|
+
all_elements = []
|
546
|
+
for page in self.pages:
|
547
|
+
elements = page.get_elements()
|
548
|
+
all_elements.extend(elements)
|
549
|
+
|
550
|
+
# Sort by page index, then vertical position, then horizontal position
|
551
|
+
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
552
|
+
|
553
|
+
# Mark section boundaries
|
554
|
+
section_boundaries = []
|
555
|
+
|
556
|
+
# Add start element boundaries
|
557
|
+
for element in start_elements:
|
558
|
+
if element in all_elements:
|
559
|
+
idx = all_elements.index(element)
|
560
|
+
section_boundaries.append({
|
561
|
+
'index': idx,
|
562
|
+
'element': element,
|
563
|
+
'type': 'start',
|
564
|
+
'page_idx': element.page.index
|
565
|
+
})
|
566
|
+
elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
|
567
|
+
# This is a virtual page boundary element
|
568
|
+
section_boundaries.append({
|
569
|
+
'index': -1, # Special index for page boundaries
|
570
|
+
'element': element,
|
571
|
+
'type': 'start',
|
572
|
+
'page_idx': element.page.index
|
573
|
+
})
|
574
|
+
|
575
|
+
# Add end element boundaries if provided
|
576
|
+
if end_elements:
|
577
|
+
for element in end_elements:
|
578
|
+
if element in all_elements:
|
579
|
+
idx = all_elements.index(element)
|
580
|
+
section_boundaries.append({
|
581
|
+
'index': idx,
|
582
|
+
'element': element,
|
583
|
+
'type': 'end',
|
584
|
+
'page_idx': element.page.index
|
585
|
+
})
|
586
|
+
elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
|
587
|
+
# This is a virtual page boundary element
|
588
|
+
section_boundaries.append({
|
589
|
+
'index': -1, # Special index for page boundaries
|
590
|
+
'element': element,
|
591
|
+
'type': 'end',
|
592
|
+
'page_idx': element.page.index
|
593
|
+
})
|
594
|
+
|
595
|
+
# Sort boundaries by page index, then by actual document position
|
596
|
+
section_boundaries.sort(key=lambda x: (x['page_idx'],
|
597
|
+
x['index'] if x['index'] != -1 else
|
598
|
+
(0 if x['type'] == 'start' else float('inf'))))
|
599
|
+
|
600
|
+
# Generate sections
|
601
|
+
sections = []
|
602
|
+
current_start = None
|
603
|
+
|
604
|
+
for i, boundary in enumerate(section_boundaries):
|
605
|
+
# If it's a start boundary and we don't have a current start
|
606
|
+
if boundary['type'] == 'start' and current_start is None:
|
607
|
+
current_start = boundary
|
608
|
+
|
609
|
+
# If it's an end boundary and we have a current start
|
610
|
+
elif boundary['type'] == 'end' and current_start is not None:
|
611
|
+
# Create a section from current_start to this boundary
|
612
|
+
start_element = current_start['element']
|
613
|
+
end_element = boundary['element']
|
614
|
+
|
615
|
+
# If both elements are on the same page, use the page's get_section_between
|
616
|
+
if start_element.page == end_element.page:
|
617
|
+
section = start_element.page.get_section_between(
|
618
|
+
start_element,
|
619
|
+
end_element,
|
620
|
+
boundary_inclusion
|
621
|
+
)
|
622
|
+
sections.append(section)
|
623
|
+
else:
|
624
|
+
# Create a multi-page section
|
625
|
+
from natural_pdf.elements.region import Region
|
626
|
+
|
627
|
+
# Get the start and end pages
|
628
|
+
start_page = start_element.page
|
629
|
+
end_page = end_element.page
|
630
|
+
|
631
|
+
# Create a combined region
|
632
|
+
combined_region = Region(
|
633
|
+
start_page,
|
634
|
+
(0, start_element.top, start_page.width, start_page.height)
|
635
|
+
)
|
636
|
+
combined_region._spans_pages = True
|
637
|
+
combined_region._page_range = (start_page.index, end_page.index)
|
638
|
+
combined_region.start_element = start_element
|
639
|
+
combined_region.end_element = end_element
|
640
|
+
|
641
|
+
# Get all elements that fall within this multi-page region
|
642
|
+
combined_elements = []
|
643
|
+
|
644
|
+
# Get elements from the first page
|
645
|
+
first_page_elements = [e for e in all_elements
|
646
|
+
if e.page == start_page and e.top >= start_element.top]
|
647
|
+
combined_elements.extend(first_page_elements)
|
648
|
+
|
649
|
+
# Get elements from middle pages (if any)
|
650
|
+
for page_idx in range(start_page.index + 1, end_page.index):
|
651
|
+
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
652
|
+
combined_elements.extend(middle_page_elements)
|
653
|
+
|
654
|
+
# Get elements from the last page
|
655
|
+
last_page_elements = [e for e in all_elements
|
656
|
+
if e.page == end_page and e.bottom <= end_element.bottom]
|
657
|
+
combined_elements.extend(last_page_elements)
|
658
|
+
|
659
|
+
# Store the elements in the combined region
|
660
|
+
combined_region._multi_page_elements = combined_elements
|
661
|
+
|
662
|
+
sections.append(combined_region)
|
663
|
+
|
664
|
+
current_start = None
|
665
|
+
|
666
|
+
# If it's another start boundary and we have a current start (for splitting by starts only)
|
667
|
+
elif boundary['type'] == 'start' and current_start is not None and not end_elements:
|
668
|
+
# Create a section from current_start to just before this boundary
|
669
|
+
start_element = current_start['element']
|
670
|
+
|
671
|
+
# Find the last element before this boundary on the same page
|
672
|
+
if start_element.page == boundary['element'].page:
|
673
|
+
# Find elements on this page
|
674
|
+
page_elements = [e for e in all_elements if e.page == start_element.page]
|
675
|
+
# Sort by position
|
676
|
+
page_elements.sort(key=lambda e: (e.top, e.x0))
|
677
|
+
|
678
|
+
# Find the last element before the boundary
|
679
|
+
end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
|
680
|
+
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
681
|
+
|
682
|
+
# Create the section
|
683
|
+
section = start_element.page.get_section_between(
|
684
|
+
start_element,
|
685
|
+
end_element,
|
686
|
+
boundary_inclusion
|
687
|
+
)
|
688
|
+
sections.append(section)
|
689
|
+
else:
|
690
|
+
# Cross-page section - create from current_start to the end of its page
|
691
|
+
from natural_pdf.elements.region import Region
|
692
|
+
start_page = start_element.page
|
693
|
+
|
694
|
+
region = Region(
|
695
|
+
start_page,
|
696
|
+
(0, start_element.top, start_page.width, start_page.height)
|
697
|
+
)
|
698
|
+
region.start_element = start_element
|
699
|
+
sections.append(region)
|
700
|
+
|
701
|
+
current_start = boundary
|
702
|
+
|
703
|
+
# Handle the last section if we have a current start
|
704
|
+
if current_start is not None:
|
705
|
+
start_element = current_start['element']
|
706
|
+
start_page = start_element.page
|
707
|
+
|
708
|
+
if end_elements:
|
709
|
+
# With end_elements, we need an explicit end - use the last element
|
710
|
+
# on the last page of the collection
|
711
|
+
last_page = self.pages[-1]
|
712
|
+
last_page_elements = [e for e in all_elements if e.page == last_page]
|
713
|
+
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
714
|
+
end_element = last_page_elements[-1] if last_page_elements else None
|
715
|
+
|
716
|
+
# Create a multi-page section
|
717
|
+
from natural_pdf.elements.region import Region
|
718
|
+
|
719
|
+
if start_page == last_page:
|
720
|
+
# Simple case - both on same page
|
721
|
+
section = start_page.get_section_between(
|
722
|
+
start_element,
|
723
|
+
end_element,
|
724
|
+
boundary_inclusion
|
725
|
+
)
|
726
|
+
sections.append(section)
|
727
|
+
else:
|
728
|
+
# Create a multi-page section
|
729
|
+
combined_region = Region(
|
730
|
+
start_page,
|
731
|
+
(0, start_element.top, start_page.width, start_page.height)
|
732
|
+
)
|
733
|
+
combined_region._spans_pages = True
|
734
|
+
combined_region._page_range = (start_page.index, last_page.index)
|
735
|
+
combined_region.start_element = start_element
|
736
|
+
combined_region.end_element = end_element
|
737
|
+
|
738
|
+
# Get all elements that fall within this multi-page region
|
739
|
+
combined_elements = []
|
740
|
+
|
741
|
+
# Get elements from the first page
|
742
|
+
first_page_elements = [e for e in all_elements
|
743
|
+
if e.page == start_page and e.top >= start_element.top]
|
744
|
+
combined_elements.extend(first_page_elements)
|
745
|
+
|
746
|
+
# Get elements from middle pages (if any)
|
747
|
+
for page_idx in range(start_page.index + 1, last_page.index):
|
748
|
+
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
749
|
+
combined_elements.extend(middle_page_elements)
|
750
|
+
|
751
|
+
# Get elements from the last page
|
752
|
+
last_page_elements = [e for e in all_elements
|
753
|
+
if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
|
754
|
+
combined_elements.extend(last_page_elements)
|
755
|
+
|
756
|
+
# Store the elements in the combined region
|
757
|
+
combined_region._multi_page_elements = combined_elements
|
758
|
+
|
759
|
+
sections.append(combined_region)
|
760
|
+
else:
|
761
|
+
# With start_elements only, create a section to the end of the current page
|
762
|
+
from natural_pdf.elements.region import Region
|
763
|
+
region = Region(
|
764
|
+
start_page,
|
765
|
+
(0, start_element.top, start_page.width, start_page.height)
|
766
|
+
)
|
767
|
+
region.start_element = start_element
|
768
|
+
sections.append(region)
|
769
|
+
|
770
|
+
return sections
|