natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,1366 @@
|
|
1
|
+
from typing import Optional, Union, List, Dict, Tuple, Any, Callable, TYPE_CHECKING
|
2
|
+
|
3
|
+
if TYPE_CHECKING:
|
4
|
+
from natural_pdf.core.page import Page
|
5
|
+
from natural_pdf.elements.base import Element
|
6
|
+
from natural_pdf.elements.text import TextElement
|
7
|
+
|
8
|
+
# Import OCRManager conditionally to avoid circular imports
|
9
|
+
try:
|
10
|
+
from natural_pdf.utils.ocr import OCRManager
|
11
|
+
except ImportError:
|
12
|
+
# OCRManager will be imported directly in methods that use it
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class Region:
|
17
|
+
"""
|
18
|
+
Represents a rectangular region on a page.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
|
22
|
+
"""
|
23
|
+
Initialize a region.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
page: Parent page
|
27
|
+
bbox: Bounding box as (x0, top, x1, bottom)
|
28
|
+
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
|
+
"""
|
30
|
+
self._page = page
|
31
|
+
self._bbox = bbox
|
32
|
+
self._polygon = polygon
|
33
|
+
self._multi_page_elements = None
|
34
|
+
self._spans_pages = False
|
35
|
+
self._page_range = None
|
36
|
+
self.start_element = None
|
37
|
+
self.end_element = None
|
38
|
+
|
39
|
+
# Standard attributes for all elements
|
40
|
+
self.object_type = 'region' # For selector compatibility
|
41
|
+
|
42
|
+
# Layout detection attributes
|
43
|
+
self.region_type = None
|
44
|
+
self.normalized_type = None
|
45
|
+
self.confidence = None
|
46
|
+
self.model = None
|
47
|
+
|
48
|
+
# Region management attributes
|
49
|
+
self.name = None
|
50
|
+
self.source = None # Will be set by creation methods
|
51
|
+
|
52
|
+
@property
|
53
|
+
def page(self) -> 'Page':
|
54
|
+
"""Get the parent page."""
|
55
|
+
return self._page
|
56
|
+
|
57
|
+
@property
|
58
|
+
def bbox(self) -> Tuple[float, float, float, float]:
|
59
|
+
"""Get the bounding box as (x0, top, x1, bottom)."""
|
60
|
+
return self._bbox
|
61
|
+
|
62
|
+
@property
|
63
|
+
def x0(self) -> float:
|
64
|
+
"""Get the left coordinate."""
|
65
|
+
return self._bbox[0]
|
66
|
+
|
67
|
+
@property
|
68
|
+
def top(self) -> float:
|
69
|
+
"""Get the top coordinate."""
|
70
|
+
return self._bbox[1]
|
71
|
+
|
72
|
+
@property
|
73
|
+
def x1(self) -> float:
|
74
|
+
"""Get the right coordinate."""
|
75
|
+
return self._bbox[2]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def bottom(self) -> float:
|
79
|
+
"""Get the bottom coordinate."""
|
80
|
+
return self._bbox[3]
|
81
|
+
|
82
|
+
@property
|
83
|
+
def width(self) -> float:
|
84
|
+
"""Get the width of the region."""
|
85
|
+
return self.x1 - self.x0
|
86
|
+
|
87
|
+
@property
|
88
|
+
def height(self) -> float:
|
89
|
+
"""Get the height of the region."""
|
90
|
+
return self.bottom - self.top
|
91
|
+
|
92
|
+
@property
|
93
|
+
def has_polygon(self) -> bool:
|
94
|
+
"""Check if this region has polygon coordinates."""
|
95
|
+
return self._polygon is not None and len(self._polygon) >= 3
|
96
|
+
|
97
|
+
@property
|
98
|
+
def polygon(self) -> List[Tuple[float, float]]:
|
99
|
+
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
100
|
+
if self._polygon:
|
101
|
+
return self._polygon
|
102
|
+
else:
|
103
|
+
# Create rectangle corners from bbox as fallback
|
104
|
+
return [
|
105
|
+
(self.x0, self.top), # top-left
|
106
|
+
(self.x1, self.top), # top-right
|
107
|
+
(self.x1, self.bottom), # bottom-right
|
108
|
+
(self.x0, self.bottom) # bottom-left
|
109
|
+
]
|
110
|
+
|
111
|
+
def _is_point_in_polygon(self, x: float, y: float) -> bool:
|
112
|
+
"""
|
113
|
+
Check if a point is inside the polygon using ray casting algorithm.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
x: X-coordinate to check
|
117
|
+
y: Y-coordinate to check
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
True if the point is inside the polygon
|
121
|
+
"""
|
122
|
+
# If no polygon, use simple rectangle check
|
123
|
+
if not self.has_polygon:
|
124
|
+
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
125
|
+
|
126
|
+
# Ray casting algorithm for complex polygons
|
127
|
+
poly = self.polygon
|
128
|
+
n = len(poly)
|
129
|
+
inside = False
|
130
|
+
|
131
|
+
p1x, p1y = poly[0]
|
132
|
+
for i in range(1, n + 1):
|
133
|
+
p2x, p2y = poly[i % n]
|
134
|
+
if y > min(p1y, p2y) and y <= max(p1y, p2y) and x <= max(p1x, p2x):
|
135
|
+
if p1y != p2y:
|
136
|
+
xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
|
137
|
+
if p1x == p2x or x <= xinters:
|
138
|
+
inside = not inside
|
139
|
+
p1x, p1y = p2x, p2y
|
140
|
+
|
141
|
+
return inside
|
142
|
+
|
143
|
+
def _is_element_in_region(self, element: 'Element', use_boundary_tolerance=True) -> bool:
|
144
|
+
"""
|
145
|
+
Check if an element is within this region.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
element: Element to check
|
149
|
+
use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
True if the element is in the region, False otherwise
|
153
|
+
"""
|
154
|
+
# If we have multi-page elements cached, check if the element is in the list
|
155
|
+
if self._spans_pages and self._multi_page_elements is not None:
|
156
|
+
return element in self._multi_page_elements
|
157
|
+
|
158
|
+
# Check if element is on the same page
|
159
|
+
if element.page != self._page:
|
160
|
+
return False
|
161
|
+
|
162
|
+
# Calculate element center
|
163
|
+
element_center_x = (element.x0 + element.x1) / 2
|
164
|
+
element_center_y = (element.top + element.bottom) / 2
|
165
|
+
|
166
|
+
# If this is a boundary region with exclusions, apply strict boundary checking
|
167
|
+
# This helps enforce boundary_inclusion behavior in get_sections
|
168
|
+
if hasattr(self, 'start_element') or hasattr(self, 'end_element'):
|
169
|
+
# Apply a small tolerance to avoid border cases
|
170
|
+
# When an element is right at the border, we want to be more strict
|
171
|
+
tolerance = 2.0 if use_boundary_tolerance else 0.0
|
172
|
+
|
173
|
+
# Check if element center is strictly within the region (not just on border)
|
174
|
+
if (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
|
175
|
+
self.top + tolerance <= element_center_y <= self.bottom - tolerance):
|
176
|
+
return True
|
177
|
+
|
178
|
+
# For elements right at the boundary, be more conservative
|
179
|
+
return False
|
180
|
+
|
181
|
+
# If the element itself has a polygon, check if ANY corner is in this region
|
182
|
+
if hasattr(element, 'has_polygon') and element.has_polygon:
|
183
|
+
for point in element.polygon:
|
184
|
+
if self._is_point_in_polygon(point[0], point[1]):
|
185
|
+
return True
|
186
|
+
# If no point is inside, check if the center is inside
|
187
|
+
return self._is_point_in_polygon(element_center_x, element_center_y)
|
188
|
+
|
189
|
+
# For regular elements, check if center is in the region
|
190
|
+
# Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
|
191
|
+
# This ensures consistent behavior with the below() and above() method fixes
|
192
|
+
tolerance = 1.0 if use_boundary_tolerance else 0.0
|
193
|
+
|
194
|
+
# Check if within region with the tolerance applied
|
195
|
+
if self.has_polygon:
|
196
|
+
return self._is_point_in_polygon(element_center_x, element_center_y)
|
197
|
+
else:
|
198
|
+
# For rectangular regions, apply tolerance to all sides
|
199
|
+
return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
|
200
|
+
self.top + tolerance <= element_center_y <= self.bottom - tolerance)
|
201
|
+
|
202
|
+
def highlight(self,
|
203
|
+
label: Optional[str] = None,
|
204
|
+
color: Optional[Tuple[int, int, int, int]] = None,
|
205
|
+
use_color_cycling: bool = False,
|
206
|
+
include_attrs: Optional[List[str]] = None) -> 'Region':
|
207
|
+
"""
|
208
|
+
Highlight this region on the page.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
label: Optional label for the highlight
|
212
|
+
color: RGBA color tuple for the highlight, or None to use automatic color
|
213
|
+
use_color_cycling: Force color cycling even with no label (default: False)
|
214
|
+
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
Self for method chaining
|
218
|
+
"""
|
219
|
+
# Add highlight to the page's highlight manager
|
220
|
+
if self.has_polygon:
|
221
|
+
self._page._highlight_mgr.add_polygon_highlight(
|
222
|
+
self.polygon,
|
223
|
+
color,
|
224
|
+
label,
|
225
|
+
use_color_cycling,
|
226
|
+
element=self, # Pass the region itself so attributes can be accessed
|
227
|
+
include_attrs=include_attrs
|
228
|
+
)
|
229
|
+
else:
|
230
|
+
self._page._highlight_mgr.add_highlight(
|
231
|
+
self.bbox,
|
232
|
+
color,
|
233
|
+
label,
|
234
|
+
use_color_cycling,
|
235
|
+
element=self, # Pass the region itself so attributes can be accessed
|
236
|
+
include_attrs=include_attrs
|
237
|
+
)
|
238
|
+
return self
|
239
|
+
|
240
|
+
def to_image(self,
|
241
|
+
scale: float = 2.0,
|
242
|
+
resolution: float = 150,
|
243
|
+
crop_only: bool = False,
|
244
|
+
include_highlights: bool = True,
|
245
|
+
**kwargs) -> 'Image.Image':
|
246
|
+
"""
|
247
|
+
Generate an image of just this region.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
resolution: Resolution in DPI for rendering (default: 150)
|
251
|
+
crop_only: If True, only crop the region without highlighting its boundaries
|
252
|
+
include_highlights: Whether to include existing highlights (default: True)
|
253
|
+
**kwargs: Additional parameters for page.to_image()
|
254
|
+
|
255
|
+
Returns:
|
256
|
+
PIL Image of just this region
|
257
|
+
"""
|
258
|
+
# First get the full page image with highlights if requested
|
259
|
+
page_image = self._page.to_image(scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs)
|
260
|
+
|
261
|
+
# Calculate the crop coordinates - apply resolution scaling factor
|
262
|
+
# PDF coordinates are in points (1/72 inch), but image is scaled by resolution
|
263
|
+
scale_factor = scale
|
264
|
+
|
265
|
+
# Apply scaling to the coordinates
|
266
|
+
x0 = int(self.x0 * scale_factor)
|
267
|
+
top = int(self.top * scale_factor)
|
268
|
+
x1 = int(self.x1 * scale_factor)
|
269
|
+
bottom = int(self.bottom * scale_factor)
|
270
|
+
|
271
|
+
# Crop the image to just this region
|
272
|
+
region_image = page_image.crop((x0, top, x1, bottom))
|
273
|
+
|
274
|
+
# If not crop_only, add a border to highlight the region boundaries
|
275
|
+
if not crop_only:
|
276
|
+
from PIL import ImageDraw
|
277
|
+
|
278
|
+
# Create a 1px border around the region
|
279
|
+
draw = ImageDraw.Draw(region_image)
|
280
|
+
draw.rectangle((0, 0, region_image.width-1, region_image.height-1),
|
281
|
+
outline=(255, 0, 0), width=1)
|
282
|
+
|
283
|
+
return region_image
|
284
|
+
|
285
|
+
def show(self,
|
286
|
+
scale: float = 2.0,
|
287
|
+
labels: bool = True,
|
288
|
+
legend_position: str = 'right') -> 'Image.Image':
|
289
|
+
"""
|
290
|
+
Show the page with this region highlighted.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
scale: Scale factor for rendering
|
294
|
+
labels: Whether to include a legend for labels
|
295
|
+
legend_position: Position of the legend
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
PIL Image of the page with this region highlighted
|
299
|
+
"""
|
300
|
+
# Highlight this region if not already highlighted
|
301
|
+
self.highlight()
|
302
|
+
|
303
|
+
# Get and display the highlighted image
|
304
|
+
return self._page.show(scale, labels=labels, legend_position=legend_position)
|
305
|
+
|
306
|
+
def save(self,
|
307
|
+
filename: str,
|
308
|
+
scale: float = 2.0,
|
309
|
+
labels: bool = True,
|
310
|
+
legend_position: str = 'right') -> 'Region':
|
311
|
+
"""
|
312
|
+
Save the page with this region highlighted to an image file.
|
313
|
+
|
314
|
+
Args:
|
315
|
+
filename: Path to save the image to
|
316
|
+
scale: Scale factor for rendering
|
317
|
+
labels: Whether to include a legend for labels
|
318
|
+
legend_position: Position of the legend
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
Self for method chaining
|
322
|
+
"""
|
323
|
+
# Highlight this region if not already highlighted
|
324
|
+
self.highlight()
|
325
|
+
|
326
|
+
# Save the highlighted image
|
327
|
+
self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
328
|
+
return self
|
329
|
+
|
330
|
+
def save_image(self,
|
331
|
+
filename: str,
|
332
|
+
resolution: float = 150,
|
333
|
+
crop_only: bool = False,
|
334
|
+
include_highlights: bool = True,
|
335
|
+
**kwargs) -> 'Region':
|
336
|
+
"""
|
337
|
+
Save an image of just this region to a file.
|
338
|
+
|
339
|
+
Args:
|
340
|
+
filename: Path to save the image to
|
341
|
+
resolution: Resolution in DPI for rendering (default: 150)
|
342
|
+
crop_only: If True, only crop the region without highlighting its boundaries
|
343
|
+
include_highlights: Whether to include existing highlights (default: True)
|
344
|
+
**kwargs: Additional parameters for page.to_image()
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
Self for method chaining
|
348
|
+
"""
|
349
|
+
# Get the region image
|
350
|
+
image = self.to_image(
|
351
|
+
resolution=resolution,
|
352
|
+
crop_only=crop_only,
|
353
|
+
include_highlights=include_highlights,
|
354
|
+
**kwargs
|
355
|
+
)
|
356
|
+
|
357
|
+
# Save the image
|
358
|
+
image.save(filename)
|
359
|
+
return self
|
360
|
+
|
361
|
+
def get_elements(self, selector: Optional[str] = None, apply_exclusions=True, **kwargs) -> List['Element']:
|
362
|
+
"""
|
363
|
+
Get all elements within this region.
|
364
|
+
|
365
|
+
Args:
|
366
|
+
selector: Optional selector to filter elements
|
367
|
+
apply_exclusions: Whether to apply exclusion regions
|
368
|
+
**kwargs: Additional parameters for element filtering
|
369
|
+
|
370
|
+
Returns:
|
371
|
+
List of elements in the region
|
372
|
+
"""
|
373
|
+
# If we have multi-page elements, return those
|
374
|
+
if self._spans_pages and self._multi_page_elements is not None:
|
375
|
+
return self._multi_page_elements
|
376
|
+
|
377
|
+
# Otherwise, get elements from the page
|
378
|
+
if selector:
|
379
|
+
elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
380
|
+
else:
|
381
|
+
elements = self.page.get_elements(apply_exclusions=apply_exclusions)
|
382
|
+
|
383
|
+
# Filter to elements in this region
|
384
|
+
return [e for e in elements if self._is_element_in_region(e)]
|
385
|
+
|
386
|
+
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, ocr=None, preserve_whitespace=None, debug=False, **kwargs) -> str:
|
387
|
+
"""
|
388
|
+
Extract text from this region using pdfplumber's native functionality.
|
389
|
+
|
390
|
+
Args:
|
391
|
+
keep_blank_chars: Whether to keep blank characters (legacy parameter)
|
392
|
+
apply_exclusions: Whether to apply exclusion regions
|
393
|
+
ocr: OCR configuration. If None, uses PDF settings
|
394
|
+
preserve_whitespace: Synonym for keep_blank_chars (for compatibility with page.extract_text)
|
395
|
+
debug: Enable verbose debugging for exclusion handling
|
396
|
+
**kwargs: Additional parameters for text extraction
|
397
|
+
|
398
|
+
Returns:
|
399
|
+
Extracted text as string
|
400
|
+
"""
|
401
|
+
# Handle preserve_whitespace parameter for consistency with Page.extract_text
|
402
|
+
if preserve_whitespace is not None:
|
403
|
+
keep_blank_chars = preserve_whitespace
|
404
|
+
|
405
|
+
# If we span multiple pages, use the original implementation
|
406
|
+
if self._spans_pages and self._multi_page_elements is not None:
|
407
|
+
# Sort elements in reading order - only include text-like elements
|
408
|
+
text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
|
409
|
+
|
410
|
+
# Sort in reading order (by page, then top-to-bottom, left-to-right)
|
411
|
+
sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
412
|
+
|
413
|
+
# Extract text directly from elements to avoid recursion
|
414
|
+
texts = []
|
415
|
+
for element in sorted_elements:
|
416
|
+
if hasattr(element, 'text'):
|
417
|
+
texts.append(element.text)
|
418
|
+
|
419
|
+
text_result = " ".join(texts)
|
420
|
+
return text_result
|
421
|
+
|
422
|
+
# Check if we have exclusions to apply
|
423
|
+
exclusion_regions = []
|
424
|
+
if apply_exclusions and self._page._exclusions:
|
425
|
+
exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
|
426
|
+
|
427
|
+
if debug:
|
428
|
+
import logging
|
429
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
430
|
+
logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
|
431
|
+
|
432
|
+
# IMPROVEMENT 1: Check if the region intersects with any exclusion zone
|
433
|
+
# If not, ignore exclusions entirely
|
434
|
+
if exclusion_regions:
|
435
|
+
has_intersection = False
|
436
|
+
for i, exclusion in enumerate(exclusion_regions):
|
437
|
+
# Use a simple bbox overlap check
|
438
|
+
overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
|
439
|
+
self.top < exclusion.bottom and self.bottom > exclusion.top)
|
440
|
+
|
441
|
+
if overlap:
|
442
|
+
has_intersection = True
|
443
|
+
if debug:
|
444
|
+
import logging
|
445
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
446
|
+
logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
|
447
|
+
break
|
448
|
+
|
449
|
+
# If no intersection, process without exclusions
|
450
|
+
if not has_intersection:
|
451
|
+
if debug:
|
452
|
+
import logging
|
453
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
454
|
+
logger.debug(f" No intersection with any exclusion, ignoring exclusions")
|
455
|
+
apply_exclusions = False
|
456
|
+
exclusion_regions = []
|
457
|
+
|
458
|
+
# IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
|
459
|
+
# we can use the simpler cropping approach
|
460
|
+
# Only use crop for simple cases
|
461
|
+
can_use_crop = not self.has_polygon
|
462
|
+
result = "" # Default empty result
|
463
|
+
if can_use_crop and apply_exclusions and exclusion_regions:
|
464
|
+
# We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
|
465
|
+
# and those that are not
|
466
|
+
footer_header_exclusions = []
|
467
|
+
other_exclusions = []
|
468
|
+
|
469
|
+
for i, exclusion in enumerate(exclusion_regions):
|
470
|
+
# Check if exclusion spans the full width of the page
|
471
|
+
# and is either at the top or bottom
|
472
|
+
full_width = (abs(exclusion.x0) < 5 and
|
473
|
+
abs(exclusion.x1 - self.page.width) < 5)
|
474
|
+
|
475
|
+
if debug:
|
476
|
+
import logging
|
477
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
478
|
+
logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
|
479
|
+
|
480
|
+
if full_width:
|
481
|
+
footer_header_exclusions.append(exclusion)
|
482
|
+
else:
|
483
|
+
other_exclusions.append(exclusion)
|
484
|
+
|
485
|
+
# If we have only header/footer exclusions, we can use the cropping approach
|
486
|
+
all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
|
487
|
+
|
488
|
+
if all_are_bands:
|
489
|
+
# Find the actual content area after excluding header/footer
|
490
|
+
top_bound = self.top
|
491
|
+
bottom_bound = self.bottom
|
492
|
+
|
493
|
+
if debug:
|
494
|
+
import logging
|
495
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
496
|
+
logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
497
|
+
|
498
|
+
# Process only header/footer exclusions for cropping
|
499
|
+
for exclusion in footer_header_exclusions:
|
500
|
+
# If exclusion is at the top of our region
|
501
|
+
if exclusion.bottom > self.top and exclusion.top <= self.top:
|
502
|
+
# Move top bound to exclude the header
|
503
|
+
top_bound = max(top_bound, exclusion.bottom)
|
504
|
+
if debug:
|
505
|
+
import logging
|
506
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
507
|
+
logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
|
508
|
+
|
509
|
+
# If exclusion is at the bottom of our region
|
510
|
+
if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
|
511
|
+
# Move bottom bound to exclude the footer
|
512
|
+
bottom_bound = min(bottom_bound, exclusion.top)
|
513
|
+
if debug:
|
514
|
+
import logging
|
515
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
516
|
+
logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
|
517
|
+
|
518
|
+
|
519
|
+
if debug:
|
520
|
+
import logging
|
521
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
522
|
+
logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
523
|
+
|
524
|
+
# If we still have a valid region after exclusions
|
525
|
+
if top_bound < bottom_bound:
|
526
|
+
# Use direct crop with adjusted bounds
|
527
|
+
crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
|
528
|
+
cropped = self.page._page.crop(crop_bbox)
|
529
|
+
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
530
|
+
|
531
|
+
if debug:
|
532
|
+
import logging
|
533
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
534
|
+
logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
|
535
|
+
|
536
|
+
# Skip the complex filtering approach
|
537
|
+
return result
|
538
|
+
else:
|
539
|
+
# This would only happen if the region is entirely inside an exclusion zone
|
540
|
+
# or if both top and bottom of the region are excluded leaving no valid area
|
541
|
+
import logging
|
542
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
543
|
+
logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
|
544
|
+
return ""
|
545
|
+
# We have exclusions, but not all are headers/footers,
|
546
|
+
# or we have a non-rectangular region
|
547
|
+
else:
|
548
|
+
if debug:
|
549
|
+
import logging
|
550
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
551
|
+
logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
|
552
|
+
|
553
|
+
# Don't use crop for mixed exclusion types
|
554
|
+
can_use_crop = False
|
555
|
+
|
556
|
+
# If we got a result from header/footer cropping, return it
|
557
|
+
if result:
|
558
|
+
return result
|
559
|
+
|
560
|
+
# For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
|
561
|
+
if can_use_crop and not apply_exclusions:
|
562
|
+
# Simple case: use direct crop
|
563
|
+
crop_bbox = self.bbox
|
564
|
+
cropped = self.page._page.crop(crop_bbox)
|
565
|
+
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
566
|
+
return result
|
567
|
+
|
568
|
+
# For all other cases (complex exclusions, polygons), we use element filtering
|
569
|
+
import warnings
|
570
|
+
import logging
|
571
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
572
|
+
|
573
|
+
if debug:
|
574
|
+
logger.debug(f"Using element filtering approach for region {self.bbox}")
|
575
|
+
|
576
|
+
# Get all elements in this region first
|
577
|
+
all_elements = self.get_elements(apply_exclusions=False)
|
578
|
+
|
579
|
+
if apply_exclusions and exclusion_regions:
|
580
|
+
if debug:
|
581
|
+
logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
|
582
|
+
|
583
|
+
# Filter out elements in exclusion zones
|
584
|
+
filtered_elements = []
|
585
|
+
for elem in all_elements:
|
586
|
+
in_exclusion = False
|
587
|
+
# For each element, check if it's in any exclusion zone
|
588
|
+
element_center_x = (elem.x0 + elem.x1) / 2
|
589
|
+
element_center_y = (elem.top + elem.bottom) / 2
|
590
|
+
|
591
|
+
for exclusion in exclusion_regions:
|
592
|
+
if (exclusion.x0 <= element_center_x <= exclusion.x1 and
|
593
|
+
exclusion.top <= element_center_y <= exclusion.bottom):
|
594
|
+
in_exclusion = True
|
595
|
+
break
|
596
|
+
|
597
|
+
if not in_exclusion:
|
598
|
+
filtered_elements.append(elem)
|
599
|
+
else:
|
600
|
+
# No exclusions, use all elements
|
601
|
+
filtered_elements = all_elements
|
602
|
+
|
603
|
+
# Now extract text from the filtered elements
|
604
|
+
if filtered_elements:
|
605
|
+
from natural_pdf.elements.collections import ElementCollection
|
606
|
+
collection = ElementCollection(filtered_elements)
|
607
|
+
# Sort in reading order
|
608
|
+
collection = collection.sort(key=lambda e: (e.top, e.x0))
|
609
|
+
# Extract text
|
610
|
+
result = " ".join(e.text for e in collection if hasattr(e, 'text'))
|
611
|
+
|
612
|
+
if debug:
|
613
|
+
logger.debug(f"Got {len(result)} characters from element-based extraction")
|
614
|
+
|
615
|
+
# Return the result
|
616
|
+
return result
|
617
|
+
else:
|
618
|
+
if debug:
|
619
|
+
logger.debug(f"No elements found after filtering")
|
620
|
+
return ""
|
621
|
+
|
622
|
+
# Handle OCR if needed
|
623
|
+
use_ocr = ocr is True or (isinstance(ocr, dict) and ocr.get('enabled', False))
|
624
|
+
auto_ocr = ocr is None and self.page._parent._ocr_config.get('enabled') == 'auto'
|
625
|
+
|
626
|
+
# Run OCR if explicitly requested or if in auto mode and no text found
|
627
|
+
if use_ocr or (auto_ocr and not result.strip()):
|
628
|
+
ocr_config = self.page._get_ocr_config(ocr or {}) if use_ocr else self.page._get_ocr_config({'enabled': 'auto'})
|
629
|
+
ocr_elements = self.apply_ocr(**ocr_config)
|
630
|
+
|
631
|
+
if ocr_elements:
|
632
|
+
# Filter OCR elements by exclusions if needed
|
633
|
+
if apply_exclusions and exclusion_regions:
|
634
|
+
filtered_ocr = []
|
635
|
+
for element in ocr_elements:
|
636
|
+
exclude = False
|
637
|
+
for region in exclusion_regions:
|
638
|
+
if region._is_element_in_region(element):
|
639
|
+
exclude = True
|
640
|
+
break
|
641
|
+
if not exclude:
|
642
|
+
filtered_ocr.append(element)
|
643
|
+
else:
|
644
|
+
filtered_ocr = ocr_elements
|
645
|
+
|
646
|
+
# Extract text from OCR elements
|
647
|
+
from natural_pdf.elements.collections import ElementCollection
|
648
|
+
ocr_collection = ElementCollection(filtered_ocr)
|
649
|
+
ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
|
650
|
+
|
651
|
+
# Use OCR text if it's not empty
|
652
|
+
if ocr_text.strip():
|
653
|
+
return ocr_text
|
654
|
+
|
655
|
+
return result
|
656
|
+
|
657
|
+
def extract_table(self, method: str = None, table_settings: dict = None,
|
658
|
+
use_ocr: bool = False, ocr_config: dict = None) -> List[List[str]]:
|
659
|
+
"""
|
660
|
+
Extract a table from this region.
|
661
|
+
|
662
|
+
Args:
|
663
|
+
method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
|
664
|
+
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
|
665
|
+
use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
|
666
|
+
ocr_config: OCR configuration parameters
|
667
|
+
|
668
|
+
Returns:
|
669
|
+
Table data as a list of rows, where each row is a list of cell values
|
670
|
+
"""
|
671
|
+
# Default settings if none provided
|
672
|
+
if table_settings is None:
|
673
|
+
table_settings = {}
|
674
|
+
|
675
|
+
# Auto-detect method if not specified
|
676
|
+
if method is None:
|
677
|
+
# If this is a TATR-detected region, use TATR method
|
678
|
+
if hasattr(self, 'model') and self.model == 'tatr' and self.region_type == 'table':
|
679
|
+
method = 'tatr'
|
680
|
+
else:
|
681
|
+
method = 'plumber'
|
682
|
+
|
683
|
+
# Use the selected method
|
684
|
+
if method == 'tatr':
|
685
|
+
return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
686
|
+
else: # Default to pdfplumber
|
687
|
+
return self._extract_table_plumber(table_settings)
|
688
|
+
|
689
|
+
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
690
|
+
"""
|
691
|
+
Extract table using pdfplumber's table extraction.
|
692
|
+
|
693
|
+
Args:
|
694
|
+
table_settings: Settings for pdfplumber table extraction
|
695
|
+
|
696
|
+
Returns:
|
697
|
+
Table data as a list of rows, where each row is a list of cell values
|
698
|
+
"""
|
699
|
+
# Create a crop of the page for this region
|
700
|
+
cropped = self.page._page.crop(self.bbox)
|
701
|
+
|
702
|
+
# Extract table from the cropped area
|
703
|
+
tables = cropped.extract_tables(table_settings)
|
704
|
+
|
705
|
+
# Return the first table or an empty list if none found
|
706
|
+
if tables:
|
707
|
+
return tables[0]
|
708
|
+
return []
|
709
|
+
|
710
|
+
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
711
|
+
"""
|
712
|
+
Extract table using TATR structure detection.
|
713
|
+
|
714
|
+
Args:
|
715
|
+
use_ocr: Whether to apply OCR to each cell for better text extraction
|
716
|
+
ocr_config: Optional OCR configuration parameters
|
717
|
+
|
718
|
+
Returns:
|
719
|
+
Table data as a list of rows, where each row is a list of cell values
|
720
|
+
"""
|
721
|
+
# Find all rows and headers in this table
|
722
|
+
rows = self.page.find_all(f'region[type=table-row][model=tatr]')
|
723
|
+
headers = self.page.find_all(f'region[type=table-column-header][model=tatr]')
|
724
|
+
columns = self.page.find_all(f'region[type=table-column][model=tatr]')
|
725
|
+
|
726
|
+
# Filter to only include rows/headers/columns that overlap with this table region
|
727
|
+
def is_in_table(region):
|
728
|
+
# Check for overlap - simplifying to center point for now
|
729
|
+
region_center_x = (region.x0 + region.x1) / 2
|
730
|
+
region_center_y = (region.top + region.bottom) / 2
|
731
|
+
return (self.x0 <= region_center_x <= self.x1 and
|
732
|
+
self.top <= region_center_y <= self.bottom)
|
733
|
+
|
734
|
+
rows = [row for row in rows if is_in_table(row)]
|
735
|
+
headers = [header for header in headers if is_in_table(header)]
|
736
|
+
columns = [column for column in columns if is_in_table(column)]
|
737
|
+
|
738
|
+
# Sort rows by vertical position (top to bottom)
|
739
|
+
rows.sort(key=lambda r: r.top)
|
740
|
+
|
741
|
+
# Sort columns by horizontal position (left to right)
|
742
|
+
columns.sort(key=lambda c: c.x0)
|
743
|
+
|
744
|
+
# Create table data structure
|
745
|
+
table_data = []
|
746
|
+
|
747
|
+
# Prepare OCR config if needed
|
748
|
+
if use_ocr:
|
749
|
+
# Default OCR config focuses on small text with low confidence
|
750
|
+
default_ocr_config = {
|
751
|
+
"enabled": True,
|
752
|
+
"min_confidence": 0.1, # Lower than default to catch more text
|
753
|
+
"detection_params": {
|
754
|
+
"text_threshold": 0.1, # Lower threshold for low-contrast text
|
755
|
+
"link_threshold": 0.1 # Lower threshold for connecting text components
|
756
|
+
}
|
757
|
+
}
|
758
|
+
|
759
|
+
# Merge with provided config if any
|
760
|
+
if ocr_config:
|
761
|
+
if isinstance(ocr_config, dict):
|
762
|
+
# Update default config with provided values
|
763
|
+
for key, value in ocr_config.items():
|
764
|
+
if isinstance(value, dict) and key in default_ocr_config and isinstance(default_ocr_config[key], dict):
|
765
|
+
# Merge nested dicts
|
766
|
+
default_ocr_config[key].update(value)
|
767
|
+
else:
|
768
|
+
# Replace value
|
769
|
+
default_ocr_config[key] = value
|
770
|
+
else:
|
771
|
+
# Not a dict, use as is
|
772
|
+
default_ocr_config = ocr_config
|
773
|
+
|
774
|
+
# Use the merged config
|
775
|
+
ocr_config = default_ocr_config
|
776
|
+
|
777
|
+
# Add header row if headers were detected
|
778
|
+
if headers:
|
779
|
+
header_texts = []
|
780
|
+
for header in headers:
|
781
|
+
if use_ocr:
|
782
|
+
# Try OCR for better text extraction
|
783
|
+
ocr_elements = header.apply_ocr(**ocr_config)
|
784
|
+
if ocr_elements:
|
785
|
+
ocr_text = " ".join(e.text for e in ocr_elements).strip()
|
786
|
+
if ocr_text:
|
787
|
+
header_texts.append(ocr_text)
|
788
|
+
continue
|
789
|
+
|
790
|
+
# Fallback to normal extraction
|
791
|
+
header_texts.append(header.extract_text().strip())
|
792
|
+
table_data.append(header_texts)
|
793
|
+
|
794
|
+
# Process rows
|
795
|
+
for row in rows:
|
796
|
+
row_cells = []
|
797
|
+
|
798
|
+
# If we have columns, use them to extract cells
|
799
|
+
if columns:
|
800
|
+
for column in columns:
|
801
|
+
# Create a cell region at the intersection of row and column
|
802
|
+
cell_bbox = (
|
803
|
+
column.x0,
|
804
|
+
row.top,
|
805
|
+
column.x1,
|
806
|
+
row.bottom
|
807
|
+
)
|
808
|
+
|
809
|
+
# Create a region for this cell
|
810
|
+
from natural_pdf.elements.region import Region # Import here to avoid circular imports
|
811
|
+
cell_region = Region(self.page, cell_bbox)
|
812
|
+
|
813
|
+
# Extract text from the cell
|
814
|
+
if use_ocr:
|
815
|
+
# Apply OCR to the cell
|
816
|
+
ocr_elements = cell_region.apply_ocr(**ocr_config)
|
817
|
+
if ocr_elements:
|
818
|
+
# Get text from OCR elements
|
819
|
+
ocr_text = " ".join(e.text for e in ocr_elements).strip()
|
820
|
+
if ocr_text:
|
821
|
+
row_cells.append(ocr_text)
|
822
|
+
continue
|
823
|
+
|
824
|
+
# Fallback to normal extraction
|
825
|
+
cell_text = cell_region.extract_text().strip()
|
826
|
+
row_cells.append(cell_text)
|
827
|
+
else:
|
828
|
+
# No column information, just extract the whole row text
|
829
|
+
if use_ocr:
|
830
|
+
# Try OCR on the whole row
|
831
|
+
ocr_elements = row.apply_ocr(**ocr_config)
|
832
|
+
if ocr_elements:
|
833
|
+
ocr_text = " ".join(e.text for e in ocr_elements).strip()
|
834
|
+
if ocr_text:
|
835
|
+
row_cells.append(ocr_text)
|
836
|
+
continue
|
837
|
+
|
838
|
+
# Fallback to normal extraction
|
839
|
+
row_cells.append(row.extract_text().strip())
|
840
|
+
|
841
|
+
table_data.append(row_cells)
|
842
|
+
|
843
|
+
return table_data
|
844
|
+
|
845
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
|
846
|
+
"""
|
847
|
+
Find the first element in this region matching the selector.
|
848
|
+
|
849
|
+
Args:
|
850
|
+
selector: CSS-like selector string
|
851
|
+
apply_exclusions: Whether to apply exclusion regions
|
852
|
+
**kwargs: Additional parameters for element filtering
|
853
|
+
|
854
|
+
Returns:
|
855
|
+
First matching element or None
|
856
|
+
"""
|
857
|
+
elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
858
|
+
return elements[0] if elements else None
|
859
|
+
|
860
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
|
861
|
+
"""
|
862
|
+
Find all elements in this region matching the selector.
|
863
|
+
|
864
|
+
Args:
|
865
|
+
selector: CSS-like selector string
|
866
|
+
apply_exclusions: Whether to apply exclusion regions
|
867
|
+
**kwargs: Additional parameters for element filtering
|
868
|
+
|
869
|
+
Returns:
|
870
|
+
ElementCollection with matching elements
|
871
|
+
"""
|
872
|
+
from natural_pdf.elements.collections import ElementCollection
|
873
|
+
|
874
|
+
# If we span multiple pages, filter our elements
|
875
|
+
if self._spans_pages and self._multi_page_elements is not None:
|
876
|
+
# Parse the selector
|
877
|
+
from natural_pdf.selectors.parser import parse_selector
|
878
|
+
selector_obj = parse_selector(selector)
|
879
|
+
|
880
|
+
# Rather than using matches_selector, let each page's find_all handle the matching
|
881
|
+
# since that method is already properly implemented
|
882
|
+
all_matching_elements = []
|
883
|
+
page_ranges = {}
|
884
|
+
|
885
|
+
# Group elements by page
|
886
|
+
for element in self._multi_page_elements:
|
887
|
+
if element.page not in page_ranges:
|
888
|
+
page_ranges[element.page] = []
|
889
|
+
page_ranges[element.page].append(element)
|
890
|
+
|
891
|
+
# For each page, use its find_all to match elements, then filter to our collection
|
892
|
+
for page, page_elements in page_ranges.items():
|
893
|
+
# Get all matching elements from the page
|
894
|
+
page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
895
|
+
|
896
|
+
# Filter to just the elements that are in our collection
|
897
|
+
for element in page_matches:
|
898
|
+
if element in page_elements:
|
899
|
+
all_matching_elements.append(element)
|
900
|
+
|
901
|
+
return ElementCollection(all_matching_elements)
|
902
|
+
|
903
|
+
# Otherwise, get elements from the page and filter by selector and region
|
904
|
+
page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
905
|
+
filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
|
906
|
+
return ElementCollection(filtered_elements)
|
907
|
+
|
908
|
+
def apply_ocr(self, **ocr_params) -> List['TextElement']:
|
909
|
+
"""
|
910
|
+
Apply OCR to this region and return the created text elements.
|
911
|
+
|
912
|
+
Args:
|
913
|
+
**ocr_params: OCR parameters to override defaults
|
914
|
+
|
915
|
+
Returns:
|
916
|
+
List of created text elements
|
917
|
+
"""
|
918
|
+
from natural_pdf.utils.ocr import OCRManager
|
919
|
+
|
920
|
+
# Get OCR configuration but suppress verbose output
|
921
|
+
if isinstance(ocr_params, dict):
|
922
|
+
ocr_params["verbose"] = False
|
923
|
+
else:
|
924
|
+
ocr_params = {"enabled": True, "verbose": False}
|
925
|
+
|
926
|
+
ocr_config = self.page._get_ocr_config(ocr_params)
|
927
|
+
|
928
|
+
# Skip if OCR is disabled
|
929
|
+
if not ocr_config.get('enabled'):
|
930
|
+
return []
|
931
|
+
|
932
|
+
# Render the page
|
933
|
+
page_image = self.page.to_image()
|
934
|
+
|
935
|
+
# Crop to this region
|
936
|
+
region_image = page_image.crop((self.x0, self.top, self.x1, self.bottom))
|
937
|
+
|
938
|
+
# Run OCR on this region
|
939
|
+
ocr_mgr = OCRManager.get_instance()
|
940
|
+
results = ocr_mgr.recognize_region(region_image, ocr_config)
|
941
|
+
|
942
|
+
# Adjust coordinates to be relative to the page
|
943
|
+
for result in results:
|
944
|
+
# Calculate bbox in page coordinates
|
945
|
+
result['bbox'] = (
|
946
|
+
result['bbox'][0] + self.x0,
|
947
|
+
result['bbox'][1] + self.top,
|
948
|
+
result['bbox'][2] + self.x0,
|
949
|
+
result['bbox'][3] + self.top
|
950
|
+
)
|
951
|
+
|
952
|
+
# Create text elements with adjusted coordinates
|
953
|
+
elements = []
|
954
|
+
for result in results:
|
955
|
+
# Only include results that are fully within the region
|
956
|
+
if (result['bbox'][0] >= self.x0 and
|
957
|
+
result['bbox'][1] >= self.top and
|
958
|
+
result['bbox'][2] <= self.x1 and
|
959
|
+
result['bbox'][3] <= self.bottom):
|
960
|
+
# Create a TextElement object with the appropriate fields
|
961
|
+
from natural_pdf.elements.text import TextElement
|
962
|
+
element_data = {
|
963
|
+
'text': result['text'],
|
964
|
+
'x0': result['bbox'][0],
|
965
|
+
'top': result['bbox'][1],
|
966
|
+
'x1': result['bbox'][2],
|
967
|
+
'bottom': result['bbox'][3],
|
968
|
+
'width': result['bbox'][2] - result['bbox'][0],
|
969
|
+
'height': result['bbox'][3] - result['bbox'][1],
|
970
|
+
'object_type': 'text',
|
971
|
+
'source': 'ocr',
|
972
|
+
'confidence': result['confidence'],
|
973
|
+
# Add default font information to work with existing expectations
|
974
|
+
'fontname': 'OCR-detected',
|
975
|
+
'size': 10.0,
|
976
|
+
'page_number': self.page.number
|
977
|
+
}
|
978
|
+
|
979
|
+
elem = TextElement(element_data, self.page)
|
980
|
+
elements.append(elem)
|
981
|
+
|
982
|
+
# Add to page's elements
|
983
|
+
if hasattr(self.page, '_elements') and self.page._elements is not None:
|
984
|
+
# Add to words list to make it accessible via standard API
|
985
|
+
if 'words' in self.page._elements:
|
986
|
+
self.page._elements['words'].append(elem)
|
987
|
+
else:
|
988
|
+
self.page._elements['words'] = [elem]
|
989
|
+
|
990
|
+
return elements
|
991
|
+
|
992
|
+
def expand(self,
|
993
|
+
left: float = 0,
|
994
|
+
right: float = 0,
|
995
|
+
top_expand: float = 0, # Renamed to avoid conflict
|
996
|
+
bottom_expand: float = 0, # Renamed to avoid conflict
|
997
|
+
width_factor: float = 1.0,
|
998
|
+
height_factor: float = 1.0,
|
999
|
+
# Keep original parameter names for backward compatibility
|
1000
|
+
top: float = None,
|
1001
|
+
bottom: float = None) -> 'Region':
|
1002
|
+
"""
|
1003
|
+
Create a new region expanded from this one.
|
1004
|
+
|
1005
|
+
Args:
|
1006
|
+
left: Amount to expand left edge
|
1007
|
+
right: Amount to expand right edge
|
1008
|
+
top_expand: Amount to expand top edge (upward)
|
1009
|
+
bottom_expand: Amount to expand bottom edge (downward)
|
1010
|
+
width_factor: Factor to multiply width by
|
1011
|
+
height_factor: Factor to multiply height by
|
1012
|
+
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
1013
|
+
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
1014
|
+
|
1015
|
+
Returns:
|
1016
|
+
New expanded Region
|
1017
|
+
"""
|
1018
|
+
# Start with current coordinates
|
1019
|
+
new_x0 = self.x0
|
1020
|
+
new_x1 = self.x1
|
1021
|
+
new_top = self.top
|
1022
|
+
new_bottom = self.bottom
|
1023
|
+
|
1024
|
+
# Handle the deprecated parameter names for backward compatibility
|
1025
|
+
if top is not None:
|
1026
|
+
top_expand = top
|
1027
|
+
if bottom is not None:
|
1028
|
+
bottom_expand = bottom
|
1029
|
+
|
1030
|
+
# Apply absolute expansions first
|
1031
|
+
new_x0 -= left
|
1032
|
+
new_x1 += right
|
1033
|
+
new_top -= top_expand # Expand upward (decrease top coordinate)
|
1034
|
+
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
1035
|
+
|
1036
|
+
# Apply percentage factors if provided
|
1037
|
+
if width_factor != 1.0 or height_factor != 1.0:
|
1038
|
+
# Current width and height
|
1039
|
+
current_width = new_x1 - new_x0
|
1040
|
+
current_height = new_bottom - new_top
|
1041
|
+
|
1042
|
+
# Calculate new width and height
|
1043
|
+
new_width = current_width * width_factor
|
1044
|
+
new_height = current_height * height_factor
|
1045
|
+
|
1046
|
+
# Calculate width and height differences
|
1047
|
+
width_diff = new_width - current_width
|
1048
|
+
height_diff = new_height - current_height
|
1049
|
+
|
1050
|
+
# Adjust coordinates to maintain center point
|
1051
|
+
new_x0 -= width_diff / 2
|
1052
|
+
new_x1 += width_diff / 2
|
1053
|
+
new_top -= height_diff / 2
|
1054
|
+
new_bottom += height_diff / 2
|
1055
|
+
|
1056
|
+
# Create new region with expanded bbox
|
1057
|
+
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
1058
|
+
|
1059
|
+
# Copy multi-page properties if present
|
1060
|
+
if self._spans_pages:
|
1061
|
+
new_region._spans_pages = True
|
1062
|
+
new_region._multi_page_elements = self._multi_page_elements
|
1063
|
+
new_region._page_range = self._page_range
|
1064
|
+
new_region.start_element = self.start_element
|
1065
|
+
new_region.end_element = self.end_element
|
1066
|
+
|
1067
|
+
return new_region
|
1068
|
+
|
1069
|
+
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
|
1070
|
+
"""
|
1071
|
+
Get a section between two elements within this region.
|
1072
|
+
|
1073
|
+
Args:
|
1074
|
+
start_element: Element marking the start of the section
|
1075
|
+
end_element: Element marking the end of the section
|
1076
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1077
|
+
|
1078
|
+
Returns:
|
1079
|
+
Region representing the section
|
1080
|
+
"""
|
1081
|
+
elements = self.get_elements()
|
1082
|
+
|
1083
|
+
# If no elements, return self
|
1084
|
+
if not elements:
|
1085
|
+
return self
|
1086
|
+
|
1087
|
+
# Sort elements in reading order
|
1088
|
+
elements.sort(key=lambda e: (e.top, e.x0))
|
1089
|
+
|
1090
|
+
# Find start index
|
1091
|
+
start_idx = 0
|
1092
|
+
if start_element:
|
1093
|
+
try:
|
1094
|
+
start_idx = elements.index(start_element)
|
1095
|
+
except ValueError:
|
1096
|
+
# Start element not in region, use first element
|
1097
|
+
pass
|
1098
|
+
|
1099
|
+
# Find end index
|
1100
|
+
end_idx = len(elements) - 1
|
1101
|
+
if end_element:
|
1102
|
+
try:
|
1103
|
+
end_idx = elements.index(end_element)
|
1104
|
+
except ValueError:
|
1105
|
+
# End element not in region, use last element
|
1106
|
+
pass
|
1107
|
+
|
1108
|
+
# Adjust indexes based on boundary inclusion
|
1109
|
+
if boundary_inclusion == 'none':
|
1110
|
+
start_idx += 1
|
1111
|
+
end_idx -= 1
|
1112
|
+
elif boundary_inclusion == 'start':
|
1113
|
+
end_idx -= 1
|
1114
|
+
elif boundary_inclusion == 'end':
|
1115
|
+
start_idx += 1
|
1116
|
+
|
1117
|
+
# Ensure valid indexes
|
1118
|
+
start_idx = max(0, start_idx)
|
1119
|
+
end_idx = min(len(elements) - 1, end_idx)
|
1120
|
+
|
1121
|
+
# If no valid elements in range, return empty region
|
1122
|
+
if start_idx > end_idx:
|
1123
|
+
return Region(self.page, (0, 0, 0, 0))
|
1124
|
+
|
1125
|
+
# Get elements in range
|
1126
|
+
section_elements = elements[start_idx:end_idx+1]
|
1127
|
+
|
1128
|
+
# Create bounding box around elements
|
1129
|
+
x0 = min(e.x0 for e in section_elements)
|
1130
|
+
top = min(e.top for e in section_elements)
|
1131
|
+
x1 = max(e.x1 for e in section_elements)
|
1132
|
+
bottom = max(e.bottom for e in section_elements)
|
1133
|
+
|
1134
|
+
# Adjust boundaries for better boundary inclusion/exclusion
|
1135
|
+
pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
|
1136
|
+
|
1137
|
+
# Only proceed with adjustments if we have elements in the section
|
1138
|
+
if section_elements:
|
1139
|
+
# Adjust top boundary if start element should be excluded
|
1140
|
+
if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
|
1141
|
+
# If start element is just above the section, move the top down
|
1142
|
+
# Use a larger threshold (10 points) to catch more cases
|
1143
|
+
if abs(top - start_element.bottom) < 10:
|
1144
|
+
top += pixel_adjustment
|
1145
|
+
|
1146
|
+
# Adjust bottom boundary if end element should be excluded
|
1147
|
+
if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
|
1148
|
+
# If end element is just below the section, move the bottom up
|
1149
|
+
# Use a larger threshold (10 points) to catch more cases
|
1150
|
+
if abs(bottom - end_element.top) < 10:
|
1151
|
+
bottom -= pixel_adjustment
|
1152
|
+
|
1153
|
+
# Ensure top is always less than bottom (valid region)
|
1154
|
+
if top >= bottom:
|
1155
|
+
# Reset to original if adjustment would create an invalid region
|
1156
|
+
top = min(e.top for e in section_elements)
|
1157
|
+
bottom = max(e.bottom for e in section_elements)
|
1158
|
+
|
1159
|
+
# Create new region
|
1160
|
+
section = Region(self.page, (x0, top, x1, bottom))
|
1161
|
+
section.start_element = start_element if boundary_inclusion in ('start', 'both') else None
|
1162
|
+
section.end_element = end_element if boundary_inclusion in ('end', 'both') else None
|
1163
|
+
|
1164
|
+
return section
|
1165
|
+
|
1166
|
+
def get_sections(self, start_elements=None, end_elements=None, boundary_inclusion='both') -> List['Region']:
|
1167
|
+
"""
|
1168
|
+
Get sections within this region based on start/end elements.
|
1169
|
+
|
1170
|
+
Args:
|
1171
|
+
start_elements: Elements or selector string that mark the start of sections
|
1172
|
+
end_elements: Elements or selector string that mark the end of sections
|
1173
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1174
|
+
|
1175
|
+
Returns:
|
1176
|
+
List of Region objects representing the extracted sections
|
1177
|
+
"""
|
1178
|
+
from natural_pdf.elements.collections import ElementCollection
|
1179
|
+
|
1180
|
+
# Process string selectors to find elements
|
1181
|
+
if isinstance(start_elements, str):
|
1182
|
+
start_elements = self.find_all(start_elements)
|
1183
|
+
if hasattr(start_elements, 'elements'):
|
1184
|
+
start_elements = start_elements.elements
|
1185
|
+
|
1186
|
+
if isinstance(end_elements, str):
|
1187
|
+
end_elements = self.find_all(end_elements)
|
1188
|
+
if hasattr(end_elements, 'elements'):
|
1189
|
+
end_elements = end_elements.elements
|
1190
|
+
|
1191
|
+
# If no start elements, return empty list
|
1192
|
+
if not start_elements:
|
1193
|
+
return []
|
1194
|
+
|
1195
|
+
# Sort elements in reading order
|
1196
|
+
all_elements = self.get_elements()
|
1197
|
+
all_elements.sort(key=lambda e: (e.top, e.x0))
|
1198
|
+
|
1199
|
+
# Get all indexes in the sorted list
|
1200
|
+
section_boundaries = []
|
1201
|
+
|
1202
|
+
# Add start element indexes
|
1203
|
+
for element in start_elements:
|
1204
|
+
try:
|
1205
|
+
idx = all_elements.index(element)
|
1206
|
+
section_boundaries.append({
|
1207
|
+
'index': idx,
|
1208
|
+
'element': element,
|
1209
|
+
'type': 'start'
|
1210
|
+
})
|
1211
|
+
except ValueError:
|
1212
|
+
# Element not in this region, skip
|
1213
|
+
continue
|
1214
|
+
|
1215
|
+
# Add end element indexes if provided
|
1216
|
+
if end_elements:
|
1217
|
+
for element in end_elements:
|
1218
|
+
try:
|
1219
|
+
idx = all_elements.index(element)
|
1220
|
+
section_boundaries.append({
|
1221
|
+
'index': idx,
|
1222
|
+
'element': element,
|
1223
|
+
'type': 'end'
|
1224
|
+
})
|
1225
|
+
except ValueError:
|
1226
|
+
# Element not in this region, skip
|
1227
|
+
continue
|
1228
|
+
|
1229
|
+
# Sort boundaries by index (document order)
|
1230
|
+
section_boundaries.sort(key=lambda x: x['index'])
|
1231
|
+
|
1232
|
+
# Generate sections
|
1233
|
+
sections = []
|
1234
|
+
current_start = None
|
1235
|
+
|
1236
|
+
for i, boundary in enumerate(section_boundaries):
|
1237
|
+
# If it's a start boundary and we don't have a current start
|
1238
|
+
if boundary['type'] == 'start' and current_start is None:
|
1239
|
+
current_start = boundary
|
1240
|
+
|
1241
|
+
# If it's an end boundary and we have a current start
|
1242
|
+
elif boundary['type'] == 'end' and current_start is not None:
|
1243
|
+
# Create a section from current_start to this boundary
|
1244
|
+
start_element = current_start['element']
|
1245
|
+
end_element = boundary['element']
|
1246
|
+
section = self.get_section_between(
|
1247
|
+
start_element,
|
1248
|
+
end_element,
|
1249
|
+
boundary_inclusion
|
1250
|
+
)
|
1251
|
+
sections.append(section)
|
1252
|
+
current_start = None
|
1253
|
+
|
1254
|
+
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1255
|
+
elif boundary['type'] == 'start' and current_start is not None and not end_elements:
|
1256
|
+
# Create a section from current_start to just before this boundary
|
1257
|
+
start_element = current_start['element']
|
1258
|
+
end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
|
1259
|
+
section = self.get_section_between(
|
1260
|
+
start_element,
|
1261
|
+
end_element,
|
1262
|
+
boundary_inclusion
|
1263
|
+
)
|
1264
|
+
sections.append(section)
|
1265
|
+
current_start = boundary
|
1266
|
+
|
1267
|
+
# Handle the last section if we have a current start
|
1268
|
+
if current_start is not None:
|
1269
|
+
start_element = current_start['element']
|
1270
|
+
# Use the last element in the region as the end
|
1271
|
+
end_element = all_elements[-1] if all_elements else None
|
1272
|
+
section = self.get_section_between(
|
1273
|
+
start_element,
|
1274
|
+
end_element,
|
1275
|
+
boundary_inclusion
|
1276
|
+
)
|
1277
|
+
sections.append(section)
|
1278
|
+
|
1279
|
+
return sections
|
1280
|
+
|
1281
|
+
def create_cells(self):
|
1282
|
+
"""
|
1283
|
+
Create cell regions for a TATR-detected table.
|
1284
|
+
|
1285
|
+
Returns:
|
1286
|
+
List of cell regions
|
1287
|
+
"""
|
1288
|
+
if not (self.region_type == 'table' and self.model == 'tatr'):
|
1289
|
+
raise ValueError("Only works for TATR-detected table regions")
|
1290
|
+
|
1291
|
+
# Find rows and columns that belong to this table
|
1292
|
+
rows = self.page.find_all(f'region[type=table-row][model=tatr]')
|
1293
|
+
columns = self.page.find_all(f'region[type=table-column][model=tatr]')
|
1294
|
+
|
1295
|
+
# Filter to only include those that overlap with this table
|
1296
|
+
def is_in_table(element):
|
1297
|
+
element_center_x = (element.x0 + element.x1) / 2
|
1298
|
+
element_center_y = (element.top + element.bottom) / 2
|
1299
|
+
return (self.x0 <= element_center_x <= self.x1 and
|
1300
|
+
self.top <= element_center_y <= self.bottom)
|
1301
|
+
|
1302
|
+
table_rows = [r for r in rows if is_in_table(r)]
|
1303
|
+
table_columns = [c for c in columns if is_in_table(c)]
|
1304
|
+
|
1305
|
+
# Sort rows and columns
|
1306
|
+
table_rows.sort(key=lambda r: r.top)
|
1307
|
+
table_columns.sort(key=lambda c: c.x0)
|
1308
|
+
|
1309
|
+
# Create cells
|
1310
|
+
cells = []
|
1311
|
+
for row in table_rows:
|
1312
|
+
for column in table_columns:
|
1313
|
+
# Create cell region at the intersection
|
1314
|
+
cell = self.page.create_region(
|
1315
|
+
column.x0, row.top, column.x1, row.bottom
|
1316
|
+
)
|
1317
|
+
# Set minimal metadata
|
1318
|
+
cell.source = 'derived'
|
1319
|
+
cell.region_type = 'table-cell'
|
1320
|
+
cell.model = 'tatr'
|
1321
|
+
|
1322
|
+
cells.append(cell)
|
1323
|
+
|
1324
|
+
return cells
|
1325
|
+
|
1326
|
+
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1327
|
+
"""
|
1328
|
+
Ask a question about the region content using document QA.
|
1329
|
+
|
1330
|
+
This method uses a document question answering model to extract answers from the region content.
|
1331
|
+
It leverages both textual content and layout information for better understanding.
|
1332
|
+
|
1333
|
+
Args:
|
1334
|
+
question: The question to ask about the region content
|
1335
|
+
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
1336
|
+
model: Optional model name to use for QA (if None, uses default model)
|
1337
|
+
**kwargs: Additional parameters to pass to the QA engine
|
1338
|
+
|
1339
|
+
Returns:
|
1340
|
+
Dictionary with answer details: {
|
1341
|
+
"answer": extracted text,
|
1342
|
+
"confidence": confidence score,
|
1343
|
+
"found": whether an answer was found,
|
1344
|
+
"page_num": page number,
|
1345
|
+
"region": reference to this region,
|
1346
|
+
"source_elements": list of elements that contain the answer (if found)
|
1347
|
+
}
|
1348
|
+
"""
|
1349
|
+
try:
|
1350
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1351
|
+
|
1352
|
+
# Get or initialize QA engine with specified model
|
1353
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1354
|
+
|
1355
|
+
# Ask the question using the QA engine
|
1356
|
+
return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
1357
|
+
except ImportError as e:
|
1358
|
+
import logging
|
1359
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
1360
|
+
logger.warning(f"QA functionality not available: {e}")
|
1361
|
+
return {
|
1362
|
+
"answer": "",
|
1363
|
+
"confidence": 0.0,
|
1364
|
+
"error": "QA functionality not available",
|
1365
|
+
"found": False
|
1366
|
+
}
|