natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,761 @@
|
|
1
|
+
"""
|
2
|
+
Base Element class for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union, Tuple
|
5
|
+
from PIL import Image
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from natural_pdf.core.page import Page
|
9
|
+
from natural_pdf.elements.region import Region
|
10
|
+
from natural_pdf.elements.base import Element, DirectionalMixin
|
11
|
+
|
12
|
+
|
13
|
+
class DirectionalMixin:
|
14
|
+
"""
|
15
|
+
Mixin class providing directional methods for both Element and Region classes.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def _direction(self, direction: str, size: Optional[float] = None,
|
19
|
+
cross_size: str = "full", include_element: bool = False,
|
20
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
|
21
|
+
"""
|
22
|
+
Protected helper method to create a region in a specified direction relative to this element/region.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
direction: 'left', 'right', 'above', or 'below'
|
26
|
+
size: Size in the primary direction (width for horizontal, height for vertical)
|
27
|
+
cross_size: Size in the cross direction ('full' or 'element')
|
28
|
+
include_element: Whether to include this element/region's area in the result
|
29
|
+
until: Optional selector string to specify a boundary element
|
30
|
+
include_endpoint: Whether to include the boundary element found by 'until'
|
31
|
+
**kwargs: Additional parameters for the 'until' selector search
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Region object
|
35
|
+
"""
|
36
|
+
import math # Use math.inf for infinity
|
37
|
+
|
38
|
+
is_horizontal = direction in ('left', 'right')
|
39
|
+
is_positive = direction in ('right', 'below') # right/below are positive directions
|
40
|
+
pixel_offset = 1 # Offset for excluding elements/endpoints
|
41
|
+
|
42
|
+
# 1. Determine initial boundaries based on direction and include_element
|
43
|
+
if is_horizontal:
|
44
|
+
# Initial cross-boundaries (vertical)
|
45
|
+
y0 = 0 if cross_size == "full" else self.top
|
46
|
+
y1 = self.page.height if cross_size == "full" else self.bottom
|
47
|
+
|
48
|
+
# Initial primary boundaries (horizontal)
|
49
|
+
if is_positive: # right
|
50
|
+
x0_initial = self.x0 if include_element else self.x1 + pixel_offset
|
51
|
+
x1_initial = self.x1 # This edge moves
|
52
|
+
else: # left
|
53
|
+
x0_initial = self.x0 # This edge moves
|
54
|
+
x1_initial = self.x1 if include_element else self.x0 - pixel_offset
|
55
|
+
else: # Vertical
|
56
|
+
# Initial cross-boundaries (horizontal)
|
57
|
+
x0 = 0 if cross_size == "full" else self.x0
|
58
|
+
x1 = self.page.width if cross_size == "full" else self.x1
|
59
|
+
|
60
|
+
# Initial primary boundaries (vertical)
|
61
|
+
if is_positive: # below
|
62
|
+
y0_initial = self.top if include_element else self.bottom + pixel_offset
|
63
|
+
y1_initial = self.bottom # This edge moves
|
64
|
+
else: # above
|
65
|
+
y0_initial = self.top # This edge moves
|
66
|
+
y1_initial = self.bottom if include_element else self.top - pixel_offset
|
67
|
+
|
68
|
+
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
69
|
+
if is_horizontal:
|
70
|
+
if is_positive: # right
|
71
|
+
x1_final = min(self.page.width, x1_initial + (size if size is not None else (self.page.width - x1_initial)))
|
72
|
+
x0_final = x0_initial
|
73
|
+
else: # left
|
74
|
+
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
75
|
+
x1_final = x1_initial
|
76
|
+
else: # Vertical
|
77
|
+
if is_positive: # below
|
78
|
+
y1_final = min(self.page.height, y1_initial + (size if size is not None else (self.page.height - y1_initial)))
|
79
|
+
y0_final = y0_initial
|
80
|
+
else: # above
|
81
|
+
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
82
|
+
y1_final = y1_initial
|
83
|
+
|
84
|
+
# 3. Handle 'until' selector if provided
|
85
|
+
target = None
|
86
|
+
if until:
|
87
|
+
all_matches = self.page.find_all(until, **kwargs)
|
88
|
+
matches_in_direction = []
|
89
|
+
|
90
|
+
# Filter and sort matches based on direction
|
91
|
+
if direction == 'above':
|
92
|
+
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
93
|
+
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
94
|
+
elif direction == 'below':
|
95
|
+
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
96
|
+
matches_in_direction.sort(key=lambda e: e.top)
|
97
|
+
elif direction == 'left':
|
98
|
+
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
99
|
+
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
100
|
+
elif direction == 'right':
|
101
|
+
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
102
|
+
matches_in_direction.sort(key=lambda e: e.x0)
|
103
|
+
|
104
|
+
if matches_in_direction:
|
105
|
+
target = matches_in_direction[0]
|
106
|
+
|
107
|
+
# Adjust the primary boundary based on the target
|
108
|
+
if is_horizontal:
|
109
|
+
if is_positive: # right
|
110
|
+
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
111
|
+
else: # left
|
112
|
+
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
113
|
+
else: # Vertical
|
114
|
+
if is_positive: # below
|
115
|
+
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
116
|
+
else: # above
|
117
|
+
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
118
|
+
|
119
|
+
# Adjust cross boundaries if cross_size is 'element'
|
120
|
+
if cross_size == "element":
|
121
|
+
if is_horizontal: # Adjust y0, y1
|
122
|
+
target_y0 = target.top if include_endpoint else target.bottom # Use opposite boundary if excluding
|
123
|
+
target_y1 = target.bottom if include_endpoint else target.top
|
124
|
+
y0 = min(y0, target_y0)
|
125
|
+
y1 = max(y1, target_y1)
|
126
|
+
else: # Adjust x0, x1
|
127
|
+
target_x0 = target.x0 if include_endpoint else target.x1 # Use opposite boundary if excluding
|
128
|
+
target_x1 = target.x1 if include_endpoint else target.x0
|
129
|
+
x0 = min(x0, target_x0)
|
130
|
+
x1 = max(x1, target_x1)
|
131
|
+
|
132
|
+
# 4. Finalize bbox coordinates
|
133
|
+
if is_horizontal:
|
134
|
+
bbox = (x0_final, y0, x1_final, y1)
|
135
|
+
else:
|
136
|
+
bbox = (x0, y0_final, x1, y1_final)
|
137
|
+
|
138
|
+
# Ensure valid coordinates (x0 <= x1, y0 <= y1)
|
139
|
+
final_x0 = min(bbox[0], bbox[2])
|
140
|
+
final_y0 = min(bbox[1], bbox[3])
|
141
|
+
final_x1 = max(bbox[0], bbox[2])
|
142
|
+
final_y1 = max(bbox[1], bbox[3])
|
143
|
+
final_bbox = (final_x0, final_y0, final_x1, final_y1)
|
144
|
+
|
145
|
+
# 5. Create and return appropriate object based on self type
|
146
|
+
from natural_pdf.elements.region import Region
|
147
|
+
result = Region(self.page, final_bbox)
|
148
|
+
result.source_element = self
|
149
|
+
result.includes_source = include_element
|
150
|
+
# Optionally store the boundary element if found
|
151
|
+
if target:
|
152
|
+
result.boundary_element = target
|
153
|
+
|
154
|
+
return result
|
155
|
+
|
156
|
+
def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
157
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
|
158
|
+
"""
|
159
|
+
Select region above this element/region.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
height: Height of the region above, in points
|
163
|
+
width: Width mode - "full" for full page width or "element" for element width
|
164
|
+
include_element: Whether to include this element/region in the result (default: False)
|
165
|
+
until: Optional selector string to specify an upper boundary element
|
166
|
+
include_endpoint: Whether to include the boundary element in the region (default: True)
|
167
|
+
**kwargs: Additional parameters
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
Region object representing the area above
|
171
|
+
"""
|
172
|
+
return self._direction(
|
173
|
+
direction='above',
|
174
|
+
size=height,
|
175
|
+
cross_size=width,
|
176
|
+
include_element=include_element,
|
177
|
+
until=until,
|
178
|
+
include_endpoint=include_endpoint,
|
179
|
+
**kwargs
|
180
|
+
)
|
181
|
+
|
182
|
+
def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
183
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
|
184
|
+
"""
|
185
|
+
Select region below this element/region.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
height: Height of the region below, in points
|
189
|
+
width: Width mode - "full" for full page width or "element" for element width
|
190
|
+
include_element: Whether to include this element/region in the result (default: False)
|
191
|
+
until: Optional selector string to specify a lower boundary element
|
192
|
+
include_endpoint: Whether to include the boundary element in the region (default: True)
|
193
|
+
**kwargs: Additional parameters
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Region object representing the area below
|
197
|
+
"""
|
198
|
+
return self._direction(
|
199
|
+
direction='below',
|
200
|
+
size=height,
|
201
|
+
cross_size=width,
|
202
|
+
include_element=include_element,
|
203
|
+
until=until,
|
204
|
+
include_endpoint=include_endpoint,
|
205
|
+
**kwargs
|
206
|
+
)
|
207
|
+
|
208
|
+
def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
209
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
|
210
|
+
"""
|
211
|
+
Select region to the left of this element/region.
|
212
|
+
|
213
|
+
Args:
|
214
|
+
width: Width of the region to the left, in points
|
215
|
+
height: Height mode - "full" for full page height or "element" for element height
|
216
|
+
include_element: Whether to include this element/region in the result (default: False)
|
217
|
+
until: Optional selector string to specify a left boundary element
|
218
|
+
include_endpoint: Whether to include the boundary element in the region (default: True)
|
219
|
+
**kwargs: Additional parameters
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
Region object representing the area to the left
|
223
|
+
"""
|
224
|
+
return self._direction(
|
225
|
+
direction='left',
|
226
|
+
size=width,
|
227
|
+
cross_size=height,
|
228
|
+
include_element=include_element,
|
229
|
+
until=until,
|
230
|
+
include_endpoint=include_endpoint,
|
231
|
+
**kwargs
|
232
|
+
)
|
233
|
+
|
234
|
+
def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
235
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
|
236
|
+
"""
|
237
|
+
Select region to the right of this element/region.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
width: Width of the region to the right, in points
|
241
|
+
height: Height mode - "full" for full page height or "element" for element height
|
242
|
+
include_element: Whether to include this element/region in the result (default: False)
|
243
|
+
until: Optional selector string to specify a right boundary element
|
244
|
+
include_endpoint: Whether to include the boundary element in the region (default: True)
|
245
|
+
**kwargs: Additional parameters
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
Region object representing the area to the right
|
249
|
+
"""
|
250
|
+
return self._direction(
|
251
|
+
direction='right',
|
252
|
+
size=width,
|
253
|
+
cross_size=height,
|
254
|
+
include_element=include_element,
|
255
|
+
until=until,
|
256
|
+
include_endpoint=include_endpoint,
|
257
|
+
**kwargs
|
258
|
+
)
|
259
|
+
|
260
|
+
|
261
|
+
class Element(DirectionalMixin):
|
262
|
+
"""
|
263
|
+
Base class for all PDF elements.
|
264
|
+
|
265
|
+
This class provides common properties and methods for all PDF elements,
|
266
|
+
such as text, rectangles, lines, etc.
|
267
|
+
"""
|
268
|
+
|
269
|
+
def __init__(self, obj: Dict[str, Any], page: 'Page'):
|
270
|
+
"""
|
271
|
+
Initialize base element.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
obj: The underlying pdfplumber object
|
275
|
+
page: The parent Page object
|
276
|
+
"""
|
277
|
+
self._obj = obj
|
278
|
+
self._page = page
|
279
|
+
|
280
|
+
@property
|
281
|
+
def type(self) -> str:
|
282
|
+
"""Element type."""
|
283
|
+
return self._obj.get('object_type', 'unknown')
|
284
|
+
|
285
|
+
@property
|
286
|
+
def bbox(self) -> Tuple[float, float, float, float]:
|
287
|
+
"""Bounding box (x0, top, x1, bottom)."""
|
288
|
+
return (self.x0, self.top, self.x1, self.bottom)
|
289
|
+
|
290
|
+
@property
|
291
|
+
def x0(self) -> float:
|
292
|
+
"""Left x-coordinate."""
|
293
|
+
if self.has_polygon:
|
294
|
+
return min(pt[0] for pt in self.polygon)
|
295
|
+
return self._obj.get('x0', 0)
|
296
|
+
|
297
|
+
@property
|
298
|
+
def top(self) -> float:
|
299
|
+
"""Top y-coordinate."""
|
300
|
+
if self.has_polygon:
|
301
|
+
return min(pt[1] for pt in self.polygon)
|
302
|
+
return self._obj.get('top', 0)
|
303
|
+
|
304
|
+
@property
|
305
|
+
def x1(self) -> float:
|
306
|
+
"""Right x-coordinate."""
|
307
|
+
if self.has_polygon:
|
308
|
+
return max(pt[0] for pt in self.polygon)
|
309
|
+
return self._obj.get('x1', 0)
|
310
|
+
|
311
|
+
@property
|
312
|
+
def bottom(self) -> float:
|
313
|
+
"""Bottom y-coordinate."""
|
314
|
+
if self.has_polygon:
|
315
|
+
return max(pt[1] for pt in self.polygon)
|
316
|
+
return self._obj.get('bottom', 0)
|
317
|
+
|
318
|
+
@property
|
319
|
+
def width(self) -> float:
|
320
|
+
"""Element width."""
|
321
|
+
return self.x1 - self.x0
|
322
|
+
|
323
|
+
@property
|
324
|
+
def height(self) -> float:
|
325
|
+
"""Element height."""
|
326
|
+
return self.bottom - self.top
|
327
|
+
|
328
|
+
@property
|
329
|
+
def has_polygon(self) -> bool:
|
330
|
+
"""Check if this element has polygon coordinates."""
|
331
|
+
return ('polygon' in self._obj and self._obj['polygon'] and len(self._obj['polygon']) >= 3) or hasattr(self, '_polygon')
|
332
|
+
|
333
|
+
@property
|
334
|
+
def polygon(self) -> List[Tuple[float, float]]:
|
335
|
+
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
336
|
+
if hasattr(self, '_polygon') and self._polygon:
|
337
|
+
return self._polygon
|
338
|
+
elif 'polygon' in self._obj and self._obj['polygon']:
|
339
|
+
return self._obj['polygon']
|
340
|
+
else:
|
341
|
+
# Create rectangle corners as fallback
|
342
|
+
return [
|
343
|
+
(self._obj.get('x0', 0), self._obj.get('top', 0)), # top-left
|
344
|
+
(self._obj.get('x1', 0), self._obj.get('top', 0)), # top-right
|
345
|
+
(self._obj.get('x1', 0), self._obj.get('bottom', 0)), # bottom-right
|
346
|
+
(self._obj.get('x0', 0), self._obj.get('bottom', 0)) # bottom-left
|
347
|
+
]
|
348
|
+
|
349
|
+
def is_point_inside(self, x: float, y: float) -> bool:
|
350
|
+
"""
|
351
|
+
Check if a point is inside this element using ray casting algorithm for polygons.
|
352
|
+
|
353
|
+
Args:
|
354
|
+
x: X-coordinate to check
|
355
|
+
y: Y-coordinate to check
|
356
|
+
|
357
|
+
Returns:
|
358
|
+
True if the point is inside the element
|
359
|
+
"""
|
360
|
+
if not self.has_polygon:
|
361
|
+
# Use simple rectangle check
|
362
|
+
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
363
|
+
|
364
|
+
# Ray casting algorithm for complex polygons
|
365
|
+
poly = self.polygon
|
366
|
+
n = len(poly)
|
367
|
+
inside = False
|
368
|
+
|
369
|
+
p1x, p1y = poly[0]
|
370
|
+
for i in range(1, n + 1):
|
371
|
+
p2x, p2y = poly[i % n]
|
372
|
+
if y > min(p1y, p2y) and y <= max(p1y, p2y) and x <= max(p1x, p2x):
|
373
|
+
if p1y != p2y:
|
374
|
+
xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
|
375
|
+
if p1x == p2x or x <= xinters:
|
376
|
+
inside = not inside
|
377
|
+
p1x, p1y = p2x, p2y
|
378
|
+
|
379
|
+
return inside
|
380
|
+
|
381
|
+
@property
|
382
|
+
def page(self) -> 'Page':
|
383
|
+
"""Get the parent page."""
|
384
|
+
return self._page
|
385
|
+
|
386
|
+
def next(self, selector: Optional[str] = None, limit: int = 10, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
|
387
|
+
"""
|
388
|
+
Find next element in reading order.
|
389
|
+
|
390
|
+
Args:
|
391
|
+
selector: Optional selector to filter by
|
392
|
+
limit: Maximum number of elements to search through (default: 10)
|
393
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
394
|
+
**kwargs: Additional parameters
|
395
|
+
|
396
|
+
Returns:
|
397
|
+
Next element or None if not found
|
398
|
+
"""
|
399
|
+
# Get all elements in reading order
|
400
|
+
all_elements = self.page.find_all('*', apply_exclusions=apply_exclusions)
|
401
|
+
|
402
|
+
# Find our index in the list
|
403
|
+
try:
|
404
|
+
# Compare by object identity since bbox could match multiple elements
|
405
|
+
idx = next(i for i, elem in enumerate(all_elements) if elem is self)
|
406
|
+
except StopIteration:
|
407
|
+
# If not found, it might have been filtered out by exclusions
|
408
|
+
return None
|
409
|
+
|
410
|
+
# Search for next matching element
|
411
|
+
if selector:
|
412
|
+
# Filter elements after this one
|
413
|
+
candidates = all_elements[idx+1:]
|
414
|
+
# Limit search range for performance
|
415
|
+
candidates = candidates[:limit] if limit else candidates
|
416
|
+
|
417
|
+
# Find matching elements
|
418
|
+
matches = self.page.filter_elements(candidates, selector, **kwargs)
|
419
|
+
return matches[0] if matches else None
|
420
|
+
elif idx + 1 < len(all_elements):
|
421
|
+
# No selector, just return the next element
|
422
|
+
return all_elements[idx + 1]
|
423
|
+
|
424
|
+
return None
|
425
|
+
|
426
|
+
def prev(self, selector: Optional[str] = None, limit: int = 10, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
|
427
|
+
"""
|
428
|
+
Find previous element in reading order.
|
429
|
+
|
430
|
+
Args:
|
431
|
+
selector: Optional selector to filter by
|
432
|
+
limit: Maximum number of elements to search through (default: 10)
|
433
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
434
|
+
**kwargs: Additional parameters
|
435
|
+
|
436
|
+
Returns:
|
437
|
+
Previous element or None if not found
|
438
|
+
"""
|
439
|
+
# Get all elements in reading order
|
440
|
+
all_elements = self.page.find_all('*', apply_exclusions=apply_exclusions)
|
441
|
+
|
442
|
+
# Find our index in the list
|
443
|
+
try:
|
444
|
+
# Compare by object identity since bbox could match multiple elements
|
445
|
+
idx = next(i for i, elem in enumerate(all_elements) if elem is self)
|
446
|
+
except StopIteration:
|
447
|
+
# If not found, it might have been filtered out by exclusions
|
448
|
+
return None
|
449
|
+
|
450
|
+
# Search for previous matching element
|
451
|
+
if selector:
|
452
|
+
# Filter elements before this one
|
453
|
+
candidates = all_elements[:idx]
|
454
|
+
# Reverse to start from closest to this element
|
455
|
+
candidates = candidates[::-1]
|
456
|
+
# Limit search range for performance
|
457
|
+
candidates = candidates[:limit] if limit else candidates
|
458
|
+
|
459
|
+
# Find matching elements
|
460
|
+
matches = self.page.filter_elements(candidates, selector, **kwargs)
|
461
|
+
return matches[0] if matches else None
|
462
|
+
elif idx > 0:
|
463
|
+
# No selector, just return the previous element
|
464
|
+
return all_elements[idx - 1]
|
465
|
+
|
466
|
+
return None
|
467
|
+
|
468
|
+
def nearest(self, selector: str, max_distance: Optional[float] = None, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
|
469
|
+
"""
|
470
|
+
Find nearest element matching selector.
|
471
|
+
|
472
|
+
Args:
|
473
|
+
selector: CSS-like selector string
|
474
|
+
max_distance: Maximum distance to search (default: None = unlimited)
|
475
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
476
|
+
**kwargs: Additional parameters
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
Nearest element or None if not found
|
480
|
+
"""
|
481
|
+
# Find matching elements
|
482
|
+
matches = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
483
|
+
if not matches:
|
484
|
+
return None
|
485
|
+
|
486
|
+
# Calculate distance to center point of this element
|
487
|
+
self_center_x = (self.x0 + self.x1) / 2
|
488
|
+
self_center_y = (self.top + self.bottom) / 2
|
489
|
+
|
490
|
+
# Calculate distances to each match
|
491
|
+
distances = []
|
492
|
+
for match in matches:
|
493
|
+
if match is self: # Skip self
|
494
|
+
continue
|
495
|
+
|
496
|
+
match_center_x = (match.x0 + match.x1) / 2
|
497
|
+
match_center_y = (match.top + match.bottom) / 2
|
498
|
+
|
499
|
+
# Euclidean distance
|
500
|
+
distance = ((match_center_x - self_center_x) ** 2 +
|
501
|
+
(match_center_y - self_center_y) ** 2) ** 0.5
|
502
|
+
|
503
|
+
# Filter by max_distance if specified
|
504
|
+
if max_distance is None or distance <= max_distance:
|
505
|
+
distances.append((match, distance))
|
506
|
+
|
507
|
+
# Sort by distance and return the closest
|
508
|
+
if distances:
|
509
|
+
distances.sort(key=lambda x: x[1])
|
510
|
+
return distances[0][0]
|
511
|
+
|
512
|
+
return None
|
513
|
+
|
514
|
+
def until(self, selector: str, include_endpoint: bool = True, width: str = "element", **kwargs) -> 'Region':
|
515
|
+
"""
|
516
|
+
Select content from this element until matching selector.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
selector: CSS-like selector string
|
520
|
+
include_endpoint: Whether to include the endpoint element in the region (default: True)
|
521
|
+
width: Width mode - "element" to use element widths or "full" for full page width
|
522
|
+
**kwargs: Additional selection parameters
|
523
|
+
|
524
|
+
Returns:
|
525
|
+
Region object representing the selected content
|
526
|
+
"""
|
527
|
+
from natural_pdf.elements.region import Region
|
528
|
+
|
529
|
+
# Find the target element
|
530
|
+
target = self.page.find(selector, **kwargs)
|
531
|
+
if not target:
|
532
|
+
# If target not found, return a region with just this element
|
533
|
+
return Region(self.page, self.bbox)
|
534
|
+
|
535
|
+
# Use full page width if requested
|
536
|
+
if width == "full":
|
537
|
+
x0 = 0
|
538
|
+
x1 = self.page.width
|
539
|
+
# Determine vertical bounds based on element positions
|
540
|
+
if target.top >= self.bottom: # Target is below this element
|
541
|
+
top = self.top
|
542
|
+
bottom = target.bottom if include_endpoint else target.top - 1 # Subtract 1 pixel when excluding
|
543
|
+
else: # Target is above this element
|
544
|
+
top = target.top if include_endpoint else target.bottom + 1 # Add 1 pixel when excluding
|
545
|
+
bottom = self.bottom
|
546
|
+
return Region(self.page, (x0, top, x1, bottom))
|
547
|
+
|
548
|
+
# Otherwise use element-based width
|
549
|
+
# Determine the correct order for creating the region
|
550
|
+
# If the target is below this element (normal reading order)
|
551
|
+
if target.top >= self.bottom:
|
552
|
+
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
553
|
+
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
554
|
+
top = self.top
|
555
|
+
bottom = target.bottom if include_endpoint else target.top - 1 # Subtract 1 pixel when excluding
|
556
|
+
# If the target is above this element (reverse reading order)
|
557
|
+
elif target.bottom <= self.top:
|
558
|
+
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
559
|
+
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
560
|
+
top = target.top if include_endpoint else target.bottom + 1 # Add 1 pixel when excluding
|
561
|
+
bottom = self.bottom
|
562
|
+
# If they're side by side, use the horizontal version
|
563
|
+
elif target.x0 >= self.x1: # Target is to the right
|
564
|
+
x0 = self.x0
|
565
|
+
x1 = target.x1 if include_endpoint else target.x0
|
566
|
+
top = min(self.top, target.top if include_endpoint else target.bottom)
|
567
|
+
bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
|
568
|
+
else: # Target is to the left
|
569
|
+
x0 = target.x0 if include_endpoint else target.x1
|
570
|
+
x1 = self.x1
|
571
|
+
top = min(self.top, target.top if include_endpoint else target.bottom)
|
572
|
+
bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
|
573
|
+
|
574
|
+
region = Region(self.page, (x0, top, x1, bottom))
|
575
|
+
region.source_element = self
|
576
|
+
region.end_element = target
|
577
|
+
return region
|
578
|
+
|
579
|
+
# Note: select_until method removed in favor of until()
|
580
|
+
|
581
|
+
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
582
|
+
"""
|
583
|
+
Extract text from this element.
|
584
|
+
|
585
|
+
Args:
|
586
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
587
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
588
|
+
**kwargs: Additional extraction parameters
|
589
|
+
|
590
|
+
Returns:
|
591
|
+
Extracted text as string
|
592
|
+
"""
|
593
|
+
# Default implementation - override in subclasses
|
594
|
+
return ""
|
595
|
+
|
596
|
+
# Note: extract_text_compat method removed
|
597
|
+
|
598
|
+
def highlight(self,
|
599
|
+
label: Optional[str] = None,
|
600
|
+
color: Optional[Union[Tuple, str]] = None, # Allow string color
|
601
|
+
use_color_cycling: bool = False,
|
602
|
+
include_attrs: Optional[List[str]] = None,
|
603
|
+
existing: str = 'append') -> 'Element':
|
604
|
+
"""
|
605
|
+
Highlight this element on the page.
|
606
|
+
|
607
|
+
Args:
|
608
|
+
label: Optional label for the highlight
|
609
|
+
color: Color tuple/string for the highlight, or None to use automatic color
|
610
|
+
use_color_cycling: Force color cycling even with no label (default: False)
|
611
|
+
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
612
|
+
existing: How to handle existing highlights - 'append' (default) or 'replace'
|
613
|
+
|
614
|
+
Returns:
|
615
|
+
Self for method chaining
|
616
|
+
"""
|
617
|
+
# Access the correct highlighter service
|
618
|
+
highlighter = self.page._highlighter
|
619
|
+
|
620
|
+
# Prepare common arguments
|
621
|
+
highlight_args = {
|
622
|
+
"page_index": self.page.index,
|
623
|
+
"color": color,
|
624
|
+
"label": label,
|
625
|
+
"use_color_cycling": use_color_cycling,
|
626
|
+
"element": self, # Pass the element itself so attributes can be accessed
|
627
|
+
"include_attrs": include_attrs,
|
628
|
+
"existing": existing
|
629
|
+
}
|
630
|
+
|
631
|
+
# Call the appropriate service method based on geometry
|
632
|
+
if self.has_polygon:
|
633
|
+
highlight_args["polygon"] = self.polygon
|
634
|
+
highlighter.add_polygon(**highlight_args)
|
635
|
+
else:
|
636
|
+
highlight_args["bbox"] = self.bbox
|
637
|
+
highlighter.add(**highlight_args)
|
638
|
+
|
639
|
+
return self
|
640
|
+
|
641
|
+
def show(self,
|
642
|
+
scale: float = 2.0,
|
643
|
+
labels: bool = True,
|
644
|
+
legend_position: str = 'right',
|
645
|
+
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
646
|
+
label: Optional[str] = None) -> Optional['Image.Image']:
|
647
|
+
"""
|
648
|
+
Show the page with only this element highlighted temporarily.
|
649
|
+
|
650
|
+
Args:
|
651
|
+
scale: Scale factor for rendering
|
652
|
+
labels: Whether to include a legend for the highlight
|
653
|
+
legend_position: Position of the legend
|
654
|
+
color: Color to highlight this element (default: red)
|
655
|
+
label: Optional label for this element in the legend
|
656
|
+
|
657
|
+
Returns:
|
658
|
+
PIL Image of the page with only this element highlighted, or None if error.
|
659
|
+
"""
|
660
|
+
if not hasattr(self, 'page') or not self.page:
|
661
|
+
logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
|
662
|
+
return None
|
663
|
+
if not hasattr(self.page, '_highlighter') or not self.page._highlighter:
|
664
|
+
logger.warning(f"Cannot show element, page lacks highlighter service: {self}")
|
665
|
+
return None
|
666
|
+
|
667
|
+
service = self.page._highlighter
|
668
|
+
|
669
|
+
# Determine the label if not provided
|
670
|
+
display_label = label if label is not None else f"{self.__class__.__name__}"
|
671
|
+
|
672
|
+
# Prepare temporary highlight data for just this element
|
673
|
+
temp_highlight_data = {
|
674
|
+
"page_index": self.page.index,
|
675
|
+
"bbox": self.bbox if not self.has_polygon else None,
|
676
|
+
"polygon": self.polygon if self.has_polygon else None,
|
677
|
+
"color": color, # Use provided or default color
|
678
|
+
"label": display_label,
|
679
|
+
"use_color_cycling": False # Explicitly false for single preview
|
680
|
+
}
|
681
|
+
|
682
|
+
# Check if we actually got geometry data
|
683
|
+
if temp_highlight_data['bbox'] is None and temp_highlight_data['polygon'] is None:
|
684
|
+
logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
|
685
|
+
return None
|
686
|
+
|
687
|
+
# Use render_preview to show only this highlight
|
688
|
+
try:
|
689
|
+
return service.render_preview(
|
690
|
+
page_index=self.page.index,
|
691
|
+
temporary_highlights=[temp_highlight_data],
|
692
|
+
scale=scale,
|
693
|
+
labels=labels,
|
694
|
+
legend_position=legend_position
|
695
|
+
)
|
696
|
+
except Exception as e:
|
697
|
+
logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
|
698
|
+
return None
|
699
|
+
|
700
|
+
def save(self,
|
701
|
+
filename: str,
|
702
|
+
scale: float = 2.0,
|
703
|
+
labels: bool = True,
|
704
|
+
legend_position: str = 'right') -> None:
|
705
|
+
"""
|
706
|
+
Save the page with this element highlighted to an image file.
|
707
|
+
|
708
|
+
Args:
|
709
|
+
filename: Path to save the image to
|
710
|
+
scale: Scale factor for rendering
|
711
|
+
labels: Whether to include a legend for labels
|
712
|
+
legend_position: Position of the legend
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
Self for method chaining
|
716
|
+
"""
|
717
|
+
# Save the highlighted image
|
718
|
+
self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
719
|
+
return self
|
720
|
+
|
721
|
+
# Note: save_image method removed in favor of save()
|
722
|
+
|
723
|
+
def __repr__(self) -> str:
|
724
|
+
"""String representation of the element."""
|
725
|
+
return f"<{self.__class__.__name__} bbox={self.bbox}>"
|
726
|
+
|
727
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
|
728
|
+
"""
|
729
|
+
Find first element within this element's bounds matching the selector.
|
730
|
+
Creates a temporary region to perform the search.
|
731
|
+
|
732
|
+
Args:
|
733
|
+
selector: CSS-like selector string
|
734
|
+
apply_exclusions: Whether to apply exclusion regions
|
735
|
+
**kwargs: Additional parameters for element filtering
|
736
|
+
|
737
|
+
Returns:
|
738
|
+
First matching element or None
|
739
|
+
"""
|
740
|
+
# Create a temporary region from this element's bounds
|
741
|
+
from natural_pdf.elements.region import Region
|
742
|
+
temp_region = Region(self.page, self.bbox)
|
743
|
+
return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
744
|
+
|
745
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
|
746
|
+
"""
|
747
|
+
Find all elements within this element's bounds matching the selector.
|
748
|
+
Creates a temporary region to perform the search.
|
749
|
+
|
750
|
+
Args:
|
751
|
+
selector: CSS-like selector string
|
752
|
+
apply_exclusions: Whether to apply exclusion regions
|
753
|
+
**kwargs: Additional parameters for element filtering
|
754
|
+
|
755
|
+
Returns:
|
756
|
+
ElementCollection with matching elements
|
757
|
+
"""
|
758
|
+
# Create a temporary region from this element's bounds
|
759
|
+
from natural_pdf.elements.region import Region
|
760
|
+
temp_region = Region(self.page, self.bbox)
|
761
|
+
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|