natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -1,24 +1,33 @@
|
|
1
1
|
"""
|
2
2
|
Base Element class for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
5
7
|
from PIL import Image
|
6
8
|
|
7
9
|
if TYPE_CHECKING:
|
8
10
|
from natural_pdf.core.page import Page
|
9
|
-
from natural_pdf.elements.region import Region
|
10
11
|
from natural_pdf.elements.base import Element
|
11
12
|
from natural_pdf.elements.collections import ElementCollection
|
13
|
+
from natural_pdf.elements.region import Region
|
12
14
|
|
13
15
|
|
14
16
|
class DirectionalMixin:
|
15
17
|
"""
|
16
18
|
Mixin class providing directional methods for both Element and Region classes.
|
17
19
|
"""
|
18
|
-
|
19
|
-
def _direction(
|
20
|
-
|
21
|
-
|
20
|
+
|
21
|
+
def _direction(
|
22
|
+
self,
|
23
|
+
direction: str,
|
24
|
+
size: Optional[float] = None,
|
25
|
+
cross_size: str = "full",
|
26
|
+
include_element: bool = False,
|
27
|
+
until: Optional[str] = None,
|
28
|
+
include_endpoint: bool = True,
|
29
|
+
**kwargs,
|
30
|
+
) -> "Region":
|
22
31
|
"""
|
23
32
|
Protected helper method to create a region in a specified direction relative to this element/region.
|
24
33
|
|
@@ -34,11 +43,11 @@ class DirectionalMixin:
|
|
34
43
|
Returns:
|
35
44
|
Region object
|
36
45
|
"""
|
37
|
-
import math
|
46
|
+
import math # Use math.inf for infinity
|
38
47
|
|
39
|
-
is_horizontal = direction in (
|
40
|
-
is_positive = direction in (
|
41
|
-
pixel_offset = 1
|
48
|
+
is_horizontal = direction in ("left", "right")
|
49
|
+
is_positive = direction in ("right", "below") # right/below are positive directions
|
50
|
+
pixel_offset = 1 # Offset for excluding elements/endpoints
|
42
51
|
|
43
52
|
# 1. Determine initial boundaries based on direction and include_element
|
44
53
|
if is_horizontal:
|
@@ -47,38 +56,44 @@ class DirectionalMixin:
|
|
47
56
|
y1 = self.page.height if cross_size == "full" else self.bottom
|
48
57
|
|
49
58
|
# Initial primary boundaries (horizontal)
|
50
|
-
if is_positive:
|
59
|
+
if is_positive: # right
|
51
60
|
x0_initial = self.x0 if include_element else self.x1 + pixel_offset
|
52
|
-
x1_initial = self.x1
|
53
|
-
else:
|
54
|
-
x0_initial = self.x0
|
61
|
+
x1_initial = self.x1 # This edge moves
|
62
|
+
else: # left
|
63
|
+
x0_initial = self.x0 # This edge moves
|
55
64
|
x1_initial = self.x1 if include_element else self.x0 - pixel_offset
|
56
|
-
else:
|
65
|
+
else: # Vertical
|
57
66
|
# Initial cross-boundaries (horizontal)
|
58
67
|
x0 = 0 if cross_size == "full" else self.x0
|
59
68
|
x1 = self.page.width if cross_size == "full" else self.x1
|
60
69
|
|
61
70
|
# Initial primary boundaries (vertical)
|
62
|
-
if is_positive:
|
71
|
+
if is_positive: # below
|
63
72
|
y0_initial = self.top if include_element else self.bottom + pixel_offset
|
64
|
-
y1_initial = self.bottom
|
65
|
-
else:
|
66
|
-
y0_initial = self.top
|
73
|
+
y1_initial = self.bottom # This edge moves
|
74
|
+
else: # above
|
75
|
+
y0_initial = self.top # This edge moves
|
67
76
|
y1_initial = self.bottom if include_element else self.top - pixel_offset
|
68
77
|
|
69
78
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
70
79
|
if is_horizontal:
|
71
|
-
if is_positive:
|
72
|
-
x1_final = min(
|
80
|
+
if is_positive: # right
|
81
|
+
x1_final = min(
|
82
|
+
self.page.width,
|
83
|
+
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
84
|
+
)
|
73
85
|
x0_final = x0_initial
|
74
|
-
else:
|
86
|
+
else: # left
|
75
87
|
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
76
88
|
x1_final = x1_initial
|
77
|
-
else:
|
78
|
-
if is_positive:
|
79
|
-
y1_final = min(
|
89
|
+
else: # Vertical
|
90
|
+
if is_positive: # below
|
91
|
+
y1_final = min(
|
92
|
+
self.page.height,
|
93
|
+
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
94
|
+
)
|
80
95
|
y0_final = y0_initial
|
81
|
-
else:
|
96
|
+
else: # above
|
82
97
|
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
83
98
|
y1_final = y1_initial
|
84
99
|
|
@@ -89,16 +104,16 @@ class DirectionalMixin:
|
|
89
104
|
matches_in_direction = []
|
90
105
|
|
91
106
|
# Filter and sort matches based on direction
|
92
|
-
if direction ==
|
107
|
+
if direction == "above":
|
93
108
|
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
94
109
|
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
95
|
-
elif direction ==
|
110
|
+
elif direction == "below":
|
96
111
|
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
97
112
|
matches_in_direction.sort(key=lambda e: e.top)
|
98
|
-
elif direction ==
|
113
|
+
elif direction == "left":
|
99
114
|
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
100
115
|
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
101
|
-
elif direction ==
|
116
|
+
elif direction == "right":
|
102
117
|
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
103
118
|
matches_in_direction.sort(key=lambda e: e.x0)
|
104
119
|
|
@@ -107,25 +122,29 @@ class DirectionalMixin:
|
|
107
122
|
|
108
123
|
# Adjust the primary boundary based on the target
|
109
124
|
if is_horizontal:
|
110
|
-
if is_positive:
|
125
|
+
if is_positive: # right
|
111
126
|
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
112
|
-
else:
|
127
|
+
else: # left
|
113
128
|
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
114
|
-
else:
|
115
|
-
if is_positive:
|
129
|
+
else: # Vertical
|
130
|
+
if is_positive: # below
|
116
131
|
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
117
|
-
else:
|
132
|
+
else: # above
|
118
133
|
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
119
134
|
|
120
135
|
# Adjust cross boundaries if cross_size is 'element'
|
121
136
|
if cross_size == "element":
|
122
|
-
if is_horizontal:
|
123
|
-
target_y0 =
|
137
|
+
if is_horizontal: # Adjust y0, y1
|
138
|
+
target_y0 = (
|
139
|
+
target.top if include_endpoint else target.bottom
|
140
|
+
) # Use opposite boundary if excluding
|
124
141
|
target_y1 = target.bottom if include_endpoint else target.top
|
125
142
|
y0 = min(y0, target_y0)
|
126
143
|
y1 = max(y1, target_y1)
|
127
|
-
else:
|
128
|
-
target_x0 =
|
144
|
+
else: # Adjust x0, x1
|
145
|
+
target_x0 = (
|
146
|
+
target.x0 if include_endpoint else target.x1
|
147
|
+
) # Use opposite boundary if excluding
|
129
148
|
target_x1 = target.x1 if include_endpoint else target.x0
|
130
149
|
x0 = min(x0, target_x0)
|
131
150
|
x1 = max(x1, target_x1)
|
@@ -145,6 +164,7 @@ class DirectionalMixin:
|
|
145
164
|
|
146
165
|
# 5. Create and return appropriate object based on self type
|
147
166
|
from natural_pdf.elements.region import Region
|
167
|
+
|
148
168
|
result = Region(self.page, final_bbox)
|
149
169
|
result.source_element = self
|
150
170
|
result.includes_source = include_element
|
@@ -154,11 +174,18 @@ class DirectionalMixin:
|
|
154
174
|
|
155
175
|
return result
|
156
176
|
|
157
|
-
def above(
|
158
|
-
|
177
|
+
def above(
|
178
|
+
self,
|
179
|
+
height: Optional[float] = None,
|
180
|
+
width: str = "full",
|
181
|
+
include_element: bool = False,
|
182
|
+
until: Optional[str] = None,
|
183
|
+
include_endpoint: bool = True,
|
184
|
+
**kwargs,
|
185
|
+
) -> "Region":
|
159
186
|
"""
|
160
187
|
Select region above this element/region.
|
161
|
-
|
188
|
+
|
162
189
|
Args:
|
163
190
|
height: Height of the region above, in points
|
164
191
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -166,25 +193,32 @@ class DirectionalMixin:
|
|
166
193
|
until: Optional selector string to specify an upper boundary element
|
167
194
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
168
195
|
**kwargs: Additional parameters
|
169
|
-
|
196
|
+
|
170
197
|
Returns:
|
171
198
|
Region object representing the area above
|
172
199
|
"""
|
173
200
|
return self._direction(
|
174
|
-
direction=
|
201
|
+
direction="above",
|
175
202
|
size=height,
|
176
203
|
cross_size=width,
|
177
204
|
include_element=include_element,
|
178
205
|
until=until,
|
179
206
|
include_endpoint=include_endpoint,
|
180
|
-
**kwargs
|
207
|
+
**kwargs,
|
181
208
|
)
|
182
209
|
|
183
|
-
def below(
|
184
|
-
|
210
|
+
def below(
|
211
|
+
self,
|
212
|
+
height: Optional[float] = None,
|
213
|
+
width: str = "full",
|
214
|
+
include_element: bool = False,
|
215
|
+
until: Optional[str] = None,
|
216
|
+
include_endpoint: bool = True,
|
217
|
+
**kwargs,
|
218
|
+
) -> "Region":
|
185
219
|
"""
|
186
220
|
Select region below this element/region.
|
187
|
-
|
221
|
+
|
188
222
|
Args:
|
189
223
|
height: Height of the region below, in points
|
190
224
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -192,25 +226,32 @@ class DirectionalMixin:
|
|
192
226
|
until: Optional selector string to specify a lower boundary element
|
193
227
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
194
228
|
**kwargs: Additional parameters
|
195
|
-
|
229
|
+
|
196
230
|
Returns:
|
197
231
|
Region object representing the area below
|
198
232
|
"""
|
199
233
|
return self._direction(
|
200
|
-
direction=
|
234
|
+
direction="below",
|
201
235
|
size=height,
|
202
236
|
cross_size=width,
|
203
237
|
include_element=include_element,
|
204
238
|
until=until,
|
205
239
|
include_endpoint=include_endpoint,
|
206
|
-
**kwargs
|
240
|
+
**kwargs,
|
207
241
|
)
|
208
242
|
|
209
|
-
def left(
|
210
|
-
|
243
|
+
def left(
|
244
|
+
self,
|
245
|
+
width: Optional[float] = None,
|
246
|
+
height: str = "full",
|
247
|
+
include_element: bool = False,
|
248
|
+
until: Optional[str] = None,
|
249
|
+
include_endpoint: bool = True,
|
250
|
+
**kwargs,
|
251
|
+
) -> "Region":
|
211
252
|
"""
|
212
253
|
Select region to the left of this element/region.
|
213
|
-
|
254
|
+
|
214
255
|
Args:
|
215
256
|
width: Width of the region to the left, in points
|
216
257
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -218,25 +259,32 @@ class DirectionalMixin:
|
|
218
259
|
until: Optional selector string to specify a left boundary element
|
219
260
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
220
261
|
**kwargs: Additional parameters
|
221
|
-
|
262
|
+
|
222
263
|
Returns:
|
223
264
|
Region object representing the area to the left
|
224
265
|
"""
|
225
266
|
return self._direction(
|
226
|
-
direction=
|
267
|
+
direction="left",
|
227
268
|
size=width,
|
228
269
|
cross_size=height,
|
229
270
|
include_element=include_element,
|
230
271
|
until=until,
|
231
272
|
include_endpoint=include_endpoint,
|
232
|
-
**kwargs
|
273
|
+
**kwargs,
|
233
274
|
)
|
234
275
|
|
235
|
-
def right(
|
236
|
-
|
276
|
+
def right(
|
277
|
+
self,
|
278
|
+
width: Optional[float] = None,
|
279
|
+
height: str = "full",
|
280
|
+
include_element: bool = False,
|
281
|
+
until: Optional[str] = None,
|
282
|
+
include_endpoint: bool = True,
|
283
|
+
**kwargs,
|
284
|
+
) -> "Region":
|
237
285
|
"""
|
238
286
|
Select region to the right of this element/region.
|
239
|
-
|
287
|
+
|
240
288
|
Args:
|
241
289
|
width: Width of the region to the right, in points
|
242
290
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -244,33 +292,35 @@ class DirectionalMixin:
|
|
244
292
|
until: Optional selector string to specify a right boundary element
|
245
293
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
246
294
|
**kwargs: Additional parameters
|
247
|
-
|
295
|
+
|
248
296
|
Returns:
|
249
297
|
Region object representing the area to the right
|
250
298
|
"""
|
251
299
|
return self._direction(
|
252
|
-
direction=
|
300
|
+
direction="right",
|
253
301
|
size=width,
|
254
302
|
cross_size=height,
|
255
303
|
include_element=include_element,
|
256
304
|
until=until,
|
257
305
|
include_endpoint=include_endpoint,
|
258
|
-
**kwargs
|
306
|
+
**kwargs,
|
259
307
|
)
|
260
308
|
|
261
|
-
def expand(
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
309
|
+
def expand(
|
310
|
+
self,
|
311
|
+
left: float = 0,
|
312
|
+
right: float = 0,
|
313
|
+
top_expand: float = 0, # Renamed to avoid conflict
|
314
|
+
bottom_expand: float = 0, # Renamed to avoid conflict
|
315
|
+
width_factor: float = 1.0,
|
316
|
+
height_factor: float = 1.0,
|
317
|
+
# Keep original parameter names for backward compatibility
|
318
|
+
top: float = None,
|
319
|
+
bottom: float = None,
|
320
|
+
) -> "Region":
|
271
321
|
"""
|
272
322
|
Create a new region expanded from this element/region.
|
273
|
-
|
323
|
+
|
274
324
|
Args:
|
275
325
|
left: Amount to expand left edge (positive value expands leftwards)
|
276
326
|
right: Amount to expand right edge (positive value expands rightwards)
|
@@ -280,7 +330,7 @@ class DirectionalMixin:
|
|
280
330
|
height_factor: Factor to multiply height by (applied after absolute expansion)
|
281
331
|
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
282
332
|
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
283
|
-
|
333
|
+
|
284
334
|
Returns:
|
285
335
|
New expanded Region object
|
286
336
|
"""
|
@@ -289,39 +339,39 @@ class DirectionalMixin:
|
|
289
339
|
new_x1 = self.x1
|
290
340
|
new_top = self.top
|
291
341
|
new_bottom = self.bottom
|
292
|
-
|
342
|
+
|
293
343
|
# Handle the deprecated parameter names for backward compatibility
|
294
344
|
if top is not None:
|
295
345
|
top_expand = top
|
296
346
|
if bottom is not None:
|
297
347
|
bottom_expand = bottom
|
298
|
-
|
348
|
+
|
299
349
|
# Apply absolute expansions first
|
300
350
|
new_x0 -= left
|
301
351
|
new_x1 += right
|
302
352
|
new_top -= top_expand # Expand upward (decrease top coordinate)
|
303
353
|
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
304
|
-
|
354
|
+
|
305
355
|
# Apply percentage factors if provided
|
306
356
|
if width_factor != 1.0 or height_factor != 1.0:
|
307
357
|
# Calculate center point *after* absolute expansion
|
308
358
|
center_x = (new_x0 + new_x1) / 2
|
309
359
|
center_y = (new_top + new_bottom) / 2
|
310
|
-
|
360
|
+
|
311
361
|
# Calculate current width and height *after* absolute expansion
|
312
362
|
current_width = new_x1 - new_x0
|
313
363
|
current_height = new_bottom - new_top
|
314
|
-
|
364
|
+
|
315
365
|
# Calculate new width and height
|
316
366
|
new_width = current_width * width_factor
|
317
367
|
new_height = current_height * height_factor
|
318
|
-
|
368
|
+
|
319
369
|
# Adjust coordinates based on the new dimensions, keeping the center
|
320
370
|
new_x0 = center_x - new_width / 2
|
321
371
|
new_x1 = center_x + new_width / 2
|
322
372
|
new_top = center_y - new_height / 2
|
323
373
|
new_bottom = center_y + new_height / 2
|
324
|
-
|
374
|
+
|
325
375
|
# Clamp coordinates to page boundaries
|
326
376
|
new_x0 = max(0, new_x0)
|
327
377
|
new_top = max(0, new_top)
|
@@ -329,124 +379,129 @@ class DirectionalMixin:
|
|
329
379
|
new_bottom = min(self.page.height, new_bottom)
|
330
380
|
|
331
381
|
# Ensure coordinates are valid (x0 <= x1, top <= bottom)
|
332
|
-
if new_x0 > new_x1:
|
333
|
-
|
382
|
+
if new_x0 > new_x1:
|
383
|
+
new_x0 = new_x1 = (new_x0 + new_x1) / 2
|
384
|
+
if new_top > new_bottom:
|
385
|
+
new_top = new_bottom = (new_top + new_bottom) / 2
|
334
386
|
|
335
387
|
# Create new region with expanded bbox
|
336
388
|
from natural_pdf.elements.region import Region
|
389
|
+
|
337
390
|
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
338
|
-
|
391
|
+
|
339
392
|
return new_region
|
340
393
|
|
341
394
|
|
342
395
|
class Element(DirectionalMixin):
|
343
396
|
"""
|
344
397
|
Base class for all PDF elements.
|
345
|
-
|
398
|
+
|
346
399
|
This class provides common properties and methods for all PDF elements,
|
347
400
|
such as text, rectangles, lines, etc.
|
348
401
|
"""
|
349
|
-
|
350
|
-
def __init__(self, obj: Dict[str, Any], page:
|
402
|
+
|
403
|
+
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
351
404
|
"""
|
352
405
|
Initialize base element.
|
353
|
-
|
406
|
+
|
354
407
|
Args:
|
355
408
|
obj: The underlying pdfplumber object
|
356
409
|
page: The parent Page object
|
357
410
|
"""
|
358
411
|
self._obj = obj
|
359
412
|
self._page = page
|
360
|
-
|
413
|
+
|
361
414
|
@property
|
362
415
|
def type(self) -> str:
|
363
416
|
"""Element type."""
|
364
|
-
return self._obj.get(
|
365
|
-
|
417
|
+
return self._obj.get("object_type", "unknown")
|
418
|
+
|
366
419
|
@property
|
367
420
|
def bbox(self) -> Tuple[float, float, float, float]:
|
368
421
|
"""Bounding box (x0, top, x1, bottom)."""
|
369
422
|
return (self.x0, self.top, self.x1, self.bottom)
|
370
|
-
|
423
|
+
|
371
424
|
@property
|
372
425
|
def x0(self) -> float:
|
373
426
|
"""Left x-coordinate."""
|
374
427
|
if self.has_polygon:
|
375
428
|
return min(pt[0] for pt in self.polygon)
|
376
|
-
return self._obj.get(
|
377
|
-
|
429
|
+
return self._obj.get("x0", 0)
|
430
|
+
|
378
431
|
@property
|
379
432
|
def top(self) -> float:
|
380
433
|
"""Top y-coordinate."""
|
381
434
|
if self.has_polygon:
|
382
435
|
return min(pt[1] for pt in self.polygon)
|
383
|
-
return self._obj.get(
|
384
|
-
|
436
|
+
return self._obj.get("top", 0)
|
437
|
+
|
385
438
|
@property
|
386
439
|
def x1(self) -> float:
|
387
440
|
"""Right x-coordinate."""
|
388
441
|
if self.has_polygon:
|
389
442
|
return max(pt[0] for pt in self.polygon)
|
390
|
-
return self._obj.get(
|
391
|
-
|
443
|
+
return self._obj.get("x1", 0)
|
444
|
+
|
392
445
|
@property
|
393
446
|
def bottom(self) -> float:
|
394
447
|
"""Bottom y-coordinate."""
|
395
448
|
if self.has_polygon:
|
396
449
|
return max(pt[1] for pt in self.polygon)
|
397
|
-
return self._obj.get(
|
398
|
-
|
450
|
+
return self._obj.get("bottom", 0)
|
451
|
+
|
399
452
|
@property
|
400
453
|
def width(self) -> float:
|
401
454
|
"""Element width."""
|
402
455
|
return self.x1 - self.x0
|
403
|
-
|
456
|
+
|
404
457
|
@property
|
405
458
|
def height(self) -> float:
|
406
459
|
"""Element height."""
|
407
460
|
return self.bottom - self.top
|
408
|
-
|
461
|
+
|
409
462
|
@property
|
410
463
|
def has_polygon(self) -> bool:
|
411
464
|
"""Check if this element has polygon coordinates."""
|
412
|
-
return (
|
413
|
-
|
465
|
+
return (
|
466
|
+
"polygon" in self._obj and self._obj["polygon"] and len(self._obj["polygon"]) >= 3
|
467
|
+
) or hasattr(self, "_polygon")
|
468
|
+
|
414
469
|
@property
|
415
470
|
def polygon(self) -> List[Tuple[float, float]]:
|
416
471
|
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
417
|
-
if hasattr(self,
|
472
|
+
if hasattr(self, "_polygon") and self._polygon:
|
418
473
|
return self._polygon
|
419
|
-
elif
|
420
|
-
return self._obj[
|
474
|
+
elif "polygon" in self._obj and self._obj["polygon"]:
|
475
|
+
return self._obj["polygon"]
|
421
476
|
else:
|
422
477
|
# Create rectangle corners as fallback
|
423
478
|
return [
|
424
|
-
(self._obj.get(
|
425
|
-
(self._obj.get(
|
426
|
-
(self._obj.get(
|
427
|
-
(self._obj.get(
|
479
|
+
(self._obj.get("x0", 0), self._obj.get("top", 0)), # top-left
|
480
|
+
(self._obj.get("x1", 0), self._obj.get("top", 0)), # top-right
|
481
|
+
(self._obj.get("x1", 0), self._obj.get("bottom", 0)), # bottom-right
|
482
|
+
(self._obj.get("x0", 0), self._obj.get("bottom", 0)), # bottom-left
|
428
483
|
]
|
429
|
-
|
484
|
+
|
430
485
|
def is_point_inside(self, x: float, y: float) -> bool:
|
431
486
|
"""
|
432
487
|
Check if a point is inside this element using ray casting algorithm for polygons.
|
433
|
-
|
488
|
+
|
434
489
|
Args:
|
435
490
|
x: X-coordinate to check
|
436
491
|
y: Y-coordinate to check
|
437
|
-
|
492
|
+
|
438
493
|
Returns:
|
439
494
|
True if the point is inside the element
|
440
495
|
"""
|
441
496
|
if not self.has_polygon:
|
442
497
|
# Use simple rectangle check
|
443
498
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
444
|
-
|
499
|
+
|
445
500
|
# Ray casting algorithm for complex polygons
|
446
501
|
poly = self.polygon
|
447
502
|
n = len(poly)
|
448
503
|
inside = False
|
449
|
-
|
504
|
+
|
450
505
|
p1x, p1y = poly[0]
|
451
506
|
for i in range(1, n + 1):
|
452
507
|
p2x, p2y = poly[i % n]
|
@@ -456,30 +511,36 @@ class Element(DirectionalMixin):
|
|
456
511
|
if p1x == p2x or x <= xinters:
|
457
512
|
inside = not inside
|
458
513
|
p1x, p1y = p2x, p2y
|
459
|
-
|
514
|
+
|
460
515
|
return inside
|
461
|
-
|
516
|
+
|
462
517
|
@property
|
463
|
-
def page(self) ->
|
518
|
+
def page(self) -> "Page":
|
464
519
|
"""Get the parent page."""
|
465
520
|
return self._page
|
466
|
-
|
467
|
-
def next(
|
521
|
+
|
522
|
+
def next(
|
523
|
+
self,
|
524
|
+
selector: Optional[str] = None,
|
525
|
+
limit: int = 10,
|
526
|
+
apply_exclusions: bool = True,
|
527
|
+
**kwargs,
|
528
|
+
) -> Optional["Element"]:
|
468
529
|
"""
|
469
530
|
Find next element in reading order.
|
470
|
-
|
531
|
+
|
471
532
|
Args:
|
472
533
|
selector: Optional selector to filter by
|
473
534
|
limit: Maximum number of elements to search through (default: 10)
|
474
535
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
475
536
|
**kwargs: Additional parameters
|
476
|
-
|
537
|
+
|
477
538
|
Returns:
|
478
539
|
Next element or None if not found
|
479
540
|
"""
|
480
541
|
# Get all elements in reading order
|
481
|
-
all_elements = self.page.find_all(
|
482
|
-
|
542
|
+
all_elements = self.page.find_all("*", apply_exclusions=apply_exclusions)
|
543
|
+
|
483
544
|
# Find our index in the list
|
484
545
|
try:
|
485
546
|
# Compare by object identity since bbox could match multiple elements
|
@@ -487,40 +548,47 @@ class Element(DirectionalMixin):
|
|
487
548
|
except StopIteration:
|
488
549
|
# If not found, it might have been filtered out by exclusions
|
489
550
|
return None
|
490
|
-
|
551
|
+
|
491
552
|
# Search for next matching element
|
492
553
|
if selector:
|
493
554
|
# Filter elements after this one
|
494
|
-
candidates = all_elements[idx+1:]
|
555
|
+
candidates = all_elements[idx + 1 :]
|
495
556
|
# Limit search range for performance
|
496
557
|
candidates = candidates[:limit] if limit else candidates
|
497
|
-
|
558
|
+
|
498
559
|
# Find matching elements
|
499
560
|
from natural_pdf.elements.collections import ElementCollection
|
561
|
+
|
500
562
|
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
501
563
|
return matches[0] if matches else None
|
502
564
|
elif idx + 1 < len(all_elements):
|
503
565
|
# No selector, just return the next element
|
504
566
|
return all_elements[idx + 1]
|
505
|
-
|
567
|
+
|
506
568
|
return None
|
507
|
-
|
508
|
-
def prev(
|
569
|
+
|
570
|
+
def prev(
|
571
|
+
self,
|
572
|
+
selector: Optional[str] = None,
|
573
|
+
limit: int = 10,
|
574
|
+
apply_exclusions: bool = True,
|
575
|
+
**kwargs,
|
576
|
+
) -> Optional["Element"]:
|
509
577
|
"""
|
510
578
|
Find previous element in reading order.
|
511
|
-
|
579
|
+
|
512
580
|
Args:
|
513
581
|
selector: Optional selector to filter by
|
514
582
|
limit: Maximum number of elements to search through (default: 10)
|
515
583
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
516
584
|
**kwargs: Additional parameters
|
517
|
-
|
585
|
+
|
518
586
|
Returns:
|
519
587
|
Previous element or None if not found
|
520
588
|
"""
|
521
589
|
# Get all elements in reading order
|
522
|
-
all_elements = self.page.find_all(
|
523
|
-
|
590
|
+
all_elements = self.page.find_all("*", apply_exclusions=apply_exclusions)
|
591
|
+
|
524
592
|
# Find our index in the list
|
525
593
|
try:
|
526
594
|
# Compare by object identity since bbox could match multiple elements
|
@@ -528,7 +596,7 @@ class Element(DirectionalMixin):
|
|
528
596
|
except StopIteration:
|
529
597
|
# If not found, it might have been filtered out by exclusions
|
530
598
|
return None
|
531
|
-
|
599
|
+
|
532
600
|
# Search for previous matching element
|
533
601
|
if selector:
|
534
602
|
# Select elements before this one
|
@@ -537,27 +605,34 @@ class Element(DirectionalMixin):
|
|
537
605
|
candidates = candidates[::-1]
|
538
606
|
# Limit search range for performance
|
539
607
|
candidates = candidates[:limit] if limit else candidates
|
540
|
-
|
608
|
+
|
541
609
|
# Find matching elements using ElementCollection
|
542
610
|
from natural_pdf.elements.collections import ElementCollection
|
611
|
+
|
543
612
|
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
544
|
-
return matches[0] if matches else None
|
613
|
+
return matches[0] if matches else None # find_all returns a collection
|
545
614
|
elif idx > 0:
|
546
615
|
# No selector, just return the previous element
|
547
616
|
return all_elements[idx - 1]
|
548
|
-
|
617
|
+
|
549
618
|
return None
|
550
|
-
|
551
|
-
def nearest(
|
619
|
+
|
620
|
+
def nearest(
|
621
|
+
self,
|
622
|
+
selector: str,
|
623
|
+
max_distance: Optional[float] = None,
|
624
|
+
apply_exclusions: bool = True,
|
625
|
+
**kwargs,
|
626
|
+
) -> Optional["Element"]:
|
552
627
|
"""
|
553
628
|
Find nearest element matching selector.
|
554
|
-
|
629
|
+
|
555
630
|
Args:
|
556
631
|
selector: CSS-like selector string
|
557
632
|
max_distance: Maximum distance to search (default: None = unlimited)
|
558
633
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
559
634
|
**kwargs: Additional parameters
|
560
|
-
|
635
|
+
|
561
636
|
Returns:
|
562
637
|
Nearest element or None if not found
|
563
638
|
"""
|
@@ -565,56 +640,59 @@ class Element(DirectionalMixin):
|
|
565
640
|
matches = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
566
641
|
if not matches:
|
567
642
|
return None
|
568
|
-
|
643
|
+
|
569
644
|
# Calculate distance to center point of this element
|
570
645
|
self_center_x = (self.x0 + self.x1) / 2
|
571
646
|
self_center_y = (self.top + self.bottom) / 2
|
572
|
-
|
647
|
+
|
573
648
|
# Calculate distances to each match
|
574
649
|
distances = []
|
575
650
|
for match in matches:
|
576
651
|
if match is self: # Skip self
|
577
652
|
continue
|
578
|
-
|
653
|
+
|
579
654
|
match_center_x = (match.x0 + match.x1) / 2
|
580
655
|
match_center_y = (match.top + match.bottom) / 2
|
581
|
-
|
656
|
+
|
582
657
|
# Euclidean distance
|
583
|
-
distance = (
|
584
|
-
|
585
|
-
|
658
|
+
distance = (
|
659
|
+
(match_center_x - self_center_x) ** 2 + (match_center_y - self_center_y) ** 2
|
660
|
+
) ** 0.5
|
661
|
+
|
586
662
|
# Filter by max_distance if specified
|
587
663
|
if max_distance is None or distance <= max_distance:
|
588
664
|
distances.append((match, distance))
|
589
|
-
|
665
|
+
|
590
666
|
# Sort by distance and return the closest
|
591
667
|
if distances:
|
592
668
|
distances.sort(key=lambda x: x[1])
|
593
669
|
return distances[0][0]
|
594
|
-
|
670
|
+
|
595
671
|
return None
|
596
|
-
|
597
|
-
def until(
|
672
|
+
|
673
|
+
def until(
|
674
|
+
self, selector: str, include_endpoint: bool = True, width: str = "element", **kwargs
|
675
|
+
) -> "Region":
|
598
676
|
"""
|
599
677
|
Select content from this element until matching selector.
|
600
|
-
|
678
|
+
|
601
679
|
Args:
|
602
680
|
selector: CSS-like selector string
|
603
681
|
include_endpoint: Whether to include the endpoint element in the region (default: True)
|
604
682
|
width: Width mode - "element" to use element widths or "full" for full page width
|
605
683
|
**kwargs: Additional selection parameters
|
606
|
-
|
684
|
+
|
607
685
|
Returns:
|
608
686
|
Region object representing the selected content
|
609
687
|
"""
|
610
688
|
from natural_pdf.elements.region import Region
|
611
|
-
|
689
|
+
|
612
690
|
# Find the target element
|
613
691
|
target = self.page.find(selector, **kwargs)
|
614
692
|
if not target:
|
615
693
|
# If target not found, return a region with just this element
|
616
694
|
return Region(self.page, self.bbox)
|
617
|
-
|
695
|
+
|
618
696
|
# Use full page width if requested
|
619
697
|
if width == "full":
|
620
698
|
x0 = 0
|
@@ -622,12 +700,16 @@ class Element(DirectionalMixin):
|
|
622
700
|
# Determine vertical bounds based on element positions
|
623
701
|
if target.top >= self.bottom: # Target is below this element
|
624
702
|
top = self.top
|
625
|
-
bottom =
|
703
|
+
bottom = (
|
704
|
+
target.bottom if include_endpoint else target.top - 1
|
705
|
+
) # Subtract 1 pixel when excluding
|
626
706
|
else: # Target is above this element
|
627
|
-
top =
|
707
|
+
top = (
|
708
|
+
target.top if include_endpoint else target.bottom + 1
|
709
|
+
) # Add 1 pixel when excluding
|
628
710
|
bottom = self.bottom
|
629
711
|
return Region(self.page, (x0, top, x1, bottom))
|
630
|
-
|
712
|
+
|
631
713
|
# Otherwise use element-based width
|
632
714
|
# Determine the correct order for creating the region
|
633
715
|
# If the target is below this element (normal reading order)
|
@@ -635,12 +717,16 @@ class Element(DirectionalMixin):
|
|
635
717
|
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
636
718
|
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
637
719
|
top = self.top
|
638
|
-
bottom =
|
720
|
+
bottom = (
|
721
|
+
target.bottom if include_endpoint else target.top - 1
|
722
|
+
) # Subtract 1 pixel when excluding
|
639
723
|
# If the target is above this element (reverse reading order)
|
640
724
|
elif target.bottom <= self.top:
|
641
725
|
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
642
726
|
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
643
|
-
top =
|
727
|
+
top = (
|
728
|
+
target.top if include_endpoint else target.bottom + 1
|
729
|
+
) # Add 1 pixel when excluding
|
644
730
|
bottom = self.bottom
|
645
731
|
# If they're side by side, use the horizontal version
|
646
732
|
elif target.x0 >= self.x1: # Target is to the right
|
@@ -653,47 +739,49 @@ class Element(DirectionalMixin):
|
|
653
739
|
x1 = self.x1
|
654
740
|
top = min(self.top, target.top if include_endpoint else target.bottom)
|
655
741
|
bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
|
656
|
-
|
742
|
+
|
657
743
|
region = Region(self.page, (x0, top, x1, bottom))
|
658
744
|
region.source_element = self
|
659
745
|
region.end_element = target
|
660
746
|
return region
|
661
|
-
|
747
|
+
|
662
748
|
# Note: select_until method removed in favor of until()
|
663
|
-
|
749
|
+
|
664
750
|
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
665
751
|
"""
|
666
752
|
Extract text from this element.
|
667
|
-
|
753
|
+
|
668
754
|
Args:
|
669
755
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
670
756
|
use_exclusions: Whether to apply exclusion regions (default: True)
|
671
757
|
**kwargs: Additional extraction parameters
|
672
|
-
|
758
|
+
|
673
759
|
Returns:
|
674
760
|
Extracted text as string
|
675
761
|
"""
|
676
762
|
# Default implementation - override in subclasses
|
677
763
|
return ""
|
678
|
-
|
764
|
+
|
679
765
|
# Note: extract_text_compat method removed
|
680
|
-
|
681
|
-
def highlight(
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
766
|
+
|
767
|
+
def highlight(
|
768
|
+
self,
|
769
|
+
label: Optional[str] = None,
|
770
|
+
color: Optional[Union[Tuple, str]] = None, # Allow string color
|
771
|
+
use_color_cycling: bool = False,
|
772
|
+
include_attrs: Optional[List[str]] = None,
|
773
|
+
existing: str = "append",
|
774
|
+
) -> "Element":
|
687
775
|
"""
|
688
776
|
Highlight this element on the page.
|
689
|
-
|
777
|
+
|
690
778
|
Args:
|
691
779
|
label: Optional label for the highlight
|
692
780
|
color: Color tuple/string for the highlight, or None to use automatic color
|
693
781
|
use_color_cycling: Force color cycling even with no label (default: False)
|
694
782
|
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
695
783
|
existing: How to handle existing highlights - 'append' (default) or 'replace'
|
696
|
-
|
784
|
+
|
697
785
|
Returns:
|
698
786
|
Self for method chaining
|
699
787
|
"""
|
@@ -708,7 +796,7 @@ class Element(DirectionalMixin):
|
|
708
796
|
"use_color_cycling": use_color_cycling,
|
709
797
|
"element": self, # Pass the element itself so attributes can be accessed
|
710
798
|
"include_attrs": include_attrs,
|
711
|
-
"existing": existing
|
799
|
+
"existing": existing,
|
712
800
|
}
|
713
801
|
|
714
802
|
# Call the appropriate service method based on geometry
|
@@ -720,13 +808,15 @@ class Element(DirectionalMixin):
|
|
720
808
|
highlighter.add(**highlight_args)
|
721
809
|
|
722
810
|
return self
|
723
|
-
|
724
|
-
def show(
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
811
|
+
|
812
|
+
def show(
|
813
|
+
self,
|
814
|
+
scale: float = 2.0,
|
815
|
+
labels: bool = True,
|
816
|
+
legend_position: str = "right",
|
817
|
+
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
818
|
+
label: Optional[str] = None,
|
819
|
+
) -> Optional["Image.Image"]:
|
730
820
|
"""
|
731
821
|
Show the page with only this element highlighted temporarily.
|
732
822
|
|
@@ -740,12 +830,12 @@ class Element(DirectionalMixin):
|
|
740
830
|
Returns:
|
741
831
|
PIL Image of the page with only this element highlighted, or None if error.
|
742
832
|
"""
|
743
|
-
if not hasattr(self,
|
833
|
+
if not hasattr(self, "page") or not self.page:
|
744
834
|
logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
|
745
835
|
return None
|
746
|
-
if not hasattr(self.page,
|
747
|
-
|
748
|
-
|
836
|
+
if not hasattr(self.page, "_highlighter") or not self.page._highlighter:
|
837
|
+
logger.warning(f"Cannot show element, page lacks highlighter service: {self}")
|
838
|
+
return None
|
749
839
|
|
750
840
|
service = self.page._highlighter
|
751
841
|
|
@@ -757,15 +847,15 @@ class Element(DirectionalMixin):
|
|
757
847
|
"page_index": self.page.index,
|
758
848
|
"bbox": self.bbox if not self.has_polygon else None,
|
759
849
|
"polygon": self.polygon if self.has_polygon else None,
|
760
|
-
"color": color,
|
850
|
+
"color": color, # Use provided or default color
|
761
851
|
"label": display_label,
|
762
|
-
"use_color_cycling": False
|
852
|
+
"use_color_cycling": False, # Explicitly false for single preview
|
763
853
|
}
|
764
854
|
|
765
855
|
# Check if we actually got geometry data
|
766
|
-
if temp_highlight_data[
|
767
|
-
|
768
|
-
|
856
|
+
if temp_highlight_data["bbox"] is None and temp_highlight_data["polygon"] is None:
|
857
|
+
logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
|
858
|
+
return None
|
769
859
|
|
770
860
|
# Use render_preview to show only this highlight
|
771
861
|
try:
|
@@ -774,49 +864,47 @@ class Element(DirectionalMixin):
|
|
774
864
|
temporary_highlights=[temp_highlight_data],
|
775
865
|
scale=scale,
|
776
866
|
labels=labels,
|
777
|
-
legend_position=legend_position
|
867
|
+
legend_position=legend_position,
|
778
868
|
)
|
779
869
|
except Exception as e:
|
780
870
|
logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
|
781
871
|
return None
|
782
|
-
|
783
|
-
def save(
|
784
|
-
|
785
|
-
|
786
|
-
labels: bool = True,
|
787
|
-
legend_position: str = 'right') -> None:
|
872
|
+
|
873
|
+
def save(
|
874
|
+
self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
|
875
|
+
) -> None:
|
788
876
|
"""
|
789
877
|
Save the page with this element highlighted to an image file.
|
790
|
-
|
878
|
+
|
791
879
|
Args:
|
792
880
|
filename: Path to save the image to
|
793
881
|
scale: Scale factor for rendering
|
794
882
|
labels: Whether to include a legend for labels
|
795
883
|
legend_position: Position of the legend
|
796
|
-
|
884
|
+
|
797
885
|
Returns:
|
798
886
|
Self for method chaining
|
799
887
|
"""
|
800
888
|
# Save the highlighted image
|
801
889
|
self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
802
890
|
return self
|
803
|
-
|
891
|
+
|
804
892
|
# Note: save_image method removed in favor of save()
|
805
|
-
|
893
|
+
|
806
894
|
def __repr__(self) -> str:
|
807
895
|
"""String representation of the element."""
|
808
896
|
return f"<{self.__class__.__name__} bbox={self.bbox}>"
|
809
897
|
|
810
|
-
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[
|
898
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
|
811
899
|
"""
|
812
900
|
Find first element within this element's bounds matching the selector.
|
813
901
|
Creates a temporary region to perform the search.
|
814
|
-
|
902
|
+
|
815
903
|
Args:
|
816
904
|
selector: CSS-like selector string
|
817
905
|
apply_exclusions: Whether to apply exclusion regions
|
818
906
|
**kwargs: Additional parameters for element filtering
|
819
|
-
|
907
|
+
|
820
908
|
Returns:
|
821
909
|
First matching element or None
|
822
910
|
"""
|
@@ -826,16 +914,16 @@ class Element(DirectionalMixin):
|
|
826
914
|
temp_region = Region(self.page, self.bbox)
|
827
915
|
return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
828
916
|
|
829
|
-
def find_all(self, selector: str, apply_exclusions=True, **kwargs) ->
|
917
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> "ElementCollection":
|
830
918
|
"""
|
831
919
|
Find all elements within this element's bounds matching the selector.
|
832
920
|
Creates a temporary region to perform the search.
|
833
|
-
|
921
|
+
|
834
922
|
Args:
|
835
923
|
selector: CSS-like selector string
|
836
924
|
apply_exclusions: Whether to apply exclusion regions
|
837
925
|
**kwargs: Additional parameters for element filtering
|
838
|
-
|
926
|
+
|
839
927
|
Returns:
|
840
928
|
ElementCollection with matching elements
|
841
929
|
"""
|
@@ -843,4 +931,4 @@ class Element(DirectionalMixin):
|
|
843
931
|
|
844
932
|
# Create a temporary region from this element's bounds
|
845
933
|
temp_region = Region(self.page, self.bbox)
|
846
|
-
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
934
|
+
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|