natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -1,24 +1,33 @@
|
|
1
1
|
"""
|
2
2
|
Base Element class for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
5
7
|
from PIL import Image
|
6
8
|
|
7
9
|
if TYPE_CHECKING:
|
8
10
|
from natural_pdf.core.page import Page
|
9
|
-
from natural_pdf.elements.region import Region
|
10
11
|
from natural_pdf.elements.base import Element
|
11
12
|
from natural_pdf.elements.collections import ElementCollection
|
13
|
+
from natural_pdf.elements.region import Region
|
12
14
|
|
13
15
|
|
14
16
|
class DirectionalMixin:
|
15
17
|
"""
|
16
18
|
Mixin class providing directional methods for both Element and Region classes.
|
17
19
|
"""
|
18
|
-
|
19
|
-
def _direction(
|
20
|
-
|
21
|
-
|
20
|
+
|
21
|
+
def _direction(
|
22
|
+
self,
|
23
|
+
direction: str,
|
24
|
+
size: Optional[float] = None,
|
25
|
+
cross_size: str = "full",
|
26
|
+
include_element: bool = False,
|
27
|
+
until: Optional[str] = None,
|
28
|
+
include_endpoint: bool = True,
|
29
|
+
**kwargs,
|
30
|
+
) -> "Region":
|
22
31
|
"""
|
23
32
|
Protected helper method to create a region in a specified direction relative to this element/region.
|
24
33
|
|
@@ -34,11 +43,11 @@ class DirectionalMixin:
|
|
34
43
|
Returns:
|
35
44
|
Region object
|
36
45
|
"""
|
37
|
-
import math
|
46
|
+
import math # Use math.inf for infinity
|
38
47
|
|
39
|
-
is_horizontal = direction in (
|
40
|
-
is_positive = direction in (
|
41
|
-
pixel_offset = 1
|
48
|
+
is_horizontal = direction in ("left", "right")
|
49
|
+
is_positive = direction in ("right", "below") # right/below are positive directions
|
50
|
+
pixel_offset = 1 # Offset for excluding elements/endpoints
|
42
51
|
|
43
52
|
# 1. Determine initial boundaries based on direction and include_element
|
44
53
|
if is_horizontal:
|
@@ -47,38 +56,44 @@ class DirectionalMixin:
|
|
47
56
|
y1 = self.page.height if cross_size == "full" else self.bottom
|
48
57
|
|
49
58
|
# Initial primary boundaries (horizontal)
|
50
|
-
if is_positive:
|
59
|
+
if is_positive: # right
|
51
60
|
x0_initial = self.x0 if include_element else self.x1 + pixel_offset
|
52
|
-
x1_initial = self.x1
|
53
|
-
else:
|
54
|
-
x0_initial = self.x0
|
61
|
+
x1_initial = self.x1 # This edge moves
|
62
|
+
else: # left
|
63
|
+
x0_initial = self.x0 # This edge moves
|
55
64
|
x1_initial = self.x1 if include_element else self.x0 - pixel_offset
|
56
|
-
else:
|
65
|
+
else: # Vertical
|
57
66
|
# Initial cross-boundaries (horizontal)
|
58
67
|
x0 = 0 if cross_size == "full" else self.x0
|
59
68
|
x1 = self.page.width if cross_size == "full" else self.x1
|
60
69
|
|
61
70
|
# Initial primary boundaries (vertical)
|
62
|
-
if is_positive:
|
71
|
+
if is_positive: # below
|
63
72
|
y0_initial = self.top if include_element else self.bottom + pixel_offset
|
64
|
-
y1_initial = self.bottom
|
65
|
-
else:
|
66
|
-
y0_initial = self.top
|
73
|
+
y1_initial = self.bottom # This edge moves
|
74
|
+
else: # above
|
75
|
+
y0_initial = self.top # This edge moves
|
67
76
|
y1_initial = self.bottom if include_element else self.top - pixel_offset
|
68
77
|
|
69
78
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
70
79
|
if is_horizontal:
|
71
|
-
if is_positive:
|
72
|
-
x1_final = min(
|
80
|
+
if is_positive: # right
|
81
|
+
x1_final = min(
|
82
|
+
self.page.width,
|
83
|
+
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
84
|
+
)
|
73
85
|
x0_final = x0_initial
|
74
|
-
else:
|
86
|
+
else: # left
|
75
87
|
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
76
88
|
x1_final = x1_initial
|
77
|
-
else:
|
78
|
-
if is_positive:
|
79
|
-
y1_final = min(
|
89
|
+
else: # Vertical
|
90
|
+
if is_positive: # below
|
91
|
+
y1_final = min(
|
92
|
+
self.page.height,
|
93
|
+
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
94
|
+
)
|
80
95
|
y0_final = y0_initial
|
81
|
-
else:
|
96
|
+
else: # above
|
82
97
|
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
83
98
|
y1_final = y1_initial
|
84
99
|
|
@@ -89,16 +104,16 @@ class DirectionalMixin:
|
|
89
104
|
matches_in_direction = []
|
90
105
|
|
91
106
|
# Filter and sort matches based on direction
|
92
|
-
if direction ==
|
107
|
+
if direction == "above":
|
93
108
|
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
94
109
|
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
95
|
-
elif direction ==
|
110
|
+
elif direction == "below":
|
96
111
|
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
97
112
|
matches_in_direction.sort(key=lambda e: e.top)
|
98
|
-
elif direction ==
|
113
|
+
elif direction == "left":
|
99
114
|
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
100
115
|
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
101
|
-
elif direction ==
|
116
|
+
elif direction == "right":
|
102
117
|
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
103
118
|
matches_in_direction.sort(key=lambda e: e.x0)
|
104
119
|
|
@@ -107,25 +122,29 @@ class DirectionalMixin:
|
|
107
122
|
|
108
123
|
# Adjust the primary boundary based on the target
|
109
124
|
if is_horizontal:
|
110
|
-
if is_positive:
|
125
|
+
if is_positive: # right
|
111
126
|
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
112
|
-
else:
|
127
|
+
else: # left
|
113
128
|
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
114
|
-
else:
|
115
|
-
if is_positive:
|
129
|
+
else: # Vertical
|
130
|
+
if is_positive: # below
|
116
131
|
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
117
|
-
else:
|
132
|
+
else: # above
|
118
133
|
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
119
134
|
|
120
135
|
# Adjust cross boundaries if cross_size is 'element'
|
121
136
|
if cross_size == "element":
|
122
|
-
if is_horizontal:
|
123
|
-
target_y0 =
|
137
|
+
if is_horizontal: # Adjust y0, y1
|
138
|
+
target_y0 = (
|
139
|
+
target.top if include_endpoint else target.bottom
|
140
|
+
) # Use opposite boundary if excluding
|
124
141
|
target_y1 = target.bottom if include_endpoint else target.top
|
125
142
|
y0 = min(y0, target_y0)
|
126
143
|
y1 = max(y1, target_y1)
|
127
|
-
else:
|
128
|
-
target_x0 =
|
144
|
+
else: # Adjust x0, x1
|
145
|
+
target_x0 = (
|
146
|
+
target.x0 if include_endpoint else target.x1
|
147
|
+
) # Use opposite boundary if excluding
|
129
148
|
target_x1 = target.x1 if include_endpoint else target.x0
|
130
149
|
x0 = min(x0, target_x0)
|
131
150
|
x1 = max(x1, target_x1)
|
@@ -145,6 +164,7 @@ class DirectionalMixin:
|
|
145
164
|
|
146
165
|
# 5. Create and return appropriate object based on self type
|
147
166
|
from natural_pdf.elements.region import Region
|
167
|
+
|
148
168
|
result = Region(self.page, final_bbox)
|
149
169
|
result.source_element = self
|
150
170
|
result.includes_source = include_element
|
@@ -154,11 +174,18 @@ class DirectionalMixin:
|
|
154
174
|
|
155
175
|
return result
|
156
176
|
|
157
|
-
def above(
|
158
|
-
|
177
|
+
def above(
|
178
|
+
self,
|
179
|
+
height: Optional[float] = None,
|
180
|
+
width: str = "full",
|
181
|
+
include_element: bool = False,
|
182
|
+
until: Optional[str] = None,
|
183
|
+
include_endpoint: bool = True,
|
184
|
+
**kwargs,
|
185
|
+
) -> "Region":
|
159
186
|
"""
|
160
187
|
Select region above this element/region.
|
161
|
-
|
188
|
+
|
162
189
|
Args:
|
163
190
|
height: Height of the region above, in points
|
164
191
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -166,25 +193,32 @@ class DirectionalMixin:
|
|
166
193
|
until: Optional selector string to specify an upper boundary element
|
167
194
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
168
195
|
**kwargs: Additional parameters
|
169
|
-
|
196
|
+
|
170
197
|
Returns:
|
171
198
|
Region object representing the area above
|
172
199
|
"""
|
173
200
|
return self._direction(
|
174
|
-
direction=
|
201
|
+
direction="above",
|
175
202
|
size=height,
|
176
203
|
cross_size=width,
|
177
204
|
include_element=include_element,
|
178
205
|
until=until,
|
179
206
|
include_endpoint=include_endpoint,
|
180
|
-
**kwargs
|
207
|
+
**kwargs,
|
181
208
|
)
|
182
209
|
|
183
|
-
def below(
|
184
|
-
|
210
|
+
def below(
|
211
|
+
self,
|
212
|
+
height: Optional[float] = None,
|
213
|
+
width: str = "full",
|
214
|
+
include_element: bool = False,
|
215
|
+
until: Optional[str] = None,
|
216
|
+
include_endpoint: bool = True,
|
217
|
+
**kwargs,
|
218
|
+
) -> "Region":
|
185
219
|
"""
|
186
220
|
Select region below this element/region.
|
187
|
-
|
221
|
+
|
188
222
|
Args:
|
189
223
|
height: Height of the region below, in points
|
190
224
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -192,25 +226,32 @@ class DirectionalMixin:
|
|
192
226
|
until: Optional selector string to specify a lower boundary element
|
193
227
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
194
228
|
**kwargs: Additional parameters
|
195
|
-
|
229
|
+
|
196
230
|
Returns:
|
197
231
|
Region object representing the area below
|
198
232
|
"""
|
199
233
|
return self._direction(
|
200
|
-
direction=
|
234
|
+
direction="below",
|
201
235
|
size=height,
|
202
236
|
cross_size=width,
|
203
237
|
include_element=include_element,
|
204
238
|
until=until,
|
205
239
|
include_endpoint=include_endpoint,
|
206
|
-
**kwargs
|
240
|
+
**kwargs,
|
207
241
|
)
|
208
242
|
|
209
|
-
def left(
|
210
|
-
|
243
|
+
def left(
|
244
|
+
self,
|
245
|
+
width: Optional[float] = None,
|
246
|
+
height: str = "full",
|
247
|
+
include_element: bool = False,
|
248
|
+
until: Optional[str] = None,
|
249
|
+
include_endpoint: bool = True,
|
250
|
+
**kwargs,
|
251
|
+
) -> "Region":
|
211
252
|
"""
|
212
253
|
Select region to the left of this element/region.
|
213
|
-
|
254
|
+
|
214
255
|
Args:
|
215
256
|
width: Width of the region to the left, in points
|
216
257
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -218,25 +259,32 @@ class DirectionalMixin:
|
|
218
259
|
until: Optional selector string to specify a left boundary element
|
219
260
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
220
261
|
**kwargs: Additional parameters
|
221
|
-
|
262
|
+
|
222
263
|
Returns:
|
223
264
|
Region object representing the area to the left
|
224
265
|
"""
|
225
266
|
return self._direction(
|
226
|
-
direction=
|
267
|
+
direction="left",
|
227
268
|
size=width,
|
228
269
|
cross_size=height,
|
229
270
|
include_element=include_element,
|
230
271
|
until=until,
|
231
272
|
include_endpoint=include_endpoint,
|
232
|
-
**kwargs
|
273
|
+
**kwargs,
|
233
274
|
)
|
234
275
|
|
235
|
-
def right(
|
236
|
-
|
276
|
+
def right(
|
277
|
+
self,
|
278
|
+
width: Optional[float] = None,
|
279
|
+
height: str = "full",
|
280
|
+
include_element: bool = False,
|
281
|
+
until: Optional[str] = None,
|
282
|
+
include_endpoint: bool = True,
|
283
|
+
**kwargs,
|
284
|
+
) -> "Region":
|
237
285
|
"""
|
238
286
|
Select region to the right of this element/region.
|
239
|
-
|
287
|
+
|
240
288
|
Args:
|
241
289
|
width: Width of the region to the right, in points
|
242
290
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -244,43 +292,45 @@ class DirectionalMixin:
|
|
244
292
|
until: Optional selector string to specify a right boundary element
|
245
293
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
246
294
|
**kwargs: Additional parameters
|
247
|
-
|
295
|
+
|
248
296
|
Returns:
|
249
297
|
Region object representing the area to the right
|
250
298
|
"""
|
251
299
|
return self._direction(
|
252
|
-
direction=
|
300
|
+
direction="right",
|
253
301
|
size=width,
|
254
302
|
cross_size=height,
|
255
303
|
include_element=include_element,
|
256
304
|
until=until,
|
257
305
|
include_endpoint=include_endpoint,
|
258
|
-
**kwargs
|
306
|
+
**kwargs,
|
259
307
|
)
|
260
308
|
|
261
|
-
def
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
309
|
+
def to_region(
|
310
|
+
self
|
311
|
+
):
|
312
|
+
return self.expand()
|
313
|
+
|
314
|
+
def expand(
|
315
|
+
self,
|
316
|
+
left: float = 0,
|
317
|
+
right: float = 0,
|
318
|
+
top: float = 0,
|
319
|
+
bottom: float = 0,
|
320
|
+
width_factor: float = 1.0,
|
321
|
+
height_factor: float = 1.0,
|
322
|
+
) -> "Region":
|
271
323
|
"""
|
272
324
|
Create a new region expanded from this element/region.
|
273
|
-
|
325
|
+
|
274
326
|
Args:
|
275
327
|
left: Amount to expand left edge (positive value expands leftwards)
|
276
328
|
right: Amount to expand right edge (positive value expands rightwards)
|
277
|
-
|
278
|
-
|
329
|
+
top: Amount to expand top edge (positive value expands upwards)
|
330
|
+
bottom: Amount to expand bottom edge (positive value expands downwards)
|
279
331
|
width_factor: Factor to multiply width by (applied after absolute expansion)
|
280
332
|
height_factor: Factor to multiply height by (applied after absolute expansion)
|
281
|
-
|
282
|
-
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
283
|
-
|
333
|
+
|
284
334
|
Returns:
|
285
335
|
New expanded Region object
|
286
336
|
"""
|
@@ -289,39 +339,33 @@ class DirectionalMixin:
|
|
289
339
|
new_x1 = self.x1
|
290
340
|
new_top = self.top
|
291
341
|
new_bottom = self.bottom
|
292
|
-
|
293
|
-
# Handle the deprecated parameter names for backward compatibility
|
294
|
-
if top is not None:
|
295
|
-
top_expand = top
|
296
|
-
if bottom is not None:
|
297
|
-
bottom_expand = bottom
|
298
|
-
|
342
|
+
|
299
343
|
# Apply absolute expansions first
|
300
344
|
new_x0 -= left
|
301
345
|
new_x1 += right
|
302
|
-
new_top -=
|
303
|
-
new_bottom +=
|
304
|
-
|
346
|
+
new_top -= top # Expand upward (decrease top coordinate)
|
347
|
+
new_bottom += bottom # Expand downward (increase bottom coordinate)
|
348
|
+
|
305
349
|
# Apply percentage factors if provided
|
306
350
|
if width_factor != 1.0 or height_factor != 1.0:
|
307
351
|
# Calculate center point *after* absolute expansion
|
308
352
|
center_x = (new_x0 + new_x1) / 2
|
309
353
|
center_y = (new_top + new_bottom) / 2
|
310
|
-
|
354
|
+
|
311
355
|
# Calculate current width and height *after* absolute expansion
|
312
356
|
current_width = new_x1 - new_x0
|
313
357
|
current_height = new_bottom - new_top
|
314
|
-
|
358
|
+
|
315
359
|
# Calculate new width and height
|
316
360
|
new_width = current_width * width_factor
|
317
361
|
new_height = current_height * height_factor
|
318
|
-
|
362
|
+
|
319
363
|
# Adjust coordinates based on the new dimensions, keeping the center
|
320
364
|
new_x0 = center_x - new_width / 2
|
321
365
|
new_x1 = center_x + new_width / 2
|
322
366
|
new_top = center_y - new_height / 2
|
323
367
|
new_bottom = center_y + new_height / 2
|
324
|
-
|
368
|
+
|
325
369
|
# Clamp coordinates to page boundaries
|
326
370
|
new_x0 = max(0, new_x0)
|
327
371
|
new_top = max(0, new_top)
|
@@ -329,124 +373,129 @@ class DirectionalMixin:
|
|
329
373
|
new_bottom = min(self.page.height, new_bottom)
|
330
374
|
|
331
375
|
# Ensure coordinates are valid (x0 <= x1, top <= bottom)
|
332
|
-
if new_x0 > new_x1:
|
333
|
-
|
376
|
+
if new_x0 > new_x1:
|
377
|
+
new_x0 = new_x1 = (new_x0 + new_x1) / 2
|
378
|
+
if new_top > new_bottom:
|
379
|
+
new_top = new_bottom = (new_top + new_bottom) / 2
|
334
380
|
|
335
381
|
# Create new region with expanded bbox
|
336
382
|
from natural_pdf.elements.region import Region
|
383
|
+
|
337
384
|
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
338
|
-
|
385
|
+
|
339
386
|
return new_region
|
340
387
|
|
341
388
|
|
342
389
|
class Element(DirectionalMixin):
|
343
390
|
"""
|
344
391
|
Base class for all PDF elements.
|
345
|
-
|
392
|
+
|
346
393
|
This class provides common properties and methods for all PDF elements,
|
347
394
|
such as text, rectangles, lines, etc.
|
348
395
|
"""
|
349
|
-
|
350
|
-
def __init__(self, obj: Dict[str, Any], page:
|
396
|
+
|
397
|
+
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
351
398
|
"""
|
352
399
|
Initialize base element.
|
353
|
-
|
400
|
+
|
354
401
|
Args:
|
355
402
|
obj: The underlying pdfplumber object
|
356
403
|
page: The parent Page object
|
357
404
|
"""
|
358
405
|
self._obj = obj
|
359
406
|
self._page = page
|
360
|
-
|
407
|
+
|
361
408
|
@property
|
362
409
|
def type(self) -> str:
|
363
410
|
"""Element type."""
|
364
|
-
return self._obj.get(
|
365
|
-
|
411
|
+
return self._obj.get("object_type", "unknown")
|
412
|
+
|
366
413
|
@property
|
367
414
|
def bbox(self) -> Tuple[float, float, float, float]:
|
368
415
|
"""Bounding box (x0, top, x1, bottom)."""
|
369
416
|
return (self.x0, self.top, self.x1, self.bottom)
|
370
|
-
|
417
|
+
|
371
418
|
@property
|
372
419
|
def x0(self) -> float:
|
373
420
|
"""Left x-coordinate."""
|
374
421
|
if self.has_polygon:
|
375
422
|
return min(pt[0] for pt in self.polygon)
|
376
|
-
return self._obj.get(
|
377
|
-
|
423
|
+
return self._obj.get("x0", 0)
|
424
|
+
|
378
425
|
@property
|
379
426
|
def top(self) -> float:
|
380
427
|
"""Top y-coordinate."""
|
381
428
|
if self.has_polygon:
|
382
429
|
return min(pt[1] for pt in self.polygon)
|
383
|
-
return self._obj.get(
|
384
|
-
|
430
|
+
return self._obj.get("top", 0)
|
431
|
+
|
385
432
|
@property
|
386
433
|
def x1(self) -> float:
|
387
434
|
"""Right x-coordinate."""
|
388
435
|
if self.has_polygon:
|
389
436
|
return max(pt[0] for pt in self.polygon)
|
390
|
-
return self._obj.get(
|
391
|
-
|
437
|
+
return self._obj.get("x1", 0)
|
438
|
+
|
392
439
|
@property
|
393
440
|
def bottom(self) -> float:
|
394
441
|
"""Bottom y-coordinate."""
|
395
442
|
if self.has_polygon:
|
396
443
|
return max(pt[1] for pt in self.polygon)
|
397
|
-
return self._obj.get(
|
398
|
-
|
444
|
+
return self._obj.get("bottom", 0)
|
445
|
+
|
399
446
|
@property
|
400
447
|
def width(self) -> float:
|
401
448
|
"""Element width."""
|
402
449
|
return self.x1 - self.x0
|
403
|
-
|
450
|
+
|
404
451
|
@property
|
405
452
|
def height(self) -> float:
|
406
453
|
"""Element height."""
|
407
454
|
return self.bottom - self.top
|
408
|
-
|
455
|
+
|
409
456
|
@property
|
410
457
|
def has_polygon(self) -> bool:
|
411
458
|
"""Check if this element has polygon coordinates."""
|
412
|
-
return (
|
413
|
-
|
459
|
+
return (
|
460
|
+
"polygon" in self._obj and self._obj["polygon"] and len(self._obj["polygon"]) >= 3
|
461
|
+
) or hasattr(self, "_polygon")
|
462
|
+
|
414
463
|
@property
|
415
464
|
def polygon(self) -> List[Tuple[float, float]]:
|
416
465
|
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
417
|
-
if hasattr(self,
|
466
|
+
if hasattr(self, "_polygon") and self._polygon:
|
418
467
|
return self._polygon
|
419
|
-
elif
|
420
|
-
return self._obj[
|
468
|
+
elif "polygon" in self._obj and self._obj["polygon"]:
|
469
|
+
return self._obj["polygon"]
|
421
470
|
else:
|
422
471
|
# Create rectangle corners as fallback
|
423
472
|
return [
|
424
|
-
(self._obj.get(
|
425
|
-
(self._obj.get(
|
426
|
-
(self._obj.get(
|
427
|
-
(self._obj.get(
|
473
|
+
(self._obj.get("x0", 0), self._obj.get("top", 0)), # top-left
|
474
|
+
(self._obj.get("x1", 0), self._obj.get("top", 0)), # top-right
|
475
|
+
(self._obj.get("x1", 0), self._obj.get("bottom", 0)), # bottom-right
|
476
|
+
(self._obj.get("x0", 0), self._obj.get("bottom", 0)), # bottom-left
|
428
477
|
]
|
429
|
-
|
478
|
+
|
430
479
|
def is_point_inside(self, x: float, y: float) -> bool:
|
431
480
|
"""
|
432
481
|
Check if a point is inside this element using ray casting algorithm for polygons.
|
433
|
-
|
482
|
+
|
434
483
|
Args:
|
435
484
|
x: X-coordinate to check
|
436
485
|
y: Y-coordinate to check
|
437
|
-
|
486
|
+
|
438
487
|
Returns:
|
439
488
|
True if the point is inside the element
|
440
489
|
"""
|
441
490
|
if not self.has_polygon:
|
442
491
|
# Use simple rectangle check
|
443
492
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
444
|
-
|
493
|
+
|
445
494
|
# Ray casting algorithm for complex polygons
|
446
495
|
poly = self.polygon
|
447
496
|
n = len(poly)
|
448
497
|
inside = False
|
449
|
-
|
498
|
+
|
450
499
|
p1x, p1y = poly[0]
|
451
500
|
for i in range(1, n + 1):
|
452
501
|
p2x, p2y = poly[i % n]
|
@@ -456,30 +505,36 @@ class Element(DirectionalMixin):
|
|
456
505
|
if p1x == p2x or x <= xinters:
|
457
506
|
inside = not inside
|
458
507
|
p1x, p1y = p2x, p2y
|
459
|
-
|
508
|
+
|
460
509
|
return inside
|
461
|
-
|
510
|
+
|
462
511
|
@property
|
463
|
-
def page(self) ->
|
512
|
+
def page(self) -> "Page":
|
464
513
|
"""Get the parent page."""
|
465
514
|
return self._page
|
466
|
-
|
467
|
-
def next(
|
515
|
+
|
516
|
+
def next(
|
517
|
+
self,
|
518
|
+
selector: Optional[str] = None,
|
519
|
+
limit: int = 10,
|
520
|
+
apply_exclusions: bool = True,
|
521
|
+
**kwargs,
|
522
|
+
) -> Optional["Element"]:
|
468
523
|
"""
|
469
524
|
Find next element in reading order.
|
470
|
-
|
525
|
+
|
471
526
|
Args:
|
472
527
|
selector: Optional selector to filter by
|
473
528
|
limit: Maximum number of elements to search through (default: 10)
|
474
529
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
475
530
|
**kwargs: Additional parameters
|
476
|
-
|
531
|
+
|
477
532
|
Returns:
|
478
533
|
Next element or None if not found
|
479
534
|
"""
|
480
535
|
# Get all elements in reading order
|
481
|
-
all_elements = self.page.find_all(
|
482
|
-
|
536
|
+
all_elements = self.page.find_all("*", apply_exclusions=apply_exclusions)
|
537
|
+
|
483
538
|
# Find our index in the list
|
484
539
|
try:
|
485
540
|
# Compare by object identity since bbox could match multiple elements
|
@@ -487,40 +542,47 @@ class Element(DirectionalMixin):
|
|
487
542
|
except StopIteration:
|
488
543
|
# If not found, it might have been filtered out by exclusions
|
489
544
|
return None
|
490
|
-
|
545
|
+
|
491
546
|
# Search for next matching element
|
492
547
|
if selector:
|
493
548
|
# Filter elements after this one
|
494
|
-
candidates = all_elements[idx+1:]
|
549
|
+
candidates = all_elements[idx + 1 :]
|
495
550
|
# Limit search range for performance
|
496
551
|
candidates = candidates[:limit] if limit else candidates
|
497
|
-
|
552
|
+
|
498
553
|
# Find matching elements
|
499
554
|
from natural_pdf.elements.collections import ElementCollection
|
555
|
+
|
500
556
|
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
501
557
|
return matches[0] if matches else None
|
502
558
|
elif idx + 1 < len(all_elements):
|
503
559
|
# No selector, just return the next element
|
504
560
|
return all_elements[idx + 1]
|
505
|
-
|
561
|
+
|
506
562
|
return None
|
507
|
-
|
508
|
-
def prev(
|
563
|
+
|
564
|
+
def prev(
|
565
|
+
self,
|
566
|
+
selector: Optional[str] = None,
|
567
|
+
limit: int = 10,
|
568
|
+
apply_exclusions: bool = True,
|
569
|
+
**kwargs,
|
570
|
+
) -> Optional["Element"]:
|
509
571
|
"""
|
510
572
|
Find previous element in reading order.
|
511
|
-
|
573
|
+
|
512
574
|
Args:
|
513
575
|
selector: Optional selector to filter by
|
514
576
|
limit: Maximum number of elements to search through (default: 10)
|
515
577
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
516
578
|
**kwargs: Additional parameters
|
517
|
-
|
579
|
+
|
518
580
|
Returns:
|
519
581
|
Previous element or None if not found
|
520
582
|
"""
|
521
583
|
# Get all elements in reading order
|
522
|
-
all_elements = self.page.find_all(
|
523
|
-
|
584
|
+
all_elements = self.page.find_all("*", apply_exclusions=apply_exclusions)
|
585
|
+
|
524
586
|
# Find our index in the list
|
525
587
|
try:
|
526
588
|
# Compare by object identity since bbox could match multiple elements
|
@@ -528,7 +590,7 @@ class Element(DirectionalMixin):
|
|
528
590
|
except StopIteration:
|
529
591
|
# If not found, it might have been filtered out by exclusions
|
530
592
|
return None
|
531
|
-
|
593
|
+
|
532
594
|
# Search for previous matching element
|
533
595
|
if selector:
|
534
596
|
# Select elements before this one
|
@@ -537,27 +599,34 @@ class Element(DirectionalMixin):
|
|
537
599
|
candidates = candidates[::-1]
|
538
600
|
# Limit search range for performance
|
539
601
|
candidates = candidates[:limit] if limit else candidates
|
540
|
-
|
602
|
+
|
541
603
|
# Find matching elements using ElementCollection
|
542
604
|
from natural_pdf.elements.collections import ElementCollection
|
605
|
+
|
543
606
|
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
544
|
-
return matches[0] if matches else None
|
607
|
+
return matches[0] if matches else None # find_all returns a collection
|
545
608
|
elif idx > 0:
|
546
609
|
# No selector, just return the previous element
|
547
610
|
return all_elements[idx - 1]
|
548
|
-
|
611
|
+
|
549
612
|
return None
|
550
|
-
|
551
|
-
def nearest(
|
613
|
+
|
614
|
+
def nearest(
|
615
|
+
self,
|
616
|
+
selector: str,
|
617
|
+
max_distance: Optional[float] = None,
|
618
|
+
apply_exclusions: bool = True,
|
619
|
+
**kwargs,
|
620
|
+
) -> Optional["Element"]:
|
552
621
|
"""
|
553
622
|
Find nearest element matching selector.
|
554
|
-
|
623
|
+
|
555
624
|
Args:
|
556
625
|
selector: CSS-like selector string
|
557
626
|
max_distance: Maximum distance to search (default: None = unlimited)
|
558
627
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
559
628
|
**kwargs: Additional parameters
|
560
|
-
|
629
|
+
|
561
630
|
Returns:
|
562
631
|
Nearest element or None if not found
|
563
632
|
"""
|
@@ -565,56 +634,59 @@ class Element(DirectionalMixin):
|
|
565
634
|
matches = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
566
635
|
if not matches:
|
567
636
|
return None
|
568
|
-
|
637
|
+
|
569
638
|
# Calculate distance to center point of this element
|
570
639
|
self_center_x = (self.x0 + self.x1) / 2
|
571
640
|
self_center_y = (self.top + self.bottom) / 2
|
572
|
-
|
641
|
+
|
573
642
|
# Calculate distances to each match
|
574
643
|
distances = []
|
575
644
|
for match in matches:
|
576
645
|
if match is self: # Skip self
|
577
646
|
continue
|
578
|
-
|
647
|
+
|
579
648
|
match_center_x = (match.x0 + match.x1) / 2
|
580
649
|
match_center_y = (match.top + match.bottom) / 2
|
581
|
-
|
650
|
+
|
582
651
|
# Euclidean distance
|
583
|
-
distance = (
|
584
|
-
|
585
|
-
|
652
|
+
distance = (
|
653
|
+
(match_center_x - self_center_x) ** 2 + (match_center_y - self_center_y) ** 2
|
654
|
+
) ** 0.5
|
655
|
+
|
586
656
|
# Filter by max_distance if specified
|
587
657
|
if max_distance is None or distance <= max_distance:
|
588
658
|
distances.append((match, distance))
|
589
|
-
|
659
|
+
|
590
660
|
# Sort by distance and return the closest
|
591
661
|
if distances:
|
592
662
|
distances.sort(key=lambda x: x[1])
|
593
663
|
return distances[0][0]
|
594
|
-
|
664
|
+
|
595
665
|
return None
|
596
|
-
|
597
|
-
def until(
|
666
|
+
|
667
|
+
def until(
|
668
|
+
self, selector: str, include_endpoint: bool = True, width: str = "element", **kwargs
|
669
|
+
) -> "Region":
|
598
670
|
"""
|
599
671
|
Select content from this element until matching selector.
|
600
|
-
|
672
|
+
|
601
673
|
Args:
|
602
674
|
selector: CSS-like selector string
|
603
675
|
include_endpoint: Whether to include the endpoint element in the region (default: True)
|
604
676
|
width: Width mode - "element" to use element widths or "full" for full page width
|
605
677
|
**kwargs: Additional selection parameters
|
606
|
-
|
678
|
+
|
607
679
|
Returns:
|
608
680
|
Region object representing the selected content
|
609
681
|
"""
|
610
682
|
from natural_pdf.elements.region import Region
|
611
|
-
|
683
|
+
|
612
684
|
# Find the target element
|
613
685
|
target = self.page.find(selector, **kwargs)
|
614
686
|
if not target:
|
615
687
|
# If target not found, return a region with just this element
|
616
688
|
return Region(self.page, self.bbox)
|
617
|
-
|
689
|
+
|
618
690
|
# Use full page width if requested
|
619
691
|
if width == "full":
|
620
692
|
x0 = 0
|
@@ -622,12 +694,16 @@ class Element(DirectionalMixin):
|
|
622
694
|
# Determine vertical bounds based on element positions
|
623
695
|
if target.top >= self.bottom: # Target is below this element
|
624
696
|
top = self.top
|
625
|
-
bottom =
|
697
|
+
bottom = (
|
698
|
+
target.bottom if include_endpoint else target.top - 1
|
699
|
+
) # Subtract 1 pixel when excluding
|
626
700
|
else: # Target is above this element
|
627
|
-
top =
|
701
|
+
top = (
|
702
|
+
target.top if include_endpoint else target.bottom + 1
|
703
|
+
) # Add 1 pixel when excluding
|
628
704
|
bottom = self.bottom
|
629
705
|
return Region(self.page, (x0, top, x1, bottom))
|
630
|
-
|
706
|
+
|
631
707
|
# Otherwise use element-based width
|
632
708
|
# Determine the correct order for creating the region
|
633
709
|
# If the target is below this element (normal reading order)
|
@@ -635,12 +711,16 @@ class Element(DirectionalMixin):
|
|
635
711
|
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
636
712
|
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
637
713
|
top = self.top
|
638
|
-
bottom =
|
714
|
+
bottom = (
|
715
|
+
target.bottom if include_endpoint else target.top - 1
|
716
|
+
) # Subtract 1 pixel when excluding
|
639
717
|
# If the target is above this element (reverse reading order)
|
640
718
|
elif target.bottom <= self.top:
|
641
719
|
x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
|
642
720
|
x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
|
643
|
-
top =
|
721
|
+
top = (
|
722
|
+
target.top if include_endpoint else target.bottom + 1
|
723
|
+
) # Add 1 pixel when excluding
|
644
724
|
bottom = self.bottom
|
645
725
|
# If they're side by side, use the horizontal version
|
646
726
|
elif target.x0 >= self.x1: # Target is to the right
|
@@ -653,47 +733,49 @@ class Element(DirectionalMixin):
|
|
653
733
|
x1 = self.x1
|
654
734
|
top = min(self.top, target.top if include_endpoint else target.bottom)
|
655
735
|
bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
|
656
|
-
|
736
|
+
|
657
737
|
region = Region(self.page, (x0, top, x1, bottom))
|
658
738
|
region.source_element = self
|
659
739
|
region.end_element = target
|
660
740
|
return region
|
661
|
-
|
741
|
+
|
662
742
|
# Note: select_until method removed in favor of until()
|
663
|
-
|
743
|
+
|
664
744
|
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
665
745
|
"""
|
666
746
|
Extract text from this element.
|
667
|
-
|
747
|
+
|
668
748
|
Args:
|
669
749
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
670
750
|
use_exclusions: Whether to apply exclusion regions (default: True)
|
671
751
|
**kwargs: Additional extraction parameters
|
672
|
-
|
752
|
+
|
673
753
|
Returns:
|
674
754
|
Extracted text as string
|
675
755
|
"""
|
676
756
|
# Default implementation - override in subclasses
|
677
757
|
return ""
|
678
|
-
|
758
|
+
|
679
759
|
# Note: extract_text_compat method removed
|
680
|
-
|
681
|
-
def highlight(
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
760
|
+
|
761
|
+
def highlight(
|
762
|
+
self,
|
763
|
+
label: Optional[str] = None,
|
764
|
+
color: Optional[Union[Tuple, str]] = None, # Allow string color
|
765
|
+
use_color_cycling: bool = False,
|
766
|
+
include_attrs: Optional[List[str]] = None,
|
767
|
+
existing: str = "append",
|
768
|
+
) -> "Element":
|
687
769
|
"""
|
688
770
|
Highlight this element on the page.
|
689
|
-
|
771
|
+
|
690
772
|
Args:
|
691
773
|
label: Optional label for the highlight
|
692
774
|
color: Color tuple/string for the highlight, or None to use automatic color
|
693
775
|
use_color_cycling: Force color cycling even with no label (default: False)
|
694
776
|
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
695
777
|
existing: How to handle existing highlights - 'append' (default) or 'replace'
|
696
|
-
|
778
|
+
|
697
779
|
Returns:
|
698
780
|
Self for method chaining
|
699
781
|
"""
|
@@ -708,7 +790,7 @@ class Element(DirectionalMixin):
|
|
708
790
|
"use_color_cycling": use_color_cycling,
|
709
791
|
"element": self, # Pass the element itself so attributes can be accessed
|
710
792
|
"include_attrs": include_attrs,
|
711
|
-
"existing": existing
|
793
|
+
"existing": existing,
|
712
794
|
}
|
713
795
|
|
714
796
|
# Call the appropriate service method based on geometry
|
@@ -720,13 +802,15 @@ class Element(DirectionalMixin):
|
|
720
802
|
highlighter.add(**highlight_args)
|
721
803
|
|
722
804
|
return self
|
723
|
-
|
724
|
-
def show(
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
805
|
+
|
806
|
+
def show(
|
807
|
+
self,
|
808
|
+
scale: float = 2.0,
|
809
|
+
labels: bool = True,
|
810
|
+
legend_position: str = "right",
|
811
|
+
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
812
|
+
label: Optional[str] = None,
|
813
|
+
) -> Optional["Image.Image"]:
|
730
814
|
"""
|
731
815
|
Show the page with only this element highlighted temporarily.
|
732
816
|
|
@@ -740,12 +824,12 @@ class Element(DirectionalMixin):
|
|
740
824
|
Returns:
|
741
825
|
PIL Image of the page with only this element highlighted, or None if error.
|
742
826
|
"""
|
743
|
-
if not hasattr(self,
|
827
|
+
if not hasattr(self, "page") or not self.page:
|
744
828
|
logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
|
745
829
|
return None
|
746
|
-
if not hasattr(self.page,
|
747
|
-
|
748
|
-
|
830
|
+
if not hasattr(self.page, "_highlighter") or not self.page._highlighter:
|
831
|
+
logger.warning(f"Cannot show element, page lacks highlighter service: {self}")
|
832
|
+
return None
|
749
833
|
|
750
834
|
service = self.page._highlighter
|
751
835
|
|
@@ -757,15 +841,15 @@ class Element(DirectionalMixin):
|
|
757
841
|
"page_index": self.page.index,
|
758
842
|
"bbox": self.bbox if not self.has_polygon else None,
|
759
843
|
"polygon": self.polygon if self.has_polygon else None,
|
760
|
-
"color": color,
|
844
|
+
"color": color, # Use provided or default color
|
761
845
|
"label": display_label,
|
762
|
-
"use_color_cycling": False
|
846
|
+
"use_color_cycling": False, # Explicitly false for single preview
|
763
847
|
}
|
764
848
|
|
765
849
|
# Check if we actually got geometry data
|
766
|
-
if temp_highlight_data[
|
767
|
-
|
768
|
-
|
850
|
+
if temp_highlight_data["bbox"] is None and temp_highlight_data["polygon"] is None:
|
851
|
+
logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
|
852
|
+
return None
|
769
853
|
|
770
854
|
# Use render_preview to show only this highlight
|
771
855
|
try:
|
@@ -774,49 +858,47 @@ class Element(DirectionalMixin):
|
|
774
858
|
temporary_highlights=[temp_highlight_data],
|
775
859
|
scale=scale,
|
776
860
|
labels=labels,
|
777
|
-
legend_position=legend_position
|
861
|
+
legend_position=legend_position,
|
778
862
|
)
|
779
863
|
except Exception as e:
|
780
864
|
logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
|
781
865
|
return None
|
782
|
-
|
783
|
-
def save(
|
784
|
-
|
785
|
-
|
786
|
-
labels: bool = True,
|
787
|
-
legend_position: str = 'right') -> None:
|
866
|
+
|
867
|
+
def save(
|
868
|
+
self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
|
869
|
+
) -> None:
|
788
870
|
"""
|
789
871
|
Save the page with this element highlighted to an image file.
|
790
|
-
|
872
|
+
|
791
873
|
Args:
|
792
874
|
filename: Path to save the image to
|
793
875
|
scale: Scale factor for rendering
|
794
876
|
labels: Whether to include a legend for labels
|
795
877
|
legend_position: Position of the legend
|
796
|
-
|
878
|
+
|
797
879
|
Returns:
|
798
880
|
Self for method chaining
|
799
881
|
"""
|
800
882
|
# Save the highlighted image
|
801
883
|
self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
802
884
|
return self
|
803
|
-
|
885
|
+
|
804
886
|
# Note: save_image method removed in favor of save()
|
805
|
-
|
887
|
+
|
806
888
|
def __repr__(self) -> str:
|
807
889
|
"""String representation of the element."""
|
808
890
|
return f"<{self.__class__.__name__} bbox={self.bbox}>"
|
809
891
|
|
810
|
-
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[
|
892
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
|
811
893
|
"""
|
812
894
|
Find first element within this element's bounds matching the selector.
|
813
895
|
Creates a temporary region to perform the search.
|
814
|
-
|
896
|
+
|
815
897
|
Args:
|
816
898
|
selector: CSS-like selector string
|
817
899
|
apply_exclusions: Whether to apply exclusion regions
|
818
900
|
**kwargs: Additional parameters for element filtering
|
819
|
-
|
901
|
+
|
820
902
|
Returns:
|
821
903
|
First matching element or None
|
822
904
|
"""
|
@@ -826,16 +908,16 @@ class Element(DirectionalMixin):
|
|
826
908
|
temp_region = Region(self.page, self.bbox)
|
827
909
|
return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
828
910
|
|
829
|
-
def find_all(self, selector: str, apply_exclusions=True, **kwargs) ->
|
911
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> "ElementCollection":
|
830
912
|
"""
|
831
913
|
Find all elements within this element's bounds matching the selector.
|
832
914
|
Creates a temporary region to perform the search.
|
833
|
-
|
915
|
+
|
834
916
|
Args:
|
835
917
|
selector: CSS-like selector string
|
836
918
|
apply_exclusions: Whether to apply exclusion regions
|
837
919
|
**kwargs: Additional parameters for element filtering
|
838
|
-
|
920
|
+
|
839
921
|
Returns:
|
840
922
|
ElementCollection with matching elements
|
841
923
|
"""
|
@@ -843,4 +925,4 @@ class Element(DirectionalMixin):
|
|
843
925
|
|
844
926
|
# Create a temporary region from this element's bounds
|
845
927
|
temp_region = Region(self.page, self.bbox)
|
846
|
-
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
928
|
+
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|