natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,6 +1,16 @@
|
|
1
|
-
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
|
+
|
4
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
|
+
|
6
|
+
# New Imports
|
7
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
|
+
|
2
9
|
from natural_pdf.elements.base import DirectionalMixin
|
3
10
|
|
11
|
+
# Import new utils
|
12
|
+
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
|
+
|
4
14
|
if TYPE_CHECKING:
|
5
15
|
from natural_pdf.core.page import Page
|
6
16
|
from natural_pdf.elements.text import TextElement
|
@@ -12,22 +22,29 @@ except ImportError:
|
|
12
22
|
# OCRManager will be imported directly in methods that use it
|
13
23
|
pass
|
14
24
|
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
15
27
|
|
16
28
|
class Region(DirectionalMixin):
|
17
29
|
"""
|
18
30
|
Represents a rectangular region on a page.
|
19
31
|
"""
|
20
|
-
|
21
|
-
def __init__(
|
32
|
+
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
page: "Page",
|
36
|
+
bbox: Tuple[float, float, float, float],
|
37
|
+
polygon: List[Tuple[float, float]] = None,
|
38
|
+
parent=None,
|
39
|
+
):
|
22
40
|
"""
|
23
41
|
Initialize a region.
|
24
|
-
|
42
|
+
|
25
43
|
Args:
|
26
44
|
page: Parent page
|
27
45
|
bbox: Bounding box as (x0, top, x1, bottom)
|
28
46
|
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
47
|
parent: Optional parent region (for hierarchical document structure)
|
30
|
-
label: Optional label for the region (e.g., for exclusions)
|
31
48
|
"""
|
32
49
|
self._page = page
|
33
50
|
self._bbox = bbox
|
@@ -37,30 +54,36 @@ class Region(DirectionalMixin):
|
|
37
54
|
self._page_range = None
|
38
55
|
self.start_element = None
|
39
56
|
self.end_element = None
|
40
|
-
|
57
|
+
|
41
58
|
# Standard attributes for all elements
|
42
|
-
self.object_type =
|
43
|
-
|
59
|
+
self.object_type = "region" # For selector compatibility
|
60
|
+
|
44
61
|
# Layout detection attributes
|
45
62
|
self.region_type = None
|
46
63
|
self.normalized_type = None
|
47
64
|
self.confidence = None
|
48
65
|
self.model = None
|
49
|
-
|
66
|
+
|
50
67
|
# Region management attributes
|
51
68
|
self.name = None
|
52
69
|
self.source = None # Will be set by creation methods
|
53
|
-
|
54
|
-
|
70
|
+
|
55
71
|
# Hierarchy support for nested document structure
|
56
72
|
self.parent_region = parent
|
57
73
|
self.child_regions = []
|
58
74
|
self.text_content = None # Direct text content (e.g., from Docling)
|
59
75
|
self.associated_text_elements = [] # Native text elements that overlap with this region
|
60
|
-
|
61
|
-
def _direction(
|
62
|
-
|
63
|
-
|
76
|
+
|
77
|
+
def _direction(
|
78
|
+
self,
|
79
|
+
direction: str,
|
80
|
+
size: Optional[float] = None,
|
81
|
+
cross_size: str = "full",
|
82
|
+
include_element: bool = False,
|
83
|
+
until: Optional[str] = None,
|
84
|
+
include_endpoint: bool = True,
|
85
|
+
**kwargs,
|
86
|
+
) -> "Region":
|
64
87
|
"""
|
65
88
|
Protected helper method to create a region in a specified direction relative to this region.
|
66
89
|
|
@@ -76,11 +99,11 @@ class Region(DirectionalMixin):
|
|
76
99
|
Returns:
|
77
100
|
Region object
|
78
101
|
"""
|
79
|
-
import math
|
102
|
+
import math # Use math.inf for infinity
|
80
103
|
|
81
|
-
is_horizontal = direction in (
|
82
|
-
is_positive = direction in (
|
83
|
-
pixel_offset = 1
|
104
|
+
is_horizontal = direction in ("left", "right")
|
105
|
+
is_positive = direction in ("right", "below") # right/below are positive directions
|
106
|
+
pixel_offset = 1 # Offset for excluding elements/endpoints
|
84
107
|
|
85
108
|
# 1. Determine initial boundaries based on direction and include_element
|
86
109
|
if is_horizontal:
|
@@ -89,38 +112,44 @@ class Region(DirectionalMixin):
|
|
89
112
|
y1 = self.page.height if cross_size == "full" else self.bottom
|
90
113
|
|
91
114
|
# Initial primary boundaries (horizontal)
|
92
|
-
if is_positive:
|
115
|
+
if is_positive: # right
|
93
116
|
x0_initial = self.x0 if include_element else self.x1 + pixel_offset
|
94
|
-
x1_initial = self.x1
|
95
|
-
else:
|
96
|
-
x0_initial = self.x0
|
117
|
+
x1_initial = self.x1 # This edge moves
|
118
|
+
else: # left
|
119
|
+
x0_initial = self.x0 # This edge moves
|
97
120
|
x1_initial = self.x1 if include_element else self.x0 - pixel_offset
|
98
|
-
else:
|
121
|
+
else: # Vertical
|
99
122
|
# Initial cross-boundaries (horizontal)
|
100
123
|
x0 = 0 if cross_size == "full" else self.x0
|
101
124
|
x1 = self.page.width if cross_size == "full" else self.x1
|
102
125
|
|
103
126
|
# Initial primary boundaries (vertical)
|
104
|
-
if is_positive:
|
127
|
+
if is_positive: # below
|
105
128
|
y0_initial = self.top if include_element else self.bottom + pixel_offset
|
106
|
-
y1_initial = self.bottom
|
107
|
-
else:
|
108
|
-
y0_initial = self.top
|
129
|
+
y1_initial = self.bottom # This edge moves
|
130
|
+
else: # above
|
131
|
+
y0_initial = self.top # This edge moves
|
109
132
|
y1_initial = self.bottom if include_element else self.top - pixel_offset
|
110
133
|
|
111
134
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
112
135
|
if is_horizontal:
|
113
|
-
if is_positive:
|
114
|
-
x1_final = min(
|
136
|
+
if is_positive: # right
|
137
|
+
x1_final = min(
|
138
|
+
self.page.width,
|
139
|
+
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
140
|
+
)
|
115
141
|
x0_final = x0_initial
|
116
|
-
else:
|
142
|
+
else: # left
|
117
143
|
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
118
144
|
x1_final = x1_initial
|
119
|
-
else:
|
120
|
-
if is_positive:
|
121
|
-
y1_final = min(
|
145
|
+
else: # Vertical
|
146
|
+
if is_positive: # below
|
147
|
+
y1_final = min(
|
148
|
+
self.page.height,
|
149
|
+
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
150
|
+
)
|
122
151
|
y0_final = y0_initial
|
123
|
-
else:
|
152
|
+
else: # above
|
124
153
|
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
125
154
|
y1_final = y1_initial
|
126
155
|
|
@@ -131,16 +160,16 @@ class Region(DirectionalMixin):
|
|
131
160
|
matches_in_direction = []
|
132
161
|
|
133
162
|
# Filter and sort matches based on direction
|
134
|
-
if direction ==
|
163
|
+
if direction == "above":
|
135
164
|
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
136
165
|
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
137
|
-
elif direction ==
|
166
|
+
elif direction == "below":
|
138
167
|
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
139
168
|
matches_in_direction.sort(key=lambda e: e.top)
|
140
|
-
elif direction ==
|
169
|
+
elif direction == "left":
|
141
170
|
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
142
171
|
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
143
|
-
elif direction ==
|
172
|
+
elif direction == "right":
|
144
173
|
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
145
174
|
matches_in_direction.sort(key=lambda e: e.x0)
|
146
175
|
|
@@ -149,25 +178,29 @@ class Region(DirectionalMixin):
|
|
149
178
|
|
150
179
|
# Adjust the primary boundary based on the target
|
151
180
|
if is_horizontal:
|
152
|
-
if is_positive:
|
181
|
+
if is_positive: # right
|
153
182
|
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
154
|
-
else:
|
183
|
+
else: # left
|
155
184
|
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
156
|
-
else:
|
157
|
-
if is_positive:
|
185
|
+
else: # Vertical
|
186
|
+
if is_positive: # below
|
158
187
|
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
159
|
-
else:
|
188
|
+
else: # above
|
160
189
|
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
161
190
|
|
162
191
|
# Adjust cross boundaries if cross_size is 'element'
|
163
192
|
if cross_size == "element":
|
164
|
-
if is_horizontal:
|
165
|
-
target_y0 =
|
193
|
+
if is_horizontal: # Adjust y0, y1
|
194
|
+
target_y0 = (
|
195
|
+
target.top if include_endpoint else target.bottom
|
196
|
+
) # Use opposite boundary if excluding
|
166
197
|
target_y1 = target.bottom if include_endpoint else target.top
|
167
198
|
y0 = min(y0, target_y0)
|
168
199
|
y1 = max(y1, target_y1)
|
169
|
-
else:
|
170
|
-
target_x0 =
|
200
|
+
else: # Adjust x0, x1
|
201
|
+
target_x0 = (
|
202
|
+
target.x0 if include_endpoint else target.x1
|
203
|
+
) # Use opposite boundary if excluding
|
171
204
|
target_x1 = target.x1 if include_endpoint else target.x0
|
172
205
|
x0 = min(x0, target_x0)
|
173
206
|
x1 = max(x1, target_x1)
|
@@ -195,11 +228,18 @@ class Region(DirectionalMixin):
|
|
195
228
|
|
196
229
|
return region
|
197
230
|
|
198
|
-
def above(
|
199
|
-
|
231
|
+
def above(
|
232
|
+
self,
|
233
|
+
height: Optional[float] = None,
|
234
|
+
width: str = "full",
|
235
|
+
include_element: bool = False,
|
236
|
+
until: Optional[str] = None,
|
237
|
+
include_endpoint: bool = True,
|
238
|
+
**kwargs,
|
239
|
+
) -> "Region":
|
200
240
|
"""
|
201
241
|
Select region above this region.
|
202
|
-
|
242
|
+
|
203
243
|
Args:
|
204
244
|
height: Height of the region above, in points
|
205
245
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -207,25 +247,32 @@ class Region(DirectionalMixin):
|
|
207
247
|
until: Optional selector string to specify an upper boundary element
|
208
248
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
209
249
|
**kwargs: Additional parameters
|
210
|
-
|
250
|
+
|
211
251
|
Returns:
|
212
252
|
Region object representing the area above
|
213
253
|
"""
|
214
254
|
return self._direction(
|
215
|
-
direction=
|
255
|
+
direction="above",
|
216
256
|
size=height,
|
217
257
|
cross_size=width,
|
218
258
|
include_element=include_element,
|
219
259
|
until=until,
|
220
260
|
include_endpoint=include_endpoint,
|
221
|
-
**kwargs
|
261
|
+
**kwargs,
|
222
262
|
)
|
223
263
|
|
224
|
-
def below(
|
225
|
-
|
264
|
+
def below(
|
265
|
+
self,
|
266
|
+
height: Optional[float] = None,
|
267
|
+
width: str = "full",
|
268
|
+
include_element: bool = False,
|
269
|
+
until: Optional[str] = None,
|
270
|
+
include_endpoint: bool = True,
|
271
|
+
**kwargs,
|
272
|
+
) -> "Region":
|
226
273
|
"""
|
227
274
|
Select region below this region.
|
228
|
-
|
275
|
+
|
229
276
|
Args:
|
230
277
|
height: Height of the region below, in points
|
231
278
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -233,25 +280,32 @@ class Region(DirectionalMixin):
|
|
233
280
|
until: Optional selector string to specify a lower boundary element
|
234
281
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
235
282
|
**kwargs: Additional parameters
|
236
|
-
|
283
|
+
|
237
284
|
Returns:
|
238
285
|
Region object representing the area below
|
239
286
|
"""
|
240
287
|
return self._direction(
|
241
|
-
direction=
|
288
|
+
direction="below",
|
242
289
|
size=height,
|
243
290
|
cross_size=width,
|
244
291
|
include_element=include_element,
|
245
292
|
until=until,
|
246
293
|
include_endpoint=include_endpoint,
|
247
|
-
**kwargs
|
294
|
+
**kwargs,
|
248
295
|
)
|
249
296
|
|
250
|
-
def left(
|
251
|
-
|
297
|
+
def left(
|
298
|
+
self,
|
299
|
+
width: Optional[float] = None,
|
300
|
+
height: str = "full",
|
301
|
+
include_element: bool = False,
|
302
|
+
until: Optional[str] = None,
|
303
|
+
include_endpoint: bool = True,
|
304
|
+
**kwargs,
|
305
|
+
) -> "Region":
|
252
306
|
"""
|
253
307
|
Select region to the left of this region.
|
254
|
-
|
308
|
+
|
255
309
|
Args:
|
256
310
|
width: Width of the region to the left, in points
|
257
311
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -259,25 +313,32 @@ class Region(DirectionalMixin):
|
|
259
313
|
until: Optional selector string to specify a left boundary element
|
260
314
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
261
315
|
**kwargs: Additional parameters
|
262
|
-
|
316
|
+
|
263
317
|
Returns:
|
264
318
|
Region object representing the area to the left
|
265
319
|
"""
|
266
320
|
return self._direction(
|
267
|
-
direction=
|
321
|
+
direction="left",
|
268
322
|
size=width,
|
269
323
|
cross_size=height,
|
270
324
|
include_element=include_element,
|
271
325
|
until=until,
|
272
326
|
include_endpoint=include_endpoint,
|
273
|
-
**kwargs
|
327
|
+
**kwargs,
|
274
328
|
)
|
275
329
|
|
276
|
-
def right(
|
277
|
-
|
330
|
+
def right(
|
331
|
+
self,
|
332
|
+
width: Optional[float] = None,
|
333
|
+
height: str = "full",
|
334
|
+
include_element: bool = False,
|
335
|
+
until: Optional[str] = None,
|
336
|
+
include_endpoint: bool = True,
|
337
|
+
**kwargs,
|
338
|
+
) -> "Region":
|
278
339
|
"""
|
279
340
|
Select region to the right of this region.
|
280
|
-
|
341
|
+
|
281
342
|
Args:
|
282
343
|
width: Width of the region to the right, in points
|
283
344
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -285,72 +346,72 @@ class Region(DirectionalMixin):
|
|
285
346
|
until: Optional selector string to specify a right boundary element
|
286
347
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
287
348
|
**kwargs: Additional parameters
|
288
|
-
|
349
|
+
|
289
350
|
Returns:
|
290
351
|
Region object representing the area to the right
|
291
352
|
"""
|
292
353
|
return self._direction(
|
293
|
-
direction=
|
354
|
+
direction="right",
|
294
355
|
size=width,
|
295
356
|
cross_size=height,
|
296
357
|
include_element=include_element,
|
297
358
|
until=until,
|
298
359
|
include_endpoint=include_endpoint,
|
299
|
-
**kwargs
|
360
|
+
**kwargs,
|
300
361
|
)
|
301
|
-
|
362
|
+
|
302
363
|
@property
|
303
364
|
def type(self) -> str:
|
304
365
|
"""Element type."""
|
305
366
|
# Return the specific type if detected (e.g., from layout analysis)
|
306
367
|
# or 'region' as a default.
|
307
|
-
return self.region_type or
|
308
|
-
|
368
|
+
return self.region_type or "region" # Prioritize specific region_type if set
|
369
|
+
|
309
370
|
@property
|
310
|
-
def page(self) ->
|
371
|
+
def page(self) -> "Page":
|
311
372
|
"""Get the parent page."""
|
312
373
|
return self._page
|
313
|
-
|
374
|
+
|
314
375
|
@property
|
315
376
|
def bbox(self) -> Tuple[float, float, float, float]:
|
316
377
|
"""Get the bounding box as (x0, top, x1, bottom)."""
|
317
378
|
return self._bbox
|
318
|
-
|
379
|
+
|
319
380
|
@property
|
320
381
|
def x0(self) -> float:
|
321
382
|
"""Get the left coordinate."""
|
322
383
|
return self._bbox[0]
|
323
|
-
|
384
|
+
|
324
385
|
@property
|
325
386
|
def top(self) -> float:
|
326
387
|
"""Get the top coordinate."""
|
327
388
|
return self._bbox[1]
|
328
|
-
|
389
|
+
|
329
390
|
@property
|
330
391
|
def x1(self) -> float:
|
331
392
|
"""Get the right coordinate."""
|
332
393
|
return self._bbox[2]
|
333
|
-
|
394
|
+
|
334
395
|
@property
|
335
396
|
def bottom(self) -> float:
|
336
397
|
"""Get the bottom coordinate."""
|
337
398
|
return self._bbox[3]
|
338
|
-
|
399
|
+
|
339
400
|
@property
|
340
401
|
def width(self) -> float:
|
341
402
|
"""Get the width of the region."""
|
342
403
|
return self.x1 - self.x0
|
343
|
-
|
404
|
+
|
344
405
|
@property
|
345
406
|
def height(self) -> float:
|
346
407
|
"""Get the height of the region."""
|
347
408
|
return self.bottom - self.top
|
348
|
-
|
409
|
+
|
349
410
|
@property
|
350
411
|
def has_polygon(self) -> bool:
|
351
412
|
"""Check if this region has polygon coordinates."""
|
352
413
|
return self._polygon is not None and len(self._polygon) >= 3
|
353
|
-
|
414
|
+
|
354
415
|
@property
|
355
416
|
def polygon(self) -> List[Tuple[float, float]]:
|
356
417
|
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
@@ -359,141 +420,122 @@ class Region(DirectionalMixin):
|
|
359
420
|
else:
|
360
421
|
# Create rectangle corners from bbox as fallback
|
361
422
|
return [
|
362
|
-
(self.x0, self.top),
|
363
|
-
(self.x1, self.top),
|
364
|
-
(self.x1, self.bottom),
|
365
|
-
(self.x0, self.bottom)
|
423
|
+
(self.x0, self.top), # top-left
|
424
|
+
(self.x1, self.top), # top-right
|
425
|
+
(self.x1, self.bottom), # bottom-right
|
426
|
+
(self.x0, self.bottom), # bottom-left
|
366
427
|
]
|
367
|
-
|
428
|
+
|
368
429
|
def _is_point_in_polygon(self, x: float, y: float) -> bool:
|
369
430
|
"""
|
370
431
|
Check if a point is inside the polygon using ray casting algorithm.
|
371
|
-
|
432
|
+
|
372
433
|
Args:
|
373
434
|
x: X coordinate of the point
|
374
435
|
y: Y coordinate of the point
|
375
|
-
|
436
|
+
|
376
437
|
Returns:
|
377
438
|
bool: True if the point is inside the polygon
|
378
439
|
"""
|
379
440
|
if not self.has_polygon:
|
380
441
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
381
|
-
|
442
|
+
|
382
443
|
# Ray casting algorithm
|
383
444
|
inside = False
|
384
445
|
j = len(self.polygon) - 1
|
385
|
-
|
446
|
+
|
386
447
|
for i in range(len(self.polygon)):
|
387
|
-
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and
|
388
|
-
|
389
|
-
(self.polygon[j][
|
448
|
+
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
|
449
|
+
x
|
450
|
+
< (self.polygon[j][0] - self.polygon[i][0])
|
451
|
+
* (y - self.polygon[i][1])
|
452
|
+
/ (self.polygon[j][1] - self.polygon[i][1])
|
453
|
+
+ self.polygon[i][0]
|
454
|
+
):
|
390
455
|
inside = not inside
|
391
456
|
j = i
|
392
|
-
|
457
|
+
|
393
458
|
return inside
|
394
459
|
|
395
460
|
def is_point_inside(self, x: float, y: float) -> bool:
|
396
461
|
"""
|
397
462
|
Check if a point is inside this region using ray casting algorithm for polygons.
|
398
|
-
|
463
|
+
|
399
464
|
Args:
|
400
465
|
x: X coordinate of the point
|
401
466
|
y: Y coordinate of the point
|
402
|
-
|
467
|
+
|
403
468
|
Returns:
|
404
469
|
bool: True if the point is inside the region
|
405
470
|
"""
|
406
471
|
if not self.has_polygon:
|
407
472
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
408
|
-
|
473
|
+
|
409
474
|
# Ray casting algorithm
|
410
475
|
inside = False
|
411
476
|
j = len(self.polygon) - 1
|
412
|
-
|
477
|
+
|
413
478
|
for i in range(len(self.polygon)):
|
414
|
-
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and
|
415
|
-
|
416
|
-
(self.polygon[j][
|
479
|
+
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
|
480
|
+
x
|
481
|
+
< (self.polygon[j][0] - self.polygon[i][0])
|
482
|
+
* (y - self.polygon[i][1])
|
483
|
+
/ (self.polygon[j][1] - self.polygon[i][1])
|
484
|
+
+ self.polygon[i][0]
|
485
|
+
):
|
417
486
|
inside = not inside
|
418
487
|
j = i
|
419
|
-
|
488
|
+
|
420
489
|
return inside
|
421
490
|
|
422
|
-
def _is_element_in_region(self, element:
|
491
|
+
def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
|
423
492
|
"""
|
424
493
|
Check if an element is within this region.
|
425
|
-
|
494
|
+
|
426
495
|
Args:
|
427
496
|
element: Element to check
|
428
497
|
use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
|
429
|
-
|
498
|
+
|
430
499
|
Returns:
|
431
500
|
True if the element is in the region, False otherwise
|
432
501
|
"""
|
433
502
|
# If we have multi-page elements cached, check if the element is in the list
|
434
503
|
if self._spans_pages and self._multi_page_elements is not None:
|
435
504
|
return element in self._multi_page_elements
|
436
|
-
|
505
|
+
|
437
506
|
# Check if element is on the same page
|
438
|
-
if element.page != self._page:
|
507
|
+
if not hasattr(element, "page") or element.page != self._page:
|
439
508
|
return False
|
440
|
-
|
509
|
+
|
441
510
|
# Calculate element center
|
511
|
+
# Ensure element has necessary attributes
|
512
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
513
|
+
return False # Cannot determine position
|
514
|
+
|
442
515
|
element_center_x = (element.x0 + element.x1) / 2
|
443
516
|
element_center_y = (element.top + element.bottom) / 2
|
444
|
-
|
445
|
-
#
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
# For elements right at the boundary, be more conservative
|
458
|
-
return False
|
459
|
-
|
460
|
-
# If the element itself has a polygon, check if ANY corner is in this region
|
461
|
-
if hasattr(element, 'has_polygon') and element.has_polygon:
|
462
|
-
for point in element.polygon:
|
463
|
-
if self.is_point_inside(point[0], point[1]):
|
464
|
-
return True
|
465
|
-
# If no point is inside, check if the center is inside
|
466
|
-
return self.is_point_inside(element_center_x, element_center_y)
|
467
|
-
|
468
|
-
# For regular elements, check if center is in the region
|
469
|
-
# Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
|
470
|
-
# This ensures consistent behavior with the below() and above() method fixes
|
471
|
-
tolerance = 1.0 if use_boundary_tolerance else 0.0
|
472
|
-
|
473
|
-
# Check if within region with the tolerance applied
|
474
|
-
if self.has_polygon:
|
475
|
-
return self.is_point_inside(element_center_x, element_center_y)
|
476
|
-
else:
|
477
|
-
# For rectangular regions, apply tolerance to all sides
|
478
|
-
return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
|
479
|
-
self.top + tolerance <= element_center_y <= self.bottom - tolerance)
|
480
|
-
|
481
|
-
def highlight(self,
|
482
|
-
label: Optional[str] = None,
|
483
|
-
color: Optional[Union[Tuple, str]] = None,
|
484
|
-
use_color_cycling: bool = False,
|
485
|
-
include_attrs: Optional[List[str]] = None,
|
486
|
-
existing: str = 'append') -> 'Region':
|
517
|
+
|
518
|
+
# Check if center point is inside the region's geometry
|
519
|
+
return self.is_point_inside(element_center_x, element_center_y)
|
520
|
+
|
521
|
+
def highlight(
|
522
|
+
self,
|
523
|
+
label: Optional[str] = None,
|
524
|
+
color: Optional[Union[Tuple, str]] = None,
|
525
|
+
use_color_cycling: bool = False,
|
526
|
+
include_attrs: Optional[List[str]] = None,
|
527
|
+
existing: str = "append",
|
528
|
+
) -> "Region":
|
487
529
|
"""
|
488
530
|
Highlight this region on the page.
|
489
|
-
|
531
|
+
|
490
532
|
Args:
|
491
533
|
label: Optional label for the highlight
|
492
534
|
color: Color tuple/string for the highlight, or None to use automatic color
|
493
535
|
use_color_cycling: Force color cycling even with no label (default: False)
|
494
536
|
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
495
537
|
existing: How to handle existing highlights ('append' or 'replace').
|
496
|
-
|
538
|
+
|
497
539
|
Returns:
|
498
540
|
Self for method chaining
|
499
541
|
"""
|
@@ -508,7 +550,7 @@ class Region(DirectionalMixin):
|
|
508
550
|
"use_color_cycling": use_color_cycling,
|
509
551
|
"element": self, # Pass the region itself so attributes can be accessed
|
510
552
|
"include_attrs": include_attrs,
|
511
|
-
"existing": existing
|
553
|
+
"existing": existing,
|
512
554
|
}
|
513
555
|
|
514
556
|
# Call the appropriate service method
|
@@ -520,59 +562,68 @@ class Region(DirectionalMixin):
|
|
520
562
|
highlighter.add(**highlight_args)
|
521
563
|
|
522
564
|
return self
|
523
|
-
|
524
|
-
def to_image(
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
565
|
+
|
566
|
+
def to_image(
|
567
|
+
self,
|
568
|
+
scale: float = 2.0,
|
569
|
+
resolution: float = 150,
|
570
|
+
crop_only: bool = False,
|
571
|
+
include_highlights: bool = True,
|
572
|
+
**kwargs,
|
573
|
+
) -> "Image.Image":
|
530
574
|
"""
|
531
575
|
Generate an image of just this region.
|
532
|
-
|
576
|
+
|
533
577
|
Args:
|
534
578
|
resolution: Resolution in DPI for rendering (default: 150)
|
535
579
|
crop_only: If True, only crop the region without highlighting its boundaries
|
536
580
|
include_highlights: Whether to include existing highlights (default: True)
|
537
581
|
**kwargs: Additional parameters for page.to_image()
|
538
|
-
|
582
|
+
|
539
583
|
Returns:
|
540
584
|
PIL Image of just this region
|
541
585
|
"""
|
542
586
|
# First get the full page image with highlights if requested
|
543
|
-
page_image = self._page.to_image(
|
544
|
-
|
587
|
+
page_image = self._page.to_image(
|
588
|
+
scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
|
589
|
+
)
|
590
|
+
|
545
591
|
# Calculate the crop coordinates - apply resolution scaling factor
|
546
592
|
# PDF coordinates are in points (1/72 inch), but image is scaled by resolution
|
547
|
-
scale_factor =
|
548
|
-
|
593
|
+
scale_factor = resolution / 72.0 # Scale based on DPI
|
594
|
+
|
549
595
|
# Apply scaling to the coordinates
|
550
596
|
x0 = int(self.x0 * scale_factor)
|
551
597
|
top = int(self.top * scale_factor)
|
552
598
|
x1 = int(self.x1 * scale_factor)
|
553
599
|
bottom = int(self.bottom * scale_factor)
|
554
|
-
|
600
|
+
|
555
601
|
# Crop the image to just this region
|
556
602
|
region_image = page_image.crop((x0, top, x1, bottom))
|
557
|
-
|
603
|
+
|
558
604
|
# If not crop_only, add a border to highlight the region boundaries
|
559
605
|
if not crop_only:
|
560
606
|
from PIL import ImageDraw
|
561
|
-
|
607
|
+
|
562
608
|
# Create a 1px border around the region
|
563
609
|
draw = ImageDraw.Draw(region_image)
|
564
|
-
draw.rectangle(
|
565
|
-
|
566
|
-
|
610
|
+
draw.rectangle(
|
611
|
+
(0, 0, region_image.width - 1, region_image.height - 1),
|
612
|
+
outline=(255, 0, 0),
|
613
|
+
width=1,
|
614
|
+
)
|
615
|
+
|
567
616
|
return region_image
|
568
|
-
|
569
|
-
def show(
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
617
|
+
|
618
|
+
def show(
|
619
|
+
self,
|
620
|
+
scale: float = 2.0,
|
621
|
+
labels: bool = True,
|
622
|
+
legend_position: str = "right",
|
623
|
+
# Add a default color for standalone show
|
624
|
+
color: Optional[Union[Tuple, str]] = "blue",
|
625
|
+
label: Optional[str] = None,
|
626
|
+
) -> "Image.Image":
|
576
627
|
"""
|
577
628
|
Show the page with just this region highlighted temporarily.
|
578
629
|
|
@@ -593,16 +644,18 @@ class Region(DirectionalMixin):
|
|
593
644
|
service = self._page._highlighter
|
594
645
|
|
595
646
|
# Determine the label if not provided
|
596
|
-
display_label =
|
647
|
+
display_label = (
|
648
|
+
label if label is not None else f"Region ({self.type})" if self.type else "Region"
|
649
|
+
)
|
597
650
|
|
598
651
|
# Prepare temporary highlight data for just this region
|
599
652
|
temp_highlight_data = {
|
600
653
|
"page_index": self._page.index,
|
601
654
|
"bbox": self.bbox,
|
602
655
|
"polygon": self.polygon if self.has_polygon else None,
|
603
|
-
"color": color,
|
656
|
+
"color": color, # Use provided or default color
|
604
657
|
"label": display_label,
|
605
|
-
"use_color_cycling": False
|
658
|
+
"use_color_cycling": False, # Explicitly false for single preview
|
606
659
|
}
|
607
660
|
|
608
661
|
# Use render_preview to show only this highlight
|
@@ -611,452 +664,271 @@ class Region(DirectionalMixin):
|
|
611
664
|
temporary_highlights=[temp_highlight_data],
|
612
665
|
scale=scale,
|
613
666
|
labels=labels,
|
614
|
-
legend_position=legend_position
|
667
|
+
legend_position=legend_position,
|
615
668
|
)
|
616
669
|
|
617
|
-
def save(
|
618
|
-
|
619
|
-
|
620
|
-
labels: bool = True,
|
621
|
-
legend_position: str = 'right') -> 'Region':
|
670
|
+
def save(
|
671
|
+
self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
|
672
|
+
) -> "Region":
|
622
673
|
"""
|
623
674
|
Save the page with this region highlighted to an image file.
|
624
|
-
|
675
|
+
|
625
676
|
Args:
|
626
677
|
filename: Path to save the image to
|
627
678
|
scale: Scale factor for rendering
|
628
679
|
labels: Whether to include a legend for labels
|
629
680
|
legend_position: Position of the legend
|
630
|
-
|
681
|
+
|
631
682
|
Returns:
|
632
683
|
Self for method chaining
|
633
684
|
"""
|
634
685
|
# Highlight this region if not already highlighted
|
635
686
|
self.highlight()
|
636
|
-
|
687
|
+
|
637
688
|
# Save the highlighted image
|
638
689
|
self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
639
690
|
return self
|
640
|
-
|
641
|
-
def save_image(
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
691
|
+
|
692
|
+
def save_image(
|
693
|
+
self,
|
694
|
+
filename: str,
|
695
|
+
resolution: float = 150,
|
696
|
+
crop_only: bool = False,
|
697
|
+
include_highlights: bool = True,
|
698
|
+
**kwargs,
|
699
|
+
) -> "Region":
|
647
700
|
"""
|
648
701
|
Save an image of just this region to a file.
|
649
|
-
|
702
|
+
|
650
703
|
Args:
|
651
704
|
filename: Path to save the image to
|
652
705
|
resolution: Resolution in DPI for rendering (default: 150)
|
653
706
|
crop_only: If True, only crop the region without highlighting its boundaries
|
654
707
|
include_highlights: Whether to include existing highlights (default: True)
|
655
708
|
**kwargs: Additional parameters for page.to_image()
|
656
|
-
|
709
|
+
|
657
710
|
Returns:
|
658
711
|
Self for method chaining
|
659
712
|
"""
|
660
713
|
# Get the region image
|
661
714
|
image = self.to_image(
|
662
|
-
resolution=resolution,
|
663
|
-
crop_only=crop_only,
|
715
|
+
resolution=resolution,
|
716
|
+
crop_only=crop_only,
|
664
717
|
include_highlights=include_highlights,
|
665
|
-
**kwargs
|
718
|
+
**kwargs,
|
666
719
|
)
|
667
|
-
|
720
|
+
|
668
721
|
# Save the image
|
669
722
|
image.save(filename)
|
670
723
|
return self
|
671
|
-
|
672
|
-
def get_elements(
|
724
|
+
|
725
|
+
def get_elements(
|
726
|
+
self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
|
727
|
+
) -> List["Element"]:
|
673
728
|
"""
|
674
729
|
Get all elements within this region.
|
675
|
-
|
730
|
+
|
676
731
|
Args:
|
677
732
|
selector: Optional selector to filter elements
|
678
733
|
apply_exclusions: Whether to apply exclusion regions
|
679
734
|
**kwargs: Additional parameters for element filtering
|
680
|
-
|
735
|
+
|
681
736
|
Returns:
|
682
737
|
List of elements in the region
|
683
738
|
"""
|
684
739
|
# If we have multi-page elements, return those
|
685
740
|
if self._spans_pages and self._multi_page_elements is not None:
|
741
|
+
# TODO: Apply selector to multi-page elements if needed
|
686
742
|
return self._multi_page_elements
|
687
|
-
|
743
|
+
|
688
744
|
# Otherwise, get elements from the page
|
689
745
|
if selector:
|
690
|
-
elements
|
746
|
+
# Find elements on the page matching the selector
|
747
|
+
page_elements = self.page.find_all(
|
748
|
+
selector, apply_exclusions=apply_exclusions, **kwargs
|
749
|
+
)
|
750
|
+
# Filter those elements to only include ones within this region
|
751
|
+
return [e for e in page_elements if self._is_element_in_region(e)]
|
691
752
|
else:
|
692
|
-
elements
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
def extract_text(self,
|
753
|
+
# Get all elements from the page
|
754
|
+
page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
|
755
|
+
# Filter to elements in this region
|
756
|
+
return [e for e in page_elements if self._is_element_in_region(e)]
|
757
|
+
|
758
|
+
def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
|
698
759
|
"""
|
699
|
-
Extract text from this region using pdfplumber's
|
700
|
-
|
701
|
-
|
702
|
-
1. Associated text elements from the PDF (if available)
|
703
|
-
2. Direct text content from Docling (if available)
|
704
|
-
3. Fall back to standard pdfplumber extraction
|
705
|
-
|
760
|
+
Extract text from this region, respecting page exclusions and using pdfplumber's
|
761
|
+
layout engine (chars_to_textmap).
|
762
|
+
|
706
763
|
Args:
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
764
|
+
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
765
|
+
debug: Enable verbose debugging output for filtering steps.
|
766
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
767
|
+
`chars_to_textmap` function (e.g., layout, x_density, y_density).
|
768
|
+
See Page.extract_text docstring for more.
|
769
|
+
|
714
770
|
Returns:
|
715
|
-
Extracted text as string
|
771
|
+
Extracted text as string, potentially with layout-based spacing.
|
716
772
|
"""
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
if self._spans_pages and self._multi_page_elements is not None:
|
745
|
-
# Sort elements in reading order - only include text-like elements
|
746
|
-
text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
|
747
|
-
|
748
|
-
# Sort in reading order (by page, then top-to-bottom, left-to-right)
|
749
|
-
sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
750
|
-
|
751
|
-
# Extract text directly from elements to avoid recursion
|
752
|
-
texts = []
|
753
|
-
for element in sorted_elements:
|
754
|
-
if hasattr(element, 'text'):
|
755
|
-
texts.append(element.text)
|
756
|
-
|
757
|
-
text_result = " ".join(texts)
|
758
|
-
return text_result
|
759
|
-
|
760
|
-
# Check if we have exclusions to apply
|
773
|
+
# Allow 'debug_exclusions' for backward compatibility
|
774
|
+
debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
|
775
|
+
logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
|
776
|
+
|
777
|
+
# --- Handle Docling source (priority) --- DEPRECATED or Adapt?
|
778
|
+
# For now, let's bypass this and always use the standard extraction flow
|
779
|
+
# based on contained elements to ensure consistency.
|
780
|
+
# if self.model == 'docling' or hasattr(self, 'text_content'): ...
|
781
|
+
|
782
|
+
# 1. Get Word Elements potentially within this region (initial broad phase)
|
783
|
+
# Optimization: Could use spatial query if page elements were indexed
|
784
|
+
page_words = self.page.words # Get all words from the page
|
785
|
+
|
786
|
+
# 2. Gather all character dicts from words potentially in region
|
787
|
+
# We filter precisely in filter_chars_spatially
|
788
|
+
all_char_dicts = []
|
789
|
+
for word in page_words:
|
790
|
+
# Quick bbox check to avoid processing words clearly outside
|
791
|
+
if get_bbox_overlap(self.bbox, word.bbox) is not None:
|
792
|
+
all_char_dicts.extend(getattr(word, "_char_dicts", []))
|
793
|
+
|
794
|
+
if not all_char_dicts:
|
795
|
+
logger.debug(f"Region {self.bbox}: No character dicts found overlapping region bbox.")
|
796
|
+
return ""
|
797
|
+
|
798
|
+
# 3. Get Relevant Exclusions (overlapping this region)
|
799
|
+
apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
|
761
800
|
exclusion_regions = []
|
762
|
-
if
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
has_intersection = False
|
772
|
-
for i, exclusion in enumerate(exclusion_regions):
|
773
|
-
# Use a simple bbox overlap check
|
774
|
-
overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
|
775
|
-
self.top < exclusion.bottom and self.bottom > exclusion.top)
|
776
|
-
|
777
|
-
if overlap:
|
778
|
-
has_intersection = True
|
779
|
-
if debug:
|
780
|
-
logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
|
781
|
-
break
|
782
|
-
|
783
|
-
# If no intersection, process without exclusions
|
784
|
-
if not has_intersection:
|
785
|
-
if debug:
|
786
|
-
logger.debug(f" No intersection with any exclusion, ignoring exclusions")
|
787
|
-
apply_exclusions = False
|
788
|
-
exclusion_regions = []
|
789
|
-
|
790
|
-
# IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
|
791
|
-
# we can use the simpler cropping approach
|
792
|
-
# Only use crop for simple cases
|
793
|
-
can_use_crop = not self.has_polygon
|
794
|
-
result = "" # Default empty result
|
795
|
-
if can_use_crop and apply_exclusions and exclusion_regions:
|
796
|
-
# We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
|
797
|
-
# and those that are not
|
798
|
-
footer_header_exclusions = []
|
799
|
-
other_exclusions = []
|
800
|
-
|
801
|
-
for i, exclusion in enumerate(exclusion_regions):
|
802
|
-
# Check if exclusion spans the full width of the page
|
803
|
-
# and is either at the top or bottom
|
804
|
-
full_width = (abs(exclusion.x0) < 5 and
|
805
|
-
abs(exclusion.x1 - self.page.width) < 5)
|
806
|
-
|
807
|
-
if debug:
|
808
|
-
logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
|
809
|
-
|
810
|
-
if full_width:
|
811
|
-
footer_header_exclusions.append(exclusion)
|
812
|
-
else:
|
813
|
-
other_exclusions.append(exclusion)
|
814
|
-
|
815
|
-
# If we have only header/footer exclusions, we can use the cropping approach
|
816
|
-
all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
|
817
|
-
|
818
|
-
if all_are_bands:
|
819
|
-
# Find the actual content area after excluding header/footer
|
820
|
-
top_bound = self.top
|
821
|
-
bottom_bound = self.bottom
|
822
|
-
|
823
|
-
if debug:
|
824
|
-
logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
825
|
-
|
826
|
-
# Process only header/footer exclusions for cropping
|
827
|
-
for exclusion in footer_header_exclusions:
|
828
|
-
# If exclusion is at the top of our region
|
829
|
-
if exclusion.bottom > self.top and exclusion.top <= self.top:
|
830
|
-
# Move top bound to exclude the header
|
831
|
-
top_bound = max(top_bound, exclusion.bottom)
|
832
|
-
if debug:
|
833
|
-
logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
|
834
|
-
|
835
|
-
# If exclusion is at the bottom of our region
|
836
|
-
if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
|
837
|
-
# Move bottom bound to exclude the footer
|
838
|
-
bottom_bound = min(bottom_bound, exclusion.top)
|
839
|
-
if debug:
|
840
|
-
logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
|
841
|
-
|
842
|
-
|
843
|
-
if debug:
|
844
|
-
logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
845
|
-
|
846
|
-
# If we still have a valid region after exclusions
|
847
|
-
if top_bound < bottom_bound:
|
848
|
-
# Use direct crop with adjusted bounds
|
849
|
-
crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
|
850
|
-
cropped = self.page._page.crop(crop_bbox)
|
851
|
-
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
852
|
-
|
853
|
-
if debug:
|
854
|
-
logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
|
855
|
-
|
856
|
-
# Skip the complex filtering approach
|
857
|
-
return result
|
858
|
-
else:
|
859
|
-
# This would only happen if the region is entirely inside an exclusion zone
|
860
|
-
# or if both top and bottom of the region are excluded leaving no valid area
|
861
|
-
logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
|
862
|
-
return ""
|
863
|
-
# We have exclusions, but not all are headers/footers,
|
864
|
-
# or we have a non-rectangular region
|
865
|
-
else:
|
866
|
-
if debug:
|
867
|
-
logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
|
868
|
-
|
869
|
-
# Don't use crop for mixed exclusion types
|
870
|
-
can_use_crop = False
|
871
|
-
|
872
|
-
# If we got a result from header/footer cropping, return it
|
873
|
-
if result:
|
874
|
-
return result
|
875
|
-
|
876
|
-
# For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
|
877
|
-
if can_use_crop and not apply_exclusions:
|
878
|
-
# Simple case: use direct crop
|
879
|
-
crop_bbox = self.bbox
|
880
|
-
cropped = self.page._page.crop(crop_bbox)
|
881
|
-
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
882
|
-
return result
|
883
|
-
|
884
|
-
# For all other cases (complex exclusions, polygons), we use element filtering
|
885
|
-
if debug:
|
886
|
-
logger.debug(f"Using element filtering approach for region {self.bbox}")
|
887
|
-
|
888
|
-
# Get only word elements in this region first (instead of ALL elements)
|
889
|
-
# This prevents duplication from joining both char and word text
|
890
|
-
all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
|
891
|
-
|
892
|
-
if apply_exclusions and exclusion_regions:
|
893
|
-
if debug:
|
894
|
-
logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
|
895
|
-
|
896
|
-
# Filter out elements in exclusion zones
|
897
|
-
filtered_elements = []
|
898
|
-
for elem in all_elements:
|
899
|
-
in_exclusion = False
|
900
|
-
# For each element, check if it's in any exclusion zone
|
901
|
-
element_center_x = (elem.x0 + elem.x1) / 2
|
902
|
-
element_center_y = (elem.top + elem.bottom) / 2
|
903
|
-
|
904
|
-
for exclusion in exclusion_regions:
|
905
|
-
if (exclusion.x0 <= element_center_x <= exclusion.x1 and
|
906
|
-
exclusion.top <= element_center_y <= exclusion.bottom):
|
907
|
-
in_exclusion = True
|
908
|
-
break
|
909
|
-
|
910
|
-
if not in_exclusion:
|
911
|
-
filtered_elements.append(elem)
|
912
|
-
else:
|
913
|
-
# No exclusions, use all elements
|
914
|
-
filtered_elements = all_elements
|
915
|
-
|
916
|
-
# Now extract text from the filtered elements
|
917
|
-
if filtered_elements:
|
918
|
-
from natural_pdf.elements.collections import ElementCollection
|
919
|
-
collection = ElementCollection(filtered_elements)
|
920
|
-
# Sort in reading order
|
921
|
-
collection = collection.sort(key=lambda e: (e.top, e.x0))
|
922
|
-
# Extract text
|
923
|
-
result = " ".join(e.text for e in collection if hasattr(e, 'text'))
|
924
|
-
|
925
|
-
if debug:
|
926
|
-
logger.debug(f"Got {len(result)} characters from element-based extraction")
|
927
|
-
|
928
|
-
# Return the result
|
929
|
-
return result
|
930
|
-
else:
|
801
|
+
if apply_exclusions_flag and self._page._exclusions:
|
802
|
+
all_page_exclusions = self._page._get_exclusion_regions(
|
803
|
+
include_callable=True, debug=debug
|
804
|
+
)
|
805
|
+
overlapping_exclusions = []
|
806
|
+
for excl in all_page_exclusions:
|
807
|
+
if get_bbox_overlap(self.bbox, excl.bbox) is not None:
|
808
|
+
overlapping_exclusions.append(excl)
|
809
|
+
exclusion_regions = overlapping_exclusions
|
931
810
|
if debug:
|
932
|
-
logger.debug(
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
#
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
filtered_ocr.append(element)
|
956
|
-
else:
|
957
|
-
filtered_ocr = ocr_elements
|
958
|
-
|
959
|
-
# Extract text from OCR elements
|
960
|
-
from natural_pdf.elements.collections import ElementCollection
|
961
|
-
ocr_collection = ElementCollection(filtered_ocr)
|
962
|
-
ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
|
963
|
-
|
964
|
-
# Use OCR text if it's not empty
|
965
|
-
if ocr_text.strip():
|
966
|
-
return ocr_text
|
967
|
-
|
811
|
+
logger.debug(
|
812
|
+
f"Region {self.bbox}: Applying {len(exclusion_regions)} overlapping exclusions."
|
813
|
+
)
|
814
|
+
elif debug:
|
815
|
+
logger.debug(f"Region {self.bbox}: Not applying exclusions.")
|
816
|
+
|
817
|
+
# 4. Spatially Filter Characters using Utility
|
818
|
+
# Pass self as the target_region for precise polygon checks etc.
|
819
|
+
filtered_chars = filter_chars_spatially(
|
820
|
+
char_dicts=all_char_dicts,
|
821
|
+
exclusion_regions=exclusion_regions,
|
822
|
+
target_region=self, # Pass self!
|
823
|
+
debug=debug,
|
824
|
+
)
|
825
|
+
|
826
|
+
# 5. Generate Text Layout using Utility
|
827
|
+
result = generate_text_layout(
|
828
|
+
char_dicts=filtered_chars,
|
829
|
+
layout_context_bbox=self.bbox, # Use region's bbox for context
|
830
|
+
user_kwargs=kwargs,
|
831
|
+
)
|
832
|
+
|
833
|
+
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
968
834
|
return result
|
969
|
-
|
970
|
-
def extract_table(
|
971
|
-
|
835
|
+
|
836
|
+
def extract_table(
|
837
|
+
self,
|
838
|
+
method: str = None,
|
839
|
+
table_settings: dict = None,
|
840
|
+
use_ocr: bool = False,
|
841
|
+
ocr_config: dict = None,
|
842
|
+
) -> List[List[str]]:
|
972
843
|
"""
|
973
844
|
Extract a table from this region.
|
974
|
-
|
845
|
+
|
975
846
|
Args:
|
976
847
|
method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
|
977
848
|
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
|
978
849
|
use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
|
979
850
|
ocr_config: OCR configuration parameters
|
980
|
-
|
851
|
+
|
981
852
|
Returns:
|
982
853
|
Table data as a list of rows, where each row is a list of cell values
|
983
854
|
"""
|
984
855
|
# Default settings if none provided
|
985
856
|
if table_settings is None:
|
986
857
|
table_settings = {}
|
987
|
-
|
858
|
+
|
988
859
|
# Auto-detect method if not specified
|
989
860
|
if method is None:
|
990
861
|
# If this is a TATR-detected region, use TATR method
|
991
|
-
if hasattr(self,
|
992
|
-
method =
|
862
|
+
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
863
|
+
method = "tatr"
|
993
864
|
else:
|
994
|
-
method =
|
995
|
-
|
865
|
+
method = "plumber"
|
866
|
+
|
996
867
|
# Use the selected method
|
997
|
-
if method ==
|
868
|
+
if method == "tatr":
|
998
869
|
return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
999
870
|
else: # Default to pdfplumber
|
1000
871
|
return self._extract_table_plumber(table_settings)
|
1001
|
-
|
872
|
+
|
1002
873
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1003
874
|
"""
|
1004
875
|
Extract table using pdfplumber's table extraction.
|
1005
|
-
|
876
|
+
|
1006
877
|
Args:
|
1007
878
|
table_settings: Settings for pdfplumber table extraction
|
1008
|
-
|
879
|
+
|
1009
880
|
Returns:
|
1010
881
|
Table data as a list of rows, where each row is a list of cell values
|
1011
882
|
"""
|
1012
883
|
# Create a crop of the page for this region
|
1013
884
|
cropped = self.page._page.crop(self.bbox)
|
1014
|
-
|
885
|
+
|
1015
886
|
# Extract table from the cropped area
|
1016
887
|
tables = cropped.extract_tables(table_settings)
|
1017
|
-
|
888
|
+
|
1018
889
|
# Return the first table or an empty list if none found
|
1019
890
|
if tables:
|
1020
891
|
return tables[0]
|
1021
892
|
return []
|
1022
|
-
|
893
|
+
|
1023
894
|
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
1024
895
|
"""
|
1025
896
|
Extract table using TATR structure detection.
|
1026
|
-
|
897
|
+
|
1027
898
|
Args:
|
1028
899
|
use_ocr: Whether to apply OCR to each cell for better text extraction
|
1029
900
|
ocr_config: Optional OCR configuration parameters
|
1030
|
-
|
901
|
+
|
1031
902
|
Returns:
|
1032
903
|
Table data as a list of rows, where each row is a list of cell values
|
1033
904
|
"""
|
1034
905
|
# Find all rows and headers in this table
|
1035
|
-
rows = self.page.find_all(f
|
1036
|
-
headers = self.page.find_all(f
|
1037
|
-
columns = self.page.find_all(f
|
1038
|
-
|
906
|
+
rows = self.page.find_all(f"region[type=table-row][model=tatr]")
|
907
|
+
headers = self.page.find_all(f"region[type=table-column-header][model=tatr]")
|
908
|
+
columns = self.page.find_all(f"region[type=table-column][model=tatr]")
|
909
|
+
|
1039
910
|
# Filter to only include rows/headers/columns that overlap with this table region
|
1040
911
|
def is_in_table(region):
|
1041
912
|
# Check for overlap - simplifying to center point for now
|
1042
913
|
region_center_x = (region.x0 + region.x1) / 2
|
1043
914
|
region_center_y = (region.top + region.bottom) / 2
|
1044
|
-
return (
|
1045
|
-
|
1046
|
-
|
915
|
+
return (
|
916
|
+
self.x0 <= region_center_x <= self.x1 and self.top <= region_center_y <= self.bottom
|
917
|
+
)
|
918
|
+
|
1047
919
|
rows = [row for row in rows if is_in_table(row)]
|
1048
920
|
headers = [header for header in headers if is_in_table(header)]
|
1049
921
|
columns = [column for column in columns if is_in_table(column)]
|
1050
|
-
|
922
|
+
|
1051
923
|
# Sort rows by vertical position (top to bottom)
|
1052
924
|
rows.sort(key=lambda r: r.top)
|
1053
|
-
|
925
|
+
|
1054
926
|
# Sort columns by horizontal position (left to right)
|
1055
927
|
columns.sort(key=lambda c: c.x0)
|
1056
|
-
|
928
|
+
|
1057
929
|
# Create table data structure
|
1058
930
|
table_data = []
|
1059
|
-
|
931
|
+
|
1060
932
|
# Prepare OCR config if needed
|
1061
933
|
if use_ocr:
|
1062
934
|
# Default OCR config focuses on small text with low confidence
|
@@ -1065,16 +937,20 @@ class Region(DirectionalMixin):
|
|
1065
937
|
"min_confidence": 0.1, # Lower than default to catch more text
|
1066
938
|
"detection_params": {
|
1067
939
|
"text_threshold": 0.1, # Lower threshold for low-contrast text
|
1068
|
-
"link_threshold": 0.1 # Lower threshold for connecting text components
|
1069
|
-
}
|
940
|
+
"link_threshold": 0.1, # Lower threshold for connecting text components
|
941
|
+
},
|
1070
942
|
}
|
1071
|
-
|
943
|
+
|
1072
944
|
# Merge with provided config if any
|
1073
945
|
if ocr_config:
|
1074
946
|
if isinstance(ocr_config, dict):
|
1075
947
|
# Update default config with provided values
|
1076
948
|
for key, value in ocr_config.items():
|
1077
|
-
if
|
949
|
+
if (
|
950
|
+
isinstance(value, dict)
|
951
|
+
and key in default_ocr_config
|
952
|
+
and isinstance(default_ocr_config[key], dict)
|
953
|
+
):
|
1078
954
|
# Merge nested dicts
|
1079
955
|
default_ocr_config[key].update(value)
|
1080
956
|
else:
|
@@ -1083,10 +959,10 @@ class Region(DirectionalMixin):
|
|
1083
959
|
else:
|
1084
960
|
# Not a dict, use as is
|
1085
961
|
default_ocr_config = ocr_config
|
1086
|
-
|
962
|
+
|
1087
963
|
# Use the merged config
|
1088
964
|
ocr_config = default_ocr_config
|
1089
|
-
|
965
|
+
|
1090
966
|
# Add header row if headers were detected
|
1091
967
|
if headers:
|
1092
968
|
header_texts = []
|
@@ -1099,30 +975,28 @@ class Region(DirectionalMixin):
|
|
1099
975
|
if ocr_text:
|
1100
976
|
header_texts.append(ocr_text)
|
1101
977
|
continue
|
1102
|
-
|
978
|
+
|
1103
979
|
# Fallback to normal extraction
|
1104
980
|
header_texts.append(header.extract_text().strip())
|
1105
981
|
table_data.append(header_texts)
|
1106
|
-
|
982
|
+
|
1107
983
|
# Process rows
|
1108
984
|
for row in rows:
|
1109
985
|
row_cells = []
|
1110
|
-
|
986
|
+
|
1111
987
|
# If we have columns, use them to extract cells
|
1112
988
|
if columns:
|
1113
989
|
for column in columns:
|
1114
990
|
# Create a cell region at the intersection of row and column
|
1115
|
-
cell_bbox = (
|
1116
|
-
|
1117
|
-
row.top,
|
1118
|
-
column.x1,
|
1119
|
-
row.bottom
|
1120
|
-
)
|
1121
|
-
|
991
|
+
cell_bbox = (column.x0, row.top, column.x1, row.bottom)
|
992
|
+
|
1122
993
|
# Create a region for this cell
|
1123
|
-
from natural_pdf.elements.region import
|
994
|
+
from natural_pdf.elements.region import ( # Import here to avoid circular imports
|
995
|
+
Region,
|
996
|
+
)
|
997
|
+
|
1124
998
|
cell_region = Region(self.page, cell_bbox)
|
1125
|
-
|
999
|
+
|
1126
1000
|
# Extract text from the cell
|
1127
1001
|
if use_ocr:
|
1128
1002
|
# Apply OCR to the cell
|
@@ -1133,7 +1007,7 @@ class Region(DirectionalMixin):
|
|
1133
1007
|
if ocr_text:
|
1134
1008
|
row_cells.append(ocr_text)
|
1135
1009
|
continue
|
1136
|
-
|
1010
|
+
|
1137
1011
|
# Fallback to normal extraction
|
1138
1012
|
cell_text = cell_region.extract_text().strip()
|
1139
1013
|
row_cells.append(cell_text)
|
@@ -1147,182 +1021,215 @@ class Region(DirectionalMixin):
|
|
1147
1021
|
if ocr_text:
|
1148
1022
|
row_cells.append(ocr_text)
|
1149
1023
|
continue
|
1150
|
-
|
1024
|
+
|
1151
1025
|
# Fallback to normal extraction
|
1152
1026
|
row_cells.append(row.extract_text().strip())
|
1153
|
-
|
1027
|
+
|
1154
1028
|
table_data.append(row_cells)
|
1155
|
-
|
1029
|
+
|
1156
1030
|
return table_data
|
1157
|
-
|
1158
|
-
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[
|
1031
|
+
|
1032
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
|
1159
1033
|
"""
|
1160
1034
|
Find the first element in this region matching the selector.
|
1161
|
-
|
1035
|
+
|
1162
1036
|
Args:
|
1163
1037
|
selector: CSS-like selector string
|
1164
1038
|
apply_exclusions: Whether to apply exclusion regions
|
1165
1039
|
**kwargs: Additional parameters for element filtering
|
1166
|
-
|
1040
|
+
|
1167
1041
|
Returns:
|
1168
1042
|
First matching element or None
|
1169
1043
|
"""
|
1170
1044
|
elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1171
|
-
return elements
|
1172
|
-
|
1173
|
-
def
|
1045
|
+
return elements.first if elements else None # Use .first property
|
1046
|
+
|
1047
|
+
def find_all(
|
1048
|
+
self, selector: str, apply_exclusions=True, **kwargs
|
1049
|
+
) -> "ElementCollection": # Changed from _find_all
|
1174
1050
|
"""
|
1175
1051
|
Find all elements in this region matching the selector.
|
1176
|
-
|
1052
|
+
|
1177
1053
|
Args:
|
1178
1054
|
selector: CSS-like selector string
|
1179
1055
|
apply_exclusions: Whether to apply exclusion regions
|
1180
1056
|
**kwargs: Additional parameters for element filtering
|
1181
|
-
|
1057
|
+
|
1182
1058
|
Returns:
|
1183
1059
|
ElementCollection with matching elements
|
1184
1060
|
"""
|
1185
1061
|
from natural_pdf.elements.collections import ElementCollection
|
1186
1062
|
|
1187
1063
|
# If we span multiple pages, filter our elements
|
1064
|
+
# TODO: Revisit multi-page region logic
|
1188
1065
|
if self._spans_pages and self._multi_page_elements is not None:
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
page_ranges[element.page] = []
|
1202
|
-
page_ranges[element.page].append(element)
|
1203
|
-
|
1204
|
-
# For each page, use its find_all to match elements, then filter to our collection
|
1205
|
-
for page, page_elements in page_ranges.items():
|
1206
|
-
# Get all matching elements from the page
|
1207
|
-
page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1208
|
-
|
1209
|
-
# Filter to just the elements that are in our collection
|
1210
|
-
for element in page_matches:
|
1211
|
-
if element in page_elements:
|
1212
|
-
all_matching_elements.append(element)
|
1213
|
-
|
1214
|
-
return ElementCollection(all_matching_elements)
|
1066
|
+
logger.warning("find_all on multi-page regions is not fully implemented.")
|
1067
|
+
# Temporary: Apply filter directly to cached elements
|
1068
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1069
|
+
|
1070
|
+
try:
|
1071
|
+
selector_obj = parse_selector(selector)
|
1072
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1073
|
+
matching = [el for el in self._multi_page_elements if filter_func(el)]
|
1074
|
+
return ElementCollection(matching)
|
1075
|
+
except Exception as e:
|
1076
|
+
logger.error(f"Error applying selector to multi-page region elements: {e}")
|
1077
|
+
return ElementCollection([])
|
1215
1078
|
|
1216
1079
|
# Otherwise, get elements from the page and filter by selector and region
|
1217
1080
|
page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1081
|
+
# Use the precise _is_element_in_region check
|
1218
1082
|
filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1219
1083
|
return ElementCollection(filtered_elements)
|
1220
|
-
|
1221
|
-
def apply_ocr(self, **ocr_params) -> List[
|
1084
|
+
|
1085
|
+
def apply_ocr(self, **ocr_params) -> List["TextElement"]: # Return type hint updated
|
1222
1086
|
"""
|
1223
1087
|
Apply OCR to this region and return the created text elements.
|
1224
|
-
|
1088
|
+
|
1225
1089
|
Args:
|
1226
|
-
**ocr_params: OCR parameters to override defaults
|
1227
|
-
|
1090
|
+
**ocr_params: OCR parameters to override defaults (passed to OCRManager)
|
1091
|
+
|
1228
1092
|
Returns:
|
1229
|
-
List of created
|
1093
|
+
List of created TextElement objects representing OCR words/lines.
|
1230
1094
|
"""
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
if isinstance(ocr_params, dict):
|
1235
|
-
ocr_params["verbose"] = False
|
1236
|
-
else:
|
1237
|
-
ocr_params = {"enabled": True, "verbose": False}
|
1238
|
-
|
1239
|
-
ocr_config = self.page._get_ocr_config(ocr_params)
|
1240
|
-
|
1241
|
-
# Skip if OCR is disabled
|
1242
|
-
if not ocr_config.get('enabled'):
|
1095
|
+
# Ensure OCRManager is available
|
1096
|
+
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1097
|
+
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
1243
1098
|
return []
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
#
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
#
|
1256
|
-
|
1257
|
-
#
|
1258
|
-
|
1259
|
-
|
1260
|
-
result['bbox'][1] + self.top,
|
1261
|
-
result['bbox'][2] + self.x0,
|
1262
|
-
result['bbox'][3] + self.top
|
1099
|
+
ocr_mgr = self.page._parent._ocr_manager
|
1100
|
+
|
1101
|
+
# Get OCR configuration from kwargs or PDF defaults if needed
|
1102
|
+
# We'll mostly rely on passing ocr_params directly to the manager
|
1103
|
+
# For rendering, use a reasonable default scale
|
1104
|
+
ocr_image_scale = self.page._parent._config.get("ocr_image_scale", 2.0)
|
1105
|
+
|
1106
|
+
logger.debug(
|
1107
|
+
f"Region {self.bbox}: Applying OCR with scale {ocr_image_scale} and params: {ocr_params}"
|
1108
|
+
)
|
1109
|
+
|
1110
|
+
# Render the page region to an image
|
1111
|
+
try:
|
1112
|
+
# Crop the page image to this region's bbox
|
1113
|
+
region_image = self.to_image(
|
1114
|
+
scale=ocr_image_scale, include_highlights=False, crop_only=True
|
1263
1115
|
)
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1116
|
+
if not region_image:
|
1117
|
+
logger.error("Failed to render region to image for OCR.")
|
1118
|
+
return []
|
1119
|
+
logger.debug(f"Region rendered to image size: {region_image.size}")
|
1120
|
+
except Exception as e:
|
1121
|
+
logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
|
1122
|
+
return []
|
1123
|
+
|
1124
|
+
# Run OCR on this region's image using the manager
|
1125
|
+
try:
|
1126
|
+
# Pass the single image and any specific options/kwargs
|
1127
|
+
# The manager handles engine selection based on ocr_params or defaults
|
1128
|
+
results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
|
1129
|
+
# apply_ocr returns List[Dict] for single image
|
1130
|
+
if not isinstance(results, list):
|
1131
|
+
logger.error(
|
1132
|
+
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
1133
|
+
)
|
1134
|
+
return []
|
1135
|
+
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1136
|
+
except Exception as e:
|
1137
|
+
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
1138
|
+
return []
|
1139
|
+
|
1140
|
+
# Convert results to TextElements, scaling coordinates relative to the page
|
1141
|
+
# Calculate scaling factors based on the region image vs the region PDF coords
|
1142
|
+
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
1143
|
+
scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
|
1144
|
+
logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
|
1145
|
+
|
1146
|
+
created_elements = []
|
1267
1147
|
for result in results:
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1148
|
+
try:
|
1149
|
+
img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
|
1150
|
+
pdf_height = (img_bottom - img_top) * scale_y
|
1151
|
+
|
1152
|
+
# Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
|
1153
|
+
page_x0 = self.x0 + (img_x0 * scale_x)
|
1154
|
+
page_top = self.top + (img_top * scale_y)
|
1155
|
+
page_x1 = self.x0 + (img_x1 * scale_x)
|
1156
|
+
page_bottom = self.top + (img_bottom * scale_y)
|
1157
|
+
|
1158
|
+
# Create element data using PAGE coordinates
|
1275
1159
|
element_data = {
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1160
|
+
"text": result["text"],
|
1161
|
+
"x0": page_x0,
|
1162
|
+
"top": page_top,
|
1163
|
+
"x1": page_x1,
|
1164
|
+
"bottom": page_bottom,
|
1165
|
+
"width": page_x1 - page_x0,
|
1166
|
+
"height": page_bottom - page_top,
|
1167
|
+
"object_type": "word", # Treat as word
|
1168
|
+
"source": "ocr",
|
1169
|
+
"confidence": float(result.get("confidence", 0.0)),
|
1170
|
+
"fontname": "OCR",
|
1171
|
+
"size": round(pdf_height) if pdf_height > 0 else 10.0, # Size based on height
|
1172
|
+
"page_number": self.page.number,
|
1173
|
+
"bold": False,
|
1174
|
+
"italic": False,
|
1175
|
+
"upright": True,
|
1176
|
+
"doctop": page_top + self.page._page.initial_doctop,
|
1290
1177
|
}
|
1291
|
-
|
1178
|
+
|
1179
|
+
# Create the representative char dict
|
1180
|
+
ocr_char_dict = element_data.copy()
|
1181
|
+
ocr_char_dict["object_type"] = "char"
|
1182
|
+
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
1183
|
+
|
1184
|
+
# Add char dicts to word data
|
1185
|
+
element_data["_char_dicts"] = [ocr_char_dict]
|
1186
|
+
|
1187
|
+
# Create the TextElement word
|
1188
|
+
from natural_pdf.elements.text import TextElement # Local import ok here
|
1189
|
+
|
1292
1190
|
elem = TextElement(element_data, self.page)
|
1293
|
-
|
1294
|
-
|
1295
|
-
# Add to page's
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1191
|
+
created_elements.append(elem)
|
1192
|
+
|
1193
|
+
# Add the element to the page's element manager
|
1194
|
+
self.page._element_mgr.add_element(elem, element_type="words")
|
1195
|
+
# Add the char dict to the manager's char list
|
1196
|
+
self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
|
1197
|
+
|
1198
|
+
except Exception as e:
|
1199
|
+
logger.error(
|
1200
|
+
f"Failed to convert region OCR result to element: {result}. Error: {e}",
|
1201
|
+
exc_info=True,
|
1202
|
+
)
|
1203
|
+
|
1204
|
+
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
1205
|
+
return created_elements
|
1206
|
+
|
1207
|
+
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
1306
1208
|
"""
|
1307
1209
|
Get a section between two elements within this region.
|
1308
|
-
|
1210
|
+
|
1309
1211
|
Args:
|
1310
1212
|
start_element: Element marking the start of the section
|
1311
1213
|
end_element: Element marking the end of the section
|
1312
1214
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1313
|
-
|
1215
|
+
|
1314
1216
|
Returns:
|
1315
1217
|
Region representing the section
|
1316
1218
|
"""
|
1219
|
+
# Get elements only within this region first
|
1317
1220
|
elements = self.get_elements()
|
1318
|
-
|
1319
|
-
# If no elements, return self
|
1221
|
+
|
1222
|
+
# If no elements, return self or empty region?
|
1320
1223
|
if not elements:
|
1321
|
-
|
1322
|
-
|
1224
|
+
logger.warning(
|
1225
|
+
f"get_section_between called on region {self.bbox} with no contained elements."
|
1226
|
+
)
|
1227
|
+
# Return an empty region at the start of the parent region
|
1228
|
+
return Region(self.page, (self.x0, self.top, self.x0, self.top))
|
1229
|
+
|
1323
1230
|
# Sort elements in reading order
|
1324
1231
|
elements.sort(key=lambda e: (e.top, e.x0))
|
1325
|
-
|
1232
|
+
|
1326
1233
|
# Find start index
|
1327
1234
|
start_idx = 0
|
1328
1235
|
if start_element:
|
@@ -1330,8 +1237,12 @@ class Region(DirectionalMixin):
|
|
1330
1237
|
start_idx = elements.index(start_element)
|
1331
1238
|
except ValueError:
|
1332
1239
|
# Start element not in region, use first element
|
1333
|
-
|
1334
|
-
|
1240
|
+
logger.debug("Start element not found in region, using first element.")
|
1241
|
+
start_element = elements[0] # Use the actual first element
|
1242
|
+
start_idx = 0
|
1243
|
+
else:
|
1244
|
+
start_element = elements[0] # Default start is first element
|
1245
|
+
|
1335
1246
|
# Find end index
|
1336
1247
|
end_idx = len(elements) - 1
|
1337
1248
|
if end_element:
|
@@ -1339,218 +1250,231 @@ class Region(DirectionalMixin):
|
|
1339
1250
|
end_idx = elements.index(end_element)
|
1340
1251
|
except ValueError:
|
1341
1252
|
# End element not in region, use last element
|
1342
|
-
|
1343
|
-
|
1253
|
+
logger.debug("End element not found in region, using last element.")
|
1254
|
+
end_element = elements[-1] # Use the actual last element
|
1255
|
+
end_idx = len(elements) - 1
|
1256
|
+
else:
|
1257
|
+
end_element = elements[-1] # Default end is last element
|
1258
|
+
|
1344
1259
|
# Adjust indexes based on boundary inclusion
|
1345
|
-
|
1260
|
+
start_element_for_bbox = start_element
|
1261
|
+
end_element_for_bbox = end_element
|
1262
|
+
|
1263
|
+
if boundary_inclusion == "none":
|
1346
1264
|
start_idx += 1
|
1347
1265
|
end_idx -= 1
|
1348
|
-
|
1266
|
+
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
1267
|
+
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
1268
|
+
elif boundary_inclusion == "start":
|
1349
1269
|
end_idx -= 1
|
1350
|
-
|
1270
|
+
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
1271
|
+
elif boundary_inclusion == "end":
|
1351
1272
|
start_idx += 1
|
1352
|
-
|
1273
|
+
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
1274
|
+
|
1353
1275
|
# Ensure valid indexes
|
1354
1276
|
start_idx = max(0, start_idx)
|
1355
1277
|
end_idx = min(len(elements) - 1, end_idx)
|
1356
|
-
|
1278
|
+
|
1357
1279
|
# If no valid elements in range, return empty region
|
1358
|
-
if start_idx > end_idx:
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
#
|
1280
|
+
if start_idx > end_idx or start_element_for_bbox is None or end_element_for_bbox is None:
|
1281
|
+
logger.debug("No valid elements in range for get_section_between.")
|
1282
|
+
# Return an empty region positioned at the start element boundary
|
1283
|
+
anchor = start_element if start_element else self
|
1284
|
+
return Region(self.page, (anchor.x0, anchor.top, anchor.x0, anchor.top))
|
1285
|
+
|
1286
|
+
# Get elements in range based on adjusted indices
|
1287
|
+
section_elements = elements[start_idx : end_idx + 1]
|
1288
|
+
|
1289
|
+
# Create bounding box around the ELEMENTS included based on indices
|
1365
1290
|
x0 = min(e.x0 for e in section_elements)
|
1366
1291
|
top = min(e.top for e in section_elements)
|
1367
1292
|
x1 = max(e.x1 for e in section_elements)
|
1368
1293
|
bottom = max(e.bottom for e in section_elements)
|
1369
|
-
|
1370
|
-
# Adjust boundaries for better boundary inclusion/exclusion
|
1371
|
-
pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
|
1372
|
-
|
1373
|
-
# Only proceed with adjustments if we have elements in the section
|
1374
|
-
if section_elements:
|
1375
|
-
# Adjust top boundary if start element should be excluded
|
1376
|
-
if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
|
1377
|
-
# If start element is just above the section, move the top down
|
1378
|
-
# Use a larger threshold (10 points) to catch more cases
|
1379
|
-
if abs(top - start_element.bottom) < 10:
|
1380
|
-
top += pixel_adjustment
|
1381
|
-
|
1382
|
-
# Adjust bottom boundary if end element should be excluded
|
1383
|
-
if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
|
1384
|
-
# If end element is just below the section, move the bottom up
|
1385
|
-
# Use a larger threshold (10 points) to catch more cases
|
1386
|
-
if abs(bottom - end_element.top) < 10:
|
1387
|
-
bottom -= pixel_adjustment
|
1388
|
-
|
1389
|
-
# Ensure top is always less than bottom (valid region)
|
1390
|
-
if top >= bottom:
|
1391
|
-
# Reset to original if adjustment would create an invalid region
|
1392
|
-
top = min(e.top for e in section_elements)
|
1393
|
-
bottom = max(e.bottom for e in section_elements)
|
1394
|
-
|
1294
|
+
|
1395
1295
|
# Create new region
|
1396
1296
|
section = Region(self.page, (x0, top, x1, bottom))
|
1397
|
-
|
1398
|
-
section.
|
1399
|
-
|
1297
|
+
# Store the original boundary elements for reference
|
1298
|
+
section.start_element = start_element
|
1299
|
+
section.end_element = end_element
|
1300
|
+
|
1400
1301
|
return section
|
1401
|
-
|
1402
|
-
def get_sections(
|
1302
|
+
|
1303
|
+
def get_sections(
|
1304
|
+
self, start_elements=None, end_elements=None, boundary_inclusion="both"
|
1305
|
+
) -> List["Region"]:
|
1403
1306
|
"""
|
1404
1307
|
Get sections within this region based on start/end elements.
|
1405
|
-
|
1308
|
+
|
1406
1309
|
Args:
|
1407
1310
|
start_elements: Elements or selector string that mark the start of sections
|
1408
1311
|
end_elements: Elements or selector string that mark the end of sections
|
1409
1312
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1410
|
-
|
1313
|
+
|
1411
1314
|
Returns:
|
1412
1315
|
List of Region objects representing the extracted sections
|
1413
1316
|
"""
|
1414
1317
|
from natural_pdf.elements.collections import ElementCollection
|
1415
|
-
|
1416
|
-
# Process string selectors to find elements
|
1318
|
+
|
1319
|
+
# Process string selectors to find elements WITHIN THIS REGION
|
1417
1320
|
if isinstance(start_elements, str):
|
1418
|
-
start_elements = self.find_all(start_elements)
|
1419
|
-
if hasattr(start_elements,
|
1321
|
+
start_elements = self.find_all(start_elements) # Use region's find_all
|
1322
|
+
if hasattr(start_elements, "elements"):
|
1420
1323
|
start_elements = start_elements.elements
|
1421
|
-
|
1324
|
+
|
1422
1325
|
if isinstance(end_elements, str):
|
1423
|
-
end_elements = self.find_all(end_elements)
|
1424
|
-
if hasattr(end_elements,
|
1326
|
+
end_elements = self.find_all(end_elements) # Use region's find_all
|
1327
|
+
if hasattr(end_elements, "elements"):
|
1425
1328
|
end_elements = end_elements.elements
|
1426
|
-
|
1427
|
-
#
|
1329
|
+
|
1330
|
+
# Ensure start_elements is a list (or similar iterable)
|
1331
|
+
if start_elements is None or not hasattr(start_elements, "__iter__"):
|
1332
|
+
logger.warning(
|
1333
|
+
"get_sections requires valid start_elements (selector or list). Returning empty."
|
1334
|
+
)
|
1335
|
+
return []
|
1336
|
+
# Ensure end_elements is a list if provided
|
1337
|
+
if end_elements is not None and not hasattr(end_elements, "__iter__"):
|
1338
|
+
logger.warning("end_elements must be iterable if provided. Ignoring.")
|
1339
|
+
end_elements = []
|
1340
|
+
elif end_elements is None:
|
1341
|
+
end_elements = []
|
1342
|
+
|
1343
|
+
# If no start elements found within the region, return empty list
|
1428
1344
|
if not start_elements:
|
1429
1345
|
return []
|
1430
|
-
|
1431
|
-
# Sort elements in reading order
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1346
|
+
|
1347
|
+
# Sort all elements within the region in reading order
|
1348
|
+
all_elements_in_region = self.get_elements()
|
1349
|
+
all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
|
1350
|
+
|
1351
|
+
if not all_elements_in_region:
|
1352
|
+
return [] # Cannot create sections if region is empty
|
1353
|
+
|
1354
|
+
# Map elements to their indices in the sorted list
|
1355
|
+
element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
|
1356
|
+
|
1357
|
+
# Mark section boundaries using indices from the sorted list
|
1436
1358
|
section_boundaries = []
|
1437
|
-
|
1359
|
+
|
1438
1360
|
# Add start element indexes
|
1439
1361
|
for element in start_elements:
|
1440
|
-
|
1441
|
-
|
1442
|
-
section_boundaries.append({
|
1443
|
-
|
1444
|
-
|
1445
|
-
'type': 'start'
|
1446
|
-
})
|
1447
|
-
except ValueError:
|
1448
|
-
# Element not in this region, skip
|
1449
|
-
continue
|
1450
|
-
|
1362
|
+
idx = element_to_index.get(element)
|
1363
|
+
if idx is not None:
|
1364
|
+
section_boundaries.append({"index": idx, "element": element, "type": "start"})
|
1365
|
+
# else: Element found by selector might not be geometrically in region? Log warning?
|
1366
|
+
|
1451
1367
|
# Add end element indexes if provided
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
})
|
1461
|
-
except ValueError:
|
1462
|
-
# Element not in this region, skip
|
1463
|
-
continue
|
1464
|
-
|
1465
|
-
# Sort boundaries by index (document order)
|
1466
|
-
section_boundaries.sort(key=lambda x: x['index'])
|
1467
|
-
|
1368
|
+
for element in end_elements:
|
1369
|
+
idx = element_to_index.get(element)
|
1370
|
+
if idx is not None:
|
1371
|
+
section_boundaries.append({"index": idx, "element": element, "type": "end"})
|
1372
|
+
|
1373
|
+
# Sort boundaries by index (document order within the region)
|
1374
|
+
section_boundaries.sort(key=lambda x: x["index"])
|
1375
|
+
|
1468
1376
|
# Generate sections
|
1469
1377
|
sections = []
|
1470
|
-
|
1471
|
-
|
1378
|
+
current_start_boundary = None
|
1379
|
+
|
1472
1380
|
for i, boundary in enumerate(section_boundaries):
|
1473
1381
|
# If it's a start boundary and we don't have a current start
|
1474
|
-
if boundary[
|
1475
|
-
|
1476
|
-
|
1382
|
+
if boundary["type"] == "start" and current_start_boundary is None:
|
1383
|
+
current_start_boundary = boundary
|
1384
|
+
|
1477
1385
|
# If it's an end boundary and we have a current start
|
1478
|
-
elif boundary[
|
1386
|
+
elif boundary["type"] == "end" and current_start_boundary is not None:
|
1479
1387
|
# Create a section from current_start to this boundary
|
1480
|
-
start_element =
|
1481
|
-
end_element = boundary[
|
1482
|
-
|
1483
|
-
|
1484
|
-
end_element,
|
1485
|
-
boundary_inclusion
|
1486
|
-
)
|
1487
|
-
sections.append(section)
|
1488
|
-
current_start = None
|
1489
|
-
|
1490
|
-
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1491
|
-
elif boundary['type'] == 'start' and current_start is not None and not end_elements:
|
1492
|
-
# Create a section from current_start to just before this boundary
|
1493
|
-
start_element = current_start['element']
|
1494
|
-
end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
|
1495
|
-
section = self.get_section_between(
|
1496
|
-
start_element,
|
1497
|
-
end_element,
|
1498
|
-
boundary_inclusion
|
1499
|
-
)
|
1388
|
+
start_element = current_start_boundary["element"]
|
1389
|
+
end_element = boundary["element"]
|
1390
|
+
# Use the helper, ensuring elements are from within the region
|
1391
|
+
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1500
1392
|
sections.append(section)
|
1501
|
-
|
1502
|
-
|
1393
|
+
current_start_boundary = None # Reset
|
1394
|
+
|
1395
|
+
# If it's another start boundary and we have a current start (split by starts only)
|
1396
|
+
elif (
|
1397
|
+
boundary["type"] == "start"
|
1398
|
+
and current_start_boundary is not None
|
1399
|
+
and not end_elements
|
1400
|
+
):
|
1401
|
+
# End the previous section just before this start boundary
|
1402
|
+
start_element = current_start_boundary["element"]
|
1403
|
+
# Find the element immediately preceding this start in the sorted list
|
1404
|
+
end_idx = boundary["index"] - 1
|
1405
|
+
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
1406
|
+
end_element = all_elements_in_region[end_idx]
|
1407
|
+
section = self.get_section_between(
|
1408
|
+
start_element, end_element, boundary_inclusion
|
1409
|
+
)
|
1410
|
+
sections.append(section)
|
1411
|
+
# Else: Section started and ended by consecutive start elements? Create empty?
|
1412
|
+
# For now, just reset and start new section
|
1413
|
+
|
1414
|
+
# Start the new section
|
1415
|
+
current_start_boundary = boundary
|
1416
|
+
|
1503
1417
|
# Handle the last section if we have a current start
|
1504
|
-
if
|
1505
|
-
start_element =
|
1506
|
-
#
|
1507
|
-
end_element =
|
1508
|
-
section = self.get_section_between(
|
1509
|
-
start_element,
|
1510
|
-
end_element,
|
1511
|
-
boundary_inclusion
|
1512
|
-
)
|
1418
|
+
if current_start_boundary is not None:
|
1419
|
+
start_element = current_start_boundary["element"]
|
1420
|
+
# End at the last element within the region
|
1421
|
+
end_element = all_elements_in_region[-1]
|
1422
|
+
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1513
1423
|
sections.append(section)
|
1514
|
-
|
1424
|
+
|
1515
1425
|
return sections
|
1516
|
-
|
1426
|
+
|
1517
1427
|
def create_cells(self):
|
1518
1428
|
"""
|
1519
1429
|
Create cell regions for a detected table by intersecting its
|
1520
1430
|
row and column regions, and add them to the page.
|
1521
|
-
|
1431
|
+
|
1522
1432
|
Assumes child row and column regions are already present on the page.
|
1523
1433
|
|
1524
1434
|
Returns:
|
1525
1435
|
Self for method chaining.
|
1526
1436
|
"""
|
1527
1437
|
# Ensure this is called on a table region
|
1528
|
-
if self.region_type not in (
|
1529
|
-
|
1530
|
-
|
1438
|
+
if self.region_type not in (
|
1439
|
+
"table",
|
1440
|
+
"tableofcontents",
|
1441
|
+
): # Allow for ToC which might have structure
|
1442
|
+
raise ValueError(
|
1443
|
+
f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'"
|
1444
|
+
)
|
1445
|
+
|
1531
1446
|
# Find rows and columns associated with this page
|
1532
1447
|
# Remove the model-specific filter
|
1533
|
-
rows = self.page.find_all(
|
1534
|
-
columns = self.page.find_all(
|
1535
|
-
|
1448
|
+
rows = self.page.find_all("region[type=table-row]")
|
1449
|
+
columns = self.page.find_all("region[type=table-column]")
|
1450
|
+
|
1536
1451
|
# Filter to only include those that overlap with this table region
|
1537
1452
|
def is_in_table(element):
|
1538
1453
|
# Use a simple overlap check (more robust than just center point)
|
1539
1454
|
# Check if element's bbox overlaps with self.bbox
|
1540
|
-
return (
|
1541
|
-
|
1542
|
-
|
1455
|
+
return (
|
1456
|
+
hasattr(element, "bbox")
|
1457
|
+
and element.x0 < self.x1 # Ensure element has bbox
|
1458
|
+
and element.x1 > self.x0
|
1459
|
+
and element.top < self.bottom
|
1460
|
+
and element.bottom > self.top
|
1461
|
+
)
|
1462
|
+
|
1543
1463
|
table_rows = [r for r in rows if is_in_table(r)]
|
1544
1464
|
table_columns = [c for c in columns if is_in_table(c)]
|
1545
|
-
|
1465
|
+
|
1546
1466
|
if not table_rows or not table_columns:
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1467
|
+
# Use page's logger if available
|
1468
|
+
logger_instance = getattr(self._page, "logger", logger)
|
1469
|
+
logger_instance.warning(
|
1470
|
+
f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found."
|
1471
|
+
)
|
1472
|
+
return self # Return self even if no cells created
|
1473
|
+
|
1550
1474
|
# Sort rows and columns
|
1551
1475
|
table_rows.sort(key=lambda r: r.top)
|
1552
1476
|
table_columns.sort(key=lambda c: c.x0)
|
1553
|
-
|
1477
|
+
|
1554
1478
|
# Create cells and add them to the page's element manager
|
1555
1479
|
created_count = 0
|
1556
1480
|
for row in table_rows:
|
@@ -1564,41 +1488,49 @@ class Region(DirectionalMixin):
|
|
1564
1488
|
# Only create a cell if the intersection is valid (positive width/height)
|
1565
1489
|
if cell_x1 > cell_x0 and cell_y1 > cell_y0:
|
1566
1490
|
# Create cell region at the intersection
|
1567
|
-
cell = self.page.create_region(
|
1568
|
-
cell_x0, cell_y0, cell_x1, cell_y1
|
1569
|
-
)
|
1491
|
+
cell = self.page.create_region(cell_x0, cell_y0, cell_x1, cell_y1)
|
1570
1492
|
# Set metadata
|
1571
|
-
cell.source =
|
1572
|
-
cell.region_type =
|
1573
|
-
cell.normalized_type =
|
1493
|
+
cell.source = "derived"
|
1494
|
+
cell.region_type = "table-cell" # Explicitly set type
|
1495
|
+
cell.normalized_type = "table-cell" # And normalized type
|
1574
1496
|
# Inherit model from the parent table region
|
1575
|
-
cell.model = self.model
|
1576
|
-
cell.parent_region = self
|
1577
|
-
|
1497
|
+
cell.model = self.model
|
1498
|
+
cell.parent_region = self # Link cell to parent table region
|
1499
|
+
|
1578
1500
|
# Add the cell region to the page's element manager
|
1579
1501
|
self.page._element_mgr.add_region(cell)
|
1580
1502
|
created_count += 1
|
1581
|
-
|
1503
|
+
|
1582
1504
|
# Optional: Add created cells to the table region's children
|
1583
1505
|
# self.child_regions.extend(cells_created_in_this_call) # Needs list management
|
1584
1506
|
|
1585
|
-
|
1507
|
+
logger_instance = getattr(self._page, "logger", logger)
|
1508
|
+
logger_instance.info(
|
1509
|
+
f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions."
|
1510
|
+
)
|
1511
|
+
|
1512
|
+
return self # Return self for chaining
|
1586
1513
|
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1514
|
+
def ask(
|
1515
|
+
self,
|
1516
|
+
question: str,
|
1517
|
+
min_confidence: float = 0.1,
|
1518
|
+
model: str = None,
|
1519
|
+
debug: bool = False,
|
1520
|
+
**kwargs,
|
1521
|
+
) -> Dict[str, Any]:
|
1590
1522
|
"""
|
1591
1523
|
Ask a question about the region content using document QA.
|
1592
|
-
|
1524
|
+
|
1593
1525
|
This method uses a document question answering model to extract answers from the region content.
|
1594
1526
|
It leverages both textual content and layout information for better understanding.
|
1595
|
-
|
1527
|
+
|
1596
1528
|
Args:
|
1597
1529
|
question: The question to ask about the region content
|
1598
1530
|
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
1599
1531
|
model: Optional model name to use for QA (if None, uses default model)
|
1600
1532
|
**kwargs: Additional parameters to pass to the QA engine
|
1601
|
-
|
1533
|
+
|
1602
1534
|
Returns:
|
1603
1535
|
Dictionary with answer details: {
|
1604
1536
|
"answer": extracted text,
|
@@ -1609,112 +1541,151 @@ class Region(DirectionalMixin):
|
|
1609
1541
|
"source_elements": list of elements that contain the answer (if found)
|
1610
1542
|
}
|
1611
1543
|
"""
|
1612
|
-
|
1613
|
-
|
1544
|
+
try:
|
1545
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1546
|
+
except ImportError:
|
1547
|
+
logger.error(
|
1548
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
|
1549
|
+
)
|
1550
|
+
return {
|
1551
|
+
"answer": None,
|
1552
|
+
"confidence": 0.0,
|
1553
|
+
"found": False,
|
1554
|
+
"page_num": self.page.number,
|
1555
|
+
"source_elements": [],
|
1556
|
+
"region": self,
|
1557
|
+
}
|
1558
|
+
|
1614
1559
|
# Get or initialize QA engine with specified model
|
1615
|
-
|
1616
|
-
|
1560
|
+
try:
|
1561
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1562
|
+
except Exception as e:
|
1563
|
+
logger.error(f"Failed to initialize QA engine (model: {model}): {e}", exc_info=True)
|
1564
|
+
return {
|
1565
|
+
"answer": None,
|
1566
|
+
"confidence": 0.0,
|
1567
|
+
"found": False,
|
1568
|
+
"page_num": self.page.number,
|
1569
|
+
"source_elements": [],
|
1570
|
+
"region": self,
|
1571
|
+
}
|
1572
|
+
|
1617
1573
|
# Ask the question using the QA engine
|
1618
|
-
|
1574
|
+
try:
|
1575
|
+
return qa_engine.ask_pdf_region(
|
1576
|
+
self, question, min_confidence=min_confidence, debug=debug, **kwargs
|
1577
|
+
)
|
1578
|
+
except Exception as e:
|
1579
|
+
logger.error(f"Error during qa_engine.ask_pdf_region: {e}", exc_info=True)
|
1580
|
+
return {
|
1581
|
+
"answer": None,
|
1582
|
+
"confidence": 0.0,
|
1583
|
+
"found": False,
|
1584
|
+
"page_num": self.page.number,
|
1585
|
+
"source_elements": [],
|
1586
|
+
"region": self,
|
1587
|
+
}
|
1619
1588
|
|
1620
1589
|
def add_child(self, child):
|
1621
1590
|
"""
|
1622
1591
|
Add a child region to this region.
|
1623
|
-
|
1592
|
+
|
1624
1593
|
Used for hierarchical document structure when using models like Docling
|
1625
1594
|
that understand document hierarchy.
|
1626
|
-
|
1595
|
+
|
1627
1596
|
Args:
|
1628
1597
|
child: Region object to add as a child
|
1629
|
-
|
1598
|
+
|
1630
1599
|
Returns:
|
1631
1600
|
Self for method chaining
|
1632
1601
|
"""
|
1633
1602
|
self.child_regions.append(child)
|
1634
1603
|
child.parent_region = self
|
1635
1604
|
return self
|
1636
|
-
|
1605
|
+
|
1637
1606
|
def get_children(self, selector=None):
|
1638
1607
|
"""
|
1639
1608
|
Get immediate child regions, optionally filtered by selector.
|
1640
|
-
|
1609
|
+
|
1641
1610
|
Args:
|
1642
1611
|
selector: Optional selector to filter children
|
1643
|
-
|
1612
|
+
|
1644
1613
|
Returns:
|
1645
1614
|
List of child regions matching the selector
|
1646
1615
|
"""
|
1647
1616
|
import logging
|
1617
|
+
|
1648
1618
|
logger = logging.getLogger("natural_pdf.elements.region")
|
1649
|
-
|
1619
|
+
|
1650
1620
|
if selector is None:
|
1651
1621
|
return self.child_regions
|
1652
|
-
|
1622
|
+
|
1653
1623
|
# Use existing selector parser to filter
|
1654
|
-
from natural_pdf.selectors.parser import
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1624
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1625
|
+
|
1626
|
+
try:
|
1627
|
+
selector_obj = parse_selector(selector)
|
1628
|
+
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
1629
|
+
matched = [child for child in self.child_regions if filter_func(child)]
|
1630
|
+
logger.debug(
|
1631
|
+
f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'"
|
1632
|
+
)
|
1633
|
+
return matched
|
1634
|
+
except Exception as e:
|
1635
|
+
logger.error(f"Error applying selector in get_children: {e}", exc_info=True)
|
1636
|
+
return [] # Return empty list on error
|
1637
|
+
|
1659
1638
|
def get_descendants(self, selector=None):
|
1660
1639
|
"""
|
1661
1640
|
Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
|
1662
|
-
|
1641
|
+
|
1663
1642
|
Args:
|
1664
1643
|
selector: Optional selector to filter descendants
|
1665
|
-
|
1644
|
+
|
1666
1645
|
Returns:
|
1667
1646
|
List of descendant regions matching the selector
|
1668
1647
|
"""
|
1669
1648
|
import logging
|
1649
|
+
|
1670
1650
|
logger = logging.getLogger("natural_pdf.elements.region")
|
1671
|
-
|
1651
|
+
|
1672
1652
|
all_descendants = []
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1653
|
+
queue = list(self.child_regions) # Start with direct children
|
1654
|
+
|
1655
|
+
while queue:
|
1656
|
+
current = queue.pop(0)
|
1657
|
+
all_descendants.append(current)
|
1658
|
+
# Add current's children to the queue for processing
|
1659
|
+
if hasattr(current, "child_regions"):
|
1660
|
+
queue.extend(current.child_regions)
|
1661
|
+
|
1681
1662
|
logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
|
1682
|
-
|
1663
|
+
|
1683
1664
|
# Filter by selector if provided
|
1684
1665
|
if selector is not None:
|
1685
|
-
from natural_pdf.selectors.parser import
|
1686
|
-
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1666
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1667
|
+
|
1668
|
+
try:
|
1669
|
+
selector_obj = parse_selector(selector)
|
1670
|
+
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
1671
|
+
matched = [desc for desc in all_descendants if filter_func(desc)]
|
1672
|
+
logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
|
1673
|
+
return matched
|
1674
|
+
except Exception as e:
|
1675
|
+
logger.error(f"Error applying selector in get_descendants: {e}", exc_info=True)
|
1676
|
+
return [] # Return empty list on error
|
1677
|
+
|
1690
1678
|
return all_descendants
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
""
|
1704
|
-
# Get direct matches
|
1705
|
-
direct_matches = self._find_all(selector, region=self, **kwargs)
|
1706
|
-
|
1707
|
-
if not recursive or not self.child_regions:
|
1708
|
-
return direct_matches
|
1709
|
-
|
1710
|
-
# Get recursive matches from children
|
1711
|
-
from natural_pdf.elements.collections import ElementCollection
|
1712
|
-
all_matches = list(direct_matches)
|
1713
|
-
|
1714
|
-
for child in self.child_regions:
|
1715
|
-
child_matches = child.find_all(selector, recursive=True, **kwargs)
|
1716
|
-
for match in child_matches:
|
1717
|
-
if match not in all_matches:
|
1718
|
-
all_matches.append(match)
|
1719
|
-
|
1720
|
-
return ElementCollection(all_matches)
|
1679
|
+
|
1680
|
+
# Removed recursive=True, find_all on region shouldn't be recursive by default
|
1681
|
+
# Renamed _find_all back to find_all
|
1682
|
+
# def find_all(self, selector, apply_exclusions=True, **kwargs):
|
1683
|
+
# See implementation above near get_elements
|
1684
|
+
|
1685
|
+
def __repr__(self) -> str:
|
1686
|
+
"""String representation of the region."""
|
1687
|
+
poly_info = " (Polygon)" if self.has_polygon else ""
|
1688
|
+
name_info = f" name='{self.name}'" if self.name else ""
|
1689
|
+
type_info = f" type='{self.region_type}'" if self.region_type else ""
|
1690
|
+
source_info = f" source='{self.source}'" if self.source else ""
|
1691
|
+
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|