natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,6 +1,18 @@
|
|
1
|
-
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
|
+
|
4
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
|
+
|
6
|
+
# New Imports
|
7
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
|
+
|
2
9
|
from natural_pdf.elements.base import DirectionalMixin
|
3
10
|
|
11
|
+
# Import new utils
|
12
|
+
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
|
+
|
14
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
15
|
+
|
4
16
|
if TYPE_CHECKING:
|
5
17
|
from natural_pdf.core.page import Page
|
6
18
|
from natural_pdf.elements.text import TextElement
|
@@ -12,22 +24,29 @@ except ImportError:
|
|
12
24
|
# OCRManager will be imported directly in methods that use it
|
13
25
|
pass
|
14
26
|
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
15
29
|
|
16
30
|
class Region(DirectionalMixin):
|
17
31
|
"""
|
18
32
|
Represents a rectangular region on a page.
|
19
33
|
"""
|
20
|
-
|
21
|
-
def __init__(
|
34
|
+
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
page: "Page",
|
38
|
+
bbox: Tuple[float, float, float, float],
|
39
|
+
polygon: List[Tuple[float, float]] = None,
|
40
|
+
parent=None,
|
41
|
+
):
|
22
42
|
"""
|
23
43
|
Initialize a region.
|
24
|
-
|
44
|
+
|
25
45
|
Args:
|
26
46
|
page: Parent page
|
27
47
|
bbox: Bounding box as (x0, top, x1, bottom)
|
28
48
|
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
49
|
parent: Optional parent region (for hierarchical document structure)
|
30
|
-
label: Optional label for the region (e.g., for exclusions)
|
31
50
|
"""
|
32
51
|
self._page = page
|
33
52
|
self._bbox = bbox
|
@@ -37,30 +56,36 @@ class Region(DirectionalMixin):
|
|
37
56
|
self._page_range = None
|
38
57
|
self.start_element = None
|
39
58
|
self.end_element = None
|
40
|
-
|
59
|
+
|
41
60
|
# Standard attributes for all elements
|
42
|
-
self.object_type =
|
43
|
-
|
61
|
+
self.object_type = "region" # For selector compatibility
|
62
|
+
|
44
63
|
# Layout detection attributes
|
45
64
|
self.region_type = None
|
46
65
|
self.normalized_type = None
|
47
66
|
self.confidence = None
|
48
67
|
self.model = None
|
49
|
-
|
68
|
+
|
50
69
|
# Region management attributes
|
51
70
|
self.name = None
|
52
71
|
self.source = None # Will be set by creation methods
|
53
|
-
|
54
|
-
|
72
|
+
|
55
73
|
# Hierarchy support for nested document structure
|
56
74
|
self.parent_region = parent
|
57
75
|
self.child_regions = []
|
58
76
|
self.text_content = None # Direct text content (e.g., from Docling)
|
59
77
|
self.associated_text_elements = [] # Native text elements that overlap with this region
|
60
|
-
|
61
|
-
def _direction(
|
62
|
-
|
63
|
-
|
78
|
+
|
79
|
+
def _direction(
|
80
|
+
self,
|
81
|
+
direction: str,
|
82
|
+
size: Optional[float] = None,
|
83
|
+
cross_size: str = "full",
|
84
|
+
include_element: bool = False,
|
85
|
+
until: Optional[str] = None,
|
86
|
+
include_endpoint: bool = True,
|
87
|
+
**kwargs,
|
88
|
+
) -> "Region":
|
64
89
|
"""
|
65
90
|
Protected helper method to create a region in a specified direction relative to this region.
|
66
91
|
|
@@ -76,11 +101,11 @@ class Region(DirectionalMixin):
|
|
76
101
|
Returns:
|
77
102
|
Region object
|
78
103
|
"""
|
79
|
-
import math
|
104
|
+
import math # Use math.inf for infinity
|
80
105
|
|
81
|
-
is_horizontal = direction in (
|
82
|
-
is_positive = direction in (
|
83
|
-
pixel_offset = 1
|
106
|
+
is_horizontal = direction in ("left", "right")
|
107
|
+
is_positive = direction in ("right", "below") # right/below are positive directions
|
108
|
+
pixel_offset = 1 # Offset for excluding elements/endpoints
|
84
109
|
|
85
110
|
# 1. Determine initial boundaries based on direction and include_element
|
86
111
|
if is_horizontal:
|
@@ -89,38 +114,44 @@ class Region(DirectionalMixin):
|
|
89
114
|
y1 = self.page.height if cross_size == "full" else self.bottom
|
90
115
|
|
91
116
|
# Initial primary boundaries (horizontal)
|
92
|
-
if is_positive:
|
117
|
+
if is_positive: # right
|
93
118
|
x0_initial = self.x0 if include_element else self.x1 + pixel_offset
|
94
|
-
x1_initial = self.x1
|
95
|
-
else:
|
96
|
-
x0_initial = self.x0
|
119
|
+
x1_initial = self.x1 # This edge moves
|
120
|
+
else: # left
|
121
|
+
x0_initial = self.x0 # This edge moves
|
97
122
|
x1_initial = self.x1 if include_element else self.x0 - pixel_offset
|
98
|
-
else:
|
123
|
+
else: # Vertical
|
99
124
|
# Initial cross-boundaries (horizontal)
|
100
125
|
x0 = 0 if cross_size == "full" else self.x0
|
101
126
|
x1 = self.page.width if cross_size == "full" else self.x1
|
102
127
|
|
103
128
|
# Initial primary boundaries (vertical)
|
104
|
-
if is_positive:
|
129
|
+
if is_positive: # below
|
105
130
|
y0_initial = self.top if include_element else self.bottom + pixel_offset
|
106
|
-
y1_initial = self.bottom
|
107
|
-
else:
|
108
|
-
y0_initial = self.top
|
131
|
+
y1_initial = self.bottom # This edge moves
|
132
|
+
else: # above
|
133
|
+
y0_initial = self.top # This edge moves
|
109
134
|
y1_initial = self.bottom if include_element else self.top - pixel_offset
|
110
135
|
|
111
136
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
112
137
|
if is_horizontal:
|
113
|
-
if is_positive:
|
114
|
-
x1_final = min(
|
138
|
+
if is_positive: # right
|
139
|
+
x1_final = min(
|
140
|
+
self.page.width,
|
141
|
+
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
142
|
+
)
|
115
143
|
x0_final = x0_initial
|
116
|
-
else:
|
144
|
+
else: # left
|
117
145
|
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
118
146
|
x1_final = x1_initial
|
119
|
-
else:
|
120
|
-
if is_positive:
|
121
|
-
y1_final = min(
|
147
|
+
else: # Vertical
|
148
|
+
if is_positive: # below
|
149
|
+
y1_final = min(
|
150
|
+
self.page.height,
|
151
|
+
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
152
|
+
)
|
122
153
|
y0_final = y0_initial
|
123
|
-
else:
|
154
|
+
else: # above
|
124
155
|
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
125
156
|
y1_final = y1_initial
|
126
157
|
|
@@ -131,16 +162,16 @@ class Region(DirectionalMixin):
|
|
131
162
|
matches_in_direction = []
|
132
163
|
|
133
164
|
# Filter and sort matches based on direction
|
134
|
-
if direction ==
|
165
|
+
if direction == "above":
|
135
166
|
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
136
167
|
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
137
|
-
elif direction ==
|
168
|
+
elif direction == "below":
|
138
169
|
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
139
170
|
matches_in_direction.sort(key=lambda e: e.top)
|
140
|
-
elif direction ==
|
171
|
+
elif direction == "left":
|
141
172
|
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
142
173
|
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
143
|
-
elif direction ==
|
174
|
+
elif direction == "right":
|
144
175
|
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
145
176
|
matches_in_direction.sort(key=lambda e: e.x0)
|
146
177
|
|
@@ -149,25 +180,29 @@ class Region(DirectionalMixin):
|
|
149
180
|
|
150
181
|
# Adjust the primary boundary based on the target
|
151
182
|
if is_horizontal:
|
152
|
-
if is_positive:
|
183
|
+
if is_positive: # right
|
153
184
|
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
154
|
-
else:
|
185
|
+
else: # left
|
155
186
|
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
156
|
-
else:
|
157
|
-
if is_positive:
|
187
|
+
else: # Vertical
|
188
|
+
if is_positive: # below
|
158
189
|
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
159
|
-
else:
|
190
|
+
else: # above
|
160
191
|
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
161
192
|
|
162
193
|
# Adjust cross boundaries if cross_size is 'element'
|
163
194
|
if cross_size == "element":
|
164
|
-
if is_horizontal:
|
165
|
-
target_y0 =
|
195
|
+
if is_horizontal: # Adjust y0, y1
|
196
|
+
target_y0 = (
|
197
|
+
target.top if include_endpoint else target.bottom
|
198
|
+
) # Use opposite boundary if excluding
|
166
199
|
target_y1 = target.bottom if include_endpoint else target.top
|
167
200
|
y0 = min(y0, target_y0)
|
168
201
|
y1 = max(y1, target_y1)
|
169
|
-
else:
|
170
|
-
target_x0 =
|
202
|
+
else: # Adjust x0, x1
|
203
|
+
target_x0 = (
|
204
|
+
target.x0 if include_endpoint else target.x1
|
205
|
+
) # Use opposite boundary if excluding
|
171
206
|
target_x1 = target.x1 if include_endpoint else target.x0
|
172
207
|
x0 = min(x0, target_x0)
|
173
208
|
x1 = max(x1, target_x1)
|
@@ -195,11 +230,18 @@ class Region(DirectionalMixin):
|
|
195
230
|
|
196
231
|
return region
|
197
232
|
|
198
|
-
def above(
|
199
|
-
|
233
|
+
def above(
|
234
|
+
self,
|
235
|
+
height: Optional[float] = None,
|
236
|
+
width: str = "full",
|
237
|
+
include_element: bool = False,
|
238
|
+
until: Optional[str] = None,
|
239
|
+
include_endpoint: bool = True,
|
240
|
+
**kwargs,
|
241
|
+
) -> "Region":
|
200
242
|
"""
|
201
243
|
Select region above this region.
|
202
|
-
|
244
|
+
|
203
245
|
Args:
|
204
246
|
height: Height of the region above, in points
|
205
247
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -207,25 +249,32 @@ class Region(DirectionalMixin):
|
|
207
249
|
until: Optional selector string to specify an upper boundary element
|
208
250
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
209
251
|
**kwargs: Additional parameters
|
210
|
-
|
252
|
+
|
211
253
|
Returns:
|
212
254
|
Region object representing the area above
|
213
255
|
"""
|
214
256
|
return self._direction(
|
215
|
-
direction=
|
257
|
+
direction="above",
|
216
258
|
size=height,
|
217
259
|
cross_size=width,
|
218
260
|
include_element=include_element,
|
219
261
|
until=until,
|
220
262
|
include_endpoint=include_endpoint,
|
221
|
-
**kwargs
|
263
|
+
**kwargs,
|
222
264
|
)
|
223
265
|
|
224
|
-
def below(
|
225
|
-
|
266
|
+
def below(
|
267
|
+
self,
|
268
|
+
height: Optional[float] = None,
|
269
|
+
width: str = "full",
|
270
|
+
include_element: bool = False,
|
271
|
+
until: Optional[str] = None,
|
272
|
+
include_endpoint: bool = True,
|
273
|
+
**kwargs,
|
274
|
+
) -> "Region":
|
226
275
|
"""
|
227
276
|
Select region below this region.
|
228
|
-
|
277
|
+
|
229
278
|
Args:
|
230
279
|
height: Height of the region below, in points
|
231
280
|
width: Width mode - "full" for full page width or "element" for element width
|
@@ -233,25 +282,32 @@ class Region(DirectionalMixin):
|
|
233
282
|
until: Optional selector string to specify a lower boundary element
|
234
283
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
235
284
|
**kwargs: Additional parameters
|
236
|
-
|
285
|
+
|
237
286
|
Returns:
|
238
287
|
Region object representing the area below
|
239
288
|
"""
|
240
289
|
return self._direction(
|
241
|
-
direction=
|
290
|
+
direction="below",
|
242
291
|
size=height,
|
243
292
|
cross_size=width,
|
244
293
|
include_element=include_element,
|
245
294
|
until=until,
|
246
295
|
include_endpoint=include_endpoint,
|
247
|
-
**kwargs
|
296
|
+
**kwargs,
|
248
297
|
)
|
249
298
|
|
250
|
-
def left(
|
251
|
-
|
299
|
+
def left(
|
300
|
+
self,
|
301
|
+
width: Optional[float] = None,
|
302
|
+
height: str = "full",
|
303
|
+
include_element: bool = False,
|
304
|
+
until: Optional[str] = None,
|
305
|
+
include_endpoint: bool = True,
|
306
|
+
**kwargs,
|
307
|
+
) -> "Region":
|
252
308
|
"""
|
253
309
|
Select region to the left of this region.
|
254
|
-
|
310
|
+
|
255
311
|
Args:
|
256
312
|
width: Width of the region to the left, in points
|
257
313
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -259,25 +315,32 @@ class Region(DirectionalMixin):
|
|
259
315
|
until: Optional selector string to specify a left boundary element
|
260
316
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
261
317
|
**kwargs: Additional parameters
|
262
|
-
|
318
|
+
|
263
319
|
Returns:
|
264
320
|
Region object representing the area to the left
|
265
321
|
"""
|
266
322
|
return self._direction(
|
267
|
-
direction=
|
323
|
+
direction="left",
|
268
324
|
size=width,
|
269
325
|
cross_size=height,
|
270
326
|
include_element=include_element,
|
271
327
|
until=until,
|
272
328
|
include_endpoint=include_endpoint,
|
273
|
-
**kwargs
|
329
|
+
**kwargs,
|
274
330
|
)
|
275
331
|
|
276
|
-
def right(
|
277
|
-
|
332
|
+
def right(
|
333
|
+
self,
|
334
|
+
width: Optional[float] = None,
|
335
|
+
height: str = "full",
|
336
|
+
include_element: bool = False,
|
337
|
+
until: Optional[str] = None,
|
338
|
+
include_endpoint: bool = True,
|
339
|
+
**kwargs,
|
340
|
+
) -> "Region":
|
278
341
|
"""
|
279
342
|
Select region to the right of this region.
|
280
|
-
|
343
|
+
|
281
344
|
Args:
|
282
345
|
width: Width of the region to the right, in points
|
283
346
|
height: Height mode - "full" for full page height or "element" for element height
|
@@ -285,72 +348,72 @@ class Region(DirectionalMixin):
|
|
285
348
|
until: Optional selector string to specify a right boundary element
|
286
349
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
287
350
|
**kwargs: Additional parameters
|
288
|
-
|
351
|
+
|
289
352
|
Returns:
|
290
353
|
Region object representing the area to the right
|
291
354
|
"""
|
292
355
|
return self._direction(
|
293
|
-
direction=
|
356
|
+
direction="right",
|
294
357
|
size=width,
|
295
358
|
cross_size=height,
|
296
359
|
include_element=include_element,
|
297
360
|
until=until,
|
298
361
|
include_endpoint=include_endpoint,
|
299
|
-
**kwargs
|
362
|
+
**kwargs,
|
300
363
|
)
|
301
|
-
|
364
|
+
|
302
365
|
@property
|
303
366
|
def type(self) -> str:
|
304
367
|
"""Element type."""
|
305
368
|
# Return the specific type if detected (e.g., from layout analysis)
|
306
369
|
# or 'region' as a default.
|
307
|
-
return self.region_type or
|
308
|
-
|
370
|
+
return self.region_type or "region" # Prioritize specific region_type if set
|
371
|
+
|
309
372
|
@property
|
310
|
-
def page(self) ->
|
373
|
+
def page(self) -> "Page":
|
311
374
|
"""Get the parent page."""
|
312
375
|
return self._page
|
313
|
-
|
376
|
+
|
314
377
|
@property
|
315
378
|
def bbox(self) -> Tuple[float, float, float, float]:
|
316
379
|
"""Get the bounding box as (x0, top, x1, bottom)."""
|
317
380
|
return self._bbox
|
318
|
-
|
381
|
+
|
319
382
|
@property
|
320
383
|
def x0(self) -> float:
|
321
384
|
"""Get the left coordinate."""
|
322
385
|
return self._bbox[0]
|
323
|
-
|
386
|
+
|
324
387
|
@property
|
325
388
|
def top(self) -> float:
|
326
389
|
"""Get the top coordinate."""
|
327
390
|
return self._bbox[1]
|
328
|
-
|
391
|
+
|
329
392
|
@property
|
330
393
|
def x1(self) -> float:
|
331
394
|
"""Get the right coordinate."""
|
332
395
|
return self._bbox[2]
|
333
|
-
|
396
|
+
|
334
397
|
@property
|
335
398
|
def bottom(self) -> float:
|
336
399
|
"""Get the bottom coordinate."""
|
337
400
|
return self._bbox[3]
|
338
|
-
|
401
|
+
|
339
402
|
@property
|
340
403
|
def width(self) -> float:
|
341
404
|
"""Get the width of the region."""
|
342
405
|
return self.x1 - self.x0
|
343
|
-
|
406
|
+
|
344
407
|
@property
|
345
408
|
def height(self) -> float:
|
346
409
|
"""Get the height of the region."""
|
347
410
|
return self.bottom - self.top
|
348
|
-
|
411
|
+
|
349
412
|
@property
|
350
413
|
def has_polygon(self) -> bool:
|
351
414
|
"""Check if this region has polygon coordinates."""
|
352
415
|
return self._polygon is not None and len(self._polygon) >= 3
|
353
|
-
|
416
|
+
|
354
417
|
@property
|
355
418
|
def polygon(self) -> List[Tuple[float, float]]:
|
356
419
|
"""Get polygon coordinates if available, otherwise return rectangle corners."""
|
@@ -359,141 +422,122 @@ class Region(DirectionalMixin):
|
|
359
422
|
else:
|
360
423
|
# Create rectangle corners from bbox as fallback
|
361
424
|
return [
|
362
|
-
(self.x0, self.top),
|
363
|
-
(self.x1, self.top),
|
364
|
-
(self.x1, self.bottom),
|
365
|
-
(self.x0, self.bottom)
|
425
|
+
(self.x0, self.top), # top-left
|
426
|
+
(self.x1, self.top), # top-right
|
427
|
+
(self.x1, self.bottom), # bottom-right
|
428
|
+
(self.x0, self.bottom), # bottom-left
|
366
429
|
]
|
367
|
-
|
430
|
+
|
368
431
|
def _is_point_in_polygon(self, x: float, y: float) -> bool:
|
369
432
|
"""
|
370
433
|
Check if a point is inside the polygon using ray casting algorithm.
|
371
|
-
|
434
|
+
|
372
435
|
Args:
|
373
436
|
x: X coordinate of the point
|
374
437
|
y: Y coordinate of the point
|
375
|
-
|
438
|
+
|
376
439
|
Returns:
|
377
440
|
bool: True if the point is inside the polygon
|
378
441
|
"""
|
379
442
|
if not self.has_polygon:
|
380
443
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
381
|
-
|
444
|
+
|
382
445
|
# Ray casting algorithm
|
383
446
|
inside = False
|
384
447
|
j = len(self.polygon) - 1
|
385
|
-
|
448
|
+
|
386
449
|
for i in range(len(self.polygon)):
|
387
|
-
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and
|
388
|
-
|
389
|
-
(self.polygon[j][
|
450
|
+
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
|
451
|
+
x
|
452
|
+
< (self.polygon[j][0] - self.polygon[i][0])
|
453
|
+
* (y - self.polygon[i][1])
|
454
|
+
/ (self.polygon[j][1] - self.polygon[i][1])
|
455
|
+
+ self.polygon[i][0]
|
456
|
+
):
|
390
457
|
inside = not inside
|
391
458
|
j = i
|
392
|
-
|
459
|
+
|
393
460
|
return inside
|
394
461
|
|
395
462
|
def is_point_inside(self, x: float, y: float) -> bool:
|
396
463
|
"""
|
397
464
|
Check if a point is inside this region using ray casting algorithm for polygons.
|
398
|
-
|
465
|
+
|
399
466
|
Args:
|
400
467
|
x: X coordinate of the point
|
401
468
|
y: Y coordinate of the point
|
402
|
-
|
469
|
+
|
403
470
|
Returns:
|
404
471
|
bool: True if the point is inside the region
|
405
472
|
"""
|
406
473
|
if not self.has_polygon:
|
407
474
|
return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
|
408
|
-
|
475
|
+
|
409
476
|
# Ray casting algorithm
|
410
477
|
inside = False
|
411
478
|
j = len(self.polygon) - 1
|
412
|
-
|
479
|
+
|
413
480
|
for i in range(len(self.polygon)):
|
414
|
-
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and
|
415
|
-
|
416
|
-
(self.polygon[j][
|
481
|
+
if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
|
482
|
+
x
|
483
|
+
< (self.polygon[j][0] - self.polygon[i][0])
|
484
|
+
* (y - self.polygon[i][1])
|
485
|
+
/ (self.polygon[j][1] - self.polygon[i][1])
|
486
|
+
+ self.polygon[i][0]
|
487
|
+
):
|
417
488
|
inside = not inside
|
418
489
|
j = i
|
419
|
-
|
490
|
+
|
420
491
|
return inside
|
421
492
|
|
422
|
-
def _is_element_in_region(self, element:
|
493
|
+
def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
|
423
494
|
"""
|
424
495
|
Check if an element is within this region.
|
425
|
-
|
496
|
+
|
426
497
|
Args:
|
427
498
|
element: Element to check
|
428
499
|
use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
|
429
|
-
|
500
|
+
|
430
501
|
Returns:
|
431
502
|
True if the element is in the region, False otherwise
|
432
503
|
"""
|
433
504
|
# If we have multi-page elements cached, check if the element is in the list
|
434
505
|
if self._spans_pages and self._multi_page_elements is not None:
|
435
506
|
return element in self._multi_page_elements
|
436
|
-
|
507
|
+
|
437
508
|
# Check if element is on the same page
|
438
|
-
if element.page != self._page:
|
509
|
+
if not hasattr(element, "page") or element.page != self._page:
|
439
510
|
return False
|
440
|
-
|
511
|
+
|
441
512
|
# Calculate element center
|
513
|
+
# Ensure element has necessary attributes
|
514
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
515
|
+
return False # Cannot determine position
|
516
|
+
|
442
517
|
element_center_x = (element.x0 + element.x1) / 2
|
443
518
|
element_center_y = (element.top + element.bottom) / 2
|
444
|
-
|
445
|
-
#
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
# For elements right at the boundary, be more conservative
|
458
|
-
return False
|
459
|
-
|
460
|
-
# If the element itself has a polygon, check if ANY corner is in this region
|
461
|
-
if hasattr(element, 'has_polygon') and element.has_polygon:
|
462
|
-
for point in element.polygon:
|
463
|
-
if self.is_point_inside(point[0], point[1]):
|
464
|
-
return True
|
465
|
-
# If no point is inside, check if the center is inside
|
466
|
-
return self.is_point_inside(element_center_x, element_center_y)
|
467
|
-
|
468
|
-
# For regular elements, check if center is in the region
|
469
|
-
# Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
|
470
|
-
# This ensures consistent behavior with the below() and above() method fixes
|
471
|
-
tolerance = 1.0 if use_boundary_tolerance else 0.0
|
472
|
-
|
473
|
-
# Check if within region with the tolerance applied
|
474
|
-
if self.has_polygon:
|
475
|
-
return self.is_point_inside(element_center_x, element_center_y)
|
476
|
-
else:
|
477
|
-
# For rectangular regions, apply tolerance to all sides
|
478
|
-
return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
|
479
|
-
self.top + tolerance <= element_center_y <= self.bottom - tolerance)
|
480
|
-
|
481
|
-
def highlight(self,
|
482
|
-
label: Optional[str] = None,
|
483
|
-
color: Optional[Union[Tuple, str]] = None,
|
484
|
-
use_color_cycling: bool = False,
|
485
|
-
include_attrs: Optional[List[str]] = None,
|
486
|
-
existing: str = 'append') -> 'Region':
|
519
|
+
|
520
|
+
# Check if center point is inside the region's geometry
|
521
|
+
return self.is_point_inside(element_center_x, element_center_y)
|
522
|
+
|
523
|
+
def highlight(
|
524
|
+
self,
|
525
|
+
label: Optional[str] = None,
|
526
|
+
color: Optional[Union[Tuple, str]] = None,
|
527
|
+
use_color_cycling: bool = False,
|
528
|
+
include_attrs: Optional[List[str]] = None,
|
529
|
+
existing: str = "append",
|
530
|
+
) -> "Region":
|
487
531
|
"""
|
488
532
|
Highlight this region on the page.
|
489
|
-
|
533
|
+
|
490
534
|
Args:
|
491
535
|
label: Optional label for the highlight
|
492
536
|
color: Color tuple/string for the highlight, or None to use automatic color
|
493
537
|
use_color_cycling: Force color cycling even with no label (default: False)
|
494
538
|
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
495
539
|
existing: How to handle existing highlights ('append' or 'replace').
|
496
|
-
|
540
|
+
|
497
541
|
Returns:
|
498
542
|
Self for method chaining
|
499
543
|
"""
|
@@ -508,7 +552,7 @@ class Region(DirectionalMixin):
|
|
508
552
|
"use_color_cycling": use_color_cycling,
|
509
553
|
"element": self, # Pass the region itself so attributes can be accessed
|
510
554
|
"include_attrs": include_attrs,
|
511
|
-
"existing": existing
|
555
|
+
"existing": existing,
|
512
556
|
}
|
513
557
|
|
514
558
|
# Call the appropriate service method
|
@@ -520,59 +564,68 @@ class Region(DirectionalMixin):
|
|
520
564
|
highlighter.add(**highlight_args)
|
521
565
|
|
522
566
|
return self
|
523
|
-
|
524
|
-
def to_image(
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
567
|
+
|
568
|
+
def to_image(
|
569
|
+
self,
|
570
|
+
scale: float = 2.0,
|
571
|
+
resolution: float = 150,
|
572
|
+
crop_only: bool = False,
|
573
|
+
include_highlights: bool = True,
|
574
|
+
**kwargs,
|
575
|
+
) -> "Image.Image":
|
530
576
|
"""
|
531
577
|
Generate an image of just this region.
|
532
|
-
|
578
|
+
|
533
579
|
Args:
|
534
580
|
resolution: Resolution in DPI for rendering (default: 150)
|
535
581
|
crop_only: If True, only crop the region without highlighting its boundaries
|
536
582
|
include_highlights: Whether to include existing highlights (default: True)
|
537
583
|
**kwargs: Additional parameters for page.to_image()
|
538
|
-
|
584
|
+
|
539
585
|
Returns:
|
540
586
|
PIL Image of just this region
|
541
587
|
"""
|
542
588
|
# First get the full page image with highlights if requested
|
543
|
-
page_image = self._page.to_image(
|
544
|
-
|
589
|
+
page_image = self._page.to_image(
|
590
|
+
scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
|
591
|
+
)
|
592
|
+
|
545
593
|
# Calculate the crop coordinates - apply resolution scaling factor
|
546
594
|
# PDF coordinates are in points (1/72 inch), but image is scaled by resolution
|
547
|
-
scale_factor =
|
548
|
-
|
595
|
+
scale_factor = resolution / 72.0 # Scale based on DPI
|
596
|
+
|
549
597
|
# Apply scaling to the coordinates
|
550
598
|
x0 = int(self.x0 * scale_factor)
|
551
599
|
top = int(self.top * scale_factor)
|
552
600
|
x1 = int(self.x1 * scale_factor)
|
553
601
|
bottom = int(self.bottom * scale_factor)
|
554
|
-
|
602
|
+
|
555
603
|
# Crop the image to just this region
|
556
604
|
region_image = page_image.crop((x0, top, x1, bottom))
|
557
|
-
|
605
|
+
|
558
606
|
# If not crop_only, add a border to highlight the region boundaries
|
559
607
|
if not crop_only:
|
560
608
|
from PIL import ImageDraw
|
561
|
-
|
609
|
+
|
562
610
|
# Create a 1px border around the region
|
563
611
|
draw = ImageDraw.Draw(region_image)
|
564
|
-
draw.rectangle(
|
565
|
-
|
566
|
-
|
612
|
+
draw.rectangle(
|
613
|
+
(0, 0, region_image.width - 1, region_image.height - 1),
|
614
|
+
outline=(255, 0, 0),
|
615
|
+
width=1,
|
616
|
+
)
|
617
|
+
|
567
618
|
return region_image
|
568
|
-
|
569
|
-
def show(
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
619
|
+
|
620
|
+
def show(
|
621
|
+
self,
|
622
|
+
scale: float = 2.0,
|
623
|
+
labels: bool = True,
|
624
|
+
legend_position: str = "right",
|
625
|
+
# Add a default color for standalone show
|
626
|
+
color: Optional[Union[Tuple, str]] = "blue",
|
627
|
+
label: Optional[str] = None,
|
628
|
+
) -> "Image.Image":
|
576
629
|
"""
|
577
630
|
Show the page with just this region highlighted temporarily.
|
578
631
|
|
@@ -593,16 +646,18 @@ class Region(DirectionalMixin):
|
|
593
646
|
service = self._page._highlighter
|
594
647
|
|
595
648
|
# Determine the label if not provided
|
596
|
-
display_label =
|
649
|
+
display_label = (
|
650
|
+
label if label is not None else f"Region ({self.type})" if self.type else "Region"
|
651
|
+
)
|
597
652
|
|
598
653
|
# Prepare temporary highlight data for just this region
|
599
654
|
temp_highlight_data = {
|
600
655
|
"page_index": self._page.index,
|
601
656
|
"bbox": self.bbox,
|
602
657
|
"polygon": self.polygon if self.has_polygon else None,
|
603
|
-
"color": color,
|
658
|
+
"color": color, # Use provided or default color
|
604
659
|
"label": display_label,
|
605
|
-
"use_color_cycling": False
|
660
|
+
"use_color_cycling": False, # Explicitly false for single preview
|
606
661
|
}
|
607
662
|
|
608
663
|
# Use render_preview to show only this highlight
|
@@ -611,452 +666,271 @@ class Region(DirectionalMixin):
|
|
611
666
|
temporary_highlights=[temp_highlight_data],
|
612
667
|
scale=scale,
|
613
668
|
labels=labels,
|
614
|
-
legend_position=legend_position
|
669
|
+
legend_position=legend_position,
|
615
670
|
)
|
616
671
|
|
617
|
-
def save(
|
618
|
-
|
619
|
-
|
620
|
-
labels: bool = True,
|
621
|
-
legend_position: str = 'right') -> 'Region':
|
672
|
+
def save(
|
673
|
+
self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
|
674
|
+
) -> "Region":
|
622
675
|
"""
|
623
676
|
Save the page with this region highlighted to an image file.
|
624
|
-
|
677
|
+
|
625
678
|
Args:
|
626
679
|
filename: Path to save the image to
|
627
680
|
scale: Scale factor for rendering
|
628
681
|
labels: Whether to include a legend for labels
|
629
682
|
legend_position: Position of the legend
|
630
|
-
|
683
|
+
|
631
684
|
Returns:
|
632
685
|
Self for method chaining
|
633
686
|
"""
|
634
687
|
# Highlight this region if not already highlighted
|
635
688
|
self.highlight()
|
636
|
-
|
689
|
+
|
637
690
|
# Save the highlighted image
|
638
691
|
self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
|
639
692
|
return self
|
640
|
-
|
641
|
-
def save_image(
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
693
|
+
|
694
|
+
def save_image(
|
695
|
+
self,
|
696
|
+
filename: str,
|
697
|
+
resolution: float = 150,
|
698
|
+
crop_only: bool = False,
|
699
|
+
include_highlights: bool = True,
|
700
|
+
**kwargs,
|
701
|
+
) -> "Region":
|
647
702
|
"""
|
648
703
|
Save an image of just this region to a file.
|
649
|
-
|
704
|
+
|
650
705
|
Args:
|
651
706
|
filename: Path to save the image to
|
652
707
|
resolution: Resolution in DPI for rendering (default: 150)
|
653
708
|
crop_only: If True, only crop the region without highlighting its boundaries
|
654
709
|
include_highlights: Whether to include existing highlights (default: True)
|
655
710
|
**kwargs: Additional parameters for page.to_image()
|
656
|
-
|
711
|
+
|
657
712
|
Returns:
|
658
713
|
Self for method chaining
|
659
714
|
"""
|
660
715
|
# Get the region image
|
661
716
|
image = self.to_image(
|
662
|
-
resolution=resolution,
|
663
|
-
crop_only=crop_only,
|
717
|
+
resolution=resolution,
|
718
|
+
crop_only=crop_only,
|
664
719
|
include_highlights=include_highlights,
|
665
|
-
**kwargs
|
720
|
+
**kwargs,
|
666
721
|
)
|
667
|
-
|
722
|
+
|
668
723
|
# Save the image
|
669
724
|
image.save(filename)
|
670
725
|
return self
|
671
|
-
|
672
|
-
def get_elements(
|
726
|
+
|
727
|
+
def get_elements(
|
728
|
+
self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
|
729
|
+
) -> List["Element"]:
|
673
730
|
"""
|
674
731
|
Get all elements within this region.
|
675
|
-
|
732
|
+
|
676
733
|
Args:
|
677
734
|
selector: Optional selector to filter elements
|
678
735
|
apply_exclusions: Whether to apply exclusion regions
|
679
736
|
**kwargs: Additional parameters for element filtering
|
680
|
-
|
737
|
+
|
681
738
|
Returns:
|
682
739
|
List of elements in the region
|
683
740
|
"""
|
684
741
|
# If we have multi-page elements, return those
|
685
742
|
if self._spans_pages and self._multi_page_elements is not None:
|
743
|
+
# TODO: Apply selector to multi-page elements if needed
|
686
744
|
return self._multi_page_elements
|
687
|
-
|
745
|
+
|
688
746
|
# Otherwise, get elements from the page
|
689
747
|
if selector:
|
690
|
-
elements
|
748
|
+
# Find elements on the page matching the selector
|
749
|
+
page_elements = self.page.find_all(
|
750
|
+
selector, apply_exclusions=apply_exclusions, **kwargs
|
751
|
+
)
|
752
|
+
# Filter those elements to only include ones within this region
|
753
|
+
return [e for e in page_elements if self._is_element_in_region(e)]
|
691
754
|
else:
|
692
|
-
elements
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
def extract_text(self,
|
755
|
+
# Get all elements from the page
|
756
|
+
page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
|
757
|
+
# Filter to elements in this region
|
758
|
+
return [e for e in page_elements if self._is_element_in_region(e)]
|
759
|
+
|
760
|
+
def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
|
698
761
|
"""
|
699
|
-
Extract text from this region using pdfplumber's
|
700
|
-
|
701
|
-
|
702
|
-
1. Associated text elements from the PDF (if available)
|
703
|
-
2. Direct text content from Docling (if available)
|
704
|
-
3. Fall back to standard pdfplumber extraction
|
705
|
-
|
762
|
+
Extract text from this region, respecting page exclusions and using pdfplumber's
|
763
|
+
layout engine (chars_to_textmap).
|
764
|
+
|
706
765
|
Args:
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
766
|
+
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
767
|
+
debug: Enable verbose debugging output for filtering steps.
|
768
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
769
|
+
`chars_to_textmap` function (e.g., layout, x_density, y_density).
|
770
|
+
See Page.extract_text docstring for more.
|
771
|
+
|
714
772
|
Returns:
|
715
|
-
Extracted text as string
|
773
|
+
Extracted text as string, potentially with layout-based spacing.
|
716
774
|
"""
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
if self._spans_pages and self._multi_page_elements is not None:
|
745
|
-
# Sort elements in reading order - only include text-like elements
|
746
|
-
text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
|
747
|
-
|
748
|
-
# Sort in reading order (by page, then top-to-bottom, left-to-right)
|
749
|
-
sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
750
|
-
|
751
|
-
# Extract text directly from elements to avoid recursion
|
752
|
-
texts = []
|
753
|
-
for element in sorted_elements:
|
754
|
-
if hasattr(element, 'text'):
|
755
|
-
texts.append(element.text)
|
756
|
-
|
757
|
-
text_result = " ".join(texts)
|
758
|
-
return text_result
|
759
|
-
|
760
|
-
# Check if we have exclusions to apply
|
775
|
+
# Allow 'debug_exclusions' for backward compatibility
|
776
|
+
debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
|
777
|
+
logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
|
778
|
+
|
779
|
+
# --- Handle Docling source (priority) --- DEPRECATED or Adapt?
|
780
|
+
# For now, let's bypass this and always use the standard extraction flow
|
781
|
+
# based on contained elements to ensure consistency.
|
782
|
+
# if self.model == 'docling' or hasattr(self, 'text_content'): ...
|
783
|
+
|
784
|
+
# 1. Get Word Elements potentially within this region (initial broad phase)
|
785
|
+
# Optimization: Could use spatial query if page elements were indexed
|
786
|
+
page_words = self.page.words # Get all words from the page
|
787
|
+
|
788
|
+
# 2. Gather all character dicts from words potentially in region
|
789
|
+
# We filter precisely in filter_chars_spatially
|
790
|
+
all_char_dicts = []
|
791
|
+
for word in page_words:
|
792
|
+
# Quick bbox check to avoid processing words clearly outside
|
793
|
+
if get_bbox_overlap(self.bbox, word.bbox) is not None:
|
794
|
+
all_char_dicts.extend(getattr(word, "_char_dicts", []))
|
795
|
+
|
796
|
+
if not all_char_dicts:
|
797
|
+
logger.debug(f"Region {self.bbox}: No character dicts found overlapping region bbox.")
|
798
|
+
return ""
|
799
|
+
|
800
|
+
# 3. Get Relevant Exclusions (overlapping this region)
|
801
|
+
apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
|
761
802
|
exclusion_regions = []
|
762
|
-
if
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
has_intersection = False
|
772
|
-
for i, exclusion in enumerate(exclusion_regions):
|
773
|
-
# Use a simple bbox overlap check
|
774
|
-
overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
|
775
|
-
self.top < exclusion.bottom and self.bottom > exclusion.top)
|
776
|
-
|
777
|
-
if overlap:
|
778
|
-
has_intersection = True
|
779
|
-
if debug:
|
780
|
-
logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
|
781
|
-
break
|
782
|
-
|
783
|
-
# If no intersection, process without exclusions
|
784
|
-
if not has_intersection:
|
785
|
-
if debug:
|
786
|
-
logger.debug(f" No intersection with any exclusion, ignoring exclusions")
|
787
|
-
apply_exclusions = False
|
788
|
-
exclusion_regions = []
|
789
|
-
|
790
|
-
# IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
|
791
|
-
# we can use the simpler cropping approach
|
792
|
-
# Only use crop for simple cases
|
793
|
-
can_use_crop = not self.has_polygon
|
794
|
-
result = "" # Default empty result
|
795
|
-
if can_use_crop and apply_exclusions and exclusion_regions:
|
796
|
-
# We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
|
797
|
-
# and those that are not
|
798
|
-
footer_header_exclusions = []
|
799
|
-
other_exclusions = []
|
800
|
-
|
801
|
-
for i, exclusion in enumerate(exclusion_regions):
|
802
|
-
# Check if exclusion spans the full width of the page
|
803
|
-
# and is either at the top or bottom
|
804
|
-
full_width = (abs(exclusion.x0) < 5 and
|
805
|
-
abs(exclusion.x1 - self.page.width) < 5)
|
806
|
-
|
807
|
-
if debug:
|
808
|
-
logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
|
809
|
-
|
810
|
-
if full_width:
|
811
|
-
footer_header_exclusions.append(exclusion)
|
812
|
-
else:
|
813
|
-
other_exclusions.append(exclusion)
|
814
|
-
|
815
|
-
# If we have only header/footer exclusions, we can use the cropping approach
|
816
|
-
all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
|
817
|
-
|
818
|
-
if all_are_bands:
|
819
|
-
# Find the actual content area after excluding header/footer
|
820
|
-
top_bound = self.top
|
821
|
-
bottom_bound = self.bottom
|
822
|
-
|
823
|
-
if debug:
|
824
|
-
logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
825
|
-
|
826
|
-
# Process only header/footer exclusions for cropping
|
827
|
-
for exclusion in footer_header_exclusions:
|
828
|
-
# If exclusion is at the top of our region
|
829
|
-
if exclusion.bottom > self.top and exclusion.top <= self.top:
|
830
|
-
# Move top bound to exclude the header
|
831
|
-
top_bound = max(top_bound, exclusion.bottom)
|
832
|
-
if debug:
|
833
|
-
logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
|
834
|
-
|
835
|
-
# If exclusion is at the bottom of our region
|
836
|
-
if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
|
837
|
-
# Move bottom bound to exclude the footer
|
838
|
-
bottom_bound = min(bottom_bound, exclusion.top)
|
839
|
-
if debug:
|
840
|
-
logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
|
841
|
-
|
842
|
-
|
843
|
-
if debug:
|
844
|
-
logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
845
|
-
|
846
|
-
# If we still have a valid region after exclusions
|
847
|
-
if top_bound < bottom_bound:
|
848
|
-
# Use direct crop with adjusted bounds
|
849
|
-
crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
|
850
|
-
cropped = self.page._page.crop(crop_bbox)
|
851
|
-
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
852
|
-
|
853
|
-
if debug:
|
854
|
-
logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
|
855
|
-
|
856
|
-
# Skip the complex filtering approach
|
857
|
-
return result
|
858
|
-
else:
|
859
|
-
# This would only happen if the region is entirely inside an exclusion zone
|
860
|
-
# or if both top and bottom of the region are excluded leaving no valid area
|
861
|
-
logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
|
862
|
-
return ""
|
863
|
-
# We have exclusions, but not all are headers/footers,
|
864
|
-
# or we have a non-rectangular region
|
865
|
-
else:
|
866
|
-
if debug:
|
867
|
-
logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
|
868
|
-
|
869
|
-
# Don't use crop for mixed exclusion types
|
870
|
-
can_use_crop = False
|
871
|
-
|
872
|
-
# If we got a result from header/footer cropping, return it
|
873
|
-
if result:
|
874
|
-
return result
|
875
|
-
|
876
|
-
# For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
|
877
|
-
if can_use_crop and not apply_exclusions:
|
878
|
-
# Simple case: use direct crop
|
879
|
-
crop_bbox = self.bbox
|
880
|
-
cropped = self.page._page.crop(crop_bbox)
|
881
|
-
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
882
|
-
return result
|
883
|
-
|
884
|
-
# For all other cases (complex exclusions, polygons), we use element filtering
|
885
|
-
if debug:
|
886
|
-
logger.debug(f"Using element filtering approach for region {self.bbox}")
|
887
|
-
|
888
|
-
# Get only word elements in this region first (instead of ALL elements)
|
889
|
-
# This prevents duplication from joining both char and word text
|
890
|
-
all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
|
891
|
-
|
892
|
-
if apply_exclusions and exclusion_regions:
|
893
|
-
if debug:
|
894
|
-
logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
|
895
|
-
|
896
|
-
# Filter out elements in exclusion zones
|
897
|
-
filtered_elements = []
|
898
|
-
for elem in all_elements:
|
899
|
-
in_exclusion = False
|
900
|
-
# For each element, check if it's in any exclusion zone
|
901
|
-
element_center_x = (elem.x0 + elem.x1) / 2
|
902
|
-
element_center_y = (elem.top + elem.bottom) / 2
|
903
|
-
|
904
|
-
for exclusion in exclusion_regions:
|
905
|
-
if (exclusion.x0 <= element_center_x <= exclusion.x1 and
|
906
|
-
exclusion.top <= element_center_y <= exclusion.bottom):
|
907
|
-
in_exclusion = True
|
908
|
-
break
|
909
|
-
|
910
|
-
if not in_exclusion:
|
911
|
-
filtered_elements.append(elem)
|
912
|
-
else:
|
913
|
-
# No exclusions, use all elements
|
914
|
-
filtered_elements = all_elements
|
915
|
-
|
916
|
-
# Now extract text from the filtered elements
|
917
|
-
if filtered_elements:
|
918
|
-
from natural_pdf.elements.collections import ElementCollection
|
919
|
-
collection = ElementCollection(filtered_elements)
|
920
|
-
# Sort in reading order
|
921
|
-
collection = collection.sort(key=lambda e: (e.top, e.x0))
|
922
|
-
# Extract text
|
923
|
-
result = " ".join(e.text for e in collection if hasattr(e, 'text'))
|
924
|
-
|
925
|
-
if debug:
|
926
|
-
logger.debug(f"Got {len(result)} characters from element-based extraction")
|
927
|
-
|
928
|
-
# Return the result
|
929
|
-
return result
|
930
|
-
else:
|
803
|
+
if apply_exclusions_flag and self._page._exclusions:
|
804
|
+
all_page_exclusions = self._page._get_exclusion_regions(
|
805
|
+
include_callable=True, debug=debug
|
806
|
+
)
|
807
|
+
overlapping_exclusions = []
|
808
|
+
for excl in all_page_exclusions:
|
809
|
+
if get_bbox_overlap(self.bbox, excl.bbox) is not None:
|
810
|
+
overlapping_exclusions.append(excl)
|
811
|
+
exclusion_regions = overlapping_exclusions
|
931
812
|
if debug:
|
932
|
-
logger.debug(
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
#
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
filtered_ocr.append(element)
|
956
|
-
else:
|
957
|
-
filtered_ocr = ocr_elements
|
958
|
-
|
959
|
-
# Extract text from OCR elements
|
960
|
-
from natural_pdf.elements.collections import ElementCollection
|
961
|
-
ocr_collection = ElementCollection(filtered_ocr)
|
962
|
-
ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
|
963
|
-
|
964
|
-
# Use OCR text if it's not empty
|
965
|
-
if ocr_text.strip():
|
966
|
-
return ocr_text
|
967
|
-
|
813
|
+
logger.debug(
|
814
|
+
f"Region {self.bbox}: Applying {len(exclusion_regions)} overlapping exclusions."
|
815
|
+
)
|
816
|
+
elif debug:
|
817
|
+
logger.debug(f"Region {self.bbox}: Not applying exclusions.")
|
818
|
+
|
819
|
+
# 4. Spatially Filter Characters using Utility
|
820
|
+
# Pass self as the target_region for precise polygon checks etc.
|
821
|
+
filtered_chars = filter_chars_spatially(
|
822
|
+
char_dicts=all_char_dicts,
|
823
|
+
exclusion_regions=exclusion_regions,
|
824
|
+
target_region=self, # Pass self!
|
825
|
+
debug=debug,
|
826
|
+
)
|
827
|
+
|
828
|
+
# 5. Generate Text Layout using Utility
|
829
|
+
result = generate_text_layout(
|
830
|
+
char_dicts=filtered_chars,
|
831
|
+
layout_context_bbox=self.bbox, # Use region's bbox for context
|
832
|
+
user_kwargs=kwargs,
|
833
|
+
)
|
834
|
+
|
835
|
+
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
968
836
|
return result
|
969
|
-
|
970
|
-
def extract_table(
|
971
|
-
|
837
|
+
|
838
|
+
def extract_table(
|
839
|
+
self,
|
840
|
+
method: str = None,
|
841
|
+
table_settings: dict = None,
|
842
|
+
use_ocr: bool = False,
|
843
|
+
ocr_config: dict = None,
|
844
|
+
) -> List[List[str]]:
|
972
845
|
"""
|
973
846
|
Extract a table from this region.
|
974
|
-
|
847
|
+
|
975
848
|
Args:
|
976
849
|
method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
|
977
850
|
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
|
978
851
|
use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
|
979
852
|
ocr_config: OCR configuration parameters
|
980
|
-
|
853
|
+
|
981
854
|
Returns:
|
982
855
|
Table data as a list of rows, where each row is a list of cell values
|
983
856
|
"""
|
984
857
|
# Default settings if none provided
|
985
858
|
if table_settings is None:
|
986
859
|
table_settings = {}
|
987
|
-
|
860
|
+
|
988
861
|
# Auto-detect method if not specified
|
989
862
|
if method is None:
|
990
863
|
# If this is a TATR-detected region, use TATR method
|
991
|
-
if hasattr(self,
|
992
|
-
method =
|
864
|
+
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
865
|
+
method = "tatr"
|
993
866
|
else:
|
994
|
-
method =
|
995
|
-
|
867
|
+
method = "plumber"
|
868
|
+
|
996
869
|
# Use the selected method
|
997
|
-
if method ==
|
870
|
+
if method == "tatr":
|
998
871
|
return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
999
872
|
else: # Default to pdfplumber
|
1000
873
|
return self._extract_table_plumber(table_settings)
|
1001
|
-
|
874
|
+
|
1002
875
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1003
876
|
"""
|
1004
877
|
Extract table using pdfplumber's table extraction.
|
1005
|
-
|
878
|
+
|
1006
879
|
Args:
|
1007
880
|
table_settings: Settings for pdfplumber table extraction
|
1008
|
-
|
881
|
+
|
1009
882
|
Returns:
|
1010
883
|
Table data as a list of rows, where each row is a list of cell values
|
1011
884
|
"""
|
1012
885
|
# Create a crop of the page for this region
|
1013
886
|
cropped = self.page._page.crop(self.bbox)
|
1014
|
-
|
887
|
+
|
1015
888
|
# Extract table from the cropped area
|
1016
889
|
tables = cropped.extract_tables(table_settings)
|
1017
|
-
|
890
|
+
|
1018
891
|
# Return the first table or an empty list if none found
|
1019
892
|
if tables:
|
1020
893
|
return tables[0]
|
1021
894
|
return []
|
1022
|
-
|
895
|
+
|
1023
896
|
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
1024
897
|
"""
|
1025
898
|
Extract table using TATR structure detection.
|
1026
|
-
|
899
|
+
|
1027
900
|
Args:
|
1028
901
|
use_ocr: Whether to apply OCR to each cell for better text extraction
|
1029
902
|
ocr_config: Optional OCR configuration parameters
|
1030
|
-
|
903
|
+
|
1031
904
|
Returns:
|
1032
905
|
Table data as a list of rows, where each row is a list of cell values
|
1033
906
|
"""
|
1034
907
|
# Find all rows and headers in this table
|
1035
|
-
rows = self.page.find_all(f
|
1036
|
-
headers = self.page.find_all(f
|
1037
|
-
columns = self.page.find_all(f
|
1038
|
-
|
908
|
+
rows = self.page.find_all(f"region[type=table-row][model=tatr]")
|
909
|
+
headers = self.page.find_all(f"region[type=table-column-header][model=tatr]")
|
910
|
+
columns = self.page.find_all(f"region[type=table-column][model=tatr]")
|
911
|
+
|
1039
912
|
# Filter to only include rows/headers/columns that overlap with this table region
|
1040
913
|
def is_in_table(region):
|
1041
914
|
# Check for overlap - simplifying to center point for now
|
1042
915
|
region_center_x = (region.x0 + region.x1) / 2
|
1043
916
|
region_center_y = (region.top + region.bottom) / 2
|
1044
|
-
return (
|
1045
|
-
|
1046
|
-
|
917
|
+
return (
|
918
|
+
self.x0 <= region_center_x <= self.x1 and self.top <= region_center_y <= self.bottom
|
919
|
+
)
|
920
|
+
|
1047
921
|
rows = [row for row in rows if is_in_table(row)]
|
1048
922
|
headers = [header for header in headers if is_in_table(header)]
|
1049
923
|
columns = [column for column in columns if is_in_table(column)]
|
1050
|
-
|
924
|
+
|
1051
925
|
# Sort rows by vertical position (top to bottom)
|
1052
926
|
rows.sort(key=lambda r: r.top)
|
1053
|
-
|
927
|
+
|
1054
928
|
# Sort columns by horizontal position (left to right)
|
1055
929
|
columns.sort(key=lambda c: c.x0)
|
1056
|
-
|
930
|
+
|
1057
931
|
# Create table data structure
|
1058
932
|
table_data = []
|
1059
|
-
|
933
|
+
|
1060
934
|
# Prepare OCR config if needed
|
1061
935
|
if use_ocr:
|
1062
936
|
# Default OCR config focuses on small text with low confidence
|
@@ -1065,16 +939,20 @@ class Region(DirectionalMixin):
|
|
1065
939
|
"min_confidence": 0.1, # Lower than default to catch more text
|
1066
940
|
"detection_params": {
|
1067
941
|
"text_threshold": 0.1, # Lower threshold for low-contrast text
|
1068
|
-
"link_threshold": 0.1 # Lower threshold for connecting text components
|
1069
|
-
}
|
942
|
+
"link_threshold": 0.1, # Lower threshold for connecting text components
|
943
|
+
},
|
1070
944
|
}
|
1071
|
-
|
945
|
+
|
1072
946
|
# Merge with provided config if any
|
1073
947
|
if ocr_config:
|
1074
948
|
if isinstance(ocr_config, dict):
|
1075
949
|
# Update default config with provided values
|
1076
950
|
for key, value in ocr_config.items():
|
1077
|
-
if
|
951
|
+
if (
|
952
|
+
isinstance(value, dict)
|
953
|
+
and key in default_ocr_config
|
954
|
+
and isinstance(default_ocr_config[key], dict)
|
955
|
+
):
|
1078
956
|
# Merge nested dicts
|
1079
957
|
default_ocr_config[key].update(value)
|
1080
958
|
else:
|
@@ -1083,10 +961,10 @@ class Region(DirectionalMixin):
|
|
1083
961
|
else:
|
1084
962
|
# Not a dict, use as is
|
1085
963
|
default_ocr_config = ocr_config
|
1086
|
-
|
964
|
+
|
1087
965
|
# Use the merged config
|
1088
966
|
ocr_config = default_ocr_config
|
1089
|
-
|
967
|
+
|
1090
968
|
# Add header row if headers were detected
|
1091
969
|
if headers:
|
1092
970
|
header_texts = []
|
@@ -1099,30 +977,28 @@ class Region(DirectionalMixin):
|
|
1099
977
|
if ocr_text:
|
1100
978
|
header_texts.append(ocr_text)
|
1101
979
|
continue
|
1102
|
-
|
980
|
+
|
1103
981
|
# Fallback to normal extraction
|
1104
982
|
header_texts.append(header.extract_text().strip())
|
1105
983
|
table_data.append(header_texts)
|
1106
|
-
|
984
|
+
|
1107
985
|
# Process rows
|
1108
986
|
for row in rows:
|
1109
987
|
row_cells = []
|
1110
|
-
|
988
|
+
|
1111
989
|
# If we have columns, use them to extract cells
|
1112
990
|
if columns:
|
1113
991
|
for column in columns:
|
1114
992
|
# Create a cell region at the intersection of row and column
|
1115
|
-
cell_bbox = (
|
1116
|
-
|
1117
|
-
row.top,
|
1118
|
-
column.x1,
|
1119
|
-
row.bottom
|
1120
|
-
)
|
1121
|
-
|
993
|
+
cell_bbox = (column.x0, row.top, column.x1, row.bottom)
|
994
|
+
|
1122
995
|
# Create a region for this cell
|
1123
|
-
from natural_pdf.elements.region import
|
996
|
+
from natural_pdf.elements.region import ( # Import here to avoid circular imports
|
997
|
+
Region,
|
998
|
+
)
|
999
|
+
|
1124
1000
|
cell_region = Region(self.page, cell_bbox)
|
1125
|
-
|
1001
|
+
|
1126
1002
|
# Extract text from the cell
|
1127
1003
|
if use_ocr:
|
1128
1004
|
# Apply OCR to the cell
|
@@ -1133,7 +1009,7 @@ class Region(DirectionalMixin):
|
|
1133
1009
|
if ocr_text:
|
1134
1010
|
row_cells.append(ocr_text)
|
1135
1011
|
continue
|
1136
|
-
|
1012
|
+
|
1137
1013
|
# Fallback to normal extraction
|
1138
1014
|
cell_text = cell_region.extract_text().strip()
|
1139
1015
|
row_cells.append(cell_text)
|
@@ -1147,182 +1023,212 @@ class Region(DirectionalMixin):
|
|
1147
1023
|
if ocr_text:
|
1148
1024
|
row_cells.append(ocr_text)
|
1149
1025
|
continue
|
1150
|
-
|
1026
|
+
|
1151
1027
|
# Fallback to normal extraction
|
1152
1028
|
row_cells.append(row.extract_text().strip())
|
1153
|
-
|
1029
|
+
|
1154
1030
|
table_data.append(row_cells)
|
1155
|
-
|
1031
|
+
|
1156
1032
|
return table_data
|
1157
|
-
|
1158
|
-
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[
|
1033
|
+
|
1034
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
|
1159
1035
|
"""
|
1160
1036
|
Find the first element in this region matching the selector.
|
1161
|
-
|
1037
|
+
|
1162
1038
|
Args:
|
1163
1039
|
selector: CSS-like selector string
|
1164
1040
|
apply_exclusions: Whether to apply exclusion regions
|
1165
1041
|
**kwargs: Additional parameters for element filtering
|
1166
|
-
|
1042
|
+
|
1167
1043
|
Returns:
|
1168
1044
|
First matching element or None
|
1169
1045
|
"""
|
1170
1046
|
elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1171
|
-
return elements
|
1172
|
-
|
1173
|
-
def
|
1047
|
+
return elements.first if elements else None # Use .first property
|
1048
|
+
|
1049
|
+
def find_all(
|
1050
|
+
self, selector: str, apply_exclusions=True, **kwargs
|
1051
|
+
) -> "ElementCollection": # Changed from _find_all
|
1174
1052
|
"""
|
1175
1053
|
Find all elements in this region matching the selector.
|
1176
|
-
|
1054
|
+
|
1177
1055
|
Args:
|
1178
1056
|
selector: CSS-like selector string
|
1179
1057
|
apply_exclusions: Whether to apply exclusion regions
|
1180
1058
|
**kwargs: Additional parameters for element filtering
|
1181
|
-
|
1059
|
+
|
1182
1060
|
Returns:
|
1183
1061
|
ElementCollection with matching elements
|
1184
1062
|
"""
|
1185
1063
|
from natural_pdf.elements.collections import ElementCollection
|
1186
1064
|
|
1187
1065
|
# If we span multiple pages, filter our elements
|
1066
|
+
# TODO: Revisit multi-page region logic
|
1188
1067
|
if self._spans_pages and self._multi_page_elements is not None:
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
page_ranges[element.page] = []
|
1202
|
-
page_ranges[element.page].append(element)
|
1203
|
-
|
1204
|
-
# For each page, use its find_all to match elements, then filter to our collection
|
1205
|
-
for page, page_elements in page_ranges.items():
|
1206
|
-
# Get all matching elements from the page
|
1207
|
-
page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1208
|
-
|
1209
|
-
# Filter to just the elements that are in our collection
|
1210
|
-
for element in page_matches:
|
1211
|
-
if element in page_elements:
|
1212
|
-
all_matching_elements.append(element)
|
1213
|
-
|
1214
|
-
return ElementCollection(all_matching_elements)
|
1068
|
+
logger.warning("find_all on multi-page regions is not fully implemented.")
|
1069
|
+
# Temporary: Apply filter directly to cached elements
|
1070
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1071
|
+
|
1072
|
+
try:
|
1073
|
+
selector_obj = parse_selector(selector)
|
1074
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1075
|
+
matching = [el for el in self._multi_page_elements if filter_func(el)]
|
1076
|
+
return ElementCollection(matching)
|
1077
|
+
except Exception as e:
|
1078
|
+
logger.error(f"Error applying selector to multi-page region elements: {e}")
|
1079
|
+
return ElementCollection([])
|
1215
1080
|
|
1216
1081
|
# Otherwise, get elements from the page and filter by selector and region
|
1217
1082
|
page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1083
|
+
# Use the precise _is_element_in_region check
|
1218
1084
|
filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1219
1085
|
return ElementCollection(filtered_elements)
|
1220
|
-
|
1221
|
-
def apply_ocr(self, **ocr_params) ->
|
1086
|
+
|
1087
|
+
def apply_ocr(self, **ocr_params) -> "Region":
|
1222
1088
|
"""
|
1223
1089
|
Apply OCR to this region and return the created text elements.
|
1224
|
-
|
1090
|
+
|
1225
1091
|
Args:
|
1226
|
-
**ocr_params:
|
1227
|
-
|
1092
|
+
**ocr_params: Keyword arguments passed to the OCR Manager.
|
1093
|
+
Common parameters like `engine`, `languages`, `min_confidence`,
|
1094
|
+
`device`, and `resolution` (for image rendering) should be
|
1095
|
+
provided here. **The `languages` list must contain codes
|
1096
|
+
understood by the specific engine selected.** No mapping
|
1097
|
+
is performed. Engine-specific settings can be passed in
|
1098
|
+
an `options` object (e.g., `options=EasyOCROptions(...)`).
|
1099
|
+
|
1228
1100
|
Returns:
|
1229
|
-
List of created
|
1101
|
+
List of created TextElement objects representing OCR words/lines.
|
1230
1102
|
"""
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
if isinstance(ocr_params, dict):
|
1235
|
-
ocr_params["verbose"] = False
|
1236
|
-
else:
|
1237
|
-
ocr_params = {"enabled": True, "verbose": False}
|
1238
|
-
|
1239
|
-
ocr_config = self.page._get_ocr_config(ocr_params)
|
1240
|
-
|
1241
|
-
# Skip if OCR is disabled
|
1242
|
-
if not ocr_config.get('enabled'):
|
1103
|
+
# Ensure OCRManager is available
|
1104
|
+
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1105
|
+
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
1243
1106
|
return []
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
result['bbox'][1] + self.top,
|
1261
|
-
result['bbox'][2] + self.x0,
|
1262
|
-
result['bbox'][3] + self.top
|
1107
|
+
ocr_mgr = self.page._parent._ocr_manager
|
1108
|
+
|
1109
|
+
# Determine rendering resolution from parameters
|
1110
|
+
final_resolution = ocr_params.get("resolution")
|
1111
|
+
if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
|
1112
|
+
final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
|
1113
|
+
elif final_resolution is None:
|
1114
|
+
final_resolution = 150
|
1115
|
+
logger.debug(
|
1116
|
+
f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
|
1117
|
+
)
|
1118
|
+
|
1119
|
+
# Render the page region to an image using the determined resolution
|
1120
|
+
try:
|
1121
|
+
region_image = self.to_image(
|
1122
|
+
resolution=final_resolution, include_highlights=False, crop_only=True
|
1263
1123
|
)
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1124
|
+
if not region_image:
|
1125
|
+
logger.error("Failed to render region to image for OCR.")
|
1126
|
+
return []
|
1127
|
+
logger.debug(f"Region rendered to image size: {region_image.size}")
|
1128
|
+
except Exception as e:
|
1129
|
+
logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
|
1130
|
+
return []
|
1131
|
+
|
1132
|
+
# Prepare args for the OCR Manager
|
1133
|
+
manager_args = {
|
1134
|
+
"images": region_image,
|
1135
|
+
"engine": ocr_params.get("engine"),
|
1136
|
+
"languages": ocr_params.get("languages"),
|
1137
|
+
"min_confidence": ocr_params.get("min_confidence"),
|
1138
|
+
"device": ocr_params.get("device"),
|
1139
|
+
"options": ocr_params.get("options"),
|
1140
|
+
"detect_only": ocr_params.get("detect_only"),
|
1141
|
+
}
|
1142
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1143
|
+
|
1144
|
+
# Run OCR on this region's image using the manager
|
1145
|
+
try:
|
1146
|
+
results = ocr_mgr.apply_ocr(**manager_args)
|
1147
|
+
if not isinstance(results, list):
|
1148
|
+
logger.error(
|
1149
|
+
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
1150
|
+
)
|
1151
|
+
return []
|
1152
|
+
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1153
|
+
except Exception as e:
|
1154
|
+
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
1155
|
+
return []
|
1156
|
+
|
1157
|
+
# Convert results to TextElements
|
1158
|
+
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
1159
|
+
scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
|
1160
|
+
logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
|
1161
|
+
created_elements = []
|
1267
1162
|
for result in results:
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1163
|
+
try:
|
1164
|
+
img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
|
1165
|
+
pdf_height = (img_bottom - img_top) * scale_y
|
1166
|
+
page_x0 = self.x0 + (img_x0 * scale_x)
|
1167
|
+
page_top = self.top + (img_top * scale_y)
|
1168
|
+
page_x1 = self.x0 + (img_x1 * scale_x)
|
1169
|
+
page_bottom = self.top + (img_bottom * scale_y)
|
1275
1170
|
element_data = {
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1171
|
+
"text": result["text"],
|
1172
|
+
"x0": page_x0,
|
1173
|
+
"top": page_top,
|
1174
|
+
"x1": page_x1,
|
1175
|
+
"bottom": page_bottom,
|
1176
|
+
"width": page_x1 - page_x0,
|
1177
|
+
"height": page_bottom - page_top,
|
1178
|
+
"object_type": "word",
|
1179
|
+
"source": "ocr",
|
1180
|
+
"confidence": float(result.get("confidence", 0.0)),
|
1181
|
+
"fontname": "OCR",
|
1182
|
+
"size": round(pdf_height) if pdf_height > 0 else 10.0,
|
1183
|
+
"page_number": self.page.number,
|
1184
|
+
"bold": False,
|
1185
|
+
"italic": False,
|
1186
|
+
"upright": True,
|
1187
|
+
"doctop": page_top + self.page._page.initial_doctop,
|
1290
1188
|
}
|
1291
|
-
|
1189
|
+
ocr_char_dict = element_data.copy()
|
1190
|
+
ocr_char_dict["object_type"] = "char"
|
1191
|
+
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
1192
|
+
element_data["_char_dicts"] = [ocr_char_dict]
|
1193
|
+
from natural_pdf.elements.text import TextElement
|
1292
1194
|
elem = TextElement(element_data, self.page)
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
|
1195
|
+
created_elements.append(elem)
|
1196
|
+
self.page._element_mgr.add_element(elem, element_type="words")
|
1197
|
+
self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
|
1198
|
+
except Exception as e:
|
1199
|
+
logger.error(
|
1200
|
+
f"Failed to convert region OCR result to element: {result}. Error: {e}",
|
1201
|
+
exc_info=True,
|
1202
|
+
)
|
1203
|
+
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
1204
|
+
return self
|
1205
|
+
|
1206
|
+
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
1306
1207
|
"""
|
1307
1208
|
Get a section between two elements within this region.
|
1308
|
-
|
1209
|
+
|
1309
1210
|
Args:
|
1310
1211
|
start_element: Element marking the start of the section
|
1311
1212
|
end_element: Element marking the end of the section
|
1312
1213
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1313
|
-
|
1214
|
+
|
1314
1215
|
Returns:
|
1315
1216
|
Region representing the section
|
1316
1217
|
"""
|
1218
|
+
# Get elements only within this region first
|
1317
1219
|
elements = self.get_elements()
|
1318
|
-
|
1319
|
-
# If no elements, return self
|
1220
|
+
|
1221
|
+
# If no elements, return self or empty region?
|
1320
1222
|
if not elements:
|
1321
|
-
|
1322
|
-
|
1223
|
+
logger.warning(
|
1224
|
+
f"get_section_between called on region {self.bbox} with no contained elements."
|
1225
|
+
)
|
1226
|
+
# Return an empty region at the start of the parent region
|
1227
|
+
return Region(self.page, (self.x0, self.top, self.x0, self.top))
|
1228
|
+
|
1323
1229
|
# Sort elements in reading order
|
1324
1230
|
elements.sort(key=lambda e: (e.top, e.x0))
|
1325
|
-
|
1231
|
+
|
1326
1232
|
# Find start index
|
1327
1233
|
start_idx = 0
|
1328
1234
|
if start_element:
|
@@ -1330,8 +1236,12 @@ class Region(DirectionalMixin):
|
|
1330
1236
|
start_idx = elements.index(start_element)
|
1331
1237
|
except ValueError:
|
1332
1238
|
# Start element not in region, use first element
|
1333
|
-
|
1334
|
-
|
1239
|
+
logger.debug("Start element not found in region, using first element.")
|
1240
|
+
start_element = elements[0] # Use the actual first element
|
1241
|
+
start_idx = 0
|
1242
|
+
else:
|
1243
|
+
start_element = elements[0] # Default start is first element
|
1244
|
+
|
1335
1245
|
# Find end index
|
1336
1246
|
end_idx = len(elements) - 1
|
1337
1247
|
if end_element:
|
@@ -1339,218 +1249,231 @@ class Region(DirectionalMixin):
|
|
1339
1249
|
end_idx = elements.index(end_element)
|
1340
1250
|
except ValueError:
|
1341
1251
|
# End element not in region, use last element
|
1342
|
-
|
1343
|
-
|
1252
|
+
logger.debug("End element not found in region, using last element.")
|
1253
|
+
end_element = elements[-1] # Use the actual last element
|
1254
|
+
end_idx = len(elements) - 1
|
1255
|
+
else:
|
1256
|
+
end_element = elements[-1] # Default end is last element
|
1257
|
+
|
1344
1258
|
# Adjust indexes based on boundary inclusion
|
1345
|
-
|
1259
|
+
start_element_for_bbox = start_element
|
1260
|
+
end_element_for_bbox = end_element
|
1261
|
+
|
1262
|
+
if boundary_inclusion == "none":
|
1346
1263
|
start_idx += 1
|
1347
1264
|
end_idx -= 1
|
1348
|
-
|
1265
|
+
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
1266
|
+
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
1267
|
+
elif boundary_inclusion == "start":
|
1349
1268
|
end_idx -= 1
|
1350
|
-
|
1269
|
+
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
1270
|
+
elif boundary_inclusion == "end":
|
1351
1271
|
start_idx += 1
|
1352
|
-
|
1272
|
+
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
1273
|
+
|
1353
1274
|
# Ensure valid indexes
|
1354
1275
|
start_idx = max(0, start_idx)
|
1355
1276
|
end_idx = min(len(elements) - 1, end_idx)
|
1356
|
-
|
1277
|
+
|
1357
1278
|
# If no valid elements in range, return empty region
|
1358
|
-
if start_idx > end_idx:
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
#
|
1279
|
+
if start_idx > end_idx or start_element_for_bbox is None or end_element_for_bbox is None:
|
1280
|
+
logger.debug("No valid elements in range for get_section_between.")
|
1281
|
+
# Return an empty region positioned at the start element boundary
|
1282
|
+
anchor = start_element if start_element else self
|
1283
|
+
return Region(self.page, (anchor.x0, anchor.top, anchor.x0, anchor.top))
|
1284
|
+
|
1285
|
+
# Get elements in range based on adjusted indices
|
1286
|
+
section_elements = elements[start_idx : end_idx + 1]
|
1287
|
+
|
1288
|
+
# Create bounding box around the ELEMENTS included based on indices
|
1365
1289
|
x0 = min(e.x0 for e in section_elements)
|
1366
1290
|
top = min(e.top for e in section_elements)
|
1367
1291
|
x1 = max(e.x1 for e in section_elements)
|
1368
1292
|
bottom = max(e.bottom for e in section_elements)
|
1369
|
-
|
1370
|
-
# Adjust boundaries for better boundary inclusion/exclusion
|
1371
|
-
pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
|
1372
|
-
|
1373
|
-
# Only proceed with adjustments if we have elements in the section
|
1374
|
-
if section_elements:
|
1375
|
-
# Adjust top boundary if start element should be excluded
|
1376
|
-
if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
|
1377
|
-
# If start element is just above the section, move the top down
|
1378
|
-
# Use a larger threshold (10 points) to catch more cases
|
1379
|
-
if abs(top - start_element.bottom) < 10:
|
1380
|
-
top += pixel_adjustment
|
1381
|
-
|
1382
|
-
# Adjust bottom boundary if end element should be excluded
|
1383
|
-
if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
|
1384
|
-
# If end element is just below the section, move the bottom up
|
1385
|
-
# Use a larger threshold (10 points) to catch more cases
|
1386
|
-
if abs(bottom - end_element.top) < 10:
|
1387
|
-
bottom -= pixel_adjustment
|
1388
|
-
|
1389
|
-
# Ensure top is always less than bottom (valid region)
|
1390
|
-
if top >= bottom:
|
1391
|
-
# Reset to original if adjustment would create an invalid region
|
1392
|
-
top = min(e.top for e in section_elements)
|
1393
|
-
bottom = max(e.bottom for e in section_elements)
|
1394
|
-
|
1293
|
+
|
1395
1294
|
# Create new region
|
1396
1295
|
section = Region(self.page, (x0, top, x1, bottom))
|
1397
|
-
|
1398
|
-
section.
|
1399
|
-
|
1296
|
+
# Store the original boundary elements for reference
|
1297
|
+
section.start_element = start_element
|
1298
|
+
section.end_element = end_element
|
1299
|
+
|
1400
1300
|
return section
|
1401
|
-
|
1402
|
-
def get_sections(
|
1301
|
+
|
1302
|
+
def get_sections(
|
1303
|
+
self, start_elements=None, end_elements=None, boundary_inclusion="both"
|
1304
|
+
) -> List["Region"]:
|
1403
1305
|
"""
|
1404
1306
|
Get sections within this region based on start/end elements.
|
1405
|
-
|
1307
|
+
|
1406
1308
|
Args:
|
1407
1309
|
start_elements: Elements or selector string that mark the start of sections
|
1408
1310
|
end_elements: Elements or selector string that mark the end of sections
|
1409
1311
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1410
|
-
|
1312
|
+
|
1411
1313
|
Returns:
|
1412
1314
|
List of Region objects representing the extracted sections
|
1413
1315
|
"""
|
1414
1316
|
from natural_pdf.elements.collections import ElementCollection
|
1415
|
-
|
1416
|
-
# Process string selectors to find elements
|
1317
|
+
|
1318
|
+
# Process string selectors to find elements WITHIN THIS REGION
|
1417
1319
|
if isinstance(start_elements, str):
|
1418
|
-
start_elements = self.find_all(start_elements)
|
1419
|
-
if hasattr(start_elements,
|
1320
|
+
start_elements = self.find_all(start_elements) # Use region's find_all
|
1321
|
+
if hasattr(start_elements, "elements"):
|
1420
1322
|
start_elements = start_elements.elements
|
1421
|
-
|
1323
|
+
|
1422
1324
|
if isinstance(end_elements, str):
|
1423
|
-
end_elements = self.find_all(end_elements)
|
1424
|
-
if hasattr(end_elements,
|
1325
|
+
end_elements = self.find_all(end_elements) # Use region's find_all
|
1326
|
+
if hasattr(end_elements, "elements"):
|
1425
1327
|
end_elements = end_elements.elements
|
1426
|
-
|
1427
|
-
#
|
1328
|
+
|
1329
|
+
# Ensure start_elements is a list (or similar iterable)
|
1330
|
+
if start_elements is None or not hasattr(start_elements, "__iter__"):
|
1331
|
+
logger.warning(
|
1332
|
+
"get_sections requires valid start_elements (selector or list). Returning empty."
|
1333
|
+
)
|
1334
|
+
return []
|
1335
|
+
# Ensure end_elements is a list if provided
|
1336
|
+
if end_elements is not None and not hasattr(end_elements, "__iter__"):
|
1337
|
+
logger.warning("end_elements must be iterable if provided. Ignoring.")
|
1338
|
+
end_elements = []
|
1339
|
+
elif end_elements is None:
|
1340
|
+
end_elements = []
|
1341
|
+
|
1342
|
+
# If no start elements found within the region, return empty list
|
1428
1343
|
if not start_elements:
|
1429
1344
|
return []
|
1430
|
-
|
1431
|
-
# Sort elements in reading order
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1345
|
+
|
1346
|
+
# Sort all elements within the region in reading order
|
1347
|
+
all_elements_in_region = self.get_elements()
|
1348
|
+
all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
|
1349
|
+
|
1350
|
+
if not all_elements_in_region:
|
1351
|
+
return [] # Cannot create sections if region is empty
|
1352
|
+
|
1353
|
+
# Map elements to their indices in the sorted list
|
1354
|
+
element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
|
1355
|
+
|
1356
|
+
# Mark section boundaries using indices from the sorted list
|
1436
1357
|
section_boundaries = []
|
1437
|
-
|
1358
|
+
|
1438
1359
|
# Add start element indexes
|
1439
1360
|
for element in start_elements:
|
1440
|
-
|
1441
|
-
|
1442
|
-
section_boundaries.append({
|
1443
|
-
|
1444
|
-
|
1445
|
-
'type': 'start'
|
1446
|
-
})
|
1447
|
-
except ValueError:
|
1448
|
-
# Element not in this region, skip
|
1449
|
-
continue
|
1450
|
-
|
1361
|
+
idx = element_to_index.get(element)
|
1362
|
+
if idx is not None:
|
1363
|
+
section_boundaries.append({"index": idx, "element": element, "type": "start"})
|
1364
|
+
# else: Element found by selector might not be geometrically in region? Log warning?
|
1365
|
+
|
1451
1366
|
# Add end element indexes if provided
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
})
|
1461
|
-
except ValueError:
|
1462
|
-
# Element not in this region, skip
|
1463
|
-
continue
|
1464
|
-
|
1465
|
-
# Sort boundaries by index (document order)
|
1466
|
-
section_boundaries.sort(key=lambda x: x['index'])
|
1467
|
-
|
1367
|
+
for element in end_elements:
|
1368
|
+
idx = element_to_index.get(element)
|
1369
|
+
if idx is not None:
|
1370
|
+
section_boundaries.append({"index": idx, "element": element, "type": "end"})
|
1371
|
+
|
1372
|
+
# Sort boundaries by index (document order within the region)
|
1373
|
+
section_boundaries.sort(key=lambda x: x["index"])
|
1374
|
+
|
1468
1375
|
# Generate sections
|
1469
1376
|
sections = []
|
1470
|
-
|
1471
|
-
|
1377
|
+
current_start_boundary = None
|
1378
|
+
|
1472
1379
|
for i, boundary in enumerate(section_boundaries):
|
1473
1380
|
# If it's a start boundary and we don't have a current start
|
1474
|
-
if boundary[
|
1475
|
-
|
1476
|
-
|
1381
|
+
if boundary["type"] == "start" and current_start_boundary is None:
|
1382
|
+
current_start_boundary = boundary
|
1383
|
+
|
1477
1384
|
# If it's an end boundary and we have a current start
|
1478
|
-
elif boundary[
|
1385
|
+
elif boundary["type"] == "end" and current_start_boundary is not None:
|
1479
1386
|
# Create a section from current_start to this boundary
|
1480
|
-
start_element =
|
1481
|
-
end_element = boundary[
|
1482
|
-
|
1483
|
-
|
1484
|
-
end_element,
|
1485
|
-
boundary_inclusion
|
1486
|
-
)
|
1487
|
-
sections.append(section)
|
1488
|
-
current_start = None
|
1489
|
-
|
1490
|
-
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1491
|
-
elif boundary['type'] == 'start' and current_start is not None and not end_elements:
|
1492
|
-
# Create a section from current_start to just before this boundary
|
1493
|
-
start_element = current_start['element']
|
1494
|
-
end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
|
1495
|
-
section = self.get_section_between(
|
1496
|
-
start_element,
|
1497
|
-
end_element,
|
1498
|
-
boundary_inclusion
|
1499
|
-
)
|
1387
|
+
start_element = current_start_boundary["element"]
|
1388
|
+
end_element = boundary["element"]
|
1389
|
+
# Use the helper, ensuring elements are from within the region
|
1390
|
+
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1500
1391
|
sections.append(section)
|
1501
|
-
|
1502
|
-
|
1392
|
+
current_start_boundary = None # Reset
|
1393
|
+
|
1394
|
+
# If it's another start boundary and we have a current start (split by starts only)
|
1395
|
+
elif (
|
1396
|
+
boundary["type"] == "start"
|
1397
|
+
and current_start_boundary is not None
|
1398
|
+
and not end_elements
|
1399
|
+
):
|
1400
|
+
# End the previous section just before this start boundary
|
1401
|
+
start_element = current_start_boundary["element"]
|
1402
|
+
# Find the element immediately preceding this start in the sorted list
|
1403
|
+
end_idx = boundary["index"] - 1
|
1404
|
+
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
1405
|
+
end_element = all_elements_in_region[end_idx]
|
1406
|
+
section = self.get_section_between(
|
1407
|
+
start_element, end_element, boundary_inclusion
|
1408
|
+
)
|
1409
|
+
sections.append(section)
|
1410
|
+
# Else: Section started and ended by consecutive start elements? Create empty?
|
1411
|
+
# For now, just reset and start new section
|
1412
|
+
|
1413
|
+
# Start the new section
|
1414
|
+
current_start_boundary = boundary
|
1415
|
+
|
1503
1416
|
# Handle the last section if we have a current start
|
1504
|
-
if
|
1505
|
-
start_element =
|
1506
|
-
#
|
1507
|
-
end_element =
|
1508
|
-
section = self.get_section_between(
|
1509
|
-
start_element,
|
1510
|
-
end_element,
|
1511
|
-
boundary_inclusion
|
1512
|
-
)
|
1417
|
+
if current_start_boundary is not None:
|
1418
|
+
start_element = current_start_boundary["element"]
|
1419
|
+
# End at the last element within the region
|
1420
|
+
end_element = all_elements_in_region[-1]
|
1421
|
+
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1513
1422
|
sections.append(section)
|
1514
|
-
|
1423
|
+
|
1515
1424
|
return sections
|
1516
|
-
|
1425
|
+
|
1517
1426
|
def create_cells(self):
|
1518
1427
|
"""
|
1519
1428
|
Create cell regions for a detected table by intersecting its
|
1520
1429
|
row and column regions, and add them to the page.
|
1521
|
-
|
1430
|
+
|
1522
1431
|
Assumes child row and column regions are already present on the page.
|
1523
1432
|
|
1524
1433
|
Returns:
|
1525
1434
|
Self for method chaining.
|
1526
1435
|
"""
|
1527
1436
|
# Ensure this is called on a table region
|
1528
|
-
if self.region_type not in (
|
1529
|
-
|
1530
|
-
|
1437
|
+
if self.region_type not in (
|
1438
|
+
"table",
|
1439
|
+
"tableofcontents",
|
1440
|
+
): # Allow for ToC which might have structure
|
1441
|
+
raise ValueError(
|
1442
|
+
f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'"
|
1443
|
+
)
|
1444
|
+
|
1531
1445
|
# Find rows and columns associated with this page
|
1532
1446
|
# Remove the model-specific filter
|
1533
|
-
rows = self.page.find_all(
|
1534
|
-
columns = self.page.find_all(
|
1535
|
-
|
1447
|
+
rows = self.page.find_all("region[type=table-row]")
|
1448
|
+
columns = self.page.find_all("region[type=table-column]")
|
1449
|
+
|
1536
1450
|
# Filter to only include those that overlap with this table region
|
1537
1451
|
def is_in_table(element):
|
1538
1452
|
# Use a simple overlap check (more robust than just center point)
|
1539
1453
|
# Check if element's bbox overlaps with self.bbox
|
1540
|
-
return (
|
1541
|
-
|
1542
|
-
|
1454
|
+
return (
|
1455
|
+
hasattr(element, "bbox")
|
1456
|
+
and element.x0 < self.x1 # Ensure element has bbox
|
1457
|
+
and element.x1 > self.x0
|
1458
|
+
and element.top < self.bottom
|
1459
|
+
and element.bottom > self.top
|
1460
|
+
)
|
1461
|
+
|
1543
1462
|
table_rows = [r for r in rows if is_in_table(r)]
|
1544
1463
|
table_columns = [c for c in columns if is_in_table(c)]
|
1545
|
-
|
1464
|
+
|
1546
1465
|
if not table_rows or not table_columns:
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1466
|
+
# Use page's logger if available
|
1467
|
+
logger_instance = getattr(self._page, "logger", logger)
|
1468
|
+
logger_instance.warning(
|
1469
|
+
f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found."
|
1470
|
+
)
|
1471
|
+
return self # Return self even if no cells created
|
1472
|
+
|
1550
1473
|
# Sort rows and columns
|
1551
1474
|
table_rows.sort(key=lambda r: r.top)
|
1552
1475
|
table_columns.sort(key=lambda c: c.x0)
|
1553
|
-
|
1476
|
+
|
1554
1477
|
# Create cells and add them to the page's element manager
|
1555
1478
|
created_count = 0
|
1556
1479
|
for row in table_rows:
|
@@ -1564,41 +1487,49 @@ class Region(DirectionalMixin):
|
|
1564
1487
|
# Only create a cell if the intersection is valid (positive width/height)
|
1565
1488
|
if cell_x1 > cell_x0 and cell_y1 > cell_y0:
|
1566
1489
|
# Create cell region at the intersection
|
1567
|
-
cell = self.page.create_region(
|
1568
|
-
cell_x0, cell_y0, cell_x1, cell_y1
|
1569
|
-
)
|
1490
|
+
cell = self.page.create_region(cell_x0, cell_y0, cell_x1, cell_y1)
|
1570
1491
|
# Set metadata
|
1571
|
-
cell.source =
|
1572
|
-
cell.region_type =
|
1573
|
-
cell.normalized_type =
|
1492
|
+
cell.source = "derived"
|
1493
|
+
cell.region_type = "table-cell" # Explicitly set type
|
1494
|
+
cell.normalized_type = "table-cell" # And normalized type
|
1574
1495
|
# Inherit model from the parent table region
|
1575
|
-
cell.model = self.model
|
1576
|
-
cell.parent_region = self
|
1577
|
-
|
1496
|
+
cell.model = self.model
|
1497
|
+
cell.parent_region = self # Link cell to parent table region
|
1498
|
+
|
1578
1499
|
# Add the cell region to the page's element manager
|
1579
1500
|
self.page._element_mgr.add_region(cell)
|
1580
1501
|
created_count += 1
|
1581
|
-
|
1502
|
+
|
1582
1503
|
# Optional: Add created cells to the table region's children
|
1583
1504
|
# self.child_regions.extend(cells_created_in_this_call) # Needs list management
|
1584
1505
|
|
1585
|
-
|
1506
|
+
logger_instance = getattr(self._page, "logger", logger)
|
1507
|
+
logger_instance.info(
|
1508
|
+
f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions."
|
1509
|
+
)
|
1586
1510
|
|
1587
|
-
return self
|
1588
|
-
|
1589
|
-
def ask(
|
1511
|
+
return self # Return self for chaining
|
1512
|
+
|
1513
|
+
def ask(
|
1514
|
+
self,
|
1515
|
+
question: str,
|
1516
|
+
min_confidence: float = 0.1,
|
1517
|
+
model: str = None,
|
1518
|
+
debug: bool = False,
|
1519
|
+
**kwargs,
|
1520
|
+
) -> Dict[str, Any]:
|
1590
1521
|
"""
|
1591
1522
|
Ask a question about the region content using document QA.
|
1592
|
-
|
1523
|
+
|
1593
1524
|
This method uses a document question answering model to extract answers from the region content.
|
1594
1525
|
It leverages both textual content and layout information for better understanding.
|
1595
|
-
|
1526
|
+
|
1596
1527
|
Args:
|
1597
1528
|
question: The question to ask about the region content
|
1598
1529
|
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
1599
1530
|
model: Optional model name to use for QA (if None, uses default model)
|
1600
1531
|
**kwargs: Additional parameters to pass to the QA engine
|
1601
|
-
|
1532
|
+
|
1602
1533
|
Returns:
|
1603
1534
|
Dictionary with answer details: {
|
1604
1535
|
"answer": extracted text,
|
@@ -1609,112 +1540,191 @@ class Region(DirectionalMixin):
|
|
1609
1540
|
"source_elements": list of elements that contain the answer (if found)
|
1610
1541
|
}
|
1611
1542
|
"""
|
1612
|
-
|
1613
|
-
|
1543
|
+
try:
|
1544
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1545
|
+
except ImportError:
|
1546
|
+
logger.error(
|
1547
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
|
1548
|
+
)
|
1549
|
+
return {
|
1550
|
+
"answer": None,
|
1551
|
+
"confidence": 0.0,
|
1552
|
+
"found": False,
|
1553
|
+
"page_num": self.page.number,
|
1554
|
+
"source_elements": [],
|
1555
|
+
"region": self,
|
1556
|
+
}
|
1557
|
+
|
1614
1558
|
# Get or initialize QA engine with specified model
|
1615
|
-
|
1616
|
-
|
1559
|
+
try:
|
1560
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1561
|
+
except Exception as e:
|
1562
|
+
logger.error(f"Failed to initialize QA engine (model: {model}): {e}", exc_info=True)
|
1563
|
+
return {
|
1564
|
+
"answer": None,
|
1565
|
+
"confidence": 0.0,
|
1566
|
+
"found": False,
|
1567
|
+
"page_num": self.page.number,
|
1568
|
+
"source_elements": [],
|
1569
|
+
"region": self,
|
1570
|
+
}
|
1571
|
+
|
1617
1572
|
# Ask the question using the QA engine
|
1618
|
-
|
1573
|
+
try:
|
1574
|
+
return qa_engine.ask_pdf_region(
|
1575
|
+
self, question, min_confidence=min_confidence, debug=debug, **kwargs
|
1576
|
+
)
|
1577
|
+
except Exception as e:
|
1578
|
+
logger.error(f"Error during qa_engine.ask_pdf_region: {e}", exc_info=True)
|
1579
|
+
return {
|
1580
|
+
"answer": None,
|
1581
|
+
"confidence": 0.0,
|
1582
|
+
"found": False,
|
1583
|
+
"page_num": self.page.number,
|
1584
|
+
"source_elements": [],
|
1585
|
+
"region": self,
|
1586
|
+
}
|
1619
1587
|
|
1620
1588
|
def add_child(self, child):
|
1621
1589
|
"""
|
1622
1590
|
Add a child region to this region.
|
1623
|
-
|
1591
|
+
|
1624
1592
|
Used for hierarchical document structure when using models like Docling
|
1625
1593
|
that understand document hierarchy.
|
1626
|
-
|
1594
|
+
|
1627
1595
|
Args:
|
1628
1596
|
child: Region object to add as a child
|
1629
|
-
|
1597
|
+
|
1630
1598
|
Returns:
|
1631
1599
|
Self for method chaining
|
1632
1600
|
"""
|
1633
1601
|
self.child_regions.append(child)
|
1634
1602
|
child.parent_region = self
|
1635
1603
|
return self
|
1636
|
-
|
1604
|
+
|
1637
1605
|
def get_children(self, selector=None):
|
1638
1606
|
"""
|
1639
1607
|
Get immediate child regions, optionally filtered by selector.
|
1640
|
-
|
1608
|
+
|
1641
1609
|
Args:
|
1642
1610
|
selector: Optional selector to filter children
|
1643
|
-
|
1611
|
+
|
1644
1612
|
Returns:
|
1645
1613
|
List of child regions matching the selector
|
1646
1614
|
"""
|
1647
1615
|
import logging
|
1616
|
+
|
1648
1617
|
logger = logging.getLogger("natural_pdf.elements.region")
|
1649
|
-
|
1618
|
+
|
1650
1619
|
if selector is None:
|
1651
1620
|
return self.child_regions
|
1652
|
-
|
1621
|
+
|
1653
1622
|
# Use existing selector parser to filter
|
1654
|
-
from natural_pdf.selectors.parser import
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1623
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1624
|
+
|
1625
|
+
try:
|
1626
|
+
selector_obj = parse_selector(selector)
|
1627
|
+
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
1628
|
+
matched = [child for child in self.child_regions if filter_func(child)]
|
1629
|
+
logger.debug(
|
1630
|
+
f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'"
|
1631
|
+
)
|
1632
|
+
return matched
|
1633
|
+
except Exception as e:
|
1634
|
+
logger.error(f"Error applying selector in get_children: {e}", exc_info=True)
|
1635
|
+
return [] # Return empty list on error
|
1636
|
+
|
1659
1637
|
def get_descendants(self, selector=None):
|
1660
1638
|
"""
|
1661
1639
|
Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
|
1662
|
-
|
1640
|
+
|
1663
1641
|
Args:
|
1664
1642
|
selector: Optional selector to filter descendants
|
1665
|
-
|
1643
|
+
|
1666
1644
|
Returns:
|
1667
1645
|
List of descendant regions matching the selector
|
1668
1646
|
"""
|
1669
1647
|
import logging
|
1648
|
+
|
1670
1649
|
logger = logging.getLogger("natural_pdf.elements.region")
|
1671
|
-
|
1650
|
+
|
1672
1651
|
all_descendants = []
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1652
|
+
queue = list(self.child_regions) # Start with direct children
|
1653
|
+
|
1654
|
+
while queue:
|
1655
|
+
current = queue.pop(0)
|
1656
|
+
all_descendants.append(current)
|
1657
|
+
# Add current's children to the queue for processing
|
1658
|
+
if hasattr(current, "child_regions"):
|
1659
|
+
queue.extend(current.child_regions)
|
1660
|
+
|
1681
1661
|
logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
|
1682
|
-
|
1662
|
+
|
1683
1663
|
# Filter by selector if provided
|
1684
1664
|
if selector is not None:
|
1685
|
-
from natural_pdf.selectors.parser import
|
1686
|
-
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1665
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1666
|
+
|
1667
|
+
try:
|
1668
|
+
selector_obj = parse_selector(selector)
|
1669
|
+
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
1670
|
+
matched = [desc for desc in all_descendants if filter_func(desc)]
|
1671
|
+
logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
|
1672
|
+
return matched
|
1673
|
+
except Exception as e:
|
1674
|
+
logger.error(f"Error applying selector in get_descendants: {e}", exc_info=True)
|
1675
|
+
return [] # Return empty list on error
|
1676
|
+
|
1690
1677
|
return all_descendants
|
1691
|
-
|
1692
|
-
|
1678
|
+
|
1679
|
+
# Removed recursive=True, find_all on region shouldn't be recursive by default
|
1680
|
+
# Renamed _find_all back to find_all
|
1681
|
+
# def find_all(self, selector, apply_exclusions=True, **kwargs):
|
1682
|
+
# See implementation above near get_elements
|
1683
|
+
|
1684
|
+
def __repr__(self) -> str:
|
1685
|
+
"""String representation of the region."""
|
1686
|
+
poly_info = " (Polygon)" if self.has_polygon else ""
|
1687
|
+
name_info = f" name='{self.name}'" if self.name else ""
|
1688
|
+
type_info = f" type='{self.region_type}'" if self.region_type else ""
|
1689
|
+
source_info = f" source='{self.source}'" if self.source else ""
|
1690
|
+
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|
1691
|
+
|
1692
|
+
def correct_ocr(
|
1693
|
+
self,
|
1694
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1695
|
+
) -> "Region": # Return self for chaining
|
1693
1696
|
"""
|
1694
|
-
|
1695
|
-
|
1697
|
+
Applies corrections to OCR-generated text elements within this region
|
1698
|
+
using a user-provided callback function.
|
1699
|
+
|
1700
|
+
Finds text elements within this region whose 'source' attribute starts
|
1701
|
+
with 'ocr' and calls the `correction_callback` for each, passing the
|
1702
|
+
element itself.
|
1703
|
+
|
1704
|
+
The `correction_callback` should contain the logic to:
|
1705
|
+
1. Determine if the element needs correction.
|
1706
|
+
2. Perform the correction (e.g., call an LLM).
|
1707
|
+
3. Return the new text (`str`) or `None`.
|
1708
|
+
|
1709
|
+
If the callback returns a string, the element's `.text` is updated.
|
1710
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
1711
|
+
|
1696
1712
|
Args:
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1713
|
+
correction_callback: A function accepting an element and returning
|
1714
|
+
`Optional[str]` (new text or None).
|
1715
|
+
|
1701
1716
|
Returns:
|
1702
|
-
|
1717
|
+
Self for method chaining.
|
1703
1718
|
"""
|
1704
|
-
#
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
for match in child_matches:
|
1717
|
-
if match not in all_matches:
|
1718
|
-
all_matches.append(match)
|
1719
|
-
|
1720
|
-
return ElementCollection(all_matches)
|
1719
|
+
# Find OCR elements specifically within this region
|
1720
|
+
# Note: We typically want to correct even if the element falls in an excluded area
|
1721
|
+
target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
|
1722
|
+
|
1723
|
+
# Delegate to the utility function
|
1724
|
+
_apply_ocr_correction_to_elements(
|
1725
|
+
elements=target_elements, # Pass the ElementCollection directly
|
1726
|
+
correction_callback=correction_callback,
|
1727
|
+
caller_info=f"Region({self.bbox})", # Pass caller info
|
1728
|
+
)
|
1729
|
+
|
1730
|
+
return self # Return self for chaining
|