natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,18 @@
1
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable, TYPE_CHECKING
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
+
4
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
+
6
+ # New Imports
7
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
+
2
9
  from natural_pdf.elements.base import DirectionalMixin
3
10
 
11
+ # Import new utils
12
+ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
+
14
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
+
4
16
  if TYPE_CHECKING:
5
17
  from natural_pdf.core.page import Page
6
18
  from natural_pdf.elements.text import TextElement
@@ -12,22 +24,29 @@ except ImportError:
12
24
  # OCRManager will be imported directly in methods that use it
13
25
  pass
14
26
 
27
+ logger = logging.getLogger(__name__)
28
+
15
29
 
16
30
  class Region(DirectionalMixin):
17
31
  """
18
32
  Represents a rectangular region on a page.
19
33
  """
20
-
21
- def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None, label: Optional[str] = None):
34
+
35
+ def __init__(
36
+ self,
37
+ page: "Page",
38
+ bbox: Tuple[float, float, float, float],
39
+ polygon: List[Tuple[float, float]] = None,
40
+ parent=None,
41
+ ):
22
42
  """
23
43
  Initialize a region.
24
-
44
+
25
45
  Args:
26
46
  page: Parent page
27
47
  bbox: Bounding box as (x0, top, x1, bottom)
28
48
  polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
49
  parent: Optional parent region (for hierarchical document structure)
30
- label: Optional label for the region (e.g., for exclusions)
31
50
  """
32
51
  self._page = page
33
52
  self._bbox = bbox
@@ -37,30 +56,36 @@ class Region(DirectionalMixin):
37
56
  self._page_range = None
38
57
  self.start_element = None
39
58
  self.end_element = None
40
-
59
+
41
60
  # Standard attributes for all elements
42
- self.object_type = 'region' # For selector compatibility
43
-
61
+ self.object_type = "region" # For selector compatibility
62
+
44
63
  # Layout detection attributes
45
64
  self.region_type = None
46
65
  self.normalized_type = None
47
66
  self.confidence = None
48
67
  self.model = None
49
-
68
+
50
69
  # Region management attributes
51
70
  self.name = None
52
71
  self.source = None # Will be set by creation methods
53
- self.label = label
54
-
72
+
55
73
  # Hierarchy support for nested document structure
56
74
  self.parent_region = parent
57
75
  self.child_regions = []
58
76
  self.text_content = None # Direct text content (e.g., from Docling)
59
77
  self.associated_text_elements = [] # Native text elements that overlap with this region
60
-
61
- def _direction(self, direction: str, size: Optional[float] = None,
62
- cross_size: str = "full", include_element: bool = False,
63
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
78
+
79
+ def _direction(
80
+ self,
81
+ direction: str,
82
+ size: Optional[float] = None,
83
+ cross_size: str = "full",
84
+ include_element: bool = False,
85
+ until: Optional[str] = None,
86
+ include_endpoint: bool = True,
87
+ **kwargs,
88
+ ) -> "Region":
64
89
  """
65
90
  Protected helper method to create a region in a specified direction relative to this region.
66
91
 
@@ -76,11 +101,11 @@ class Region(DirectionalMixin):
76
101
  Returns:
77
102
  Region object
78
103
  """
79
- import math # Use math.inf for infinity
104
+ import math # Use math.inf for infinity
80
105
 
81
- is_horizontal = direction in ('left', 'right')
82
- is_positive = direction in ('right', 'below') # right/below are positive directions
83
- pixel_offset = 1 # Offset for excluding elements/endpoints
106
+ is_horizontal = direction in ("left", "right")
107
+ is_positive = direction in ("right", "below") # right/below are positive directions
108
+ pixel_offset = 1 # Offset for excluding elements/endpoints
84
109
 
85
110
  # 1. Determine initial boundaries based on direction and include_element
86
111
  if is_horizontal:
@@ -89,38 +114,44 @@ class Region(DirectionalMixin):
89
114
  y1 = self.page.height if cross_size == "full" else self.bottom
90
115
 
91
116
  # Initial primary boundaries (horizontal)
92
- if is_positive: # right
117
+ if is_positive: # right
93
118
  x0_initial = self.x0 if include_element else self.x1 + pixel_offset
94
- x1_initial = self.x1 # This edge moves
95
- else: # left
96
- x0_initial = self.x0 # This edge moves
119
+ x1_initial = self.x1 # This edge moves
120
+ else: # left
121
+ x0_initial = self.x0 # This edge moves
97
122
  x1_initial = self.x1 if include_element else self.x0 - pixel_offset
98
- else: # Vertical
123
+ else: # Vertical
99
124
  # Initial cross-boundaries (horizontal)
100
125
  x0 = 0 if cross_size == "full" else self.x0
101
126
  x1 = self.page.width if cross_size == "full" else self.x1
102
127
 
103
128
  # Initial primary boundaries (vertical)
104
- if is_positive: # below
129
+ if is_positive: # below
105
130
  y0_initial = self.top if include_element else self.bottom + pixel_offset
106
- y1_initial = self.bottom # This edge moves
107
- else: # above
108
- y0_initial = self.top # This edge moves
131
+ y1_initial = self.bottom # This edge moves
132
+ else: # above
133
+ y0_initial = self.top # This edge moves
109
134
  y1_initial = self.bottom if include_element else self.top - pixel_offset
110
135
 
111
136
  # 2. Calculate the final primary boundary, considering 'size' or page limits
112
137
  if is_horizontal:
113
- if is_positive: # right
114
- x1_final = min(self.page.width, x1_initial + (size if size is not None else (self.page.width - x1_initial)))
138
+ if is_positive: # right
139
+ x1_final = min(
140
+ self.page.width,
141
+ x1_initial + (size if size is not None else (self.page.width - x1_initial)),
142
+ )
115
143
  x0_final = x0_initial
116
- else: # left
144
+ else: # left
117
145
  x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
118
146
  x1_final = x1_initial
119
- else: # Vertical
120
- if is_positive: # below
121
- y1_final = min(self.page.height, y1_initial + (size if size is not None else (self.page.height - y1_initial)))
147
+ else: # Vertical
148
+ if is_positive: # below
149
+ y1_final = min(
150
+ self.page.height,
151
+ y1_initial + (size if size is not None else (self.page.height - y1_initial)),
152
+ )
122
153
  y0_final = y0_initial
123
- else: # above
154
+ else: # above
124
155
  y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
125
156
  y1_final = y1_initial
126
157
 
@@ -131,16 +162,16 @@ class Region(DirectionalMixin):
131
162
  matches_in_direction = []
132
163
 
133
164
  # Filter and sort matches based on direction
134
- if direction == 'above':
165
+ if direction == "above":
135
166
  matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
136
167
  matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
137
- elif direction == 'below':
168
+ elif direction == "below":
138
169
  matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
139
170
  matches_in_direction.sort(key=lambda e: e.top)
140
- elif direction == 'left':
171
+ elif direction == "left":
141
172
  matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
142
173
  matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
143
- elif direction == 'right':
174
+ elif direction == "right":
144
175
  matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
145
176
  matches_in_direction.sort(key=lambda e: e.x0)
146
177
 
@@ -149,25 +180,29 @@ class Region(DirectionalMixin):
149
180
 
150
181
  # Adjust the primary boundary based on the target
151
182
  if is_horizontal:
152
- if is_positive: # right
183
+ if is_positive: # right
153
184
  x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
154
- else: # left
185
+ else: # left
155
186
  x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
156
- else: # Vertical
157
- if is_positive: # below
187
+ else: # Vertical
188
+ if is_positive: # below
158
189
  y1_final = target.bottom if include_endpoint else target.top - pixel_offset
159
- else: # above
190
+ else: # above
160
191
  y0_final = target.top if include_endpoint else target.bottom + pixel_offset
161
192
 
162
193
  # Adjust cross boundaries if cross_size is 'element'
163
194
  if cross_size == "element":
164
- if is_horizontal: # Adjust y0, y1
165
- target_y0 = target.top if include_endpoint else target.bottom # Use opposite boundary if excluding
195
+ if is_horizontal: # Adjust y0, y1
196
+ target_y0 = (
197
+ target.top if include_endpoint else target.bottom
198
+ ) # Use opposite boundary if excluding
166
199
  target_y1 = target.bottom if include_endpoint else target.top
167
200
  y0 = min(y0, target_y0)
168
201
  y1 = max(y1, target_y1)
169
- else: # Adjust x0, x1
170
- target_x0 = target.x0 if include_endpoint else target.x1 # Use opposite boundary if excluding
202
+ else: # Adjust x0, x1
203
+ target_x0 = (
204
+ target.x0 if include_endpoint else target.x1
205
+ ) # Use opposite boundary if excluding
171
206
  target_x1 = target.x1 if include_endpoint else target.x0
172
207
  x0 = min(x0, target_x0)
173
208
  x1 = max(x1, target_x1)
@@ -195,11 +230,18 @@ class Region(DirectionalMixin):
195
230
 
196
231
  return region
197
232
 
198
- def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
199
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
233
+ def above(
234
+ self,
235
+ height: Optional[float] = None,
236
+ width: str = "full",
237
+ include_element: bool = False,
238
+ until: Optional[str] = None,
239
+ include_endpoint: bool = True,
240
+ **kwargs,
241
+ ) -> "Region":
200
242
  """
201
243
  Select region above this region.
202
-
244
+
203
245
  Args:
204
246
  height: Height of the region above, in points
205
247
  width: Width mode - "full" for full page width or "element" for element width
@@ -207,25 +249,32 @@ class Region(DirectionalMixin):
207
249
  until: Optional selector string to specify an upper boundary element
208
250
  include_endpoint: Whether to include the boundary element in the region (default: True)
209
251
  **kwargs: Additional parameters
210
-
252
+
211
253
  Returns:
212
254
  Region object representing the area above
213
255
  """
214
256
  return self._direction(
215
- direction='above',
257
+ direction="above",
216
258
  size=height,
217
259
  cross_size=width,
218
260
  include_element=include_element,
219
261
  until=until,
220
262
  include_endpoint=include_endpoint,
221
- **kwargs
263
+ **kwargs,
222
264
  )
223
265
 
224
- def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
225
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
266
+ def below(
267
+ self,
268
+ height: Optional[float] = None,
269
+ width: str = "full",
270
+ include_element: bool = False,
271
+ until: Optional[str] = None,
272
+ include_endpoint: bool = True,
273
+ **kwargs,
274
+ ) -> "Region":
226
275
  """
227
276
  Select region below this region.
228
-
277
+
229
278
  Args:
230
279
  height: Height of the region below, in points
231
280
  width: Width mode - "full" for full page width or "element" for element width
@@ -233,25 +282,32 @@ class Region(DirectionalMixin):
233
282
  until: Optional selector string to specify a lower boundary element
234
283
  include_endpoint: Whether to include the boundary element in the region (default: True)
235
284
  **kwargs: Additional parameters
236
-
285
+
237
286
  Returns:
238
287
  Region object representing the area below
239
288
  """
240
289
  return self._direction(
241
- direction='below',
290
+ direction="below",
242
291
  size=height,
243
292
  cross_size=width,
244
293
  include_element=include_element,
245
294
  until=until,
246
295
  include_endpoint=include_endpoint,
247
- **kwargs
296
+ **kwargs,
248
297
  )
249
298
 
250
- def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
251
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
299
+ def left(
300
+ self,
301
+ width: Optional[float] = None,
302
+ height: str = "full",
303
+ include_element: bool = False,
304
+ until: Optional[str] = None,
305
+ include_endpoint: bool = True,
306
+ **kwargs,
307
+ ) -> "Region":
252
308
  """
253
309
  Select region to the left of this region.
254
-
310
+
255
311
  Args:
256
312
  width: Width of the region to the left, in points
257
313
  height: Height mode - "full" for full page height or "element" for element height
@@ -259,25 +315,32 @@ class Region(DirectionalMixin):
259
315
  until: Optional selector string to specify a left boundary element
260
316
  include_endpoint: Whether to include the boundary element in the region (default: True)
261
317
  **kwargs: Additional parameters
262
-
318
+
263
319
  Returns:
264
320
  Region object representing the area to the left
265
321
  """
266
322
  return self._direction(
267
- direction='left',
323
+ direction="left",
268
324
  size=width,
269
325
  cross_size=height,
270
326
  include_element=include_element,
271
327
  until=until,
272
328
  include_endpoint=include_endpoint,
273
- **kwargs
329
+ **kwargs,
274
330
  )
275
331
 
276
- def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
277
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
332
+ def right(
333
+ self,
334
+ width: Optional[float] = None,
335
+ height: str = "full",
336
+ include_element: bool = False,
337
+ until: Optional[str] = None,
338
+ include_endpoint: bool = True,
339
+ **kwargs,
340
+ ) -> "Region":
278
341
  """
279
342
  Select region to the right of this region.
280
-
343
+
281
344
  Args:
282
345
  width: Width of the region to the right, in points
283
346
  height: Height mode - "full" for full page height or "element" for element height
@@ -285,72 +348,72 @@ class Region(DirectionalMixin):
285
348
  until: Optional selector string to specify a right boundary element
286
349
  include_endpoint: Whether to include the boundary element in the region (default: True)
287
350
  **kwargs: Additional parameters
288
-
351
+
289
352
  Returns:
290
353
  Region object representing the area to the right
291
354
  """
292
355
  return self._direction(
293
- direction='right',
356
+ direction="right",
294
357
  size=width,
295
358
  cross_size=height,
296
359
  include_element=include_element,
297
360
  until=until,
298
361
  include_endpoint=include_endpoint,
299
- **kwargs
362
+ **kwargs,
300
363
  )
301
-
364
+
302
365
  @property
303
366
  def type(self) -> str:
304
367
  """Element type."""
305
368
  # Return the specific type if detected (e.g., from layout analysis)
306
369
  # or 'region' as a default.
307
- return self.region_type or 'region' # Prioritize specific region_type if set
308
-
370
+ return self.region_type or "region" # Prioritize specific region_type if set
371
+
309
372
  @property
310
- def page(self) -> 'Page':
373
+ def page(self) -> "Page":
311
374
  """Get the parent page."""
312
375
  return self._page
313
-
376
+
314
377
  @property
315
378
  def bbox(self) -> Tuple[float, float, float, float]:
316
379
  """Get the bounding box as (x0, top, x1, bottom)."""
317
380
  return self._bbox
318
-
381
+
319
382
  @property
320
383
  def x0(self) -> float:
321
384
  """Get the left coordinate."""
322
385
  return self._bbox[0]
323
-
386
+
324
387
  @property
325
388
  def top(self) -> float:
326
389
  """Get the top coordinate."""
327
390
  return self._bbox[1]
328
-
391
+
329
392
  @property
330
393
  def x1(self) -> float:
331
394
  """Get the right coordinate."""
332
395
  return self._bbox[2]
333
-
396
+
334
397
  @property
335
398
  def bottom(self) -> float:
336
399
  """Get the bottom coordinate."""
337
400
  return self._bbox[3]
338
-
401
+
339
402
  @property
340
403
  def width(self) -> float:
341
404
  """Get the width of the region."""
342
405
  return self.x1 - self.x0
343
-
406
+
344
407
  @property
345
408
  def height(self) -> float:
346
409
  """Get the height of the region."""
347
410
  return self.bottom - self.top
348
-
411
+
349
412
  @property
350
413
  def has_polygon(self) -> bool:
351
414
  """Check if this region has polygon coordinates."""
352
415
  return self._polygon is not None and len(self._polygon) >= 3
353
-
416
+
354
417
  @property
355
418
  def polygon(self) -> List[Tuple[float, float]]:
356
419
  """Get polygon coordinates if available, otherwise return rectangle corners."""
@@ -359,141 +422,122 @@ class Region(DirectionalMixin):
359
422
  else:
360
423
  # Create rectangle corners from bbox as fallback
361
424
  return [
362
- (self.x0, self.top), # top-left
363
- (self.x1, self.top), # top-right
364
- (self.x1, self.bottom), # bottom-right
365
- (self.x0, self.bottom) # bottom-left
425
+ (self.x0, self.top), # top-left
426
+ (self.x1, self.top), # top-right
427
+ (self.x1, self.bottom), # bottom-right
428
+ (self.x0, self.bottom), # bottom-left
366
429
  ]
367
-
430
+
368
431
  def _is_point_in_polygon(self, x: float, y: float) -> bool:
369
432
  """
370
433
  Check if a point is inside the polygon using ray casting algorithm.
371
-
434
+
372
435
  Args:
373
436
  x: X coordinate of the point
374
437
  y: Y coordinate of the point
375
-
438
+
376
439
  Returns:
377
440
  bool: True if the point is inside the polygon
378
441
  """
379
442
  if not self.has_polygon:
380
443
  return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
381
-
444
+
382
445
  # Ray casting algorithm
383
446
  inside = False
384
447
  j = len(self.polygon) - 1
385
-
448
+
386
449
  for i in range(len(self.polygon)):
387
- if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
388
- (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
389
- (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
450
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
451
+ x
452
+ < (self.polygon[j][0] - self.polygon[i][0])
453
+ * (y - self.polygon[i][1])
454
+ / (self.polygon[j][1] - self.polygon[i][1])
455
+ + self.polygon[i][0]
456
+ ):
390
457
  inside = not inside
391
458
  j = i
392
-
459
+
393
460
  return inside
394
461
 
395
462
  def is_point_inside(self, x: float, y: float) -> bool:
396
463
  """
397
464
  Check if a point is inside this region using ray casting algorithm for polygons.
398
-
465
+
399
466
  Args:
400
467
  x: X coordinate of the point
401
468
  y: Y coordinate of the point
402
-
469
+
403
470
  Returns:
404
471
  bool: True if the point is inside the region
405
472
  """
406
473
  if not self.has_polygon:
407
474
  return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
408
-
475
+
409
476
  # Ray casting algorithm
410
477
  inside = False
411
478
  j = len(self.polygon) - 1
412
-
479
+
413
480
  for i in range(len(self.polygon)):
414
- if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
415
- (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
416
- (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
481
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
482
+ x
483
+ < (self.polygon[j][0] - self.polygon[i][0])
484
+ * (y - self.polygon[i][1])
485
+ / (self.polygon[j][1] - self.polygon[i][1])
486
+ + self.polygon[i][0]
487
+ ):
417
488
  inside = not inside
418
489
  j = i
419
-
490
+
420
491
  return inside
421
492
 
422
- def _is_element_in_region(self, element: 'Element', use_boundary_tolerance=True) -> bool:
493
+ def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
423
494
  """
424
495
  Check if an element is within this region.
425
-
496
+
426
497
  Args:
427
498
  element: Element to check
428
499
  use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
429
-
500
+
430
501
  Returns:
431
502
  True if the element is in the region, False otherwise
432
503
  """
433
504
  # If we have multi-page elements cached, check if the element is in the list
434
505
  if self._spans_pages and self._multi_page_elements is not None:
435
506
  return element in self._multi_page_elements
436
-
507
+
437
508
  # Check if element is on the same page
438
- if element.page != self._page:
509
+ if not hasattr(element, "page") or element.page != self._page:
439
510
  return False
440
-
511
+
441
512
  # Calculate element center
513
+ # Ensure element has necessary attributes
514
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
515
+ return False # Cannot determine position
516
+
442
517
  element_center_x = (element.x0 + element.x1) / 2
443
518
  element_center_y = (element.top + element.bottom) / 2
444
-
445
- # If this is a boundary region with exclusions, apply strict boundary checking
446
- # This helps enforce boundary_inclusion behavior in get_sections
447
- if hasattr(self, 'start_element') or hasattr(self, 'end_element'):
448
- # Apply a small tolerance to avoid border cases
449
- # When an element is right at the border, we want to be more strict
450
- tolerance = 2.0 if use_boundary_tolerance else 0.0
451
-
452
- # Check if element center is strictly within the region (not just on border)
453
- if (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
454
- self.top + tolerance <= element_center_y <= self.bottom - tolerance):
455
- return True
456
-
457
- # For elements right at the boundary, be more conservative
458
- return False
459
-
460
- # If the element itself has a polygon, check if ANY corner is in this region
461
- if hasattr(element, 'has_polygon') and element.has_polygon:
462
- for point in element.polygon:
463
- if self.is_point_inside(point[0], point[1]):
464
- return True
465
- # If no point is inside, check if the center is inside
466
- return self.is_point_inside(element_center_x, element_center_y)
467
-
468
- # For regular elements, check if center is in the region
469
- # Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
470
- # This ensures consistent behavior with the below() and above() method fixes
471
- tolerance = 1.0 if use_boundary_tolerance else 0.0
472
-
473
- # Check if within region with the tolerance applied
474
- if self.has_polygon:
475
- return self.is_point_inside(element_center_x, element_center_y)
476
- else:
477
- # For rectangular regions, apply tolerance to all sides
478
- return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
479
- self.top + tolerance <= element_center_y <= self.bottom - tolerance)
480
-
481
- def highlight(self,
482
- label: Optional[str] = None,
483
- color: Optional[Union[Tuple, str]] = None,
484
- use_color_cycling: bool = False,
485
- include_attrs: Optional[List[str]] = None,
486
- existing: str = 'append') -> 'Region':
519
+
520
+ # Check if center point is inside the region's geometry
521
+ return self.is_point_inside(element_center_x, element_center_y)
522
+
523
+ def highlight(
524
+ self,
525
+ label: Optional[str] = None,
526
+ color: Optional[Union[Tuple, str]] = None,
527
+ use_color_cycling: bool = False,
528
+ include_attrs: Optional[List[str]] = None,
529
+ existing: str = "append",
530
+ ) -> "Region":
487
531
  """
488
532
  Highlight this region on the page.
489
-
533
+
490
534
  Args:
491
535
  label: Optional label for the highlight
492
536
  color: Color tuple/string for the highlight, or None to use automatic color
493
537
  use_color_cycling: Force color cycling even with no label (default: False)
494
538
  include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
495
539
  existing: How to handle existing highlights ('append' or 'replace').
496
-
540
+
497
541
  Returns:
498
542
  Self for method chaining
499
543
  """
@@ -508,7 +552,7 @@ class Region(DirectionalMixin):
508
552
  "use_color_cycling": use_color_cycling,
509
553
  "element": self, # Pass the region itself so attributes can be accessed
510
554
  "include_attrs": include_attrs,
511
- "existing": existing
555
+ "existing": existing,
512
556
  }
513
557
 
514
558
  # Call the appropriate service method
@@ -520,59 +564,68 @@ class Region(DirectionalMixin):
520
564
  highlighter.add(**highlight_args)
521
565
 
522
566
  return self
523
-
524
- def to_image(self,
525
- scale: float = 2.0,
526
- resolution: float = 150,
527
- crop_only: bool = False,
528
- include_highlights: bool = True,
529
- **kwargs) -> 'Image.Image':
567
+
568
+ def to_image(
569
+ self,
570
+ scale: float = 2.0,
571
+ resolution: float = 150,
572
+ crop_only: bool = False,
573
+ include_highlights: bool = True,
574
+ **kwargs,
575
+ ) -> "Image.Image":
530
576
  """
531
577
  Generate an image of just this region.
532
-
578
+
533
579
  Args:
534
580
  resolution: Resolution in DPI for rendering (default: 150)
535
581
  crop_only: If True, only crop the region without highlighting its boundaries
536
582
  include_highlights: Whether to include existing highlights (default: True)
537
583
  **kwargs: Additional parameters for page.to_image()
538
-
584
+
539
585
  Returns:
540
586
  PIL Image of just this region
541
587
  """
542
588
  # First get the full page image with highlights if requested
543
- page_image = self._page.to_image(scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs)
544
-
589
+ page_image = self._page.to_image(
590
+ scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
591
+ )
592
+
545
593
  # Calculate the crop coordinates - apply resolution scaling factor
546
594
  # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
547
- scale_factor = scale
548
-
595
+ scale_factor = resolution / 72.0 # Scale based on DPI
596
+
549
597
  # Apply scaling to the coordinates
550
598
  x0 = int(self.x0 * scale_factor)
551
599
  top = int(self.top * scale_factor)
552
600
  x1 = int(self.x1 * scale_factor)
553
601
  bottom = int(self.bottom * scale_factor)
554
-
602
+
555
603
  # Crop the image to just this region
556
604
  region_image = page_image.crop((x0, top, x1, bottom))
557
-
605
+
558
606
  # If not crop_only, add a border to highlight the region boundaries
559
607
  if not crop_only:
560
608
  from PIL import ImageDraw
561
-
609
+
562
610
  # Create a 1px border around the region
563
611
  draw = ImageDraw.Draw(region_image)
564
- draw.rectangle((0, 0, region_image.width-1, region_image.height-1),
565
- outline=(255, 0, 0), width=1)
566
-
612
+ draw.rectangle(
613
+ (0, 0, region_image.width - 1, region_image.height - 1),
614
+ outline=(255, 0, 0),
615
+ width=1,
616
+ )
617
+
567
618
  return region_image
568
-
569
- def show(self,
570
- scale: float = 2.0,
571
- labels: bool = True,
572
- legend_position: str = 'right',
573
- # Add a default color for standalone show
574
- color: Optional[Union[Tuple, str]] = "blue",
575
- label: Optional[str] = None) -> 'Image.Image':
619
+
620
+ def show(
621
+ self,
622
+ scale: float = 2.0,
623
+ labels: bool = True,
624
+ legend_position: str = "right",
625
+ # Add a default color for standalone show
626
+ color: Optional[Union[Tuple, str]] = "blue",
627
+ label: Optional[str] = None,
628
+ ) -> "Image.Image":
576
629
  """
577
630
  Show the page with just this region highlighted temporarily.
578
631
 
@@ -593,16 +646,18 @@ class Region(DirectionalMixin):
593
646
  service = self._page._highlighter
594
647
 
595
648
  # Determine the label if not provided
596
- display_label = label if label is not None else f"Region ({self.type})" if self.type else "Region"
649
+ display_label = (
650
+ label if label is not None else f"Region ({self.type})" if self.type else "Region"
651
+ )
597
652
 
598
653
  # Prepare temporary highlight data for just this region
599
654
  temp_highlight_data = {
600
655
  "page_index": self._page.index,
601
656
  "bbox": self.bbox,
602
657
  "polygon": self.polygon if self.has_polygon else None,
603
- "color": color, # Use provided or default color
658
+ "color": color, # Use provided or default color
604
659
  "label": display_label,
605
- "use_color_cycling": False # Explicitly false for single preview
660
+ "use_color_cycling": False, # Explicitly false for single preview
606
661
  }
607
662
 
608
663
  # Use render_preview to show only this highlight
@@ -611,452 +666,271 @@ class Region(DirectionalMixin):
611
666
  temporary_highlights=[temp_highlight_data],
612
667
  scale=scale,
613
668
  labels=labels,
614
- legend_position=legend_position
669
+ legend_position=legend_position,
615
670
  )
616
671
 
617
- def save(self,
618
- filename: str,
619
- scale: float = 2.0,
620
- labels: bool = True,
621
- legend_position: str = 'right') -> 'Region':
672
+ def save(
673
+ self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
674
+ ) -> "Region":
622
675
  """
623
676
  Save the page with this region highlighted to an image file.
624
-
677
+
625
678
  Args:
626
679
  filename: Path to save the image to
627
680
  scale: Scale factor for rendering
628
681
  labels: Whether to include a legend for labels
629
682
  legend_position: Position of the legend
630
-
683
+
631
684
  Returns:
632
685
  Self for method chaining
633
686
  """
634
687
  # Highlight this region if not already highlighted
635
688
  self.highlight()
636
-
689
+
637
690
  # Save the highlighted image
638
691
  self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
639
692
  return self
640
-
641
- def save_image(self,
642
- filename: str,
643
- resolution: float = 150,
644
- crop_only: bool = False,
645
- include_highlights: bool = True,
646
- **kwargs) -> 'Region':
693
+
694
+ def save_image(
695
+ self,
696
+ filename: str,
697
+ resolution: float = 150,
698
+ crop_only: bool = False,
699
+ include_highlights: bool = True,
700
+ **kwargs,
701
+ ) -> "Region":
647
702
  """
648
703
  Save an image of just this region to a file.
649
-
704
+
650
705
  Args:
651
706
  filename: Path to save the image to
652
707
  resolution: Resolution in DPI for rendering (default: 150)
653
708
  crop_only: If True, only crop the region without highlighting its boundaries
654
709
  include_highlights: Whether to include existing highlights (default: True)
655
710
  **kwargs: Additional parameters for page.to_image()
656
-
711
+
657
712
  Returns:
658
713
  Self for method chaining
659
714
  """
660
715
  # Get the region image
661
716
  image = self.to_image(
662
- resolution=resolution,
663
- crop_only=crop_only,
717
+ resolution=resolution,
718
+ crop_only=crop_only,
664
719
  include_highlights=include_highlights,
665
- **kwargs
720
+ **kwargs,
666
721
  )
667
-
722
+
668
723
  # Save the image
669
724
  image.save(filename)
670
725
  return self
671
-
672
- def get_elements(self, selector: Optional[str] = None, apply_exclusions=True, **kwargs) -> List['Element']:
726
+
727
+ def get_elements(
728
+ self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
729
+ ) -> List["Element"]:
673
730
  """
674
731
  Get all elements within this region.
675
-
732
+
676
733
  Args:
677
734
  selector: Optional selector to filter elements
678
735
  apply_exclusions: Whether to apply exclusion regions
679
736
  **kwargs: Additional parameters for element filtering
680
-
737
+
681
738
  Returns:
682
739
  List of elements in the region
683
740
  """
684
741
  # If we have multi-page elements, return those
685
742
  if self._spans_pages and self._multi_page_elements is not None:
743
+ # TODO: Apply selector to multi-page elements if needed
686
744
  return self._multi_page_elements
687
-
745
+
688
746
  # Otherwise, get elements from the page
689
747
  if selector:
690
- elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
748
+ # Find elements on the page matching the selector
749
+ page_elements = self.page.find_all(
750
+ selector, apply_exclusions=apply_exclusions, **kwargs
751
+ )
752
+ # Filter those elements to only include ones within this region
753
+ return [e for e in page_elements if self._is_element_in_region(e)]
691
754
  else:
692
- elements = self.page.get_elements(apply_exclusions=apply_exclusions)
693
-
694
- # Filter to elements in this region
695
- return [e for e in elements if self._is_element_in_region(e)]
696
-
697
- def extract_text(self, keep_blank_chars=True, apply_exclusions=True, ocr=None, preserve_whitespace=None, debug=False, **kwargs) -> str:
755
+ # Get all elements from the page
756
+ page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
757
+ # Filter to elements in this region
758
+ return [e for e in page_elements if self._is_element_in_region(e)]
759
+
760
+ def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
698
761
  """
699
- Extract text from this region using pdfplumber's native functionality.
700
-
701
- For regions created by Docling, this will first try to use:
702
- 1. Associated text elements from the PDF (if available)
703
- 2. Direct text content from Docling (if available)
704
- 3. Fall back to standard pdfplumber extraction
705
-
762
+ Extract text from this region, respecting page exclusions and using pdfplumber's
763
+ layout engine (chars_to_textmap).
764
+
706
765
  Args:
707
- keep_blank_chars: Whether to keep blank characters (legacy parameter)
708
- apply_exclusions: Whether to apply exclusion regions
709
- ocr: OCR configuration. If None, uses PDF settings
710
- preserve_whitespace: Synonym for keep_blank_chars (for compatibility with page.extract_text)
711
- debug: Enable verbose debugging for exclusion handling
712
- **kwargs: Additional parameters for text extraction
713
-
766
+ apply_exclusions: Whether to apply exclusion regions defined on the parent page.
767
+ debug: Enable verbose debugging output for filtering steps.
768
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
769
+ `chars_to_textmap` function (e.g., layout, x_density, y_density).
770
+ See Page.extract_text docstring for more.
771
+
714
772
  Returns:
715
- Extracted text as string
773
+ Extracted text as string, potentially with layout-based spacing.
716
774
  """
717
- import logging
718
- logger = logging.getLogger("natural_pdf.elements.region")
719
-
720
- # Check for Docling model or if we have direct text content
721
- if self.model == 'docling' or hasattr(self, 'text_content'):
722
- # First priority: check if we have associated native text elements
723
- if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
724
- source_count = len(self.associated_text_elements)
725
- logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
726
- # Sort elements in reading order
727
- sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
728
- # Extract and join their text
729
- text_result = " ".join(elem.text for elem in sorted_elements)
730
- return text_result
731
-
732
- # Second priority: use direct text content from Docling
733
- elif self.text_content:
734
- logger.info(f"Region {self.region_type}: Using Docling OCR text content")
735
- return self.text_content
736
-
737
- logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
738
-
739
- # Handle preserve_whitespace parameter for consistency with Page.extract_text
740
- if preserve_whitespace is not None:
741
- keep_blank_chars = preserve_whitespace
742
-
743
- # If we span multiple pages, use the original implementation
744
- if self._spans_pages and self._multi_page_elements is not None:
745
- # Sort elements in reading order - only include text-like elements
746
- text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
747
-
748
- # Sort in reading order (by page, then top-to-bottom, left-to-right)
749
- sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
750
-
751
- # Extract text directly from elements to avoid recursion
752
- texts = []
753
- for element in sorted_elements:
754
- if hasattr(element, 'text'):
755
- texts.append(element.text)
756
-
757
- text_result = " ".join(texts)
758
- return text_result
759
-
760
- # Check if we have exclusions to apply
775
+ # Allow 'debug_exclusions' for backward compatibility
776
+ debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
777
+ logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
778
+
779
+ # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
780
+ # For now, let's bypass this and always use the standard extraction flow
781
+ # based on contained elements to ensure consistency.
782
+ # if self.model == 'docling' or hasattr(self, 'text_content'): ...
783
+
784
+ # 1. Get Word Elements potentially within this region (initial broad phase)
785
+ # Optimization: Could use spatial query if page elements were indexed
786
+ page_words = self.page.words # Get all words from the page
787
+
788
+ # 2. Gather all character dicts from words potentially in region
789
+ # We filter precisely in filter_chars_spatially
790
+ all_char_dicts = []
791
+ for word in page_words:
792
+ # Quick bbox check to avoid processing words clearly outside
793
+ if get_bbox_overlap(self.bbox, word.bbox) is not None:
794
+ all_char_dicts.extend(getattr(word, "_char_dicts", []))
795
+
796
+ if not all_char_dicts:
797
+ logger.debug(f"Region {self.bbox}: No character dicts found overlapping region bbox.")
798
+ return ""
799
+
800
+ # 3. Get Relevant Exclusions (overlapping this region)
801
+ apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
761
802
  exclusion_regions = []
762
- if apply_exclusions and self._page._exclusions:
763
- exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
764
-
765
- if debug:
766
- logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
767
-
768
- # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
769
- # If not, ignore exclusions entirely
770
- if exclusion_regions:
771
- has_intersection = False
772
- for i, exclusion in enumerate(exclusion_regions):
773
- # Use a simple bbox overlap check
774
- overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
775
- self.top < exclusion.bottom and self.bottom > exclusion.top)
776
-
777
- if overlap:
778
- has_intersection = True
779
- if debug:
780
- logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
781
- break
782
-
783
- # If no intersection, process without exclusions
784
- if not has_intersection:
785
- if debug:
786
- logger.debug(f" No intersection with any exclusion, ignoring exclusions")
787
- apply_exclusions = False
788
- exclusion_regions = []
789
-
790
- # IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
791
- # we can use the simpler cropping approach
792
- # Only use crop for simple cases
793
- can_use_crop = not self.has_polygon
794
- result = "" # Default empty result
795
- if can_use_crop and apply_exclusions and exclusion_regions:
796
- # We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
797
- # and those that are not
798
- footer_header_exclusions = []
799
- other_exclusions = []
800
-
801
- for i, exclusion in enumerate(exclusion_regions):
802
- # Check if exclusion spans the full width of the page
803
- # and is either at the top or bottom
804
- full_width = (abs(exclusion.x0) < 5 and
805
- abs(exclusion.x1 - self.page.width) < 5)
806
-
807
- if debug:
808
- logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
809
-
810
- if full_width:
811
- footer_header_exclusions.append(exclusion)
812
- else:
813
- other_exclusions.append(exclusion)
814
-
815
- # If we have only header/footer exclusions, we can use the cropping approach
816
- all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
817
-
818
- if all_are_bands:
819
- # Find the actual content area after excluding header/footer
820
- top_bound = self.top
821
- bottom_bound = self.bottom
822
-
823
- if debug:
824
- logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
825
-
826
- # Process only header/footer exclusions for cropping
827
- for exclusion in footer_header_exclusions:
828
- # If exclusion is at the top of our region
829
- if exclusion.bottom > self.top and exclusion.top <= self.top:
830
- # Move top bound to exclude the header
831
- top_bound = max(top_bound, exclusion.bottom)
832
- if debug:
833
- logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
834
-
835
- # If exclusion is at the bottom of our region
836
- if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
837
- # Move bottom bound to exclude the footer
838
- bottom_bound = min(bottom_bound, exclusion.top)
839
- if debug:
840
- logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
841
-
842
-
843
- if debug:
844
- logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
845
-
846
- # If we still have a valid region after exclusions
847
- if top_bound < bottom_bound:
848
- # Use direct crop with adjusted bounds
849
- crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
850
- cropped = self.page._page.crop(crop_bbox)
851
- result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
852
-
853
- if debug:
854
- logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
855
-
856
- # Skip the complex filtering approach
857
- return result
858
- else:
859
- # This would only happen if the region is entirely inside an exclusion zone
860
- # or if both top and bottom of the region are excluded leaving no valid area
861
- logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
862
- return ""
863
- # We have exclusions, but not all are headers/footers,
864
- # or we have a non-rectangular region
865
- else:
866
- if debug:
867
- logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
868
-
869
- # Don't use crop for mixed exclusion types
870
- can_use_crop = False
871
-
872
- # If we got a result from header/footer cropping, return it
873
- if result:
874
- return result
875
-
876
- # For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
877
- if can_use_crop and not apply_exclusions:
878
- # Simple case: use direct crop
879
- crop_bbox = self.bbox
880
- cropped = self.page._page.crop(crop_bbox)
881
- result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
882
- return result
883
-
884
- # For all other cases (complex exclusions, polygons), we use element filtering
885
- if debug:
886
- logger.debug(f"Using element filtering approach for region {self.bbox}")
887
-
888
- # Get only word elements in this region first (instead of ALL elements)
889
- # This prevents duplication from joining both char and word text
890
- all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
891
-
892
- if apply_exclusions and exclusion_regions:
893
- if debug:
894
- logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
895
-
896
- # Filter out elements in exclusion zones
897
- filtered_elements = []
898
- for elem in all_elements:
899
- in_exclusion = False
900
- # For each element, check if it's in any exclusion zone
901
- element_center_x = (elem.x0 + elem.x1) / 2
902
- element_center_y = (elem.top + elem.bottom) / 2
903
-
904
- for exclusion in exclusion_regions:
905
- if (exclusion.x0 <= element_center_x <= exclusion.x1 and
906
- exclusion.top <= element_center_y <= exclusion.bottom):
907
- in_exclusion = True
908
- break
909
-
910
- if not in_exclusion:
911
- filtered_elements.append(elem)
912
- else:
913
- # No exclusions, use all elements
914
- filtered_elements = all_elements
915
-
916
- # Now extract text from the filtered elements
917
- if filtered_elements:
918
- from natural_pdf.elements.collections import ElementCollection
919
- collection = ElementCollection(filtered_elements)
920
- # Sort in reading order
921
- collection = collection.sort(key=lambda e: (e.top, e.x0))
922
- # Extract text
923
- result = " ".join(e.text for e in collection if hasattr(e, 'text'))
924
-
925
- if debug:
926
- logger.debug(f"Got {len(result)} characters from element-based extraction")
927
-
928
- # Return the result
929
- return result
930
- else:
803
+ if apply_exclusions_flag and self._page._exclusions:
804
+ all_page_exclusions = self._page._get_exclusion_regions(
805
+ include_callable=True, debug=debug
806
+ )
807
+ overlapping_exclusions = []
808
+ for excl in all_page_exclusions:
809
+ if get_bbox_overlap(self.bbox, excl.bbox) is not None:
810
+ overlapping_exclusions.append(excl)
811
+ exclusion_regions = overlapping_exclusions
931
812
  if debug:
932
- logger.debug(f"No elements found after filtering")
933
- return ""
934
-
935
- # Handle OCR if needed
936
- use_ocr = ocr is True or (isinstance(ocr, dict) and ocr.get('enabled', False))
937
- auto_ocr = ocr is None and self.page._parent._ocr_config.get('enabled') == 'auto'
938
-
939
- # Run OCR if explicitly requested or if in auto mode and no text found
940
- if use_ocr or (auto_ocr and not result.strip()):
941
- ocr_config = self.page._get_ocr_config(ocr or {}) if use_ocr else self.page._get_ocr_config({'enabled': 'auto'})
942
- ocr_elements = self.apply_ocr(**ocr_config)
943
-
944
- if ocr_elements:
945
- # Filter OCR elements by exclusions if needed
946
- if apply_exclusions and exclusion_regions:
947
- filtered_ocr = []
948
- for element in ocr_elements:
949
- exclude = False
950
- for region in exclusion_regions:
951
- if region._is_element_in_region(element):
952
- exclude = True
953
- break
954
- if not exclude:
955
- filtered_ocr.append(element)
956
- else:
957
- filtered_ocr = ocr_elements
958
-
959
- # Extract text from OCR elements
960
- from natural_pdf.elements.collections import ElementCollection
961
- ocr_collection = ElementCollection(filtered_ocr)
962
- ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
963
-
964
- # Use OCR text if it's not empty
965
- if ocr_text.strip():
966
- return ocr_text
967
-
813
+ logger.debug(
814
+ f"Region {self.bbox}: Applying {len(exclusion_regions)} overlapping exclusions."
815
+ )
816
+ elif debug:
817
+ logger.debug(f"Region {self.bbox}: Not applying exclusions.")
818
+
819
+ # 4. Spatially Filter Characters using Utility
820
+ # Pass self as the target_region for precise polygon checks etc.
821
+ filtered_chars = filter_chars_spatially(
822
+ char_dicts=all_char_dicts,
823
+ exclusion_regions=exclusion_regions,
824
+ target_region=self, # Pass self!
825
+ debug=debug,
826
+ )
827
+
828
+ # 5. Generate Text Layout using Utility
829
+ result = generate_text_layout(
830
+ char_dicts=filtered_chars,
831
+ layout_context_bbox=self.bbox, # Use region's bbox for context
832
+ user_kwargs=kwargs,
833
+ )
834
+
835
+ logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
968
836
  return result
969
-
970
- def extract_table(self, method: str = None, table_settings: dict = None,
971
- use_ocr: bool = False, ocr_config: dict = None) -> List[List[str]]:
837
+
838
+ def extract_table(
839
+ self,
840
+ method: str = None,
841
+ table_settings: dict = None,
842
+ use_ocr: bool = False,
843
+ ocr_config: dict = None,
844
+ ) -> List[List[str]]:
972
845
  """
973
846
  Extract a table from this region.
974
-
847
+
975
848
  Args:
976
849
  method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
977
850
  table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
978
851
  use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
979
852
  ocr_config: OCR configuration parameters
980
-
853
+
981
854
  Returns:
982
855
  Table data as a list of rows, where each row is a list of cell values
983
856
  """
984
857
  # Default settings if none provided
985
858
  if table_settings is None:
986
859
  table_settings = {}
987
-
860
+
988
861
  # Auto-detect method if not specified
989
862
  if method is None:
990
863
  # If this is a TATR-detected region, use TATR method
991
- if hasattr(self, 'model') and self.model == 'tatr' and self.region_type == 'table':
992
- method = 'tatr'
864
+ if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
865
+ method = "tatr"
993
866
  else:
994
- method = 'plumber'
995
-
867
+ method = "plumber"
868
+
996
869
  # Use the selected method
997
- if method == 'tatr':
870
+ if method == "tatr":
998
871
  return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
999
872
  else: # Default to pdfplumber
1000
873
  return self._extract_table_plumber(table_settings)
1001
-
874
+
1002
875
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1003
876
  """
1004
877
  Extract table using pdfplumber's table extraction.
1005
-
878
+
1006
879
  Args:
1007
880
  table_settings: Settings for pdfplumber table extraction
1008
-
881
+
1009
882
  Returns:
1010
883
  Table data as a list of rows, where each row is a list of cell values
1011
884
  """
1012
885
  # Create a crop of the page for this region
1013
886
  cropped = self.page._page.crop(self.bbox)
1014
-
887
+
1015
888
  # Extract table from the cropped area
1016
889
  tables = cropped.extract_tables(table_settings)
1017
-
890
+
1018
891
  # Return the first table or an empty list if none found
1019
892
  if tables:
1020
893
  return tables[0]
1021
894
  return []
1022
-
895
+
1023
896
  def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
1024
897
  """
1025
898
  Extract table using TATR structure detection.
1026
-
899
+
1027
900
  Args:
1028
901
  use_ocr: Whether to apply OCR to each cell for better text extraction
1029
902
  ocr_config: Optional OCR configuration parameters
1030
-
903
+
1031
904
  Returns:
1032
905
  Table data as a list of rows, where each row is a list of cell values
1033
906
  """
1034
907
  # Find all rows and headers in this table
1035
- rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1036
- headers = self.page.find_all(f'region[type=table-column-header][model=tatr]')
1037
- columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1038
-
908
+ rows = self.page.find_all(f"region[type=table-row][model=tatr]")
909
+ headers = self.page.find_all(f"region[type=table-column-header][model=tatr]")
910
+ columns = self.page.find_all(f"region[type=table-column][model=tatr]")
911
+
1039
912
  # Filter to only include rows/headers/columns that overlap with this table region
1040
913
  def is_in_table(region):
1041
914
  # Check for overlap - simplifying to center point for now
1042
915
  region_center_x = (region.x0 + region.x1) / 2
1043
916
  region_center_y = (region.top + region.bottom) / 2
1044
- return (self.x0 <= region_center_x <= self.x1 and
1045
- self.top <= region_center_y <= self.bottom)
1046
-
917
+ return (
918
+ self.x0 <= region_center_x <= self.x1 and self.top <= region_center_y <= self.bottom
919
+ )
920
+
1047
921
  rows = [row for row in rows if is_in_table(row)]
1048
922
  headers = [header for header in headers if is_in_table(header)]
1049
923
  columns = [column for column in columns if is_in_table(column)]
1050
-
924
+
1051
925
  # Sort rows by vertical position (top to bottom)
1052
926
  rows.sort(key=lambda r: r.top)
1053
-
927
+
1054
928
  # Sort columns by horizontal position (left to right)
1055
929
  columns.sort(key=lambda c: c.x0)
1056
-
930
+
1057
931
  # Create table data structure
1058
932
  table_data = []
1059
-
933
+
1060
934
  # Prepare OCR config if needed
1061
935
  if use_ocr:
1062
936
  # Default OCR config focuses on small text with low confidence
@@ -1065,16 +939,20 @@ class Region(DirectionalMixin):
1065
939
  "min_confidence": 0.1, # Lower than default to catch more text
1066
940
  "detection_params": {
1067
941
  "text_threshold": 0.1, # Lower threshold for low-contrast text
1068
- "link_threshold": 0.1 # Lower threshold for connecting text components
1069
- }
942
+ "link_threshold": 0.1, # Lower threshold for connecting text components
943
+ },
1070
944
  }
1071
-
945
+
1072
946
  # Merge with provided config if any
1073
947
  if ocr_config:
1074
948
  if isinstance(ocr_config, dict):
1075
949
  # Update default config with provided values
1076
950
  for key, value in ocr_config.items():
1077
- if isinstance(value, dict) and key in default_ocr_config and isinstance(default_ocr_config[key], dict):
951
+ if (
952
+ isinstance(value, dict)
953
+ and key in default_ocr_config
954
+ and isinstance(default_ocr_config[key], dict)
955
+ ):
1078
956
  # Merge nested dicts
1079
957
  default_ocr_config[key].update(value)
1080
958
  else:
@@ -1083,10 +961,10 @@ class Region(DirectionalMixin):
1083
961
  else:
1084
962
  # Not a dict, use as is
1085
963
  default_ocr_config = ocr_config
1086
-
964
+
1087
965
  # Use the merged config
1088
966
  ocr_config = default_ocr_config
1089
-
967
+
1090
968
  # Add header row if headers were detected
1091
969
  if headers:
1092
970
  header_texts = []
@@ -1099,30 +977,28 @@ class Region(DirectionalMixin):
1099
977
  if ocr_text:
1100
978
  header_texts.append(ocr_text)
1101
979
  continue
1102
-
980
+
1103
981
  # Fallback to normal extraction
1104
982
  header_texts.append(header.extract_text().strip())
1105
983
  table_data.append(header_texts)
1106
-
984
+
1107
985
  # Process rows
1108
986
  for row in rows:
1109
987
  row_cells = []
1110
-
988
+
1111
989
  # If we have columns, use them to extract cells
1112
990
  if columns:
1113
991
  for column in columns:
1114
992
  # Create a cell region at the intersection of row and column
1115
- cell_bbox = (
1116
- column.x0,
1117
- row.top,
1118
- column.x1,
1119
- row.bottom
1120
- )
1121
-
993
+ cell_bbox = (column.x0, row.top, column.x1, row.bottom)
994
+
1122
995
  # Create a region for this cell
1123
- from natural_pdf.elements.region import Region # Import here to avoid circular imports
996
+ from natural_pdf.elements.region import ( # Import here to avoid circular imports
997
+ Region,
998
+ )
999
+
1124
1000
  cell_region = Region(self.page, cell_bbox)
1125
-
1001
+
1126
1002
  # Extract text from the cell
1127
1003
  if use_ocr:
1128
1004
  # Apply OCR to the cell
@@ -1133,7 +1009,7 @@ class Region(DirectionalMixin):
1133
1009
  if ocr_text:
1134
1010
  row_cells.append(ocr_text)
1135
1011
  continue
1136
-
1012
+
1137
1013
  # Fallback to normal extraction
1138
1014
  cell_text = cell_region.extract_text().strip()
1139
1015
  row_cells.append(cell_text)
@@ -1147,182 +1023,212 @@ class Region(DirectionalMixin):
1147
1023
  if ocr_text:
1148
1024
  row_cells.append(ocr_text)
1149
1025
  continue
1150
-
1026
+
1151
1027
  # Fallback to normal extraction
1152
1028
  row_cells.append(row.extract_text().strip())
1153
-
1029
+
1154
1030
  table_data.append(row_cells)
1155
-
1031
+
1156
1032
  return table_data
1157
-
1158
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
1033
+
1034
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
1159
1035
  """
1160
1036
  Find the first element in this region matching the selector.
1161
-
1037
+
1162
1038
  Args:
1163
1039
  selector: CSS-like selector string
1164
1040
  apply_exclusions: Whether to apply exclusion regions
1165
1041
  **kwargs: Additional parameters for element filtering
1166
-
1042
+
1167
1043
  Returns:
1168
1044
  First matching element or None
1169
1045
  """
1170
1046
  elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1171
- return elements[0] if elements else None
1172
-
1173
- def _find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
1047
+ return elements.first if elements else None # Use .first property
1048
+
1049
+ def find_all(
1050
+ self, selector: str, apply_exclusions=True, **kwargs
1051
+ ) -> "ElementCollection": # Changed from _find_all
1174
1052
  """
1175
1053
  Find all elements in this region matching the selector.
1176
-
1054
+
1177
1055
  Args:
1178
1056
  selector: CSS-like selector string
1179
1057
  apply_exclusions: Whether to apply exclusion regions
1180
1058
  **kwargs: Additional parameters for element filtering
1181
-
1059
+
1182
1060
  Returns:
1183
1061
  ElementCollection with matching elements
1184
1062
  """
1185
1063
  from natural_pdf.elements.collections import ElementCollection
1186
1064
 
1187
1065
  # If we span multiple pages, filter our elements
1066
+ # TODO: Revisit multi-page region logic
1188
1067
  if self._spans_pages and self._multi_page_elements is not None:
1189
- # Parse the selector
1190
- from natural_pdf.selectors.parser import parse_selector
1191
- selector_obj = parse_selector(selector)
1192
-
1193
- # Rather than using matches_selector, let each page's find_all handle the matching
1194
- # since that method is already properly implemented
1195
- all_matching_elements = []
1196
- page_ranges = {}
1197
-
1198
- # Group elements by page
1199
- for element in self._multi_page_elements:
1200
- if element.page not in page_ranges:
1201
- page_ranges[element.page] = []
1202
- page_ranges[element.page].append(element)
1203
-
1204
- # For each page, use its find_all to match elements, then filter to our collection
1205
- for page, page_elements in page_ranges.items():
1206
- # Get all matching elements from the page
1207
- page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1208
-
1209
- # Filter to just the elements that are in our collection
1210
- for element in page_matches:
1211
- if element in page_elements:
1212
- all_matching_elements.append(element)
1213
-
1214
- return ElementCollection(all_matching_elements)
1068
+ logger.warning("find_all on multi-page regions is not fully implemented.")
1069
+ # Temporary: Apply filter directly to cached elements
1070
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1071
+
1072
+ try:
1073
+ selector_obj = parse_selector(selector)
1074
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
1075
+ matching = [el for el in self._multi_page_elements if filter_func(el)]
1076
+ return ElementCollection(matching)
1077
+ except Exception as e:
1078
+ logger.error(f"Error applying selector to multi-page region elements: {e}")
1079
+ return ElementCollection([])
1215
1080
 
1216
1081
  # Otherwise, get elements from the page and filter by selector and region
1217
1082
  page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1083
+ # Use the precise _is_element_in_region check
1218
1084
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1219
1085
  return ElementCollection(filtered_elements)
1220
-
1221
- def apply_ocr(self, **ocr_params) -> List['TextElement']:
1086
+
1087
+ def apply_ocr(self, **ocr_params) -> "Region":
1222
1088
  """
1223
1089
  Apply OCR to this region and return the created text elements.
1224
-
1090
+
1225
1091
  Args:
1226
- **ocr_params: OCR parameters to override defaults
1227
-
1092
+ **ocr_params: Keyword arguments passed to the OCR Manager.
1093
+ Common parameters like `engine`, `languages`, `min_confidence`,
1094
+ `device`, and `resolution` (for image rendering) should be
1095
+ provided here. **The `languages` list must contain codes
1096
+ understood by the specific engine selected.** No mapping
1097
+ is performed. Engine-specific settings can be passed in
1098
+ an `options` object (e.g., `options=EasyOCROptions(...)`).
1099
+
1228
1100
  Returns:
1229
- List of created text elements
1101
+ List of created TextElement objects representing OCR words/lines.
1230
1102
  """
1231
- from natural_pdf.ocr import OCRManager
1232
-
1233
- # Get OCR configuration but suppress verbose output
1234
- if isinstance(ocr_params, dict):
1235
- ocr_params["verbose"] = False
1236
- else:
1237
- ocr_params = {"enabled": True, "verbose": False}
1238
-
1239
- ocr_config = self.page._get_ocr_config(ocr_params)
1240
-
1241
- # Skip if OCR is disabled
1242
- if not ocr_config.get('enabled'):
1103
+ # Ensure OCRManager is available
1104
+ if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1105
+ logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
1243
1106
  return []
1244
-
1245
- # Render the page
1246
- page_image = self.page.to_image()
1247
-
1248
- # Crop to this region
1249
- region_image = page_image.crop((self.x0, self.top, self.x1, self.bottom))
1250
-
1251
- # Run OCR on this region
1252
- ocr_mgr = OCRManager.get_instance()
1253
- results = ocr_mgr.recognize_region(region_image, ocr_config)
1254
-
1255
- # Adjust coordinates to be relative to the page
1256
- for result in results:
1257
- # Calculate bbox in page coordinates
1258
- result['bbox'] = (
1259
- result['bbox'][0] + self.x0,
1260
- result['bbox'][1] + self.top,
1261
- result['bbox'][2] + self.x0,
1262
- result['bbox'][3] + self.top
1107
+ ocr_mgr = self.page._parent._ocr_manager
1108
+
1109
+ # Determine rendering resolution from parameters
1110
+ final_resolution = ocr_params.get("resolution")
1111
+ if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
1112
+ final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
1113
+ elif final_resolution is None:
1114
+ final_resolution = 150
1115
+ logger.debug(
1116
+ f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
1117
+ )
1118
+
1119
+ # Render the page region to an image using the determined resolution
1120
+ try:
1121
+ region_image = self.to_image(
1122
+ resolution=final_resolution, include_highlights=False, crop_only=True
1263
1123
  )
1264
-
1265
- # Create text elements with adjusted coordinates
1266
- elements = []
1124
+ if not region_image:
1125
+ logger.error("Failed to render region to image for OCR.")
1126
+ return []
1127
+ logger.debug(f"Region rendered to image size: {region_image.size}")
1128
+ except Exception as e:
1129
+ logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1130
+ return []
1131
+
1132
+ # Prepare args for the OCR Manager
1133
+ manager_args = {
1134
+ "images": region_image,
1135
+ "engine": ocr_params.get("engine"),
1136
+ "languages": ocr_params.get("languages"),
1137
+ "min_confidence": ocr_params.get("min_confidence"),
1138
+ "device": ocr_params.get("device"),
1139
+ "options": ocr_params.get("options"),
1140
+ "detect_only": ocr_params.get("detect_only"),
1141
+ }
1142
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1143
+
1144
+ # Run OCR on this region's image using the manager
1145
+ try:
1146
+ results = ocr_mgr.apply_ocr(**manager_args)
1147
+ if not isinstance(results, list):
1148
+ logger.error(
1149
+ f"OCRManager returned unexpected type for single region image: {type(results)}"
1150
+ )
1151
+ return []
1152
+ logger.debug(f"Region OCR processing returned {len(results)} results.")
1153
+ except Exception as e:
1154
+ logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1155
+ return []
1156
+
1157
+ # Convert results to TextElements
1158
+ scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
1159
+ scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
1160
+ logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
1161
+ created_elements = []
1267
1162
  for result in results:
1268
- # Only include results that are fully within the region
1269
- if (result['bbox'][0] >= self.x0 and
1270
- result['bbox'][1] >= self.top and
1271
- result['bbox'][2] <= self.x1 and
1272
- result['bbox'][3] <= self.bottom):
1273
- # Create a TextElement object with the appropriate fields
1274
- from natural_pdf.elements.text import TextElement
1163
+ try:
1164
+ img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
1165
+ pdf_height = (img_bottom - img_top) * scale_y
1166
+ page_x0 = self.x0 + (img_x0 * scale_x)
1167
+ page_top = self.top + (img_top * scale_y)
1168
+ page_x1 = self.x0 + (img_x1 * scale_x)
1169
+ page_bottom = self.top + (img_bottom * scale_y)
1275
1170
  element_data = {
1276
- 'text': result['text'],
1277
- 'x0': result['bbox'][0],
1278
- 'top': result['bbox'][1],
1279
- 'x1': result['bbox'][2],
1280
- 'bottom': result['bbox'][3],
1281
- 'width': result['bbox'][2] - result['bbox'][0],
1282
- 'height': result['bbox'][3] - result['bbox'][1],
1283
- 'object_type': 'text',
1284
- 'source': 'ocr',
1285
- 'confidence': result['confidence'],
1286
- # Add default font information to work with existing expectations
1287
- 'fontname': 'OCR-detected',
1288
- 'size': 10.0,
1289
- 'page_number': self.page.number
1171
+ "text": result["text"],
1172
+ "x0": page_x0,
1173
+ "top": page_top,
1174
+ "x1": page_x1,
1175
+ "bottom": page_bottom,
1176
+ "width": page_x1 - page_x0,
1177
+ "height": page_bottom - page_top,
1178
+ "object_type": "word",
1179
+ "source": "ocr",
1180
+ "confidence": float(result.get("confidence", 0.0)),
1181
+ "fontname": "OCR",
1182
+ "size": round(pdf_height) if pdf_height > 0 else 10.0,
1183
+ "page_number": self.page.number,
1184
+ "bold": False,
1185
+ "italic": False,
1186
+ "upright": True,
1187
+ "doctop": page_top + self.page._page.initial_doctop,
1290
1188
  }
1291
-
1189
+ ocr_char_dict = element_data.copy()
1190
+ ocr_char_dict["object_type"] = "char"
1191
+ ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1192
+ element_data["_char_dicts"] = [ocr_char_dict]
1193
+ from natural_pdf.elements.text import TextElement
1292
1194
  elem = TextElement(element_data, self.page)
1293
- elements.append(elem)
1294
-
1295
- # Add to page's elements
1296
- if hasattr(self.page, '_elements') and self.page._elements is not None:
1297
- # Add to words list to make it accessible via standard API
1298
- if 'words' in self.page._elements:
1299
- self.page._elements['words'].append(elem)
1300
- else:
1301
- self.page._elements['words'] = [elem]
1302
-
1303
- return elements
1304
-
1305
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1195
+ created_elements.append(elem)
1196
+ self.page._element_mgr.add_element(elem, element_type="words")
1197
+ self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
1198
+ except Exception as e:
1199
+ logger.error(
1200
+ f"Failed to convert region OCR result to element: {result}. Error: {e}",
1201
+ exc_info=True,
1202
+ )
1203
+ logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
1204
+ return self
1205
+
1206
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
1306
1207
  """
1307
1208
  Get a section between two elements within this region.
1308
-
1209
+
1309
1210
  Args:
1310
1211
  start_element: Element marking the start of the section
1311
1212
  end_element: Element marking the end of the section
1312
1213
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1313
-
1214
+
1314
1215
  Returns:
1315
1216
  Region representing the section
1316
1217
  """
1218
+ # Get elements only within this region first
1317
1219
  elements = self.get_elements()
1318
-
1319
- # If no elements, return self
1220
+
1221
+ # If no elements, return self or empty region?
1320
1222
  if not elements:
1321
- return self
1322
-
1223
+ logger.warning(
1224
+ f"get_section_between called on region {self.bbox} with no contained elements."
1225
+ )
1226
+ # Return an empty region at the start of the parent region
1227
+ return Region(self.page, (self.x0, self.top, self.x0, self.top))
1228
+
1323
1229
  # Sort elements in reading order
1324
1230
  elements.sort(key=lambda e: (e.top, e.x0))
1325
-
1231
+
1326
1232
  # Find start index
1327
1233
  start_idx = 0
1328
1234
  if start_element:
@@ -1330,8 +1236,12 @@ class Region(DirectionalMixin):
1330
1236
  start_idx = elements.index(start_element)
1331
1237
  except ValueError:
1332
1238
  # Start element not in region, use first element
1333
- pass
1334
-
1239
+ logger.debug("Start element not found in region, using first element.")
1240
+ start_element = elements[0] # Use the actual first element
1241
+ start_idx = 0
1242
+ else:
1243
+ start_element = elements[0] # Default start is first element
1244
+
1335
1245
  # Find end index
1336
1246
  end_idx = len(elements) - 1
1337
1247
  if end_element:
@@ -1339,218 +1249,231 @@ class Region(DirectionalMixin):
1339
1249
  end_idx = elements.index(end_element)
1340
1250
  except ValueError:
1341
1251
  # End element not in region, use last element
1342
- pass
1343
-
1252
+ logger.debug("End element not found in region, using last element.")
1253
+ end_element = elements[-1] # Use the actual last element
1254
+ end_idx = len(elements) - 1
1255
+ else:
1256
+ end_element = elements[-1] # Default end is last element
1257
+
1344
1258
  # Adjust indexes based on boundary inclusion
1345
- if boundary_inclusion == 'none':
1259
+ start_element_for_bbox = start_element
1260
+ end_element_for_bbox = end_element
1261
+
1262
+ if boundary_inclusion == "none":
1346
1263
  start_idx += 1
1347
1264
  end_idx -= 1
1348
- elif boundary_inclusion == 'start':
1265
+ start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
1266
+ end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
1267
+ elif boundary_inclusion == "start":
1349
1268
  end_idx -= 1
1350
- elif boundary_inclusion == 'end':
1269
+ end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
1270
+ elif boundary_inclusion == "end":
1351
1271
  start_idx += 1
1352
-
1272
+ start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
1273
+
1353
1274
  # Ensure valid indexes
1354
1275
  start_idx = max(0, start_idx)
1355
1276
  end_idx = min(len(elements) - 1, end_idx)
1356
-
1277
+
1357
1278
  # If no valid elements in range, return empty region
1358
- if start_idx > end_idx:
1359
- return Region(self.page, (0, 0, 0, 0))
1360
-
1361
- # Get elements in range
1362
- section_elements = elements[start_idx:end_idx+1]
1363
-
1364
- # Create bounding box around elements
1279
+ if start_idx > end_idx or start_element_for_bbox is None or end_element_for_bbox is None:
1280
+ logger.debug("No valid elements in range for get_section_between.")
1281
+ # Return an empty region positioned at the start element boundary
1282
+ anchor = start_element if start_element else self
1283
+ return Region(self.page, (anchor.x0, anchor.top, anchor.x0, anchor.top))
1284
+
1285
+ # Get elements in range based on adjusted indices
1286
+ section_elements = elements[start_idx : end_idx + 1]
1287
+
1288
+ # Create bounding box around the ELEMENTS included based on indices
1365
1289
  x0 = min(e.x0 for e in section_elements)
1366
1290
  top = min(e.top for e in section_elements)
1367
1291
  x1 = max(e.x1 for e in section_elements)
1368
1292
  bottom = max(e.bottom for e in section_elements)
1369
-
1370
- # Adjust boundaries for better boundary inclusion/exclusion
1371
- pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
1372
-
1373
- # Only proceed with adjustments if we have elements in the section
1374
- if section_elements:
1375
- # Adjust top boundary if start element should be excluded
1376
- if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
1377
- # If start element is just above the section, move the top down
1378
- # Use a larger threshold (10 points) to catch more cases
1379
- if abs(top - start_element.bottom) < 10:
1380
- top += pixel_adjustment
1381
-
1382
- # Adjust bottom boundary if end element should be excluded
1383
- if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
1384
- # If end element is just below the section, move the bottom up
1385
- # Use a larger threshold (10 points) to catch more cases
1386
- if abs(bottom - end_element.top) < 10:
1387
- bottom -= pixel_adjustment
1388
-
1389
- # Ensure top is always less than bottom (valid region)
1390
- if top >= bottom:
1391
- # Reset to original if adjustment would create an invalid region
1392
- top = min(e.top for e in section_elements)
1393
- bottom = max(e.bottom for e in section_elements)
1394
-
1293
+
1395
1294
  # Create new region
1396
1295
  section = Region(self.page, (x0, top, x1, bottom))
1397
- section.start_element = start_element if boundary_inclusion in ('start', 'both') else None
1398
- section.end_element = end_element if boundary_inclusion in ('end', 'both') else None
1399
-
1296
+ # Store the original boundary elements for reference
1297
+ section.start_element = start_element
1298
+ section.end_element = end_element
1299
+
1400
1300
  return section
1401
-
1402
- def get_sections(self, start_elements=None, end_elements=None, boundary_inclusion='both') -> List['Region']:
1301
+
1302
+ def get_sections(
1303
+ self, start_elements=None, end_elements=None, boundary_inclusion="both"
1304
+ ) -> List["Region"]:
1403
1305
  """
1404
1306
  Get sections within this region based on start/end elements.
1405
-
1307
+
1406
1308
  Args:
1407
1309
  start_elements: Elements or selector string that mark the start of sections
1408
1310
  end_elements: Elements or selector string that mark the end of sections
1409
1311
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1410
-
1312
+
1411
1313
  Returns:
1412
1314
  List of Region objects representing the extracted sections
1413
1315
  """
1414
1316
  from natural_pdf.elements.collections import ElementCollection
1415
-
1416
- # Process string selectors to find elements
1317
+
1318
+ # Process string selectors to find elements WITHIN THIS REGION
1417
1319
  if isinstance(start_elements, str):
1418
- start_elements = self.find_all(start_elements)
1419
- if hasattr(start_elements, 'elements'):
1320
+ start_elements = self.find_all(start_elements) # Use region's find_all
1321
+ if hasattr(start_elements, "elements"):
1420
1322
  start_elements = start_elements.elements
1421
-
1323
+
1422
1324
  if isinstance(end_elements, str):
1423
- end_elements = self.find_all(end_elements)
1424
- if hasattr(end_elements, 'elements'):
1325
+ end_elements = self.find_all(end_elements) # Use region's find_all
1326
+ if hasattr(end_elements, "elements"):
1425
1327
  end_elements = end_elements.elements
1426
-
1427
- # If no start elements, return empty list
1328
+
1329
+ # Ensure start_elements is a list (or similar iterable)
1330
+ if start_elements is None or not hasattr(start_elements, "__iter__"):
1331
+ logger.warning(
1332
+ "get_sections requires valid start_elements (selector or list). Returning empty."
1333
+ )
1334
+ return []
1335
+ # Ensure end_elements is a list if provided
1336
+ if end_elements is not None and not hasattr(end_elements, "__iter__"):
1337
+ logger.warning("end_elements must be iterable if provided. Ignoring.")
1338
+ end_elements = []
1339
+ elif end_elements is None:
1340
+ end_elements = []
1341
+
1342
+ # If no start elements found within the region, return empty list
1428
1343
  if not start_elements:
1429
1344
  return []
1430
-
1431
- # Sort elements in reading order
1432
- all_elements = self.get_elements()
1433
- all_elements.sort(key=lambda e: (e.top, e.x0))
1434
-
1435
- # Get all indexes in the sorted list
1345
+
1346
+ # Sort all elements within the region in reading order
1347
+ all_elements_in_region = self.get_elements()
1348
+ all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
1349
+
1350
+ if not all_elements_in_region:
1351
+ return [] # Cannot create sections if region is empty
1352
+
1353
+ # Map elements to their indices in the sorted list
1354
+ element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
1355
+
1356
+ # Mark section boundaries using indices from the sorted list
1436
1357
  section_boundaries = []
1437
-
1358
+
1438
1359
  # Add start element indexes
1439
1360
  for element in start_elements:
1440
- try:
1441
- idx = all_elements.index(element)
1442
- section_boundaries.append({
1443
- 'index': idx,
1444
- 'element': element,
1445
- 'type': 'start'
1446
- })
1447
- except ValueError:
1448
- # Element not in this region, skip
1449
- continue
1450
-
1361
+ idx = element_to_index.get(element)
1362
+ if idx is not None:
1363
+ section_boundaries.append({"index": idx, "element": element, "type": "start"})
1364
+ # else: Element found by selector might not be geometrically in region? Log warning?
1365
+
1451
1366
  # Add end element indexes if provided
1452
- if end_elements:
1453
- for element in end_elements:
1454
- try:
1455
- idx = all_elements.index(element)
1456
- section_boundaries.append({
1457
- 'index': idx,
1458
- 'element': element,
1459
- 'type': 'end'
1460
- })
1461
- except ValueError:
1462
- # Element not in this region, skip
1463
- continue
1464
-
1465
- # Sort boundaries by index (document order)
1466
- section_boundaries.sort(key=lambda x: x['index'])
1467
-
1367
+ for element in end_elements:
1368
+ idx = element_to_index.get(element)
1369
+ if idx is not None:
1370
+ section_boundaries.append({"index": idx, "element": element, "type": "end"})
1371
+
1372
+ # Sort boundaries by index (document order within the region)
1373
+ section_boundaries.sort(key=lambda x: x["index"])
1374
+
1468
1375
  # Generate sections
1469
1376
  sections = []
1470
- current_start = None
1471
-
1377
+ current_start_boundary = None
1378
+
1472
1379
  for i, boundary in enumerate(section_boundaries):
1473
1380
  # If it's a start boundary and we don't have a current start
1474
- if boundary['type'] == 'start' and current_start is None:
1475
- current_start = boundary
1476
-
1381
+ if boundary["type"] == "start" and current_start_boundary is None:
1382
+ current_start_boundary = boundary
1383
+
1477
1384
  # If it's an end boundary and we have a current start
1478
- elif boundary['type'] == 'end' and current_start is not None:
1385
+ elif boundary["type"] == "end" and current_start_boundary is not None:
1479
1386
  # Create a section from current_start to this boundary
1480
- start_element = current_start['element']
1481
- end_element = boundary['element']
1482
- section = self.get_section_between(
1483
- start_element,
1484
- end_element,
1485
- boundary_inclusion
1486
- )
1487
- sections.append(section)
1488
- current_start = None
1489
-
1490
- # If it's another start boundary and we have a current start (for splitting by starts only)
1491
- elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1492
- # Create a section from current_start to just before this boundary
1493
- start_element = current_start['element']
1494
- end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
1495
- section = self.get_section_between(
1496
- start_element,
1497
- end_element,
1498
- boundary_inclusion
1499
- )
1387
+ start_element = current_start_boundary["element"]
1388
+ end_element = boundary["element"]
1389
+ # Use the helper, ensuring elements are from within the region
1390
+ section = self.get_section_between(start_element, end_element, boundary_inclusion)
1500
1391
  sections.append(section)
1501
- current_start = boundary
1502
-
1392
+ current_start_boundary = None # Reset
1393
+
1394
+ # If it's another start boundary and we have a current start (split by starts only)
1395
+ elif (
1396
+ boundary["type"] == "start"
1397
+ and current_start_boundary is not None
1398
+ and not end_elements
1399
+ ):
1400
+ # End the previous section just before this start boundary
1401
+ start_element = current_start_boundary["element"]
1402
+ # Find the element immediately preceding this start in the sorted list
1403
+ end_idx = boundary["index"] - 1
1404
+ if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
1405
+ end_element = all_elements_in_region[end_idx]
1406
+ section = self.get_section_between(
1407
+ start_element, end_element, boundary_inclusion
1408
+ )
1409
+ sections.append(section)
1410
+ # Else: Section started and ended by consecutive start elements? Create empty?
1411
+ # For now, just reset and start new section
1412
+
1413
+ # Start the new section
1414
+ current_start_boundary = boundary
1415
+
1503
1416
  # Handle the last section if we have a current start
1504
- if current_start is not None:
1505
- start_element = current_start['element']
1506
- # Use the last element in the region as the end
1507
- end_element = all_elements[-1] if all_elements else None
1508
- section = self.get_section_between(
1509
- start_element,
1510
- end_element,
1511
- boundary_inclusion
1512
- )
1417
+ if current_start_boundary is not None:
1418
+ start_element = current_start_boundary["element"]
1419
+ # End at the last element within the region
1420
+ end_element = all_elements_in_region[-1]
1421
+ section = self.get_section_between(start_element, end_element, boundary_inclusion)
1513
1422
  sections.append(section)
1514
-
1423
+
1515
1424
  return sections
1516
-
1425
+
1517
1426
  def create_cells(self):
1518
1427
  """
1519
1428
  Create cell regions for a detected table by intersecting its
1520
1429
  row and column regions, and add them to the page.
1521
-
1430
+
1522
1431
  Assumes child row and column regions are already present on the page.
1523
1432
 
1524
1433
  Returns:
1525
1434
  Self for method chaining.
1526
1435
  """
1527
1436
  # Ensure this is called on a table region
1528
- if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
1529
- raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
1530
-
1437
+ if self.region_type not in (
1438
+ "table",
1439
+ "tableofcontents",
1440
+ ): # Allow for ToC which might have structure
1441
+ raise ValueError(
1442
+ f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'"
1443
+ )
1444
+
1531
1445
  # Find rows and columns associated with this page
1532
1446
  # Remove the model-specific filter
1533
- rows = self.page.find_all('region[type=table-row]')
1534
- columns = self.page.find_all('region[type=table-column]')
1535
-
1447
+ rows = self.page.find_all("region[type=table-row]")
1448
+ columns = self.page.find_all("region[type=table-column]")
1449
+
1536
1450
  # Filter to only include those that overlap with this table region
1537
1451
  def is_in_table(element):
1538
1452
  # Use a simple overlap check (more robust than just center point)
1539
1453
  # Check if element's bbox overlaps with self.bbox
1540
- return (element.x0 < self.x1 and element.x1 > self.x0 and
1541
- element.top < self.bottom and element.bottom > self.top)
1542
-
1454
+ return (
1455
+ hasattr(element, "bbox")
1456
+ and element.x0 < self.x1 # Ensure element has bbox
1457
+ and element.x1 > self.x0
1458
+ and element.top < self.bottom
1459
+ and element.bottom > self.top
1460
+ )
1461
+
1543
1462
  table_rows = [r for r in rows if is_in_table(r)]
1544
1463
  table_columns = [c for c in columns if is_in_table(c)]
1545
-
1464
+
1546
1465
  if not table_rows or not table_columns:
1547
- self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
1548
- return self # Return self even if no cells created
1549
-
1466
+ # Use page's logger if available
1467
+ logger_instance = getattr(self._page, "logger", logger)
1468
+ logger_instance.warning(
1469
+ f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found."
1470
+ )
1471
+ return self # Return self even if no cells created
1472
+
1550
1473
  # Sort rows and columns
1551
1474
  table_rows.sort(key=lambda r: r.top)
1552
1475
  table_columns.sort(key=lambda c: c.x0)
1553
-
1476
+
1554
1477
  # Create cells and add them to the page's element manager
1555
1478
  created_count = 0
1556
1479
  for row in table_rows:
@@ -1564,41 +1487,49 @@ class Region(DirectionalMixin):
1564
1487
  # Only create a cell if the intersection is valid (positive width/height)
1565
1488
  if cell_x1 > cell_x0 and cell_y1 > cell_y0:
1566
1489
  # Create cell region at the intersection
1567
- cell = self.page.create_region(
1568
- cell_x0, cell_y0, cell_x1, cell_y1
1569
- )
1490
+ cell = self.page.create_region(cell_x0, cell_y0, cell_x1, cell_y1)
1570
1491
  # Set metadata
1571
- cell.source = 'derived'
1572
- cell.region_type = 'table-cell' # Explicitly set type
1573
- cell.normalized_type = 'table-cell' # And normalized type
1492
+ cell.source = "derived"
1493
+ cell.region_type = "table-cell" # Explicitly set type
1494
+ cell.normalized_type = "table-cell" # And normalized type
1574
1495
  # Inherit model from the parent table region
1575
- cell.model = self.model
1576
- cell.parent_region = self # Link cell to parent table region
1577
-
1496
+ cell.model = self.model
1497
+ cell.parent_region = self # Link cell to parent table region
1498
+
1578
1499
  # Add the cell region to the page's element manager
1579
1500
  self.page._element_mgr.add_region(cell)
1580
1501
  created_count += 1
1581
-
1502
+
1582
1503
  # Optional: Add created cells to the table region's children
1583
1504
  # self.child_regions.extend(cells_created_in_this_call) # Needs list management
1584
1505
 
1585
- self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
1506
+ logger_instance = getattr(self._page, "logger", logger)
1507
+ logger_instance.info(
1508
+ f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions."
1509
+ )
1586
1510
 
1587
- return self # Return self for chaining
1588
-
1589
- def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1511
+ return self # Return self for chaining
1512
+
1513
+ def ask(
1514
+ self,
1515
+ question: str,
1516
+ min_confidence: float = 0.1,
1517
+ model: str = None,
1518
+ debug: bool = False,
1519
+ **kwargs,
1520
+ ) -> Dict[str, Any]:
1590
1521
  """
1591
1522
  Ask a question about the region content using document QA.
1592
-
1523
+
1593
1524
  This method uses a document question answering model to extract answers from the region content.
1594
1525
  It leverages both textual content and layout information for better understanding.
1595
-
1526
+
1596
1527
  Args:
1597
1528
  question: The question to ask about the region content
1598
1529
  min_confidence: Minimum confidence threshold for answers (0.0-1.0)
1599
1530
  model: Optional model name to use for QA (if None, uses default model)
1600
1531
  **kwargs: Additional parameters to pass to the QA engine
1601
-
1532
+
1602
1533
  Returns:
1603
1534
  Dictionary with answer details: {
1604
1535
  "answer": extracted text,
@@ -1609,112 +1540,191 @@ class Region(DirectionalMixin):
1609
1540
  "source_elements": list of elements that contain the answer (if found)
1610
1541
  }
1611
1542
  """
1612
- from natural_pdf.qa.document_qa import get_qa_engine
1613
-
1543
+ try:
1544
+ from natural_pdf.qa.document_qa import get_qa_engine
1545
+ except ImportError:
1546
+ logger.error(
1547
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
1548
+ )
1549
+ return {
1550
+ "answer": None,
1551
+ "confidence": 0.0,
1552
+ "found": False,
1553
+ "page_num": self.page.number,
1554
+ "source_elements": [],
1555
+ "region": self,
1556
+ }
1557
+
1614
1558
  # Get or initialize QA engine with specified model
1615
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1616
-
1559
+ try:
1560
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1561
+ except Exception as e:
1562
+ logger.error(f"Failed to initialize QA engine (model: {model}): {e}", exc_info=True)
1563
+ return {
1564
+ "answer": None,
1565
+ "confidence": 0.0,
1566
+ "found": False,
1567
+ "page_num": self.page.number,
1568
+ "source_elements": [],
1569
+ "region": self,
1570
+ }
1571
+
1617
1572
  # Ask the question using the QA engine
1618
- return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1573
+ try:
1574
+ return qa_engine.ask_pdf_region(
1575
+ self, question, min_confidence=min_confidence, debug=debug, **kwargs
1576
+ )
1577
+ except Exception as e:
1578
+ logger.error(f"Error during qa_engine.ask_pdf_region: {e}", exc_info=True)
1579
+ return {
1580
+ "answer": None,
1581
+ "confidence": 0.0,
1582
+ "found": False,
1583
+ "page_num": self.page.number,
1584
+ "source_elements": [],
1585
+ "region": self,
1586
+ }
1619
1587
 
1620
1588
  def add_child(self, child):
1621
1589
  """
1622
1590
  Add a child region to this region.
1623
-
1591
+
1624
1592
  Used for hierarchical document structure when using models like Docling
1625
1593
  that understand document hierarchy.
1626
-
1594
+
1627
1595
  Args:
1628
1596
  child: Region object to add as a child
1629
-
1597
+
1630
1598
  Returns:
1631
1599
  Self for method chaining
1632
1600
  """
1633
1601
  self.child_regions.append(child)
1634
1602
  child.parent_region = self
1635
1603
  return self
1636
-
1604
+
1637
1605
  def get_children(self, selector=None):
1638
1606
  """
1639
1607
  Get immediate child regions, optionally filtered by selector.
1640
-
1608
+
1641
1609
  Args:
1642
1610
  selector: Optional selector to filter children
1643
-
1611
+
1644
1612
  Returns:
1645
1613
  List of child regions matching the selector
1646
1614
  """
1647
1615
  import logging
1616
+
1648
1617
  logger = logging.getLogger("natural_pdf.elements.region")
1649
-
1618
+
1650
1619
  if selector is None:
1651
1620
  return self.child_regions
1652
-
1621
+
1653
1622
  # Use existing selector parser to filter
1654
- from natural_pdf.selectors.parser import match_elements_with_selector
1655
- matched = match_elements_with_selector(self.child_regions, selector)
1656
- logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
1657
- return matched
1658
-
1623
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1624
+
1625
+ try:
1626
+ selector_obj = parse_selector(selector)
1627
+ filter_func = selector_to_filter_func(selector_obj) # Removed region=self
1628
+ matched = [child for child in self.child_regions if filter_func(child)]
1629
+ logger.debug(
1630
+ f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'"
1631
+ )
1632
+ return matched
1633
+ except Exception as e:
1634
+ logger.error(f"Error applying selector in get_children: {e}", exc_info=True)
1635
+ return [] # Return empty list on error
1636
+
1659
1637
  def get_descendants(self, selector=None):
1660
1638
  """
1661
1639
  Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
1662
-
1640
+
1663
1641
  Args:
1664
1642
  selector: Optional selector to filter descendants
1665
-
1643
+
1666
1644
  Returns:
1667
1645
  List of descendant regions matching the selector
1668
1646
  """
1669
1647
  import logging
1648
+
1670
1649
  logger = logging.getLogger("natural_pdf.elements.region")
1671
-
1650
+
1672
1651
  all_descendants = []
1673
-
1674
- # First add direct children
1675
- all_descendants.extend(self.child_regions)
1676
-
1677
- # Then recursively add their descendants
1678
- for child in self.child_regions:
1679
- all_descendants.extend(child.get_descendants())
1680
-
1652
+ queue = list(self.child_regions) # Start with direct children
1653
+
1654
+ while queue:
1655
+ current = queue.pop(0)
1656
+ all_descendants.append(current)
1657
+ # Add current's children to the queue for processing
1658
+ if hasattr(current, "child_regions"):
1659
+ queue.extend(current.child_regions)
1660
+
1681
1661
  logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
1682
-
1662
+
1683
1663
  # Filter by selector if provided
1684
1664
  if selector is not None:
1685
- from natural_pdf.selectors.parser import match_elements_with_selector
1686
- matched = match_elements_with_selector(all_descendants, selector)
1687
- logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1688
- return matched
1689
-
1665
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1666
+
1667
+ try:
1668
+ selector_obj = parse_selector(selector)
1669
+ filter_func = selector_to_filter_func(selector_obj) # Removed region=self
1670
+ matched = [desc for desc in all_descendants if filter_func(desc)]
1671
+ logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1672
+ return matched
1673
+ except Exception as e:
1674
+ logger.error(f"Error applying selector in get_descendants: {e}", exc_info=True)
1675
+ return [] # Return empty list on error
1676
+
1690
1677
  return all_descendants
1691
-
1692
- def find_all(self, selector, recursive=True, **kwargs):
1678
+
1679
+ # Removed recursive=True, find_all on region shouldn't be recursive by default
1680
+ # Renamed _find_all back to find_all
1681
+ # def find_all(self, selector, apply_exclusions=True, **kwargs):
1682
+ # See implementation above near get_elements
1683
+
1684
+ def __repr__(self) -> str:
1685
+ """String representation of the region."""
1686
+ poly_info = " (Polygon)" if self.has_polygon else ""
1687
+ name_info = f" name='{self.name}'" if self.name else ""
1688
+ type_info = f" type='{self.region_type}'" if self.region_type else ""
1689
+ source_info = f" source='{self.source}'" if self.source else ""
1690
+ return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
1691
+
1692
+ def correct_ocr(
1693
+ self,
1694
+ correction_callback: Callable[[Any], Optional[str]],
1695
+ ) -> "Region": # Return self for chaining
1693
1696
  """
1694
- Find all matching elements within this region, with optional recursion through child regions.
1695
-
1697
+ Applies corrections to OCR-generated text elements within this region
1698
+ using a user-provided callback function.
1699
+
1700
+ Finds text elements within this region whose 'source' attribute starts
1701
+ with 'ocr' and calls the `correction_callback` for each, passing the
1702
+ element itself.
1703
+
1704
+ The `correction_callback` should contain the logic to:
1705
+ 1. Determine if the element needs correction.
1706
+ 2. Perform the correction (e.g., call an LLM).
1707
+ 3. Return the new text (`str`) or `None`.
1708
+
1709
+ If the callback returns a string, the element's `.text` is updated.
1710
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1711
+
1696
1712
  Args:
1697
- selector: The selector to find elements with
1698
- recursive: Whether to search recursively through child regions
1699
- **kwargs: Additional parameters to pass to the selector parser
1700
-
1713
+ correction_callback: A function accepting an element and returning
1714
+ `Optional[str]` (new text or None).
1715
+
1701
1716
  Returns:
1702
- Collection of matching elements
1717
+ Self for method chaining.
1703
1718
  """
1704
- # Get direct matches
1705
- direct_matches = self._find_all(selector, region=self, **kwargs)
1706
-
1707
- if not recursive or not self.child_regions:
1708
- return direct_matches
1709
-
1710
- # Get recursive matches from children
1711
- from natural_pdf.elements.collections import ElementCollection
1712
- all_matches = list(direct_matches)
1713
-
1714
- for child in self.child_regions:
1715
- child_matches = child.find_all(selector, recursive=True, **kwargs)
1716
- for match in child_matches:
1717
- if match not in all_matches:
1718
- all_matches.append(match)
1719
-
1720
- return ElementCollection(all_matches)
1719
+ # Find OCR elements specifically within this region
1720
+ # Note: We typically want to correct even if the element falls in an excluded area
1721
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
1722
+
1723
+ # Delegate to the utility function
1724
+ _apply_ocr_correction_to_elements(
1725
+ elements=target_elements, # Pass the ElementCollection directly
1726
+ correction_callback=correction_callback,
1727
+ caller_info=f"Region({self.bbox})", # Pass caller info
1728
+ )
1729
+
1730
+ return self # Return self for chaining