natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,16 @@
1
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable, TYPE_CHECKING
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
+
4
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
+
6
+ # New Imports
7
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
+
2
9
  from natural_pdf.elements.base import DirectionalMixin
3
10
 
11
+ # Import new utils
12
+ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
+
4
14
  if TYPE_CHECKING:
5
15
  from natural_pdf.core.page import Page
6
16
  from natural_pdf.elements.text import TextElement
@@ -12,22 +22,29 @@ except ImportError:
12
22
  # OCRManager will be imported directly in methods that use it
13
23
  pass
14
24
 
25
+ logger = logging.getLogger(__name__)
26
+
15
27
 
16
28
  class Region(DirectionalMixin):
17
29
  """
18
30
  Represents a rectangular region on a page.
19
31
  """
20
-
21
- def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None, label: Optional[str] = None):
32
+
33
+ def __init__(
34
+ self,
35
+ page: "Page",
36
+ bbox: Tuple[float, float, float, float],
37
+ polygon: List[Tuple[float, float]] = None,
38
+ parent=None,
39
+ ):
22
40
  """
23
41
  Initialize a region.
24
-
42
+
25
43
  Args:
26
44
  page: Parent page
27
45
  bbox: Bounding box as (x0, top, x1, bottom)
28
46
  polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
47
  parent: Optional parent region (for hierarchical document structure)
30
- label: Optional label for the region (e.g., for exclusions)
31
48
  """
32
49
  self._page = page
33
50
  self._bbox = bbox
@@ -37,30 +54,36 @@ class Region(DirectionalMixin):
37
54
  self._page_range = None
38
55
  self.start_element = None
39
56
  self.end_element = None
40
-
57
+
41
58
  # Standard attributes for all elements
42
- self.object_type = 'region' # For selector compatibility
43
-
59
+ self.object_type = "region" # For selector compatibility
60
+
44
61
  # Layout detection attributes
45
62
  self.region_type = None
46
63
  self.normalized_type = None
47
64
  self.confidence = None
48
65
  self.model = None
49
-
66
+
50
67
  # Region management attributes
51
68
  self.name = None
52
69
  self.source = None # Will be set by creation methods
53
- self.label = label
54
-
70
+
55
71
  # Hierarchy support for nested document structure
56
72
  self.parent_region = parent
57
73
  self.child_regions = []
58
74
  self.text_content = None # Direct text content (e.g., from Docling)
59
75
  self.associated_text_elements = [] # Native text elements that overlap with this region
60
-
61
- def _direction(self, direction: str, size: Optional[float] = None,
62
- cross_size: str = "full", include_element: bool = False,
63
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
76
+
77
+ def _direction(
78
+ self,
79
+ direction: str,
80
+ size: Optional[float] = None,
81
+ cross_size: str = "full",
82
+ include_element: bool = False,
83
+ until: Optional[str] = None,
84
+ include_endpoint: bool = True,
85
+ **kwargs,
86
+ ) -> "Region":
64
87
  """
65
88
  Protected helper method to create a region in a specified direction relative to this region.
66
89
 
@@ -76,11 +99,11 @@ class Region(DirectionalMixin):
76
99
  Returns:
77
100
  Region object
78
101
  """
79
- import math # Use math.inf for infinity
102
+ import math # Use math.inf for infinity
80
103
 
81
- is_horizontal = direction in ('left', 'right')
82
- is_positive = direction in ('right', 'below') # right/below are positive directions
83
- pixel_offset = 1 # Offset for excluding elements/endpoints
104
+ is_horizontal = direction in ("left", "right")
105
+ is_positive = direction in ("right", "below") # right/below are positive directions
106
+ pixel_offset = 1 # Offset for excluding elements/endpoints
84
107
 
85
108
  # 1. Determine initial boundaries based on direction and include_element
86
109
  if is_horizontal:
@@ -89,38 +112,44 @@ class Region(DirectionalMixin):
89
112
  y1 = self.page.height if cross_size == "full" else self.bottom
90
113
 
91
114
  # Initial primary boundaries (horizontal)
92
- if is_positive: # right
115
+ if is_positive: # right
93
116
  x0_initial = self.x0 if include_element else self.x1 + pixel_offset
94
- x1_initial = self.x1 # This edge moves
95
- else: # left
96
- x0_initial = self.x0 # This edge moves
117
+ x1_initial = self.x1 # This edge moves
118
+ else: # left
119
+ x0_initial = self.x0 # This edge moves
97
120
  x1_initial = self.x1 if include_element else self.x0 - pixel_offset
98
- else: # Vertical
121
+ else: # Vertical
99
122
  # Initial cross-boundaries (horizontal)
100
123
  x0 = 0 if cross_size == "full" else self.x0
101
124
  x1 = self.page.width if cross_size == "full" else self.x1
102
125
 
103
126
  # Initial primary boundaries (vertical)
104
- if is_positive: # below
127
+ if is_positive: # below
105
128
  y0_initial = self.top if include_element else self.bottom + pixel_offset
106
- y1_initial = self.bottom # This edge moves
107
- else: # above
108
- y0_initial = self.top # This edge moves
129
+ y1_initial = self.bottom # This edge moves
130
+ else: # above
131
+ y0_initial = self.top # This edge moves
109
132
  y1_initial = self.bottom if include_element else self.top - pixel_offset
110
133
 
111
134
  # 2. Calculate the final primary boundary, considering 'size' or page limits
112
135
  if is_horizontal:
113
- if is_positive: # right
114
- x1_final = min(self.page.width, x1_initial + (size if size is not None else (self.page.width - x1_initial)))
136
+ if is_positive: # right
137
+ x1_final = min(
138
+ self.page.width,
139
+ x1_initial + (size if size is not None else (self.page.width - x1_initial)),
140
+ )
115
141
  x0_final = x0_initial
116
- else: # left
142
+ else: # left
117
143
  x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
118
144
  x1_final = x1_initial
119
- else: # Vertical
120
- if is_positive: # below
121
- y1_final = min(self.page.height, y1_initial + (size if size is not None else (self.page.height - y1_initial)))
145
+ else: # Vertical
146
+ if is_positive: # below
147
+ y1_final = min(
148
+ self.page.height,
149
+ y1_initial + (size if size is not None else (self.page.height - y1_initial)),
150
+ )
122
151
  y0_final = y0_initial
123
- else: # above
152
+ else: # above
124
153
  y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
125
154
  y1_final = y1_initial
126
155
 
@@ -131,16 +160,16 @@ class Region(DirectionalMixin):
131
160
  matches_in_direction = []
132
161
 
133
162
  # Filter and sort matches based on direction
134
- if direction == 'above':
163
+ if direction == "above":
135
164
  matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
136
165
  matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
137
- elif direction == 'below':
166
+ elif direction == "below":
138
167
  matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
139
168
  matches_in_direction.sort(key=lambda e: e.top)
140
- elif direction == 'left':
169
+ elif direction == "left":
141
170
  matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
142
171
  matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
143
- elif direction == 'right':
172
+ elif direction == "right":
144
173
  matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
145
174
  matches_in_direction.sort(key=lambda e: e.x0)
146
175
 
@@ -149,25 +178,29 @@ class Region(DirectionalMixin):
149
178
 
150
179
  # Adjust the primary boundary based on the target
151
180
  if is_horizontal:
152
- if is_positive: # right
181
+ if is_positive: # right
153
182
  x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
154
- else: # left
183
+ else: # left
155
184
  x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
156
- else: # Vertical
157
- if is_positive: # below
185
+ else: # Vertical
186
+ if is_positive: # below
158
187
  y1_final = target.bottom if include_endpoint else target.top - pixel_offset
159
- else: # above
188
+ else: # above
160
189
  y0_final = target.top if include_endpoint else target.bottom + pixel_offset
161
190
 
162
191
  # Adjust cross boundaries if cross_size is 'element'
163
192
  if cross_size == "element":
164
- if is_horizontal: # Adjust y0, y1
165
- target_y0 = target.top if include_endpoint else target.bottom # Use opposite boundary if excluding
193
+ if is_horizontal: # Adjust y0, y1
194
+ target_y0 = (
195
+ target.top if include_endpoint else target.bottom
196
+ ) # Use opposite boundary if excluding
166
197
  target_y1 = target.bottom if include_endpoint else target.top
167
198
  y0 = min(y0, target_y0)
168
199
  y1 = max(y1, target_y1)
169
- else: # Adjust x0, x1
170
- target_x0 = target.x0 if include_endpoint else target.x1 # Use opposite boundary if excluding
200
+ else: # Adjust x0, x1
201
+ target_x0 = (
202
+ target.x0 if include_endpoint else target.x1
203
+ ) # Use opposite boundary if excluding
171
204
  target_x1 = target.x1 if include_endpoint else target.x0
172
205
  x0 = min(x0, target_x0)
173
206
  x1 = max(x1, target_x1)
@@ -195,11 +228,18 @@ class Region(DirectionalMixin):
195
228
 
196
229
  return region
197
230
 
198
- def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
199
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
231
+ def above(
232
+ self,
233
+ height: Optional[float] = None,
234
+ width: str = "full",
235
+ include_element: bool = False,
236
+ until: Optional[str] = None,
237
+ include_endpoint: bool = True,
238
+ **kwargs,
239
+ ) -> "Region":
200
240
  """
201
241
  Select region above this region.
202
-
242
+
203
243
  Args:
204
244
  height: Height of the region above, in points
205
245
  width: Width mode - "full" for full page width or "element" for element width
@@ -207,25 +247,32 @@ class Region(DirectionalMixin):
207
247
  until: Optional selector string to specify an upper boundary element
208
248
  include_endpoint: Whether to include the boundary element in the region (default: True)
209
249
  **kwargs: Additional parameters
210
-
250
+
211
251
  Returns:
212
252
  Region object representing the area above
213
253
  """
214
254
  return self._direction(
215
- direction='above',
255
+ direction="above",
216
256
  size=height,
217
257
  cross_size=width,
218
258
  include_element=include_element,
219
259
  until=until,
220
260
  include_endpoint=include_endpoint,
221
- **kwargs
261
+ **kwargs,
222
262
  )
223
263
 
224
- def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
225
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
264
+ def below(
265
+ self,
266
+ height: Optional[float] = None,
267
+ width: str = "full",
268
+ include_element: bool = False,
269
+ until: Optional[str] = None,
270
+ include_endpoint: bool = True,
271
+ **kwargs,
272
+ ) -> "Region":
226
273
  """
227
274
  Select region below this region.
228
-
275
+
229
276
  Args:
230
277
  height: Height of the region below, in points
231
278
  width: Width mode - "full" for full page width or "element" for element width
@@ -233,25 +280,32 @@ class Region(DirectionalMixin):
233
280
  until: Optional selector string to specify a lower boundary element
234
281
  include_endpoint: Whether to include the boundary element in the region (default: True)
235
282
  **kwargs: Additional parameters
236
-
283
+
237
284
  Returns:
238
285
  Region object representing the area below
239
286
  """
240
287
  return self._direction(
241
- direction='below',
288
+ direction="below",
242
289
  size=height,
243
290
  cross_size=width,
244
291
  include_element=include_element,
245
292
  until=until,
246
293
  include_endpoint=include_endpoint,
247
- **kwargs
294
+ **kwargs,
248
295
  )
249
296
 
250
- def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
251
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
297
+ def left(
298
+ self,
299
+ width: Optional[float] = None,
300
+ height: str = "full",
301
+ include_element: bool = False,
302
+ until: Optional[str] = None,
303
+ include_endpoint: bool = True,
304
+ **kwargs,
305
+ ) -> "Region":
252
306
  """
253
307
  Select region to the left of this region.
254
-
308
+
255
309
  Args:
256
310
  width: Width of the region to the left, in points
257
311
  height: Height mode - "full" for full page height or "element" for element height
@@ -259,25 +313,32 @@ class Region(DirectionalMixin):
259
313
  until: Optional selector string to specify a left boundary element
260
314
  include_endpoint: Whether to include the boundary element in the region (default: True)
261
315
  **kwargs: Additional parameters
262
-
316
+
263
317
  Returns:
264
318
  Region object representing the area to the left
265
319
  """
266
320
  return self._direction(
267
- direction='left',
321
+ direction="left",
268
322
  size=width,
269
323
  cross_size=height,
270
324
  include_element=include_element,
271
325
  until=until,
272
326
  include_endpoint=include_endpoint,
273
- **kwargs
327
+ **kwargs,
274
328
  )
275
329
 
276
- def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
277
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
330
+ def right(
331
+ self,
332
+ width: Optional[float] = None,
333
+ height: str = "full",
334
+ include_element: bool = False,
335
+ until: Optional[str] = None,
336
+ include_endpoint: bool = True,
337
+ **kwargs,
338
+ ) -> "Region":
278
339
  """
279
340
  Select region to the right of this region.
280
-
341
+
281
342
  Args:
282
343
  width: Width of the region to the right, in points
283
344
  height: Height mode - "full" for full page height or "element" for element height
@@ -285,72 +346,72 @@ class Region(DirectionalMixin):
285
346
  until: Optional selector string to specify a right boundary element
286
347
  include_endpoint: Whether to include the boundary element in the region (default: True)
287
348
  **kwargs: Additional parameters
288
-
349
+
289
350
  Returns:
290
351
  Region object representing the area to the right
291
352
  """
292
353
  return self._direction(
293
- direction='right',
354
+ direction="right",
294
355
  size=width,
295
356
  cross_size=height,
296
357
  include_element=include_element,
297
358
  until=until,
298
359
  include_endpoint=include_endpoint,
299
- **kwargs
360
+ **kwargs,
300
361
  )
301
-
362
+
302
363
  @property
303
364
  def type(self) -> str:
304
365
  """Element type."""
305
366
  # Return the specific type if detected (e.g., from layout analysis)
306
367
  # or 'region' as a default.
307
- return self.region_type or 'region' # Prioritize specific region_type if set
308
-
368
+ return self.region_type or "region" # Prioritize specific region_type if set
369
+
309
370
  @property
310
- def page(self) -> 'Page':
371
+ def page(self) -> "Page":
311
372
  """Get the parent page."""
312
373
  return self._page
313
-
374
+
314
375
  @property
315
376
  def bbox(self) -> Tuple[float, float, float, float]:
316
377
  """Get the bounding box as (x0, top, x1, bottom)."""
317
378
  return self._bbox
318
-
379
+
319
380
  @property
320
381
  def x0(self) -> float:
321
382
  """Get the left coordinate."""
322
383
  return self._bbox[0]
323
-
384
+
324
385
  @property
325
386
  def top(self) -> float:
326
387
  """Get the top coordinate."""
327
388
  return self._bbox[1]
328
-
389
+
329
390
  @property
330
391
  def x1(self) -> float:
331
392
  """Get the right coordinate."""
332
393
  return self._bbox[2]
333
-
394
+
334
395
  @property
335
396
  def bottom(self) -> float:
336
397
  """Get the bottom coordinate."""
337
398
  return self._bbox[3]
338
-
399
+
339
400
  @property
340
401
  def width(self) -> float:
341
402
  """Get the width of the region."""
342
403
  return self.x1 - self.x0
343
-
404
+
344
405
  @property
345
406
  def height(self) -> float:
346
407
  """Get the height of the region."""
347
408
  return self.bottom - self.top
348
-
409
+
349
410
  @property
350
411
  def has_polygon(self) -> bool:
351
412
  """Check if this region has polygon coordinates."""
352
413
  return self._polygon is not None and len(self._polygon) >= 3
353
-
414
+
354
415
  @property
355
416
  def polygon(self) -> List[Tuple[float, float]]:
356
417
  """Get polygon coordinates if available, otherwise return rectangle corners."""
@@ -359,141 +420,122 @@ class Region(DirectionalMixin):
359
420
  else:
360
421
  # Create rectangle corners from bbox as fallback
361
422
  return [
362
- (self.x0, self.top), # top-left
363
- (self.x1, self.top), # top-right
364
- (self.x1, self.bottom), # bottom-right
365
- (self.x0, self.bottom) # bottom-left
423
+ (self.x0, self.top), # top-left
424
+ (self.x1, self.top), # top-right
425
+ (self.x1, self.bottom), # bottom-right
426
+ (self.x0, self.bottom), # bottom-left
366
427
  ]
367
-
428
+
368
429
  def _is_point_in_polygon(self, x: float, y: float) -> bool:
369
430
  """
370
431
  Check if a point is inside the polygon using ray casting algorithm.
371
-
432
+
372
433
  Args:
373
434
  x: X coordinate of the point
374
435
  y: Y coordinate of the point
375
-
436
+
376
437
  Returns:
377
438
  bool: True if the point is inside the polygon
378
439
  """
379
440
  if not self.has_polygon:
380
441
  return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
381
-
442
+
382
443
  # Ray casting algorithm
383
444
  inside = False
384
445
  j = len(self.polygon) - 1
385
-
446
+
386
447
  for i in range(len(self.polygon)):
387
- if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
388
- (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
389
- (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
448
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
449
+ x
450
+ < (self.polygon[j][0] - self.polygon[i][0])
451
+ * (y - self.polygon[i][1])
452
+ / (self.polygon[j][1] - self.polygon[i][1])
453
+ + self.polygon[i][0]
454
+ ):
390
455
  inside = not inside
391
456
  j = i
392
-
457
+
393
458
  return inside
394
459
 
395
460
  def is_point_inside(self, x: float, y: float) -> bool:
396
461
  """
397
462
  Check if a point is inside this region using ray casting algorithm for polygons.
398
-
463
+
399
464
  Args:
400
465
  x: X coordinate of the point
401
466
  y: Y coordinate of the point
402
-
467
+
403
468
  Returns:
404
469
  bool: True if the point is inside the region
405
470
  """
406
471
  if not self.has_polygon:
407
472
  return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
408
-
473
+
409
474
  # Ray casting algorithm
410
475
  inside = False
411
476
  j = len(self.polygon) - 1
412
-
477
+
413
478
  for i in range(len(self.polygon)):
414
- if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
415
- (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
416
- (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
479
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and (
480
+ x
481
+ < (self.polygon[j][0] - self.polygon[i][0])
482
+ * (y - self.polygon[i][1])
483
+ / (self.polygon[j][1] - self.polygon[i][1])
484
+ + self.polygon[i][0]
485
+ ):
417
486
  inside = not inside
418
487
  j = i
419
-
488
+
420
489
  return inside
421
490
 
422
- def _is_element_in_region(self, element: 'Element', use_boundary_tolerance=True) -> bool:
491
+ def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
423
492
  """
424
493
  Check if an element is within this region.
425
-
494
+
426
495
  Args:
427
496
  element: Element to check
428
497
  use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
429
-
498
+
430
499
  Returns:
431
500
  True if the element is in the region, False otherwise
432
501
  """
433
502
  # If we have multi-page elements cached, check if the element is in the list
434
503
  if self._spans_pages and self._multi_page_elements is not None:
435
504
  return element in self._multi_page_elements
436
-
505
+
437
506
  # Check if element is on the same page
438
- if element.page != self._page:
507
+ if not hasattr(element, "page") or element.page != self._page:
439
508
  return False
440
-
509
+
441
510
  # Calculate element center
511
+ # Ensure element has necessary attributes
512
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
513
+ return False # Cannot determine position
514
+
442
515
  element_center_x = (element.x0 + element.x1) / 2
443
516
  element_center_y = (element.top + element.bottom) / 2
444
-
445
- # If this is a boundary region with exclusions, apply strict boundary checking
446
- # This helps enforce boundary_inclusion behavior in get_sections
447
- if hasattr(self, 'start_element') or hasattr(self, 'end_element'):
448
- # Apply a small tolerance to avoid border cases
449
- # When an element is right at the border, we want to be more strict
450
- tolerance = 2.0 if use_boundary_tolerance else 0.0
451
-
452
- # Check if element center is strictly within the region (not just on border)
453
- if (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
454
- self.top + tolerance <= element_center_y <= self.bottom - tolerance):
455
- return True
456
-
457
- # For elements right at the boundary, be more conservative
458
- return False
459
-
460
- # If the element itself has a polygon, check if ANY corner is in this region
461
- if hasattr(element, 'has_polygon') and element.has_polygon:
462
- for point in element.polygon:
463
- if self.is_point_inside(point[0], point[1]):
464
- return True
465
- # If no point is inside, check if the center is inside
466
- return self.is_point_inside(element_center_x, element_center_y)
467
-
468
- # For regular elements, check if center is in the region
469
- # Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
470
- # This ensures consistent behavior with the below() and above() method fixes
471
- tolerance = 1.0 if use_boundary_tolerance else 0.0
472
-
473
- # Check if within region with the tolerance applied
474
- if self.has_polygon:
475
- return self.is_point_inside(element_center_x, element_center_y)
476
- else:
477
- # For rectangular regions, apply tolerance to all sides
478
- return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
479
- self.top + tolerance <= element_center_y <= self.bottom - tolerance)
480
-
481
- def highlight(self,
482
- label: Optional[str] = None,
483
- color: Optional[Union[Tuple, str]] = None,
484
- use_color_cycling: bool = False,
485
- include_attrs: Optional[List[str]] = None,
486
- existing: str = 'append') -> 'Region':
517
+
518
+ # Check if center point is inside the region's geometry
519
+ return self.is_point_inside(element_center_x, element_center_y)
520
+
521
+ def highlight(
522
+ self,
523
+ label: Optional[str] = None,
524
+ color: Optional[Union[Tuple, str]] = None,
525
+ use_color_cycling: bool = False,
526
+ include_attrs: Optional[List[str]] = None,
527
+ existing: str = "append",
528
+ ) -> "Region":
487
529
  """
488
530
  Highlight this region on the page.
489
-
531
+
490
532
  Args:
491
533
  label: Optional label for the highlight
492
534
  color: Color tuple/string for the highlight, or None to use automatic color
493
535
  use_color_cycling: Force color cycling even with no label (default: False)
494
536
  include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
495
537
  existing: How to handle existing highlights ('append' or 'replace').
496
-
538
+
497
539
  Returns:
498
540
  Self for method chaining
499
541
  """
@@ -508,7 +550,7 @@ class Region(DirectionalMixin):
508
550
  "use_color_cycling": use_color_cycling,
509
551
  "element": self, # Pass the region itself so attributes can be accessed
510
552
  "include_attrs": include_attrs,
511
- "existing": existing
553
+ "existing": existing,
512
554
  }
513
555
 
514
556
  # Call the appropriate service method
@@ -520,59 +562,68 @@ class Region(DirectionalMixin):
520
562
  highlighter.add(**highlight_args)
521
563
 
522
564
  return self
523
-
524
- def to_image(self,
525
- scale: float = 2.0,
526
- resolution: float = 150,
527
- crop_only: bool = False,
528
- include_highlights: bool = True,
529
- **kwargs) -> 'Image.Image':
565
+
566
+ def to_image(
567
+ self,
568
+ scale: float = 2.0,
569
+ resolution: float = 150,
570
+ crop_only: bool = False,
571
+ include_highlights: bool = True,
572
+ **kwargs,
573
+ ) -> "Image.Image":
530
574
  """
531
575
  Generate an image of just this region.
532
-
576
+
533
577
  Args:
534
578
  resolution: Resolution in DPI for rendering (default: 150)
535
579
  crop_only: If True, only crop the region without highlighting its boundaries
536
580
  include_highlights: Whether to include existing highlights (default: True)
537
581
  **kwargs: Additional parameters for page.to_image()
538
-
582
+
539
583
  Returns:
540
584
  PIL Image of just this region
541
585
  """
542
586
  # First get the full page image with highlights if requested
543
- page_image = self._page.to_image(scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs)
544
-
587
+ page_image = self._page.to_image(
588
+ scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
589
+ )
590
+
545
591
  # Calculate the crop coordinates - apply resolution scaling factor
546
592
  # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
547
- scale_factor = scale
548
-
593
+ scale_factor = resolution / 72.0 # Scale based on DPI
594
+
549
595
  # Apply scaling to the coordinates
550
596
  x0 = int(self.x0 * scale_factor)
551
597
  top = int(self.top * scale_factor)
552
598
  x1 = int(self.x1 * scale_factor)
553
599
  bottom = int(self.bottom * scale_factor)
554
-
600
+
555
601
  # Crop the image to just this region
556
602
  region_image = page_image.crop((x0, top, x1, bottom))
557
-
603
+
558
604
  # If not crop_only, add a border to highlight the region boundaries
559
605
  if not crop_only:
560
606
  from PIL import ImageDraw
561
-
607
+
562
608
  # Create a 1px border around the region
563
609
  draw = ImageDraw.Draw(region_image)
564
- draw.rectangle((0, 0, region_image.width-1, region_image.height-1),
565
- outline=(255, 0, 0), width=1)
566
-
610
+ draw.rectangle(
611
+ (0, 0, region_image.width - 1, region_image.height - 1),
612
+ outline=(255, 0, 0),
613
+ width=1,
614
+ )
615
+
567
616
  return region_image
568
-
569
- def show(self,
570
- scale: float = 2.0,
571
- labels: bool = True,
572
- legend_position: str = 'right',
573
- # Add a default color for standalone show
574
- color: Optional[Union[Tuple, str]] = "blue",
575
- label: Optional[str] = None) -> 'Image.Image':
617
+
618
+ def show(
619
+ self,
620
+ scale: float = 2.0,
621
+ labels: bool = True,
622
+ legend_position: str = "right",
623
+ # Add a default color for standalone show
624
+ color: Optional[Union[Tuple, str]] = "blue",
625
+ label: Optional[str] = None,
626
+ ) -> "Image.Image":
576
627
  """
577
628
  Show the page with just this region highlighted temporarily.
578
629
 
@@ -593,16 +644,18 @@ class Region(DirectionalMixin):
593
644
  service = self._page._highlighter
594
645
 
595
646
  # Determine the label if not provided
596
- display_label = label if label is not None else f"Region ({self.type})" if self.type else "Region"
647
+ display_label = (
648
+ label if label is not None else f"Region ({self.type})" if self.type else "Region"
649
+ )
597
650
 
598
651
  # Prepare temporary highlight data for just this region
599
652
  temp_highlight_data = {
600
653
  "page_index": self._page.index,
601
654
  "bbox": self.bbox,
602
655
  "polygon": self.polygon if self.has_polygon else None,
603
- "color": color, # Use provided or default color
656
+ "color": color, # Use provided or default color
604
657
  "label": display_label,
605
- "use_color_cycling": False # Explicitly false for single preview
658
+ "use_color_cycling": False, # Explicitly false for single preview
606
659
  }
607
660
 
608
661
  # Use render_preview to show only this highlight
@@ -611,452 +664,271 @@ class Region(DirectionalMixin):
611
664
  temporary_highlights=[temp_highlight_data],
612
665
  scale=scale,
613
666
  labels=labels,
614
- legend_position=legend_position
667
+ legend_position=legend_position,
615
668
  )
616
669
 
617
- def save(self,
618
- filename: str,
619
- scale: float = 2.0,
620
- labels: bool = True,
621
- legend_position: str = 'right') -> 'Region':
670
+ def save(
671
+ self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
672
+ ) -> "Region":
622
673
  """
623
674
  Save the page with this region highlighted to an image file.
624
-
675
+
625
676
  Args:
626
677
  filename: Path to save the image to
627
678
  scale: Scale factor for rendering
628
679
  labels: Whether to include a legend for labels
629
680
  legend_position: Position of the legend
630
-
681
+
631
682
  Returns:
632
683
  Self for method chaining
633
684
  """
634
685
  # Highlight this region if not already highlighted
635
686
  self.highlight()
636
-
687
+
637
688
  # Save the highlighted image
638
689
  self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
639
690
  return self
640
-
641
- def save_image(self,
642
- filename: str,
643
- resolution: float = 150,
644
- crop_only: bool = False,
645
- include_highlights: bool = True,
646
- **kwargs) -> 'Region':
691
+
692
+ def save_image(
693
+ self,
694
+ filename: str,
695
+ resolution: float = 150,
696
+ crop_only: bool = False,
697
+ include_highlights: bool = True,
698
+ **kwargs,
699
+ ) -> "Region":
647
700
  """
648
701
  Save an image of just this region to a file.
649
-
702
+
650
703
  Args:
651
704
  filename: Path to save the image to
652
705
  resolution: Resolution in DPI for rendering (default: 150)
653
706
  crop_only: If True, only crop the region without highlighting its boundaries
654
707
  include_highlights: Whether to include existing highlights (default: True)
655
708
  **kwargs: Additional parameters for page.to_image()
656
-
709
+
657
710
  Returns:
658
711
  Self for method chaining
659
712
  """
660
713
  # Get the region image
661
714
  image = self.to_image(
662
- resolution=resolution,
663
- crop_only=crop_only,
715
+ resolution=resolution,
716
+ crop_only=crop_only,
664
717
  include_highlights=include_highlights,
665
- **kwargs
718
+ **kwargs,
666
719
  )
667
-
720
+
668
721
  # Save the image
669
722
  image.save(filename)
670
723
  return self
671
-
672
- def get_elements(self, selector: Optional[str] = None, apply_exclusions=True, **kwargs) -> List['Element']:
724
+
725
+ def get_elements(
726
+ self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
727
+ ) -> List["Element"]:
673
728
  """
674
729
  Get all elements within this region.
675
-
730
+
676
731
  Args:
677
732
  selector: Optional selector to filter elements
678
733
  apply_exclusions: Whether to apply exclusion regions
679
734
  **kwargs: Additional parameters for element filtering
680
-
735
+
681
736
  Returns:
682
737
  List of elements in the region
683
738
  """
684
739
  # If we have multi-page elements, return those
685
740
  if self._spans_pages and self._multi_page_elements is not None:
741
+ # TODO: Apply selector to multi-page elements if needed
686
742
  return self._multi_page_elements
687
-
743
+
688
744
  # Otherwise, get elements from the page
689
745
  if selector:
690
- elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
746
+ # Find elements on the page matching the selector
747
+ page_elements = self.page.find_all(
748
+ selector, apply_exclusions=apply_exclusions, **kwargs
749
+ )
750
+ # Filter those elements to only include ones within this region
751
+ return [e for e in page_elements if self._is_element_in_region(e)]
691
752
  else:
692
- elements = self.page.get_elements(apply_exclusions=apply_exclusions)
693
-
694
- # Filter to elements in this region
695
- return [e for e in elements if self._is_element_in_region(e)]
696
-
697
- def extract_text(self, keep_blank_chars=True, apply_exclusions=True, ocr=None, preserve_whitespace=None, debug=False, **kwargs) -> str:
753
+ # Get all elements from the page
754
+ page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
755
+ # Filter to elements in this region
756
+ return [e for e in page_elements if self._is_element_in_region(e)]
757
+
758
+ def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
698
759
  """
699
- Extract text from this region using pdfplumber's native functionality.
700
-
701
- For regions created by Docling, this will first try to use:
702
- 1. Associated text elements from the PDF (if available)
703
- 2. Direct text content from Docling (if available)
704
- 3. Fall back to standard pdfplumber extraction
705
-
760
+ Extract text from this region, respecting page exclusions and using pdfplumber's
761
+ layout engine (chars_to_textmap).
762
+
706
763
  Args:
707
- keep_blank_chars: Whether to keep blank characters (legacy parameter)
708
- apply_exclusions: Whether to apply exclusion regions
709
- ocr: OCR configuration. If None, uses PDF settings
710
- preserve_whitespace: Synonym for keep_blank_chars (for compatibility with page.extract_text)
711
- debug: Enable verbose debugging for exclusion handling
712
- **kwargs: Additional parameters for text extraction
713
-
764
+ apply_exclusions: Whether to apply exclusion regions defined on the parent page.
765
+ debug: Enable verbose debugging output for filtering steps.
766
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
767
+ `chars_to_textmap` function (e.g., layout, x_density, y_density).
768
+ See Page.extract_text docstring for more.
769
+
714
770
  Returns:
715
- Extracted text as string
771
+ Extracted text as string, potentially with layout-based spacing.
716
772
  """
717
- import logging
718
- logger = logging.getLogger("natural_pdf.elements.region")
719
-
720
- # Check for Docling model or if we have direct text content
721
- if self.model == 'docling' or hasattr(self, 'text_content'):
722
- # First priority: check if we have associated native text elements
723
- if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
724
- source_count = len(self.associated_text_elements)
725
- logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
726
- # Sort elements in reading order
727
- sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
728
- # Extract and join their text
729
- text_result = " ".join(elem.text for elem in sorted_elements)
730
- return text_result
731
-
732
- # Second priority: use direct text content from Docling
733
- elif self.text_content:
734
- logger.info(f"Region {self.region_type}: Using Docling OCR text content")
735
- return self.text_content
736
-
737
- logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
738
-
739
- # Handle preserve_whitespace parameter for consistency with Page.extract_text
740
- if preserve_whitespace is not None:
741
- keep_blank_chars = preserve_whitespace
742
-
743
- # If we span multiple pages, use the original implementation
744
- if self._spans_pages and self._multi_page_elements is not None:
745
- # Sort elements in reading order - only include text-like elements
746
- text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
747
-
748
- # Sort in reading order (by page, then top-to-bottom, left-to-right)
749
- sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
750
-
751
- # Extract text directly from elements to avoid recursion
752
- texts = []
753
- for element in sorted_elements:
754
- if hasattr(element, 'text'):
755
- texts.append(element.text)
756
-
757
- text_result = " ".join(texts)
758
- return text_result
759
-
760
- # Check if we have exclusions to apply
773
+ # Allow 'debug_exclusions' for backward compatibility
774
+ debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
775
+ logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
776
+
777
+ # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
778
+ # For now, let's bypass this and always use the standard extraction flow
779
+ # based on contained elements to ensure consistency.
780
+ # if self.model == 'docling' or hasattr(self, 'text_content'): ...
781
+
782
+ # 1. Get Word Elements potentially within this region (initial broad phase)
783
+ # Optimization: Could use spatial query if page elements were indexed
784
+ page_words = self.page.words # Get all words from the page
785
+
786
+ # 2. Gather all character dicts from words potentially in region
787
+ # We filter precisely in filter_chars_spatially
788
+ all_char_dicts = []
789
+ for word in page_words:
790
+ # Quick bbox check to avoid processing words clearly outside
791
+ if get_bbox_overlap(self.bbox, word.bbox) is not None:
792
+ all_char_dicts.extend(getattr(word, "_char_dicts", []))
793
+
794
+ if not all_char_dicts:
795
+ logger.debug(f"Region {self.bbox}: No character dicts found overlapping region bbox.")
796
+ return ""
797
+
798
+ # 3. Get Relevant Exclusions (overlapping this region)
799
+ apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
761
800
  exclusion_regions = []
762
- if apply_exclusions and self._page._exclusions:
763
- exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
764
-
765
- if debug:
766
- logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
767
-
768
- # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
769
- # If not, ignore exclusions entirely
770
- if exclusion_regions:
771
- has_intersection = False
772
- for i, exclusion in enumerate(exclusion_regions):
773
- # Use a simple bbox overlap check
774
- overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
775
- self.top < exclusion.bottom and self.bottom > exclusion.top)
776
-
777
- if overlap:
778
- has_intersection = True
779
- if debug:
780
- logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
781
- break
782
-
783
- # If no intersection, process without exclusions
784
- if not has_intersection:
785
- if debug:
786
- logger.debug(f" No intersection with any exclusion, ignoring exclusions")
787
- apply_exclusions = False
788
- exclusion_regions = []
789
-
790
- # IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
791
- # we can use the simpler cropping approach
792
- # Only use crop for simple cases
793
- can_use_crop = not self.has_polygon
794
- result = "" # Default empty result
795
- if can_use_crop and apply_exclusions and exclusion_regions:
796
- # We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
797
- # and those that are not
798
- footer_header_exclusions = []
799
- other_exclusions = []
800
-
801
- for i, exclusion in enumerate(exclusion_regions):
802
- # Check if exclusion spans the full width of the page
803
- # and is either at the top or bottom
804
- full_width = (abs(exclusion.x0) < 5 and
805
- abs(exclusion.x1 - self.page.width) < 5)
806
-
807
- if debug:
808
- logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
809
-
810
- if full_width:
811
- footer_header_exclusions.append(exclusion)
812
- else:
813
- other_exclusions.append(exclusion)
814
-
815
- # If we have only header/footer exclusions, we can use the cropping approach
816
- all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
817
-
818
- if all_are_bands:
819
- # Find the actual content area after excluding header/footer
820
- top_bound = self.top
821
- bottom_bound = self.bottom
822
-
823
- if debug:
824
- logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
825
-
826
- # Process only header/footer exclusions for cropping
827
- for exclusion in footer_header_exclusions:
828
- # If exclusion is at the top of our region
829
- if exclusion.bottom > self.top and exclusion.top <= self.top:
830
- # Move top bound to exclude the header
831
- top_bound = max(top_bound, exclusion.bottom)
832
- if debug:
833
- logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
834
-
835
- # If exclusion is at the bottom of our region
836
- if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
837
- # Move bottom bound to exclude the footer
838
- bottom_bound = min(bottom_bound, exclusion.top)
839
- if debug:
840
- logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
841
-
842
-
843
- if debug:
844
- logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
845
-
846
- # If we still have a valid region after exclusions
847
- if top_bound < bottom_bound:
848
- # Use direct crop with adjusted bounds
849
- crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
850
- cropped = self.page._page.crop(crop_bbox)
851
- result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
852
-
853
- if debug:
854
- logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
855
-
856
- # Skip the complex filtering approach
857
- return result
858
- else:
859
- # This would only happen if the region is entirely inside an exclusion zone
860
- # or if both top and bottom of the region are excluded leaving no valid area
861
- logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
862
- return ""
863
- # We have exclusions, but not all are headers/footers,
864
- # or we have a non-rectangular region
865
- else:
866
- if debug:
867
- logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
868
-
869
- # Don't use crop for mixed exclusion types
870
- can_use_crop = False
871
-
872
- # If we got a result from header/footer cropping, return it
873
- if result:
874
- return result
875
-
876
- # For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
877
- if can_use_crop and not apply_exclusions:
878
- # Simple case: use direct crop
879
- crop_bbox = self.bbox
880
- cropped = self.page._page.crop(crop_bbox)
881
- result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
882
- return result
883
-
884
- # For all other cases (complex exclusions, polygons), we use element filtering
885
- if debug:
886
- logger.debug(f"Using element filtering approach for region {self.bbox}")
887
-
888
- # Get only word elements in this region first (instead of ALL elements)
889
- # This prevents duplication from joining both char and word text
890
- all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
891
-
892
- if apply_exclusions and exclusion_regions:
893
- if debug:
894
- logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
895
-
896
- # Filter out elements in exclusion zones
897
- filtered_elements = []
898
- for elem in all_elements:
899
- in_exclusion = False
900
- # For each element, check if it's in any exclusion zone
901
- element_center_x = (elem.x0 + elem.x1) / 2
902
- element_center_y = (elem.top + elem.bottom) / 2
903
-
904
- for exclusion in exclusion_regions:
905
- if (exclusion.x0 <= element_center_x <= exclusion.x1 and
906
- exclusion.top <= element_center_y <= exclusion.bottom):
907
- in_exclusion = True
908
- break
909
-
910
- if not in_exclusion:
911
- filtered_elements.append(elem)
912
- else:
913
- # No exclusions, use all elements
914
- filtered_elements = all_elements
915
-
916
- # Now extract text from the filtered elements
917
- if filtered_elements:
918
- from natural_pdf.elements.collections import ElementCollection
919
- collection = ElementCollection(filtered_elements)
920
- # Sort in reading order
921
- collection = collection.sort(key=lambda e: (e.top, e.x0))
922
- # Extract text
923
- result = " ".join(e.text for e in collection if hasattr(e, 'text'))
924
-
925
- if debug:
926
- logger.debug(f"Got {len(result)} characters from element-based extraction")
927
-
928
- # Return the result
929
- return result
930
- else:
801
+ if apply_exclusions_flag and self._page._exclusions:
802
+ all_page_exclusions = self._page._get_exclusion_regions(
803
+ include_callable=True, debug=debug
804
+ )
805
+ overlapping_exclusions = []
806
+ for excl in all_page_exclusions:
807
+ if get_bbox_overlap(self.bbox, excl.bbox) is not None:
808
+ overlapping_exclusions.append(excl)
809
+ exclusion_regions = overlapping_exclusions
931
810
  if debug:
932
- logger.debug(f"No elements found after filtering")
933
- return ""
934
-
935
- # Handle OCR if needed
936
- use_ocr = ocr is True or (isinstance(ocr, dict) and ocr.get('enabled', False))
937
- auto_ocr = ocr is None and self.page._parent._ocr_config.get('enabled') == 'auto'
938
-
939
- # Run OCR if explicitly requested or if in auto mode and no text found
940
- if use_ocr or (auto_ocr and not result.strip()):
941
- ocr_config = self.page._get_ocr_config(ocr or {}) if use_ocr else self.page._get_ocr_config({'enabled': 'auto'})
942
- ocr_elements = self.apply_ocr(**ocr_config)
943
-
944
- if ocr_elements:
945
- # Filter OCR elements by exclusions if needed
946
- if apply_exclusions and exclusion_regions:
947
- filtered_ocr = []
948
- for element in ocr_elements:
949
- exclude = False
950
- for region in exclusion_regions:
951
- if region._is_element_in_region(element):
952
- exclude = True
953
- break
954
- if not exclude:
955
- filtered_ocr.append(element)
956
- else:
957
- filtered_ocr = ocr_elements
958
-
959
- # Extract text from OCR elements
960
- from natural_pdf.elements.collections import ElementCollection
961
- ocr_collection = ElementCollection(filtered_ocr)
962
- ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
963
-
964
- # Use OCR text if it's not empty
965
- if ocr_text.strip():
966
- return ocr_text
967
-
811
+ logger.debug(
812
+ f"Region {self.bbox}: Applying {len(exclusion_regions)} overlapping exclusions."
813
+ )
814
+ elif debug:
815
+ logger.debug(f"Region {self.bbox}: Not applying exclusions.")
816
+
817
+ # 4. Spatially Filter Characters using Utility
818
+ # Pass self as the target_region for precise polygon checks etc.
819
+ filtered_chars = filter_chars_spatially(
820
+ char_dicts=all_char_dicts,
821
+ exclusion_regions=exclusion_regions,
822
+ target_region=self, # Pass self!
823
+ debug=debug,
824
+ )
825
+
826
+ # 5. Generate Text Layout using Utility
827
+ result = generate_text_layout(
828
+ char_dicts=filtered_chars,
829
+ layout_context_bbox=self.bbox, # Use region's bbox for context
830
+ user_kwargs=kwargs,
831
+ )
832
+
833
+ logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
968
834
  return result
969
-
970
- def extract_table(self, method: str = None, table_settings: dict = None,
971
- use_ocr: bool = False, ocr_config: dict = None) -> List[List[str]]:
835
+
836
+ def extract_table(
837
+ self,
838
+ method: str = None,
839
+ table_settings: dict = None,
840
+ use_ocr: bool = False,
841
+ ocr_config: dict = None,
842
+ ) -> List[List[str]]:
972
843
  """
973
844
  Extract a table from this region.
974
-
845
+
975
846
  Args:
976
847
  method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
977
848
  table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
978
849
  use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
979
850
  ocr_config: OCR configuration parameters
980
-
851
+
981
852
  Returns:
982
853
  Table data as a list of rows, where each row is a list of cell values
983
854
  """
984
855
  # Default settings if none provided
985
856
  if table_settings is None:
986
857
  table_settings = {}
987
-
858
+
988
859
  # Auto-detect method if not specified
989
860
  if method is None:
990
861
  # If this is a TATR-detected region, use TATR method
991
- if hasattr(self, 'model') and self.model == 'tatr' and self.region_type == 'table':
992
- method = 'tatr'
862
+ if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
863
+ method = "tatr"
993
864
  else:
994
- method = 'plumber'
995
-
865
+ method = "plumber"
866
+
996
867
  # Use the selected method
997
- if method == 'tatr':
868
+ if method == "tatr":
998
869
  return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
999
870
  else: # Default to pdfplumber
1000
871
  return self._extract_table_plumber(table_settings)
1001
-
872
+
1002
873
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1003
874
  """
1004
875
  Extract table using pdfplumber's table extraction.
1005
-
876
+
1006
877
  Args:
1007
878
  table_settings: Settings for pdfplumber table extraction
1008
-
879
+
1009
880
  Returns:
1010
881
  Table data as a list of rows, where each row is a list of cell values
1011
882
  """
1012
883
  # Create a crop of the page for this region
1013
884
  cropped = self.page._page.crop(self.bbox)
1014
-
885
+
1015
886
  # Extract table from the cropped area
1016
887
  tables = cropped.extract_tables(table_settings)
1017
-
888
+
1018
889
  # Return the first table or an empty list if none found
1019
890
  if tables:
1020
891
  return tables[0]
1021
892
  return []
1022
-
893
+
1023
894
  def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
1024
895
  """
1025
896
  Extract table using TATR structure detection.
1026
-
897
+
1027
898
  Args:
1028
899
  use_ocr: Whether to apply OCR to each cell for better text extraction
1029
900
  ocr_config: Optional OCR configuration parameters
1030
-
901
+
1031
902
  Returns:
1032
903
  Table data as a list of rows, where each row is a list of cell values
1033
904
  """
1034
905
  # Find all rows and headers in this table
1035
- rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1036
- headers = self.page.find_all(f'region[type=table-column-header][model=tatr]')
1037
- columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1038
-
906
+ rows = self.page.find_all(f"region[type=table-row][model=tatr]")
907
+ headers = self.page.find_all(f"region[type=table-column-header][model=tatr]")
908
+ columns = self.page.find_all(f"region[type=table-column][model=tatr]")
909
+
1039
910
  # Filter to only include rows/headers/columns that overlap with this table region
1040
911
  def is_in_table(region):
1041
912
  # Check for overlap - simplifying to center point for now
1042
913
  region_center_x = (region.x0 + region.x1) / 2
1043
914
  region_center_y = (region.top + region.bottom) / 2
1044
- return (self.x0 <= region_center_x <= self.x1 and
1045
- self.top <= region_center_y <= self.bottom)
1046
-
915
+ return (
916
+ self.x0 <= region_center_x <= self.x1 and self.top <= region_center_y <= self.bottom
917
+ )
918
+
1047
919
  rows = [row for row in rows if is_in_table(row)]
1048
920
  headers = [header for header in headers if is_in_table(header)]
1049
921
  columns = [column for column in columns if is_in_table(column)]
1050
-
922
+
1051
923
  # Sort rows by vertical position (top to bottom)
1052
924
  rows.sort(key=lambda r: r.top)
1053
-
925
+
1054
926
  # Sort columns by horizontal position (left to right)
1055
927
  columns.sort(key=lambda c: c.x0)
1056
-
928
+
1057
929
  # Create table data structure
1058
930
  table_data = []
1059
-
931
+
1060
932
  # Prepare OCR config if needed
1061
933
  if use_ocr:
1062
934
  # Default OCR config focuses on small text with low confidence
@@ -1065,16 +937,20 @@ class Region(DirectionalMixin):
1065
937
  "min_confidence": 0.1, # Lower than default to catch more text
1066
938
  "detection_params": {
1067
939
  "text_threshold": 0.1, # Lower threshold for low-contrast text
1068
- "link_threshold": 0.1 # Lower threshold for connecting text components
1069
- }
940
+ "link_threshold": 0.1, # Lower threshold for connecting text components
941
+ },
1070
942
  }
1071
-
943
+
1072
944
  # Merge with provided config if any
1073
945
  if ocr_config:
1074
946
  if isinstance(ocr_config, dict):
1075
947
  # Update default config with provided values
1076
948
  for key, value in ocr_config.items():
1077
- if isinstance(value, dict) and key in default_ocr_config and isinstance(default_ocr_config[key], dict):
949
+ if (
950
+ isinstance(value, dict)
951
+ and key in default_ocr_config
952
+ and isinstance(default_ocr_config[key], dict)
953
+ ):
1078
954
  # Merge nested dicts
1079
955
  default_ocr_config[key].update(value)
1080
956
  else:
@@ -1083,10 +959,10 @@ class Region(DirectionalMixin):
1083
959
  else:
1084
960
  # Not a dict, use as is
1085
961
  default_ocr_config = ocr_config
1086
-
962
+
1087
963
  # Use the merged config
1088
964
  ocr_config = default_ocr_config
1089
-
965
+
1090
966
  # Add header row if headers were detected
1091
967
  if headers:
1092
968
  header_texts = []
@@ -1099,30 +975,28 @@ class Region(DirectionalMixin):
1099
975
  if ocr_text:
1100
976
  header_texts.append(ocr_text)
1101
977
  continue
1102
-
978
+
1103
979
  # Fallback to normal extraction
1104
980
  header_texts.append(header.extract_text().strip())
1105
981
  table_data.append(header_texts)
1106
-
982
+
1107
983
  # Process rows
1108
984
  for row in rows:
1109
985
  row_cells = []
1110
-
986
+
1111
987
  # If we have columns, use them to extract cells
1112
988
  if columns:
1113
989
  for column in columns:
1114
990
  # Create a cell region at the intersection of row and column
1115
- cell_bbox = (
1116
- column.x0,
1117
- row.top,
1118
- column.x1,
1119
- row.bottom
1120
- )
1121
-
991
+ cell_bbox = (column.x0, row.top, column.x1, row.bottom)
992
+
1122
993
  # Create a region for this cell
1123
- from natural_pdf.elements.region import Region # Import here to avoid circular imports
994
+ from natural_pdf.elements.region import ( # Import here to avoid circular imports
995
+ Region,
996
+ )
997
+
1124
998
  cell_region = Region(self.page, cell_bbox)
1125
-
999
+
1126
1000
  # Extract text from the cell
1127
1001
  if use_ocr:
1128
1002
  # Apply OCR to the cell
@@ -1133,7 +1007,7 @@ class Region(DirectionalMixin):
1133
1007
  if ocr_text:
1134
1008
  row_cells.append(ocr_text)
1135
1009
  continue
1136
-
1010
+
1137
1011
  # Fallback to normal extraction
1138
1012
  cell_text = cell_region.extract_text().strip()
1139
1013
  row_cells.append(cell_text)
@@ -1147,182 +1021,215 @@ class Region(DirectionalMixin):
1147
1021
  if ocr_text:
1148
1022
  row_cells.append(ocr_text)
1149
1023
  continue
1150
-
1024
+
1151
1025
  # Fallback to normal extraction
1152
1026
  row_cells.append(row.extract_text().strip())
1153
-
1027
+
1154
1028
  table_data.append(row_cells)
1155
-
1029
+
1156
1030
  return table_data
1157
-
1158
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
1031
+
1032
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
1159
1033
  """
1160
1034
  Find the first element in this region matching the selector.
1161
-
1035
+
1162
1036
  Args:
1163
1037
  selector: CSS-like selector string
1164
1038
  apply_exclusions: Whether to apply exclusion regions
1165
1039
  **kwargs: Additional parameters for element filtering
1166
-
1040
+
1167
1041
  Returns:
1168
1042
  First matching element or None
1169
1043
  """
1170
1044
  elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1171
- return elements[0] if elements else None
1172
-
1173
- def _find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
1045
+ return elements.first if elements else None # Use .first property
1046
+
1047
+ def find_all(
1048
+ self, selector: str, apply_exclusions=True, **kwargs
1049
+ ) -> "ElementCollection": # Changed from _find_all
1174
1050
  """
1175
1051
  Find all elements in this region matching the selector.
1176
-
1052
+
1177
1053
  Args:
1178
1054
  selector: CSS-like selector string
1179
1055
  apply_exclusions: Whether to apply exclusion regions
1180
1056
  **kwargs: Additional parameters for element filtering
1181
-
1057
+
1182
1058
  Returns:
1183
1059
  ElementCollection with matching elements
1184
1060
  """
1185
1061
  from natural_pdf.elements.collections import ElementCollection
1186
1062
 
1187
1063
  # If we span multiple pages, filter our elements
1064
+ # TODO: Revisit multi-page region logic
1188
1065
  if self._spans_pages and self._multi_page_elements is not None:
1189
- # Parse the selector
1190
- from natural_pdf.selectors.parser import parse_selector
1191
- selector_obj = parse_selector(selector)
1192
-
1193
- # Rather than using matches_selector, let each page's find_all handle the matching
1194
- # since that method is already properly implemented
1195
- all_matching_elements = []
1196
- page_ranges = {}
1197
-
1198
- # Group elements by page
1199
- for element in self._multi_page_elements:
1200
- if element.page not in page_ranges:
1201
- page_ranges[element.page] = []
1202
- page_ranges[element.page].append(element)
1203
-
1204
- # For each page, use its find_all to match elements, then filter to our collection
1205
- for page, page_elements in page_ranges.items():
1206
- # Get all matching elements from the page
1207
- page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1208
-
1209
- # Filter to just the elements that are in our collection
1210
- for element in page_matches:
1211
- if element in page_elements:
1212
- all_matching_elements.append(element)
1213
-
1214
- return ElementCollection(all_matching_elements)
1066
+ logger.warning("find_all on multi-page regions is not fully implemented.")
1067
+ # Temporary: Apply filter directly to cached elements
1068
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1069
+
1070
+ try:
1071
+ selector_obj = parse_selector(selector)
1072
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
1073
+ matching = [el for el in self._multi_page_elements if filter_func(el)]
1074
+ return ElementCollection(matching)
1075
+ except Exception as e:
1076
+ logger.error(f"Error applying selector to multi-page region elements: {e}")
1077
+ return ElementCollection([])
1215
1078
 
1216
1079
  # Otherwise, get elements from the page and filter by selector and region
1217
1080
  page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1081
+ # Use the precise _is_element_in_region check
1218
1082
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1219
1083
  return ElementCollection(filtered_elements)
1220
-
1221
- def apply_ocr(self, **ocr_params) -> List['TextElement']:
1084
+
1085
+ def apply_ocr(self, **ocr_params) -> List["TextElement"]: # Return type hint updated
1222
1086
  """
1223
1087
  Apply OCR to this region and return the created text elements.
1224
-
1088
+
1225
1089
  Args:
1226
- **ocr_params: OCR parameters to override defaults
1227
-
1090
+ **ocr_params: OCR parameters to override defaults (passed to OCRManager)
1091
+
1228
1092
  Returns:
1229
- List of created text elements
1093
+ List of created TextElement objects representing OCR words/lines.
1230
1094
  """
1231
- from natural_pdf.ocr import OCRManager
1232
-
1233
- # Get OCR configuration but suppress verbose output
1234
- if isinstance(ocr_params, dict):
1235
- ocr_params["verbose"] = False
1236
- else:
1237
- ocr_params = {"enabled": True, "verbose": False}
1238
-
1239
- ocr_config = self.page._get_ocr_config(ocr_params)
1240
-
1241
- # Skip if OCR is disabled
1242
- if not ocr_config.get('enabled'):
1095
+ # Ensure OCRManager is available
1096
+ if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1097
+ logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
1243
1098
  return []
1244
-
1245
- # Render the page
1246
- page_image = self.page.to_image()
1247
-
1248
- # Crop to this region
1249
- region_image = page_image.crop((self.x0, self.top, self.x1, self.bottom))
1250
-
1251
- # Run OCR on this region
1252
- ocr_mgr = OCRManager.get_instance()
1253
- results = ocr_mgr.recognize_region(region_image, ocr_config)
1254
-
1255
- # Adjust coordinates to be relative to the page
1256
- for result in results:
1257
- # Calculate bbox in page coordinates
1258
- result['bbox'] = (
1259
- result['bbox'][0] + self.x0,
1260
- result['bbox'][1] + self.top,
1261
- result['bbox'][2] + self.x0,
1262
- result['bbox'][3] + self.top
1099
+ ocr_mgr = self.page._parent._ocr_manager
1100
+
1101
+ # Get OCR configuration from kwargs or PDF defaults if needed
1102
+ # We'll mostly rely on passing ocr_params directly to the manager
1103
+ # For rendering, use a reasonable default scale
1104
+ ocr_image_scale = self.page._parent._config.get("ocr_image_scale", 2.0)
1105
+
1106
+ logger.debug(
1107
+ f"Region {self.bbox}: Applying OCR with scale {ocr_image_scale} and params: {ocr_params}"
1108
+ )
1109
+
1110
+ # Render the page region to an image
1111
+ try:
1112
+ # Crop the page image to this region's bbox
1113
+ region_image = self.to_image(
1114
+ scale=ocr_image_scale, include_highlights=False, crop_only=True
1263
1115
  )
1264
-
1265
- # Create text elements with adjusted coordinates
1266
- elements = []
1116
+ if not region_image:
1117
+ logger.error("Failed to render region to image for OCR.")
1118
+ return []
1119
+ logger.debug(f"Region rendered to image size: {region_image.size}")
1120
+ except Exception as e:
1121
+ logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1122
+ return []
1123
+
1124
+ # Run OCR on this region's image using the manager
1125
+ try:
1126
+ # Pass the single image and any specific options/kwargs
1127
+ # The manager handles engine selection based on ocr_params or defaults
1128
+ results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
1129
+ # apply_ocr returns List[Dict] for single image
1130
+ if not isinstance(results, list):
1131
+ logger.error(
1132
+ f"OCRManager returned unexpected type for single region image: {type(results)}"
1133
+ )
1134
+ return []
1135
+ logger.debug(f"Region OCR processing returned {len(results)} results.")
1136
+ except Exception as e:
1137
+ logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1138
+ return []
1139
+
1140
+ # Convert results to TextElements, scaling coordinates relative to the page
1141
+ # Calculate scaling factors based on the region image vs the region PDF coords
1142
+ scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
1143
+ scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
1144
+ logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
1145
+
1146
+ created_elements = []
1267
1147
  for result in results:
1268
- # Only include results that are fully within the region
1269
- if (result['bbox'][0] >= self.x0 and
1270
- result['bbox'][1] >= self.top and
1271
- result['bbox'][2] <= self.x1 and
1272
- result['bbox'][3] <= self.bottom):
1273
- # Create a TextElement object with the appropriate fields
1274
- from natural_pdf.elements.text import TextElement
1148
+ try:
1149
+ img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
1150
+ pdf_height = (img_bottom - img_top) * scale_y
1151
+
1152
+ # Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
1153
+ page_x0 = self.x0 + (img_x0 * scale_x)
1154
+ page_top = self.top + (img_top * scale_y)
1155
+ page_x1 = self.x0 + (img_x1 * scale_x)
1156
+ page_bottom = self.top + (img_bottom * scale_y)
1157
+
1158
+ # Create element data using PAGE coordinates
1275
1159
  element_data = {
1276
- 'text': result['text'],
1277
- 'x0': result['bbox'][0],
1278
- 'top': result['bbox'][1],
1279
- 'x1': result['bbox'][2],
1280
- 'bottom': result['bbox'][3],
1281
- 'width': result['bbox'][2] - result['bbox'][0],
1282
- 'height': result['bbox'][3] - result['bbox'][1],
1283
- 'object_type': 'text',
1284
- 'source': 'ocr',
1285
- 'confidence': result['confidence'],
1286
- # Add default font information to work with existing expectations
1287
- 'fontname': 'OCR-detected',
1288
- 'size': 10.0,
1289
- 'page_number': self.page.number
1160
+ "text": result["text"],
1161
+ "x0": page_x0,
1162
+ "top": page_top,
1163
+ "x1": page_x1,
1164
+ "bottom": page_bottom,
1165
+ "width": page_x1 - page_x0,
1166
+ "height": page_bottom - page_top,
1167
+ "object_type": "word", # Treat as word
1168
+ "source": "ocr",
1169
+ "confidence": float(result.get("confidence", 0.0)),
1170
+ "fontname": "OCR",
1171
+ "size": round(pdf_height) if pdf_height > 0 else 10.0, # Size based on height
1172
+ "page_number": self.page.number,
1173
+ "bold": False,
1174
+ "italic": False,
1175
+ "upright": True,
1176
+ "doctop": page_top + self.page._page.initial_doctop,
1290
1177
  }
1291
-
1178
+
1179
+ # Create the representative char dict
1180
+ ocr_char_dict = element_data.copy()
1181
+ ocr_char_dict["object_type"] = "char"
1182
+ ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1183
+
1184
+ # Add char dicts to word data
1185
+ element_data["_char_dicts"] = [ocr_char_dict]
1186
+
1187
+ # Create the TextElement word
1188
+ from natural_pdf.elements.text import TextElement # Local import ok here
1189
+
1292
1190
  elem = TextElement(element_data, self.page)
1293
- elements.append(elem)
1294
-
1295
- # Add to page's elements
1296
- if hasattr(self.page, '_elements') and self.page._elements is not None:
1297
- # Add to words list to make it accessible via standard API
1298
- if 'words' in self.page._elements:
1299
- self.page._elements['words'].append(elem)
1300
- else:
1301
- self.page._elements['words'] = [elem]
1302
-
1303
- return elements
1304
-
1305
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1191
+ created_elements.append(elem)
1192
+
1193
+ # Add the element to the page's element manager
1194
+ self.page._element_mgr.add_element(elem, element_type="words")
1195
+ # Add the char dict to the manager's char list
1196
+ self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
1197
+
1198
+ except Exception as e:
1199
+ logger.error(
1200
+ f"Failed to convert region OCR result to element: {result}. Error: {e}",
1201
+ exc_info=True,
1202
+ )
1203
+
1204
+ logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
1205
+ return created_elements
1206
+
1207
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
1306
1208
  """
1307
1209
  Get a section between two elements within this region.
1308
-
1210
+
1309
1211
  Args:
1310
1212
  start_element: Element marking the start of the section
1311
1213
  end_element: Element marking the end of the section
1312
1214
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1313
-
1215
+
1314
1216
  Returns:
1315
1217
  Region representing the section
1316
1218
  """
1219
+ # Get elements only within this region first
1317
1220
  elements = self.get_elements()
1318
-
1319
- # If no elements, return self
1221
+
1222
+ # If no elements, return self or empty region?
1320
1223
  if not elements:
1321
- return self
1322
-
1224
+ logger.warning(
1225
+ f"get_section_between called on region {self.bbox} with no contained elements."
1226
+ )
1227
+ # Return an empty region at the start of the parent region
1228
+ return Region(self.page, (self.x0, self.top, self.x0, self.top))
1229
+
1323
1230
  # Sort elements in reading order
1324
1231
  elements.sort(key=lambda e: (e.top, e.x0))
1325
-
1232
+
1326
1233
  # Find start index
1327
1234
  start_idx = 0
1328
1235
  if start_element:
@@ -1330,8 +1237,12 @@ class Region(DirectionalMixin):
1330
1237
  start_idx = elements.index(start_element)
1331
1238
  except ValueError:
1332
1239
  # Start element not in region, use first element
1333
- pass
1334
-
1240
+ logger.debug("Start element not found in region, using first element.")
1241
+ start_element = elements[0] # Use the actual first element
1242
+ start_idx = 0
1243
+ else:
1244
+ start_element = elements[0] # Default start is first element
1245
+
1335
1246
  # Find end index
1336
1247
  end_idx = len(elements) - 1
1337
1248
  if end_element:
@@ -1339,218 +1250,231 @@ class Region(DirectionalMixin):
1339
1250
  end_idx = elements.index(end_element)
1340
1251
  except ValueError:
1341
1252
  # End element not in region, use last element
1342
- pass
1343
-
1253
+ logger.debug("End element not found in region, using last element.")
1254
+ end_element = elements[-1] # Use the actual last element
1255
+ end_idx = len(elements) - 1
1256
+ else:
1257
+ end_element = elements[-1] # Default end is last element
1258
+
1344
1259
  # Adjust indexes based on boundary inclusion
1345
- if boundary_inclusion == 'none':
1260
+ start_element_for_bbox = start_element
1261
+ end_element_for_bbox = end_element
1262
+
1263
+ if boundary_inclusion == "none":
1346
1264
  start_idx += 1
1347
1265
  end_idx -= 1
1348
- elif boundary_inclusion == 'start':
1266
+ start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
1267
+ end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
1268
+ elif boundary_inclusion == "start":
1349
1269
  end_idx -= 1
1350
- elif boundary_inclusion == 'end':
1270
+ end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
1271
+ elif boundary_inclusion == "end":
1351
1272
  start_idx += 1
1352
-
1273
+ start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
1274
+
1353
1275
  # Ensure valid indexes
1354
1276
  start_idx = max(0, start_idx)
1355
1277
  end_idx = min(len(elements) - 1, end_idx)
1356
-
1278
+
1357
1279
  # If no valid elements in range, return empty region
1358
- if start_idx > end_idx:
1359
- return Region(self.page, (0, 0, 0, 0))
1360
-
1361
- # Get elements in range
1362
- section_elements = elements[start_idx:end_idx+1]
1363
-
1364
- # Create bounding box around elements
1280
+ if start_idx > end_idx or start_element_for_bbox is None or end_element_for_bbox is None:
1281
+ logger.debug("No valid elements in range for get_section_between.")
1282
+ # Return an empty region positioned at the start element boundary
1283
+ anchor = start_element if start_element else self
1284
+ return Region(self.page, (anchor.x0, anchor.top, anchor.x0, anchor.top))
1285
+
1286
+ # Get elements in range based on adjusted indices
1287
+ section_elements = elements[start_idx : end_idx + 1]
1288
+
1289
+ # Create bounding box around the ELEMENTS included based on indices
1365
1290
  x0 = min(e.x0 for e in section_elements)
1366
1291
  top = min(e.top for e in section_elements)
1367
1292
  x1 = max(e.x1 for e in section_elements)
1368
1293
  bottom = max(e.bottom for e in section_elements)
1369
-
1370
- # Adjust boundaries for better boundary inclusion/exclusion
1371
- pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
1372
-
1373
- # Only proceed with adjustments if we have elements in the section
1374
- if section_elements:
1375
- # Adjust top boundary if start element should be excluded
1376
- if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
1377
- # If start element is just above the section, move the top down
1378
- # Use a larger threshold (10 points) to catch more cases
1379
- if abs(top - start_element.bottom) < 10:
1380
- top += pixel_adjustment
1381
-
1382
- # Adjust bottom boundary if end element should be excluded
1383
- if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
1384
- # If end element is just below the section, move the bottom up
1385
- # Use a larger threshold (10 points) to catch more cases
1386
- if abs(bottom - end_element.top) < 10:
1387
- bottom -= pixel_adjustment
1388
-
1389
- # Ensure top is always less than bottom (valid region)
1390
- if top >= bottom:
1391
- # Reset to original if adjustment would create an invalid region
1392
- top = min(e.top for e in section_elements)
1393
- bottom = max(e.bottom for e in section_elements)
1394
-
1294
+
1395
1295
  # Create new region
1396
1296
  section = Region(self.page, (x0, top, x1, bottom))
1397
- section.start_element = start_element if boundary_inclusion in ('start', 'both') else None
1398
- section.end_element = end_element if boundary_inclusion in ('end', 'both') else None
1399
-
1297
+ # Store the original boundary elements for reference
1298
+ section.start_element = start_element
1299
+ section.end_element = end_element
1300
+
1400
1301
  return section
1401
-
1402
- def get_sections(self, start_elements=None, end_elements=None, boundary_inclusion='both') -> List['Region']:
1302
+
1303
+ def get_sections(
1304
+ self, start_elements=None, end_elements=None, boundary_inclusion="both"
1305
+ ) -> List["Region"]:
1403
1306
  """
1404
1307
  Get sections within this region based on start/end elements.
1405
-
1308
+
1406
1309
  Args:
1407
1310
  start_elements: Elements or selector string that mark the start of sections
1408
1311
  end_elements: Elements or selector string that mark the end of sections
1409
1312
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1410
-
1313
+
1411
1314
  Returns:
1412
1315
  List of Region objects representing the extracted sections
1413
1316
  """
1414
1317
  from natural_pdf.elements.collections import ElementCollection
1415
-
1416
- # Process string selectors to find elements
1318
+
1319
+ # Process string selectors to find elements WITHIN THIS REGION
1417
1320
  if isinstance(start_elements, str):
1418
- start_elements = self.find_all(start_elements)
1419
- if hasattr(start_elements, 'elements'):
1321
+ start_elements = self.find_all(start_elements) # Use region's find_all
1322
+ if hasattr(start_elements, "elements"):
1420
1323
  start_elements = start_elements.elements
1421
-
1324
+
1422
1325
  if isinstance(end_elements, str):
1423
- end_elements = self.find_all(end_elements)
1424
- if hasattr(end_elements, 'elements'):
1326
+ end_elements = self.find_all(end_elements) # Use region's find_all
1327
+ if hasattr(end_elements, "elements"):
1425
1328
  end_elements = end_elements.elements
1426
-
1427
- # If no start elements, return empty list
1329
+
1330
+ # Ensure start_elements is a list (or similar iterable)
1331
+ if start_elements is None or not hasattr(start_elements, "__iter__"):
1332
+ logger.warning(
1333
+ "get_sections requires valid start_elements (selector or list). Returning empty."
1334
+ )
1335
+ return []
1336
+ # Ensure end_elements is a list if provided
1337
+ if end_elements is not None and not hasattr(end_elements, "__iter__"):
1338
+ logger.warning("end_elements must be iterable if provided. Ignoring.")
1339
+ end_elements = []
1340
+ elif end_elements is None:
1341
+ end_elements = []
1342
+
1343
+ # If no start elements found within the region, return empty list
1428
1344
  if not start_elements:
1429
1345
  return []
1430
-
1431
- # Sort elements in reading order
1432
- all_elements = self.get_elements()
1433
- all_elements.sort(key=lambda e: (e.top, e.x0))
1434
-
1435
- # Get all indexes in the sorted list
1346
+
1347
+ # Sort all elements within the region in reading order
1348
+ all_elements_in_region = self.get_elements()
1349
+ all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
1350
+
1351
+ if not all_elements_in_region:
1352
+ return [] # Cannot create sections if region is empty
1353
+
1354
+ # Map elements to their indices in the sorted list
1355
+ element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
1356
+
1357
+ # Mark section boundaries using indices from the sorted list
1436
1358
  section_boundaries = []
1437
-
1359
+
1438
1360
  # Add start element indexes
1439
1361
  for element in start_elements:
1440
- try:
1441
- idx = all_elements.index(element)
1442
- section_boundaries.append({
1443
- 'index': idx,
1444
- 'element': element,
1445
- 'type': 'start'
1446
- })
1447
- except ValueError:
1448
- # Element not in this region, skip
1449
- continue
1450
-
1362
+ idx = element_to_index.get(element)
1363
+ if idx is not None:
1364
+ section_boundaries.append({"index": idx, "element": element, "type": "start"})
1365
+ # else: Element found by selector might not be geometrically in region? Log warning?
1366
+
1451
1367
  # Add end element indexes if provided
1452
- if end_elements:
1453
- for element in end_elements:
1454
- try:
1455
- idx = all_elements.index(element)
1456
- section_boundaries.append({
1457
- 'index': idx,
1458
- 'element': element,
1459
- 'type': 'end'
1460
- })
1461
- except ValueError:
1462
- # Element not in this region, skip
1463
- continue
1464
-
1465
- # Sort boundaries by index (document order)
1466
- section_boundaries.sort(key=lambda x: x['index'])
1467
-
1368
+ for element in end_elements:
1369
+ idx = element_to_index.get(element)
1370
+ if idx is not None:
1371
+ section_boundaries.append({"index": idx, "element": element, "type": "end"})
1372
+
1373
+ # Sort boundaries by index (document order within the region)
1374
+ section_boundaries.sort(key=lambda x: x["index"])
1375
+
1468
1376
  # Generate sections
1469
1377
  sections = []
1470
- current_start = None
1471
-
1378
+ current_start_boundary = None
1379
+
1472
1380
  for i, boundary in enumerate(section_boundaries):
1473
1381
  # If it's a start boundary and we don't have a current start
1474
- if boundary['type'] == 'start' and current_start is None:
1475
- current_start = boundary
1476
-
1382
+ if boundary["type"] == "start" and current_start_boundary is None:
1383
+ current_start_boundary = boundary
1384
+
1477
1385
  # If it's an end boundary and we have a current start
1478
- elif boundary['type'] == 'end' and current_start is not None:
1386
+ elif boundary["type"] == "end" and current_start_boundary is not None:
1479
1387
  # Create a section from current_start to this boundary
1480
- start_element = current_start['element']
1481
- end_element = boundary['element']
1482
- section = self.get_section_between(
1483
- start_element,
1484
- end_element,
1485
- boundary_inclusion
1486
- )
1487
- sections.append(section)
1488
- current_start = None
1489
-
1490
- # If it's another start boundary and we have a current start (for splitting by starts only)
1491
- elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1492
- # Create a section from current_start to just before this boundary
1493
- start_element = current_start['element']
1494
- end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
1495
- section = self.get_section_between(
1496
- start_element,
1497
- end_element,
1498
- boundary_inclusion
1499
- )
1388
+ start_element = current_start_boundary["element"]
1389
+ end_element = boundary["element"]
1390
+ # Use the helper, ensuring elements are from within the region
1391
+ section = self.get_section_between(start_element, end_element, boundary_inclusion)
1500
1392
  sections.append(section)
1501
- current_start = boundary
1502
-
1393
+ current_start_boundary = None # Reset
1394
+
1395
+ # If it's another start boundary and we have a current start (split by starts only)
1396
+ elif (
1397
+ boundary["type"] == "start"
1398
+ and current_start_boundary is not None
1399
+ and not end_elements
1400
+ ):
1401
+ # End the previous section just before this start boundary
1402
+ start_element = current_start_boundary["element"]
1403
+ # Find the element immediately preceding this start in the sorted list
1404
+ end_idx = boundary["index"] - 1
1405
+ if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
1406
+ end_element = all_elements_in_region[end_idx]
1407
+ section = self.get_section_between(
1408
+ start_element, end_element, boundary_inclusion
1409
+ )
1410
+ sections.append(section)
1411
+ # Else: Section started and ended by consecutive start elements? Create empty?
1412
+ # For now, just reset and start new section
1413
+
1414
+ # Start the new section
1415
+ current_start_boundary = boundary
1416
+
1503
1417
  # Handle the last section if we have a current start
1504
- if current_start is not None:
1505
- start_element = current_start['element']
1506
- # Use the last element in the region as the end
1507
- end_element = all_elements[-1] if all_elements else None
1508
- section = self.get_section_between(
1509
- start_element,
1510
- end_element,
1511
- boundary_inclusion
1512
- )
1418
+ if current_start_boundary is not None:
1419
+ start_element = current_start_boundary["element"]
1420
+ # End at the last element within the region
1421
+ end_element = all_elements_in_region[-1]
1422
+ section = self.get_section_between(start_element, end_element, boundary_inclusion)
1513
1423
  sections.append(section)
1514
-
1424
+
1515
1425
  return sections
1516
-
1426
+
1517
1427
  def create_cells(self):
1518
1428
  """
1519
1429
  Create cell regions for a detected table by intersecting its
1520
1430
  row and column regions, and add them to the page.
1521
-
1431
+
1522
1432
  Assumes child row and column regions are already present on the page.
1523
1433
 
1524
1434
  Returns:
1525
1435
  Self for method chaining.
1526
1436
  """
1527
1437
  # Ensure this is called on a table region
1528
- if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
1529
- raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
1530
-
1438
+ if self.region_type not in (
1439
+ "table",
1440
+ "tableofcontents",
1441
+ ): # Allow for ToC which might have structure
1442
+ raise ValueError(
1443
+ f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'"
1444
+ )
1445
+
1531
1446
  # Find rows and columns associated with this page
1532
1447
  # Remove the model-specific filter
1533
- rows = self.page.find_all('region[type=table-row]')
1534
- columns = self.page.find_all('region[type=table-column]')
1535
-
1448
+ rows = self.page.find_all("region[type=table-row]")
1449
+ columns = self.page.find_all("region[type=table-column]")
1450
+
1536
1451
  # Filter to only include those that overlap with this table region
1537
1452
  def is_in_table(element):
1538
1453
  # Use a simple overlap check (more robust than just center point)
1539
1454
  # Check if element's bbox overlaps with self.bbox
1540
- return (element.x0 < self.x1 and element.x1 > self.x0 and
1541
- element.top < self.bottom and element.bottom > self.top)
1542
-
1455
+ return (
1456
+ hasattr(element, "bbox")
1457
+ and element.x0 < self.x1 # Ensure element has bbox
1458
+ and element.x1 > self.x0
1459
+ and element.top < self.bottom
1460
+ and element.bottom > self.top
1461
+ )
1462
+
1543
1463
  table_rows = [r for r in rows if is_in_table(r)]
1544
1464
  table_columns = [c for c in columns if is_in_table(c)]
1545
-
1465
+
1546
1466
  if not table_rows or not table_columns:
1547
- self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
1548
- return self # Return self even if no cells created
1549
-
1467
+ # Use page's logger if available
1468
+ logger_instance = getattr(self._page, "logger", logger)
1469
+ logger_instance.warning(
1470
+ f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found."
1471
+ )
1472
+ return self # Return self even if no cells created
1473
+
1550
1474
  # Sort rows and columns
1551
1475
  table_rows.sort(key=lambda r: r.top)
1552
1476
  table_columns.sort(key=lambda c: c.x0)
1553
-
1477
+
1554
1478
  # Create cells and add them to the page's element manager
1555
1479
  created_count = 0
1556
1480
  for row in table_rows:
@@ -1564,41 +1488,49 @@ class Region(DirectionalMixin):
1564
1488
  # Only create a cell if the intersection is valid (positive width/height)
1565
1489
  if cell_x1 > cell_x0 and cell_y1 > cell_y0:
1566
1490
  # Create cell region at the intersection
1567
- cell = self.page.create_region(
1568
- cell_x0, cell_y0, cell_x1, cell_y1
1569
- )
1491
+ cell = self.page.create_region(cell_x0, cell_y0, cell_x1, cell_y1)
1570
1492
  # Set metadata
1571
- cell.source = 'derived'
1572
- cell.region_type = 'table-cell' # Explicitly set type
1573
- cell.normalized_type = 'table-cell' # And normalized type
1493
+ cell.source = "derived"
1494
+ cell.region_type = "table-cell" # Explicitly set type
1495
+ cell.normalized_type = "table-cell" # And normalized type
1574
1496
  # Inherit model from the parent table region
1575
- cell.model = self.model
1576
- cell.parent_region = self # Link cell to parent table region
1577
-
1497
+ cell.model = self.model
1498
+ cell.parent_region = self # Link cell to parent table region
1499
+
1578
1500
  # Add the cell region to the page's element manager
1579
1501
  self.page._element_mgr.add_region(cell)
1580
1502
  created_count += 1
1581
-
1503
+
1582
1504
  # Optional: Add created cells to the table region's children
1583
1505
  # self.child_regions.extend(cells_created_in_this_call) # Needs list management
1584
1506
 
1585
- self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
1507
+ logger_instance = getattr(self._page, "logger", logger)
1508
+ logger_instance.info(
1509
+ f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions."
1510
+ )
1511
+
1512
+ return self # Return self for chaining
1586
1513
 
1587
- return self # Return self for chaining
1588
-
1589
- def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1514
+ def ask(
1515
+ self,
1516
+ question: str,
1517
+ min_confidence: float = 0.1,
1518
+ model: str = None,
1519
+ debug: bool = False,
1520
+ **kwargs,
1521
+ ) -> Dict[str, Any]:
1590
1522
  """
1591
1523
  Ask a question about the region content using document QA.
1592
-
1524
+
1593
1525
  This method uses a document question answering model to extract answers from the region content.
1594
1526
  It leverages both textual content and layout information for better understanding.
1595
-
1527
+
1596
1528
  Args:
1597
1529
  question: The question to ask about the region content
1598
1530
  min_confidence: Minimum confidence threshold for answers (0.0-1.0)
1599
1531
  model: Optional model name to use for QA (if None, uses default model)
1600
1532
  **kwargs: Additional parameters to pass to the QA engine
1601
-
1533
+
1602
1534
  Returns:
1603
1535
  Dictionary with answer details: {
1604
1536
  "answer": extracted text,
@@ -1609,112 +1541,151 @@ class Region(DirectionalMixin):
1609
1541
  "source_elements": list of elements that contain the answer (if found)
1610
1542
  }
1611
1543
  """
1612
- from natural_pdf.qa.document_qa import get_qa_engine
1613
-
1544
+ try:
1545
+ from natural_pdf.qa.document_qa import get_qa_engine
1546
+ except ImportError:
1547
+ logger.error(
1548
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
1549
+ )
1550
+ return {
1551
+ "answer": None,
1552
+ "confidence": 0.0,
1553
+ "found": False,
1554
+ "page_num": self.page.number,
1555
+ "source_elements": [],
1556
+ "region": self,
1557
+ }
1558
+
1614
1559
  # Get or initialize QA engine with specified model
1615
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1616
-
1560
+ try:
1561
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1562
+ except Exception as e:
1563
+ logger.error(f"Failed to initialize QA engine (model: {model}): {e}", exc_info=True)
1564
+ return {
1565
+ "answer": None,
1566
+ "confidence": 0.0,
1567
+ "found": False,
1568
+ "page_num": self.page.number,
1569
+ "source_elements": [],
1570
+ "region": self,
1571
+ }
1572
+
1617
1573
  # Ask the question using the QA engine
1618
- return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1574
+ try:
1575
+ return qa_engine.ask_pdf_region(
1576
+ self, question, min_confidence=min_confidence, debug=debug, **kwargs
1577
+ )
1578
+ except Exception as e:
1579
+ logger.error(f"Error during qa_engine.ask_pdf_region: {e}", exc_info=True)
1580
+ return {
1581
+ "answer": None,
1582
+ "confidence": 0.0,
1583
+ "found": False,
1584
+ "page_num": self.page.number,
1585
+ "source_elements": [],
1586
+ "region": self,
1587
+ }
1619
1588
 
1620
1589
  def add_child(self, child):
1621
1590
  """
1622
1591
  Add a child region to this region.
1623
-
1592
+
1624
1593
  Used for hierarchical document structure when using models like Docling
1625
1594
  that understand document hierarchy.
1626
-
1595
+
1627
1596
  Args:
1628
1597
  child: Region object to add as a child
1629
-
1598
+
1630
1599
  Returns:
1631
1600
  Self for method chaining
1632
1601
  """
1633
1602
  self.child_regions.append(child)
1634
1603
  child.parent_region = self
1635
1604
  return self
1636
-
1605
+
1637
1606
  def get_children(self, selector=None):
1638
1607
  """
1639
1608
  Get immediate child regions, optionally filtered by selector.
1640
-
1609
+
1641
1610
  Args:
1642
1611
  selector: Optional selector to filter children
1643
-
1612
+
1644
1613
  Returns:
1645
1614
  List of child regions matching the selector
1646
1615
  """
1647
1616
  import logging
1617
+
1648
1618
  logger = logging.getLogger("natural_pdf.elements.region")
1649
-
1619
+
1650
1620
  if selector is None:
1651
1621
  return self.child_regions
1652
-
1622
+
1653
1623
  # Use existing selector parser to filter
1654
- from natural_pdf.selectors.parser import match_elements_with_selector
1655
- matched = match_elements_with_selector(self.child_regions, selector)
1656
- logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
1657
- return matched
1658
-
1624
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1625
+
1626
+ try:
1627
+ selector_obj = parse_selector(selector)
1628
+ filter_func = selector_to_filter_func(selector_obj) # Removed region=self
1629
+ matched = [child for child in self.child_regions if filter_func(child)]
1630
+ logger.debug(
1631
+ f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'"
1632
+ )
1633
+ return matched
1634
+ except Exception as e:
1635
+ logger.error(f"Error applying selector in get_children: {e}", exc_info=True)
1636
+ return [] # Return empty list on error
1637
+
1659
1638
  def get_descendants(self, selector=None):
1660
1639
  """
1661
1640
  Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
1662
-
1641
+
1663
1642
  Args:
1664
1643
  selector: Optional selector to filter descendants
1665
-
1644
+
1666
1645
  Returns:
1667
1646
  List of descendant regions matching the selector
1668
1647
  """
1669
1648
  import logging
1649
+
1670
1650
  logger = logging.getLogger("natural_pdf.elements.region")
1671
-
1651
+
1672
1652
  all_descendants = []
1673
-
1674
- # First add direct children
1675
- all_descendants.extend(self.child_regions)
1676
-
1677
- # Then recursively add their descendants
1678
- for child in self.child_regions:
1679
- all_descendants.extend(child.get_descendants())
1680
-
1653
+ queue = list(self.child_regions) # Start with direct children
1654
+
1655
+ while queue:
1656
+ current = queue.pop(0)
1657
+ all_descendants.append(current)
1658
+ # Add current's children to the queue for processing
1659
+ if hasattr(current, "child_regions"):
1660
+ queue.extend(current.child_regions)
1661
+
1681
1662
  logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
1682
-
1663
+
1683
1664
  # Filter by selector if provided
1684
1665
  if selector is not None:
1685
- from natural_pdf.selectors.parser import match_elements_with_selector
1686
- matched = match_elements_with_selector(all_descendants, selector)
1687
- logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1688
- return matched
1689
-
1666
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1667
+
1668
+ try:
1669
+ selector_obj = parse_selector(selector)
1670
+ filter_func = selector_to_filter_func(selector_obj) # Removed region=self
1671
+ matched = [desc for desc in all_descendants if filter_func(desc)]
1672
+ logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1673
+ return matched
1674
+ except Exception as e:
1675
+ logger.error(f"Error applying selector in get_descendants: {e}", exc_info=True)
1676
+ return [] # Return empty list on error
1677
+
1690
1678
  return all_descendants
1691
-
1692
- def find_all(self, selector, recursive=True, **kwargs):
1693
- """
1694
- Find all matching elements within this region, with optional recursion through child regions.
1695
-
1696
- Args:
1697
- selector: The selector to find elements with
1698
- recursive: Whether to search recursively through child regions
1699
- **kwargs: Additional parameters to pass to the selector parser
1700
-
1701
- Returns:
1702
- Collection of matching elements
1703
- """
1704
- # Get direct matches
1705
- direct_matches = self._find_all(selector, region=self, **kwargs)
1706
-
1707
- if not recursive or not self.child_regions:
1708
- return direct_matches
1709
-
1710
- # Get recursive matches from children
1711
- from natural_pdf.elements.collections import ElementCollection
1712
- all_matches = list(direct_matches)
1713
-
1714
- for child in self.child_regions:
1715
- child_matches = child.find_all(selector, recursive=True, **kwargs)
1716
- for match in child_matches:
1717
- if match not in all_matches:
1718
- all_matches.append(match)
1719
-
1720
- return ElementCollection(all_matches)
1679
+
1680
+ # Removed recursive=True, find_all on region shouldn't be recursive by default
1681
+ # Renamed _find_all back to find_all
1682
+ # def find_all(self, selector, apply_exclusions=True, **kwargs):
1683
+ # See implementation above near get_elements
1684
+
1685
+ def __repr__(self) -> str:
1686
+ """String representation of the region."""
1687
+ poly_info = " (Polygon)" if self.has_polygon else ""
1688
+ name_info = f" name='{self.name}'" if self.name else ""
1689
+ type_info = f" type='{self.region_type}'" if self.region_type else ""
1690
+ source_info = f" source='{self.source}'" if self.source else ""
1691
+ return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"