natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1793 @@
1
+ from typing import Optional, Union, List, Dict, Tuple, Any, Callable, TYPE_CHECKING
2
+ from natural_pdf.elements.base import DirectionalMixin
3
+
4
+ if TYPE_CHECKING:
5
+ from natural_pdf.core.page import Page
6
+ from natural_pdf.elements.text import TextElement
7
+
8
+ # Import OCRManager conditionally to avoid circular imports
9
+ try:
10
+ from natural_pdf.ocr import OCRManager
11
+ except ImportError:
12
+ # OCRManager will be imported directly in methods that use it
13
+ pass
14
+
15
+
16
+ class Region(DirectionalMixin):
17
+ """
18
+ Represents a rectangular region on a page.
19
+ """
20
+
21
+ def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
22
+ """
23
+ Initialize a region.
24
+
25
+ Args:
26
+ page: Parent page
27
+ bbox: Bounding box as (x0, top, x1, bottom)
28
+ polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
+ parent: Optional parent region (for hierarchical document structure)
30
+ """
31
+ self._page = page
32
+ self._bbox = bbox
33
+ self._polygon = polygon
34
+ self._multi_page_elements = None
35
+ self._spans_pages = False
36
+ self._page_range = None
37
+ self.start_element = None
38
+ self.end_element = None
39
+
40
+ # Standard attributes for all elements
41
+ self.object_type = 'region' # For selector compatibility
42
+
43
+ # Layout detection attributes
44
+ self.region_type = None
45
+ self.normalized_type = None
46
+ self.confidence = None
47
+ self.model = None
48
+
49
+ # Region management attributes
50
+ self.name = None
51
+ self.source = None # Will be set by creation methods
52
+
53
+ # Hierarchy support for nested document structure
54
+ self.parent_region = parent
55
+ self.child_regions = []
56
+ self.text_content = None # Direct text content (e.g., from Docling)
57
+ self.associated_text_elements = [] # Native text elements that overlap with this region
58
+
59
+ def _direction(self, direction: str, size: Optional[float] = None,
60
+ cross_size: str = "full", include_element: bool = False,
61
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
62
+ """
63
+ Protected helper method to create a region in a specified direction relative to this region.
64
+
65
+ Args:
66
+ direction: 'left', 'right', 'above', or 'below'
67
+ size: Size in the primary direction (width for horizontal, height for vertical)
68
+ cross_size: Size in the cross direction ('full' or 'element')
69
+ include_element: Whether to include this region's area in the result
70
+ until: Optional selector string to specify a boundary element
71
+ include_endpoint: Whether to include the boundary element found by 'until'
72
+ **kwargs: Additional parameters for the 'until' selector search
73
+
74
+ Returns:
75
+ Region object
76
+ """
77
+ import math # Use math.inf for infinity
78
+
79
+ is_horizontal = direction in ('left', 'right')
80
+ is_positive = direction in ('right', 'below') # right/below are positive directions
81
+ pixel_offset = 1 # Offset for excluding elements/endpoints
82
+
83
+ # 1. Determine initial boundaries based on direction and include_element
84
+ if is_horizontal:
85
+ # Initial cross-boundaries (vertical)
86
+ y0 = 0 if cross_size == "full" else self.top
87
+ y1 = self.page.height if cross_size == "full" else self.bottom
88
+
89
+ # Initial primary boundaries (horizontal)
90
+ if is_positive: # right
91
+ x0_initial = self.x0 if include_element else self.x1 + pixel_offset
92
+ x1_initial = self.x1 # This edge moves
93
+ else: # left
94
+ x0_initial = self.x0 # This edge moves
95
+ x1_initial = self.x1 if include_element else self.x0 - pixel_offset
96
+ else: # Vertical
97
+ # Initial cross-boundaries (horizontal)
98
+ x0 = 0 if cross_size == "full" else self.x0
99
+ x1 = self.page.width if cross_size == "full" else self.x1
100
+
101
+ # Initial primary boundaries (vertical)
102
+ if is_positive: # below
103
+ y0_initial = self.top if include_element else self.bottom + pixel_offset
104
+ y1_initial = self.bottom # This edge moves
105
+ else: # above
106
+ y0_initial = self.top # This edge moves
107
+ y1_initial = self.bottom if include_element else self.top - pixel_offset
108
+
109
+ # 2. Calculate the final primary boundary, considering 'size' or page limits
110
+ if is_horizontal:
111
+ if is_positive: # right
112
+ x1_final = min(self.page.width, x1_initial + (size if size is not None else (self.page.width - x1_initial)))
113
+ x0_final = x0_initial
114
+ else: # left
115
+ x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
116
+ x1_final = x1_initial
117
+ else: # Vertical
118
+ if is_positive: # below
119
+ y1_final = min(self.page.height, y1_initial + (size if size is not None else (self.page.height - y1_initial)))
120
+ y0_final = y0_initial
121
+ else: # above
122
+ y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
123
+ y1_final = y1_initial
124
+
125
+ # 3. Handle 'until' selector if provided
126
+ target = None
127
+ if until:
128
+ all_matches = self.page.find_all(until, **kwargs)
129
+ matches_in_direction = []
130
+
131
+ # Filter and sort matches based on direction
132
+ if direction == 'above':
133
+ matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
134
+ matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
135
+ elif direction == 'below':
136
+ matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
137
+ matches_in_direction.sort(key=lambda e: e.top)
138
+ elif direction == 'left':
139
+ matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
140
+ matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
141
+ elif direction == 'right':
142
+ matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
143
+ matches_in_direction.sort(key=lambda e: e.x0)
144
+
145
+ if matches_in_direction:
146
+ target = matches_in_direction[0]
147
+
148
+ # Adjust the primary boundary based on the target
149
+ if is_horizontal:
150
+ if is_positive: # right
151
+ x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
152
+ else: # left
153
+ x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
154
+ else: # Vertical
155
+ if is_positive: # below
156
+ y1_final = target.bottom if include_endpoint else target.top - pixel_offset
157
+ else: # above
158
+ y0_final = target.top if include_endpoint else target.bottom + pixel_offset
159
+
160
+ # Adjust cross boundaries if cross_size is 'element'
161
+ if cross_size == "element":
162
+ if is_horizontal: # Adjust y0, y1
163
+ target_y0 = target.top if include_endpoint else target.bottom # Use opposite boundary if excluding
164
+ target_y1 = target.bottom if include_endpoint else target.top
165
+ y0 = min(y0, target_y0)
166
+ y1 = max(y1, target_y1)
167
+ else: # Adjust x0, x1
168
+ target_x0 = target.x0 if include_endpoint else target.x1 # Use opposite boundary if excluding
169
+ target_x1 = target.x1 if include_endpoint else target.x0
170
+ x0 = min(x0, target_x0)
171
+ x1 = max(x1, target_x1)
172
+
173
+ # 4. Finalize bbox coordinates
174
+ if is_horizontal:
175
+ bbox = (x0_final, y0, x1_final, y1)
176
+ else:
177
+ bbox = (x0, y0_final, x1, y1_final)
178
+
179
+ # Ensure valid coordinates (x0 <= x1, y0 <= y1)
180
+ final_x0 = min(bbox[0], bbox[2])
181
+ final_y0 = min(bbox[1], bbox[3])
182
+ final_x1 = max(bbox[0], bbox[2])
183
+ final_y1 = max(bbox[1], bbox[3])
184
+ final_bbox = (final_x0, final_y0, final_x1, final_y1)
185
+
186
+ # 5. Create and return Region
187
+ region = Region(self.page, final_bbox)
188
+ region.source_element = self
189
+ region.includes_source = include_element
190
+ # Optionally store the boundary element if found
191
+ if target:
192
+ region.boundary_element = target
193
+
194
+ return region
195
+
196
+ def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
197
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
198
+ """
199
+ Select region above this region.
200
+
201
+ Args:
202
+ height: Height of the region above, in points
203
+ width: Width mode - "full" for full page width or "element" for element width
204
+ include_element: Whether to include this region in the result (default: False)
205
+ until: Optional selector string to specify an upper boundary element
206
+ include_endpoint: Whether to include the boundary element in the region (default: True)
207
+ **kwargs: Additional parameters
208
+
209
+ Returns:
210
+ Region object representing the area above
211
+ """
212
+ return self._direction(
213
+ direction='above',
214
+ size=height,
215
+ cross_size=width,
216
+ include_element=include_element,
217
+ until=until,
218
+ include_endpoint=include_endpoint,
219
+ **kwargs
220
+ )
221
+
222
+ def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
223
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
224
+ """
225
+ Select region below this region.
226
+
227
+ Args:
228
+ height: Height of the region below, in points
229
+ width: Width mode - "full" for full page width or "element" for element width
230
+ include_element: Whether to include this region in the result (default: False)
231
+ until: Optional selector string to specify a lower boundary element
232
+ include_endpoint: Whether to include the boundary element in the region (default: True)
233
+ **kwargs: Additional parameters
234
+
235
+ Returns:
236
+ Region object representing the area below
237
+ """
238
+ return self._direction(
239
+ direction='below',
240
+ size=height,
241
+ cross_size=width,
242
+ include_element=include_element,
243
+ until=until,
244
+ include_endpoint=include_endpoint,
245
+ **kwargs
246
+ )
247
+
248
+ def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
249
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
250
+ """
251
+ Select region to the left of this region.
252
+
253
+ Args:
254
+ width: Width of the region to the left, in points
255
+ height: Height mode - "full" for full page height or "element" for element height
256
+ include_element: Whether to include this region in the result (default: False)
257
+ until: Optional selector string to specify a left boundary element
258
+ include_endpoint: Whether to include the boundary element in the region (default: True)
259
+ **kwargs: Additional parameters
260
+
261
+ Returns:
262
+ Region object representing the area to the left
263
+ """
264
+ return self._direction(
265
+ direction='left',
266
+ size=width,
267
+ cross_size=height,
268
+ include_element=include_element,
269
+ until=until,
270
+ include_endpoint=include_endpoint,
271
+ **kwargs
272
+ )
273
+
274
+ def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
275
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
276
+ """
277
+ Select region to the right of this region.
278
+
279
+ Args:
280
+ width: Width of the region to the right, in points
281
+ height: Height mode - "full" for full page height or "element" for element height
282
+ include_element: Whether to include this region in the result (default: False)
283
+ until: Optional selector string to specify a right boundary element
284
+ include_endpoint: Whether to include the boundary element in the region (default: True)
285
+ **kwargs: Additional parameters
286
+
287
+ Returns:
288
+ Region object representing the area to the right
289
+ """
290
+ return self._direction(
291
+ direction='right',
292
+ size=width,
293
+ cross_size=height,
294
+ include_element=include_element,
295
+ until=until,
296
+ include_endpoint=include_endpoint,
297
+ **kwargs
298
+ )
299
+
300
+ @property
301
+ def type(self) -> str:
302
+ """Element type."""
303
+ # Return the specific type if detected (e.g., from layout analysis)
304
+ # or 'region' as a default.
305
+ return self.region_type or 'region' # Prioritize specific region_type if set
306
+
307
+ @property
308
+ def page(self) -> 'Page':
309
+ """Get the parent page."""
310
+ return self._page
311
+
312
+ @property
313
+ def bbox(self) -> Tuple[float, float, float, float]:
314
+ """Get the bounding box as (x0, top, x1, bottom)."""
315
+ return self._bbox
316
+
317
+ @property
318
+ def x0(self) -> float:
319
+ """Get the left coordinate."""
320
+ return self._bbox[0]
321
+
322
+ @property
323
+ def top(self) -> float:
324
+ """Get the top coordinate."""
325
+ return self._bbox[1]
326
+
327
+ @property
328
+ def x1(self) -> float:
329
+ """Get the right coordinate."""
330
+ return self._bbox[2]
331
+
332
+ @property
333
+ def bottom(self) -> float:
334
+ """Get the bottom coordinate."""
335
+ return self._bbox[3]
336
+
337
+ @property
338
+ def width(self) -> float:
339
+ """Get the width of the region."""
340
+ return self.x1 - self.x0
341
+
342
+ @property
343
+ def height(self) -> float:
344
+ """Get the height of the region."""
345
+ return self.bottom - self.top
346
+
347
+ @property
348
+ def has_polygon(self) -> bool:
349
+ """Check if this region has polygon coordinates."""
350
+ return self._polygon is not None and len(self._polygon) >= 3
351
+
352
+ @property
353
+ def polygon(self) -> List[Tuple[float, float]]:
354
+ """Get polygon coordinates if available, otherwise return rectangle corners."""
355
+ if self._polygon:
356
+ return self._polygon
357
+ else:
358
+ # Create rectangle corners from bbox as fallback
359
+ return [
360
+ (self.x0, self.top), # top-left
361
+ (self.x1, self.top), # top-right
362
+ (self.x1, self.bottom), # bottom-right
363
+ (self.x0, self.bottom) # bottom-left
364
+ ]
365
+
366
+ def _is_point_in_polygon(self, x: float, y: float) -> bool:
367
+ """
368
+ Check if a point is inside the polygon using ray casting algorithm.
369
+
370
+ Args:
371
+ x: X coordinate of the point
372
+ y: Y coordinate of the point
373
+
374
+ Returns:
375
+ bool: True if the point is inside the polygon
376
+ """
377
+ if not self.has_polygon:
378
+ return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
379
+
380
+ # Ray casting algorithm
381
+ inside = False
382
+ j = len(self.polygon) - 1
383
+
384
+ for i in range(len(self.polygon)):
385
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
386
+ (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
387
+ (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
388
+ inside = not inside
389
+ j = i
390
+
391
+ return inside
392
+
393
+ def is_point_inside(self, x: float, y: float) -> bool:
394
+ """
395
+ Check if a point is inside this region using ray casting algorithm for polygons.
396
+
397
+ Args:
398
+ x: X coordinate of the point
399
+ y: Y coordinate of the point
400
+
401
+ Returns:
402
+ bool: True if the point is inside the region
403
+ """
404
+ if not self.has_polygon:
405
+ return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
406
+
407
+ # Ray casting algorithm
408
+ inside = False
409
+ j = len(self.polygon) - 1
410
+
411
+ for i in range(len(self.polygon)):
412
+ if ((self.polygon[i][1] > y) != (self.polygon[j][1] > y)) and \
413
+ (x < (self.polygon[j][0] - self.polygon[i][0]) * (y - self.polygon[i][1]) / \
414
+ (self.polygon[j][1] - self.polygon[i][1]) + self.polygon[i][0]):
415
+ inside = not inside
416
+ j = i
417
+
418
+ return inside
419
+
420
+ def _is_element_in_region(self, element: 'Element', use_boundary_tolerance=True) -> bool:
421
+ """
422
+ Check if an element is within this region.
423
+
424
+ Args:
425
+ element: Element to check
426
+ use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
427
+
428
+ Returns:
429
+ True if the element is in the region, False otherwise
430
+ """
431
+ # If we have multi-page elements cached, check if the element is in the list
432
+ if self._spans_pages and self._multi_page_elements is not None:
433
+ return element in self._multi_page_elements
434
+
435
+ # Check if element is on the same page
436
+ if element.page != self._page:
437
+ return False
438
+
439
+ # Calculate element center
440
+ element_center_x = (element.x0 + element.x1) / 2
441
+ element_center_y = (element.top + element.bottom) / 2
442
+
443
+ # If this is a boundary region with exclusions, apply strict boundary checking
444
+ # This helps enforce boundary_inclusion behavior in get_sections
445
+ if hasattr(self, 'start_element') or hasattr(self, 'end_element'):
446
+ # Apply a small tolerance to avoid border cases
447
+ # When an element is right at the border, we want to be more strict
448
+ tolerance = 2.0 if use_boundary_tolerance else 0.0
449
+
450
+ # Check if element center is strictly within the region (not just on border)
451
+ if (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
452
+ self.top + tolerance <= element_center_y <= self.bottom - tolerance):
453
+ return True
454
+
455
+ # For elements right at the boundary, be more conservative
456
+ return False
457
+
458
+ # If the element itself has a polygon, check if ANY corner is in this region
459
+ if hasattr(element, 'has_polygon') and element.has_polygon:
460
+ for point in element.polygon:
461
+ if self.is_point_inside(point[0], point[1]):
462
+ return True
463
+ # If no point is inside, check if the center is inside
464
+ return self.is_point_inside(element_center_x, element_center_y)
465
+
466
+ # For regular elements, check if center is in the region
467
+ # Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
468
+ # This ensures consistent behavior with the below() and above() method fixes
469
+ tolerance = 1.0 if use_boundary_tolerance else 0.0
470
+
471
+ # Check if within region with the tolerance applied
472
+ if self.has_polygon:
473
+ return self.is_point_inside(element_center_x, element_center_y)
474
+ else:
475
+ # For rectangular regions, apply tolerance to all sides
476
+ return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
477
+ self.top + tolerance <= element_center_y <= self.bottom - tolerance)
478
+
479
+ def highlight(self,
480
+ label: Optional[str] = None,
481
+ color: Optional[Union[Tuple, str]] = None,
482
+ use_color_cycling: bool = False,
483
+ include_attrs: Optional[List[str]] = None,
484
+ existing: str = 'append') -> 'Region':
485
+ """
486
+ Highlight this region on the page.
487
+
488
+ Args:
489
+ label: Optional label for the highlight
490
+ color: Color tuple/string for the highlight, or None to use automatic color
491
+ use_color_cycling: Force color cycling even with no label (default: False)
492
+ include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
493
+ existing: How to handle existing highlights ('append' or 'replace').
494
+
495
+ Returns:
496
+ Self for method chaining
497
+ """
498
+ # Access the highlighter service correctly
499
+ highlighter = self.page._highlighter
500
+
501
+ # Prepare common arguments
502
+ highlight_args = {
503
+ "page_index": self.page.index,
504
+ "color": color,
505
+ "label": label,
506
+ "use_color_cycling": use_color_cycling,
507
+ "element": self, # Pass the region itself so attributes can be accessed
508
+ "include_attrs": include_attrs,
509
+ "existing": existing
510
+ }
511
+
512
+ # Call the appropriate service method
513
+ if self.has_polygon:
514
+ highlight_args["polygon"] = self.polygon
515
+ highlighter.add_polygon(**highlight_args)
516
+ else:
517
+ highlight_args["bbox"] = self.bbox
518
+ highlighter.add(**highlight_args)
519
+
520
+ return self
521
+
522
+ def to_image(self,
523
+ scale: float = 2.0,
524
+ resolution: float = 150,
525
+ crop_only: bool = False,
526
+ include_highlights: bool = True,
527
+ **kwargs) -> 'Image.Image':
528
+ """
529
+ Generate an image of just this region.
530
+
531
+ Args:
532
+ resolution: Resolution in DPI for rendering (default: 150)
533
+ crop_only: If True, only crop the region without highlighting its boundaries
534
+ include_highlights: Whether to include existing highlights (default: True)
535
+ **kwargs: Additional parameters for page.to_image()
536
+
537
+ Returns:
538
+ PIL Image of just this region
539
+ """
540
+ # First get the full page image with highlights if requested
541
+ page_image = self._page.to_image(scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs)
542
+
543
+ # Calculate the crop coordinates - apply resolution scaling factor
544
+ # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
545
+ scale_factor = scale
546
+
547
+ # Apply scaling to the coordinates
548
+ x0 = int(self.x0 * scale_factor)
549
+ top = int(self.top * scale_factor)
550
+ x1 = int(self.x1 * scale_factor)
551
+ bottom = int(self.bottom * scale_factor)
552
+
553
+ # Crop the image to just this region
554
+ region_image = page_image.crop((x0, top, x1, bottom))
555
+
556
+ # If not crop_only, add a border to highlight the region boundaries
557
+ if not crop_only:
558
+ from PIL import ImageDraw
559
+
560
+ # Create a 1px border around the region
561
+ draw = ImageDraw.Draw(region_image)
562
+ draw.rectangle((0, 0, region_image.width-1, region_image.height-1),
563
+ outline=(255, 0, 0), width=1)
564
+
565
+ return region_image
566
+
567
+ def show(self,
568
+ scale: float = 2.0,
569
+ labels: bool = True,
570
+ legend_position: str = 'right',
571
+ # Add a default color for standalone show
572
+ color: Optional[Union[Tuple, str]] = "blue",
573
+ label: Optional[str] = None) -> 'Image.Image':
574
+ """
575
+ Show the page with just this region highlighted temporarily.
576
+
577
+ Args:
578
+ scale: Scale factor for rendering
579
+ labels: Whether to include a legend for labels
580
+ legend_position: Position of the legend
581
+ color: Color to highlight this region (default: blue)
582
+ label: Optional label for this region in the legend
583
+
584
+ Returns:
585
+ PIL Image of the page with only this region highlighted
586
+ """
587
+ if not self._page:
588
+ raise ValueError("Region must be associated with a page to show.")
589
+
590
+ # Use the highlighting service via the page's property
591
+ service = self._page._highlighter
592
+
593
+ # Determine the label if not provided
594
+ display_label = label if label is not None else f"Region ({self.type})" if self.type else "Region"
595
+
596
+ # Prepare temporary highlight data for just this region
597
+ temp_highlight_data = {
598
+ "page_index": self._page.index,
599
+ "bbox": self.bbox,
600
+ "polygon": self.polygon if self.has_polygon else None,
601
+ "color": color, # Use provided or default color
602
+ "label": display_label,
603
+ "use_color_cycling": False # Explicitly false for single preview
604
+ }
605
+
606
+ # Use render_preview to show only this highlight
607
+ return service.render_preview(
608
+ page_index=self._page.index,
609
+ temporary_highlights=[temp_highlight_data],
610
+ scale=scale,
611
+ labels=labels,
612
+ legend_position=legend_position
613
+ )
614
+
615
+ def save(self,
616
+ filename: str,
617
+ scale: float = 2.0,
618
+ labels: bool = True,
619
+ legend_position: str = 'right') -> 'Region':
620
+ """
621
+ Save the page with this region highlighted to an image file.
622
+
623
+ Args:
624
+ filename: Path to save the image to
625
+ scale: Scale factor for rendering
626
+ labels: Whether to include a legend for labels
627
+ legend_position: Position of the legend
628
+
629
+ Returns:
630
+ Self for method chaining
631
+ """
632
+ # Highlight this region if not already highlighted
633
+ self.highlight()
634
+
635
+ # Save the highlighted image
636
+ self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
637
+ return self
638
+
639
+ def save_image(self,
640
+ filename: str,
641
+ resolution: float = 150,
642
+ crop_only: bool = False,
643
+ include_highlights: bool = True,
644
+ **kwargs) -> 'Region':
645
+ """
646
+ Save an image of just this region to a file.
647
+
648
+ Args:
649
+ filename: Path to save the image to
650
+ resolution: Resolution in DPI for rendering (default: 150)
651
+ crop_only: If True, only crop the region without highlighting its boundaries
652
+ include_highlights: Whether to include existing highlights (default: True)
653
+ **kwargs: Additional parameters for page.to_image()
654
+
655
+ Returns:
656
+ Self for method chaining
657
+ """
658
+ # Get the region image
659
+ image = self.to_image(
660
+ resolution=resolution,
661
+ crop_only=crop_only,
662
+ include_highlights=include_highlights,
663
+ **kwargs
664
+ )
665
+
666
+ # Save the image
667
+ image.save(filename)
668
+ return self
669
+
670
+ def get_elements(self, selector: Optional[str] = None, apply_exclusions=True, **kwargs) -> List['Element']:
671
+ """
672
+ Get all elements within this region.
673
+
674
+ Args:
675
+ selector: Optional selector to filter elements
676
+ apply_exclusions: Whether to apply exclusion regions
677
+ **kwargs: Additional parameters for element filtering
678
+
679
+ Returns:
680
+ List of elements in the region
681
+ """
682
+ # If we have multi-page elements, return those
683
+ if self._spans_pages and self._multi_page_elements is not None:
684
+ return self._multi_page_elements
685
+
686
+ # Otherwise, get elements from the page
687
+ if selector:
688
+ elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
689
+ else:
690
+ elements = self.page.get_elements(apply_exclusions=apply_exclusions)
691
+
692
+ # Filter to elements in this region
693
+ return [e for e in elements if self._is_element_in_region(e)]
694
+
695
+ def extract_text(self, keep_blank_chars=True, apply_exclusions=True, ocr=None, preserve_whitespace=None, debug=False, **kwargs) -> str:
696
+ """
697
+ Extract text from this region using pdfplumber's native functionality.
698
+
699
+ For regions created by Docling, this will first try to use:
700
+ 1. Associated text elements from the PDF (if available)
701
+ 2. Direct text content from Docling (if available)
702
+ 3. Fall back to standard pdfplumber extraction
703
+
704
+ Args:
705
+ keep_blank_chars: Whether to keep blank characters (legacy parameter)
706
+ apply_exclusions: Whether to apply exclusion regions
707
+ ocr: OCR configuration. If None, uses PDF settings
708
+ preserve_whitespace: Synonym for keep_blank_chars (for compatibility with page.extract_text)
709
+ debug: Enable verbose debugging for exclusion handling
710
+ **kwargs: Additional parameters for text extraction
711
+
712
+ Returns:
713
+ Extracted text as string
714
+ """
715
+ import logging
716
+ logger = logging.getLogger("natural_pdf.elements.region")
717
+
718
+ # Check for Docling model or if we have direct text content
719
+ if self.model == 'docling' or hasattr(self, 'text_content'):
720
+ # First priority: check if we have associated native text elements
721
+ if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
722
+ source_count = len(self.associated_text_elements)
723
+ logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
724
+ # Sort elements in reading order
725
+ sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
726
+ # Extract and join their text
727
+ text_result = " ".join(elem.text for elem in sorted_elements)
728
+ return text_result
729
+
730
+ # Second priority: use direct text content from Docling
731
+ elif self.text_content:
732
+ logger.info(f"Region {self.region_type}: Using Docling OCR text content")
733
+ return self.text_content
734
+
735
+ logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
736
+
737
+ # Handle preserve_whitespace parameter for consistency with Page.extract_text
738
+ if preserve_whitespace is not None:
739
+ keep_blank_chars = preserve_whitespace
740
+
741
+ # If we span multiple pages, use the original implementation
742
+ if self._spans_pages and self._multi_page_elements is not None:
743
+ # Sort elements in reading order - only include text-like elements
744
+ text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
745
+
746
+ # Sort in reading order (by page, then top-to-bottom, left-to-right)
747
+ sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
748
+
749
+ # Extract text directly from elements to avoid recursion
750
+ texts = []
751
+ for element in sorted_elements:
752
+ if hasattr(element, 'text'):
753
+ texts.append(element.text)
754
+
755
+ text_result = " ".join(texts)
756
+ return text_result
757
+
758
+ # Check if we have exclusions to apply
759
+ exclusion_regions = []
760
+ if apply_exclusions and self._page._exclusions:
761
+ exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
762
+
763
+ if debug:
764
+ import logging
765
+ logger = logging.getLogger("natural_pdf.elements.region")
766
+ logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
767
+
768
+ # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
769
+ # If not, ignore exclusions entirely
770
+ if exclusion_regions:
771
+ has_intersection = False
772
+ for i, exclusion in enumerate(exclusion_regions):
773
+ # Use a simple bbox overlap check
774
+ overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
775
+ self.top < exclusion.bottom and self.bottom > exclusion.top)
776
+
777
+ if overlap:
778
+ has_intersection = True
779
+ if debug:
780
+ import logging
781
+ logger = logging.getLogger("natural_pdf.elements.region")
782
+ logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
783
+ break
784
+
785
+ # If no intersection, process without exclusions
786
+ if not has_intersection:
787
+ if debug:
788
+ import logging
789
+ logger = logging.getLogger("natural_pdf.elements.region")
790
+ logger.debug(f" No intersection with any exclusion, ignoring exclusions")
791
+ apply_exclusions = False
792
+ exclusion_regions = []
793
+
794
+ # IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
795
+ # we can use the simpler cropping approach
796
+ # Only use crop for simple cases
797
+ can_use_crop = not self.has_polygon
798
+ result = "" # Default empty result
799
+ if can_use_crop and apply_exclusions and exclusion_regions:
800
+ # We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
801
+ # and those that are not
802
+ footer_header_exclusions = []
803
+ other_exclusions = []
804
+
805
+ for i, exclusion in enumerate(exclusion_regions):
806
+ # Check if exclusion spans the full width of the page
807
+ # and is either at the top or bottom
808
+ full_width = (abs(exclusion.x0) < 5 and
809
+ abs(exclusion.x1 - self.page.width) < 5)
810
+
811
+ if debug:
812
+ import logging
813
+ logger = logging.getLogger("natural_pdf.elements.region")
814
+ logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
815
+
816
+ if full_width:
817
+ footer_header_exclusions.append(exclusion)
818
+ else:
819
+ other_exclusions.append(exclusion)
820
+
821
+ # If we have only header/footer exclusions, we can use the cropping approach
822
+ all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
823
+
824
+ if all_are_bands:
825
+ # Find the actual content area after excluding header/footer
826
+ top_bound = self.top
827
+ bottom_bound = self.bottom
828
+
829
+ if debug:
830
+ import logging
831
+ logger = logging.getLogger("natural_pdf.elements.region")
832
+ logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
833
+
834
+ # Process only header/footer exclusions for cropping
835
+ for exclusion in footer_header_exclusions:
836
+ # If exclusion is at the top of our region
837
+ if exclusion.bottom > self.top and exclusion.top <= self.top:
838
+ # Move top bound to exclude the header
839
+ top_bound = max(top_bound, exclusion.bottom)
840
+ if debug:
841
+ import logging
842
+ logger = logging.getLogger("natural_pdf.elements.region")
843
+ logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
844
+
845
+ # If exclusion is at the bottom of our region
846
+ if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
847
+ # Move bottom bound to exclude the footer
848
+ bottom_bound = min(bottom_bound, exclusion.top)
849
+ if debug:
850
+ import logging
851
+ logger = logging.getLogger("natural_pdf.elements.region")
852
+ logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
853
+
854
+
855
+ if debug:
856
+ import logging
857
+ logger = logging.getLogger("natural_pdf.elements.region")
858
+ logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
859
+
860
+ # If we still have a valid region after exclusions
861
+ if top_bound < bottom_bound:
862
+ # Use direct crop with adjusted bounds
863
+ crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
864
+ cropped = self.page._page.crop(crop_bbox)
865
+ result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
866
+
867
+ if debug:
868
+ import logging
869
+ logger = logging.getLogger("natural_pdf.elements.region")
870
+ logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
871
+
872
+ # Skip the complex filtering approach
873
+ return result
874
+ else:
875
+ # This would only happen if the region is entirely inside an exclusion zone
876
+ # or if both top and bottom of the region are excluded leaving no valid area
877
+ import logging
878
+ logger = logging.getLogger("natural_pdf.elements.region")
879
+ logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
880
+ return ""
881
+ # We have exclusions, but not all are headers/footers,
882
+ # or we have a non-rectangular region
883
+ else:
884
+ if debug:
885
+ import logging
886
+ logger = logging.getLogger("natural_pdf.elements.region")
887
+ logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
888
+
889
+ # Don't use crop for mixed exclusion types
890
+ can_use_crop = False
891
+
892
+ # If we got a result from header/footer cropping, return it
893
+ if result:
894
+ return result
895
+
896
+ # For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
897
+ if can_use_crop and not apply_exclusions:
898
+ # Simple case: use direct crop
899
+ crop_bbox = self.bbox
900
+ cropped = self.page._page.crop(crop_bbox)
901
+ result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
902
+ return result
903
+
904
+ # For all other cases (complex exclusions, polygons), we use element filtering
905
+ import warnings
906
+ import logging
907
+ logger = logging.getLogger("natural_pdf.elements.region")
908
+
909
+ if debug:
910
+ logger.debug(f"Using element filtering approach for region {self.bbox}")
911
+
912
+ # Get all elements in this region first
913
+ all_elements = self.get_elements(apply_exclusions=False)
914
+
915
+ if apply_exclusions and exclusion_regions:
916
+ if debug:
917
+ logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
918
+
919
+ # Filter out elements in exclusion zones
920
+ filtered_elements = []
921
+ for elem in all_elements:
922
+ in_exclusion = False
923
+ # For each element, check if it's in any exclusion zone
924
+ element_center_x = (elem.x0 + elem.x1) / 2
925
+ element_center_y = (elem.top + elem.bottom) / 2
926
+
927
+ for exclusion in exclusion_regions:
928
+ if (exclusion.x0 <= element_center_x <= exclusion.x1 and
929
+ exclusion.top <= element_center_y <= exclusion.bottom):
930
+ in_exclusion = True
931
+ break
932
+
933
+ if not in_exclusion:
934
+ filtered_elements.append(elem)
935
+ else:
936
+ # No exclusions, use all elements
937
+ filtered_elements = all_elements
938
+
939
+ # Now extract text from the filtered elements
940
+ if filtered_elements:
941
+ from natural_pdf.elements.collections import ElementCollection
942
+ collection = ElementCollection(filtered_elements)
943
+ # Sort in reading order
944
+ collection = collection.sort(key=lambda e: (e.top, e.x0))
945
+ # Extract text
946
+ result = " ".join(e.text for e in collection if hasattr(e, 'text'))
947
+
948
+ if debug:
949
+ logger.debug(f"Got {len(result)} characters from element-based extraction")
950
+
951
+ # Return the result
952
+ return result
953
+ else:
954
+ if debug:
955
+ logger.debug(f"No elements found after filtering")
956
+ return ""
957
+
958
+ # Handle OCR if needed
959
+ use_ocr = ocr is True or (isinstance(ocr, dict) and ocr.get('enabled', False))
960
+ auto_ocr = ocr is None and self.page._parent._ocr_config.get('enabled') == 'auto'
961
+
962
+ # Run OCR if explicitly requested or if in auto mode and no text found
963
+ if use_ocr or (auto_ocr and not result.strip()):
964
+ ocr_config = self.page._get_ocr_config(ocr or {}) if use_ocr else self.page._get_ocr_config({'enabled': 'auto'})
965
+ ocr_elements = self.apply_ocr(**ocr_config)
966
+
967
+ if ocr_elements:
968
+ # Filter OCR elements by exclusions if needed
969
+ if apply_exclusions and exclusion_regions:
970
+ filtered_ocr = []
971
+ for element in ocr_elements:
972
+ exclude = False
973
+ for region in exclusion_regions:
974
+ if region._is_element_in_region(element):
975
+ exclude = True
976
+ break
977
+ if not exclude:
978
+ filtered_ocr.append(element)
979
+ else:
980
+ filtered_ocr = ocr_elements
981
+
982
+ # Extract text from OCR elements
983
+ from natural_pdf.elements.collections import ElementCollection
984
+ ocr_collection = ElementCollection(filtered_ocr)
985
+ ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
986
+
987
+ # Use OCR text if it's not empty
988
+ if ocr_text.strip():
989
+ return ocr_text
990
+
991
+ return result
992
+
993
+ def extract_table(self, method: str = None, table_settings: dict = None,
994
+ use_ocr: bool = False, ocr_config: dict = None) -> List[List[str]]:
995
+ """
996
+ Extract a table from this region.
997
+
998
+ Args:
999
+ method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
1000
+ table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
1001
+ use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
1002
+ ocr_config: OCR configuration parameters
1003
+
1004
+ Returns:
1005
+ Table data as a list of rows, where each row is a list of cell values
1006
+ """
1007
+ # Default settings if none provided
1008
+ if table_settings is None:
1009
+ table_settings = {}
1010
+
1011
+ # Auto-detect method if not specified
1012
+ if method is None:
1013
+ # If this is a TATR-detected region, use TATR method
1014
+ if hasattr(self, 'model') and self.model == 'tatr' and self.region_type == 'table':
1015
+ method = 'tatr'
1016
+ else:
1017
+ method = 'plumber'
1018
+
1019
+ # Use the selected method
1020
+ if method == 'tatr':
1021
+ return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
1022
+ else: # Default to pdfplumber
1023
+ return self._extract_table_plumber(table_settings)
1024
+
1025
+ def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1026
+ """
1027
+ Extract table using pdfplumber's table extraction.
1028
+
1029
+ Args:
1030
+ table_settings: Settings for pdfplumber table extraction
1031
+
1032
+ Returns:
1033
+ Table data as a list of rows, where each row is a list of cell values
1034
+ """
1035
+ # Create a crop of the page for this region
1036
+ cropped = self.page._page.crop(self.bbox)
1037
+
1038
+ # Extract table from the cropped area
1039
+ tables = cropped.extract_tables(table_settings)
1040
+
1041
+ # Return the first table or an empty list if none found
1042
+ if tables:
1043
+ return tables[0]
1044
+ return []
1045
+
1046
+ def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
1047
+ """
1048
+ Extract table using TATR structure detection.
1049
+
1050
+ Args:
1051
+ use_ocr: Whether to apply OCR to each cell for better text extraction
1052
+ ocr_config: Optional OCR configuration parameters
1053
+
1054
+ Returns:
1055
+ Table data as a list of rows, where each row is a list of cell values
1056
+ """
1057
+ # Find all rows and headers in this table
1058
+ rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1059
+ headers = self.page.find_all(f'region[type=table-column-header][model=tatr]')
1060
+ columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1061
+
1062
+ # Filter to only include rows/headers/columns that overlap with this table region
1063
+ def is_in_table(region):
1064
+ # Check for overlap - simplifying to center point for now
1065
+ region_center_x = (region.x0 + region.x1) / 2
1066
+ region_center_y = (region.top + region.bottom) / 2
1067
+ return (self.x0 <= region_center_x <= self.x1 and
1068
+ self.top <= region_center_y <= self.bottom)
1069
+
1070
+ rows = [row for row in rows if is_in_table(row)]
1071
+ headers = [header for header in headers if is_in_table(header)]
1072
+ columns = [column for column in columns if is_in_table(column)]
1073
+
1074
+ # Sort rows by vertical position (top to bottom)
1075
+ rows.sort(key=lambda r: r.top)
1076
+
1077
+ # Sort columns by horizontal position (left to right)
1078
+ columns.sort(key=lambda c: c.x0)
1079
+
1080
+ # Create table data structure
1081
+ table_data = []
1082
+
1083
+ # Prepare OCR config if needed
1084
+ if use_ocr:
1085
+ # Default OCR config focuses on small text with low confidence
1086
+ default_ocr_config = {
1087
+ "enabled": True,
1088
+ "min_confidence": 0.1, # Lower than default to catch more text
1089
+ "detection_params": {
1090
+ "text_threshold": 0.1, # Lower threshold for low-contrast text
1091
+ "link_threshold": 0.1 # Lower threshold for connecting text components
1092
+ }
1093
+ }
1094
+
1095
+ # Merge with provided config if any
1096
+ if ocr_config:
1097
+ if isinstance(ocr_config, dict):
1098
+ # Update default config with provided values
1099
+ for key, value in ocr_config.items():
1100
+ if isinstance(value, dict) and key in default_ocr_config and isinstance(default_ocr_config[key], dict):
1101
+ # Merge nested dicts
1102
+ default_ocr_config[key].update(value)
1103
+ else:
1104
+ # Replace value
1105
+ default_ocr_config[key] = value
1106
+ else:
1107
+ # Not a dict, use as is
1108
+ default_ocr_config = ocr_config
1109
+
1110
+ # Use the merged config
1111
+ ocr_config = default_ocr_config
1112
+
1113
+ # Add header row if headers were detected
1114
+ if headers:
1115
+ header_texts = []
1116
+ for header in headers:
1117
+ if use_ocr:
1118
+ # Try OCR for better text extraction
1119
+ ocr_elements = header.apply_ocr(**ocr_config)
1120
+ if ocr_elements:
1121
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
1122
+ if ocr_text:
1123
+ header_texts.append(ocr_text)
1124
+ continue
1125
+
1126
+ # Fallback to normal extraction
1127
+ header_texts.append(header.extract_text().strip())
1128
+ table_data.append(header_texts)
1129
+
1130
+ # Process rows
1131
+ for row in rows:
1132
+ row_cells = []
1133
+
1134
+ # If we have columns, use them to extract cells
1135
+ if columns:
1136
+ for column in columns:
1137
+ # Create a cell region at the intersection of row and column
1138
+ cell_bbox = (
1139
+ column.x0,
1140
+ row.top,
1141
+ column.x1,
1142
+ row.bottom
1143
+ )
1144
+
1145
+ # Create a region for this cell
1146
+ from natural_pdf.elements.region import Region # Import here to avoid circular imports
1147
+ cell_region = Region(self.page, cell_bbox)
1148
+
1149
+ # Extract text from the cell
1150
+ if use_ocr:
1151
+ # Apply OCR to the cell
1152
+ ocr_elements = cell_region.apply_ocr(**ocr_config)
1153
+ if ocr_elements:
1154
+ # Get text from OCR elements
1155
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
1156
+ if ocr_text:
1157
+ row_cells.append(ocr_text)
1158
+ continue
1159
+
1160
+ # Fallback to normal extraction
1161
+ cell_text = cell_region.extract_text().strip()
1162
+ row_cells.append(cell_text)
1163
+ else:
1164
+ # No column information, just extract the whole row text
1165
+ if use_ocr:
1166
+ # Try OCR on the whole row
1167
+ ocr_elements = row.apply_ocr(**ocr_config)
1168
+ if ocr_elements:
1169
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
1170
+ if ocr_text:
1171
+ row_cells.append(ocr_text)
1172
+ continue
1173
+
1174
+ # Fallback to normal extraction
1175
+ row_cells.append(row.extract_text().strip())
1176
+
1177
+ table_data.append(row_cells)
1178
+
1179
+ return table_data
1180
+
1181
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
1182
+ """
1183
+ Find the first element in this region matching the selector.
1184
+
1185
+ Args:
1186
+ selector: CSS-like selector string
1187
+ apply_exclusions: Whether to apply exclusion regions
1188
+ **kwargs: Additional parameters for element filtering
1189
+
1190
+ Returns:
1191
+ First matching element or None
1192
+ """
1193
+ elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1194
+ return elements[0] if elements else None
1195
+
1196
+ def _find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
1197
+ """
1198
+ Find all elements in this region matching the selector.
1199
+
1200
+ Args:
1201
+ selector: CSS-like selector string
1202
+ apply_exclusions: Whether to apply exclusion regions
1203
+ **kwargs: Additional parameters for element filtering
1204
+
1205
+ Returns:
1206
+ ElementCollection with matching elements
1207
+ """
1208
+ from natural_pdf.elements.collections import ElementCollection
1209
+
1210
+ # If we span multiple pages, filter our elements
1211
+ if self._spans_pages and self._multi_page_elements is not None:
1212
+ # Parse the selector
1213
+ from natural_pdf.selectors.parser import parse_selector
1214
+ selector_obj = parse_selector(selector)
1215
+
1216
+ # Rather than using matches_selector, let each page's find_all handle the matching
1217
+ # since that method is already properly implemented
1218
+ all_matching_elements = []
1219
+ page_ranges = {}
1220
+
1221
+ # Group elements by page
1222
+ for element in self._multi_page_elements:
1223
+ if element.page not in page_ranges:
1224
+ page_ranges[element.page] = []
1225
+ page_ranges[element.page].append(element)
1226
+
1227
+ # For each page, use its find_all to match elements, then filter to our collection
1228
+ for page, page_elements in page_ranges.items():
1229
+ # Get all matching elements from the page
1230
+ page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1231
+
1232
+ # Filter to just the elements that are in our collection
1233
+ for element in page_matches:
1234
+ if element in page_elements:
1235
+ all_matching_elements.append(element)
1236
+
1237
+ return ElementCollection(all_matching_elements)
1238
+
1239
+ # Otherwise, get elements from the page and filter by selector and region
1240
+ page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1241
+ filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1242
+ return ElementCollection(filtered_elements)
1243
+
1244
+ def apply_ocr(self, **ocr_params) -> List['TextElement']:
1245
+ """
1246
+ Apply OCR to this region and return the created text elements.
1247
+
1248
+ Args:
1249
+ **ocr_params: OCR parameters to override defaults
1250
+
1251
+ Returns:
1252
+ List of created text elements
1253
+ """
1254
+ from natural_pdf.ocr import OCRManager
1255
+
1256
+ # Get OCR configuration but suppress verbose output
1257
+ if isinstance(ocr_params, dict):
1258
+ ocr_params["verbose"] = False
1259
+ else:
1260
+ ocr_params = {"enabled": True, "verbose": False}
1261
+
1262
+ ocr_config = self.page._get_ocr_config(ocr_params)
1263
+
1264
+ # Skip if OCR is disabled
1265
+ if not ocr_config.get('enabled'):
1266
+ return []
1267
+
1268
+ # Render the page
1269
+ page_image = self.page.to_image()
1270
+
1271
+ # Crop to this region
1272
+ region_image = page_image.crop((self.x0, self.top, self.x1, self.bottom))
1273
+
1274
+ # Run OCR on this region
1275
+ ocr_mgr = OCRManager.get_instance()
1276
+ results = ocr_mgr.recognize_region(region_image, ocr_config)
1277
+
1278
+ # Adjust coordinates to be relative to the page
1279
+ for result in results:
1280
+ # Calculate bbox in page coordinates
1281
+ result['bbox'] = (
1282
+ result['bbox'][0] + self.x0,
1283
+ result['bbox'][1] + self.top,
1284
+ result['bbox'][2] + self.x0,
1285
+ result['bbox'][3] + self.top
1286
+ )
1287
+
1288
+ # Create text elements with adjusted coordinates
1289
+ elements = []
1290
+ for result in results:
1291
+ # Only include results that are fully within the region
1292
+ if (result['bbox'][0] >= self.x0 and
1293
+ result['bbox'][1] >= self.top and
1294
+ result['bbox'][2] <= self.x1 and
1295
+ result['bbox'][3] <= self.bottom):
1296
+ # Create a TextElement object with the appropriate fields
1297
+ from natural_pdf.elements.text import TextElement
1298
+ element_data = {
1299
+ 'text': result['text'],
1300
+ 'x0': result['bbox'][0],
1301
+ 'top': result['bbox'][1],
1302
+ 'x1': result['bbox'][2],
1303
+ 'bottom': result['bbox'][3],
1304
+ 'width': result['bbox'][2] - result['bbox'][0],
1305
+ 'height': result['bbox'][3] - result['bbox'][1],
1306
+ 'object_type': 'text',
1307
+ 'source': 'ocr',
1308
+ 'confidence': result['confidence'],
1309
+ # Add default font information to work with existing expectations
1310
+ 'fontname': 'OCR-detected',
1311
+ 'size': 10.0,
1312
+ 'page_number': self.page.number
1313
+ }
1314
+
1315
+ elem = TextElement(element_data, self.page)
1316
+ elements.append(elem)
1317
+
1318
+ # Add to page's elements
1319
+ if hasattr(self.page, '_elements') and self.page._elements is not None:
1320
+ # Add to words list to make it accessible via standard API
1321
+ if 'words' in self.page._elements:
1322
+ self.page._elements['words'].append(elem)
1323
+ else:
1324
+ self.page._elements['words'] = [elem]
1325
+
1326
+ return elements
1327
+
1328
+ def expand(self,
1329
+ left: float = 0,
1330
+ right: float = 0,
1331
+ top_expand: float = 0, # Renamed to avoid conflict
1332
+ bottom_expand: float = 0, # Renamed to avoid conflict
1333
+ width_factor: float = 1.0,
1334
+ height_factor: float = 1.0,
1335
+ # Keep original parameter names for backward compatibility
1336
+ top: float = None,
1337
+ bottom: float = None) -> 'Region':
1338
+ """
1339
+ Create a new region expanded from this one.
1340
+
1341
+ Args:
1342
+ left: Amount to expand left edge
1343
+ right: Amount to expand right edge
1344
+ top_expand: Amount to expand top edge (upward)
1345
+ bottom_expand: Amount to expand bottom edge (downward)
1346
+ width_factor: Factor to multiply width by
1347
+ height_factor: Factor to multiply height by
1348
+ top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
1349
+ bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
1350
+
1351
+ Returns:
1352
+ New expanded Region
1353
+ """
1354
+ # Start with current coordinates
1355
+ new_x0 = self.x0
1356
+ new_x1 = self.x1
1357
+ new_top = self.top
1358
+ new_bottom = self.bottom
1359
+
1360
+ # Handle the deprecated parameter names for backward compatibility
1361
+ if top is not None:
1362
+ top_expand = top
1363
+ if bottom is not None:
1364
+ bottom_expand = bottom
1365
+
1366
+ # Apply absolute expansions first
1367
+ new_x0 -= left
1368
+ new_x1 += right
1369
+ new_top -= top_expand # Expand upward (decrease top coordinate)
1370
+ new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
1371
+
1372
+ # Apply percentage factors if provided
1373
+ if width_factor != 1.0 or height_factor != 1.0:
1374
+ # Current width and height
1375
+ current_width = new_x1 - new_x0
1376
+ current_height = new_bottom - new_top
1377
+
1378
+ # Calculate new width and height
1379
+ new_width = current_width * width_factor
1380
+ new_height = current_height * height_factor
1381
+
1382
+ # Calculate width and height differences
1383
+ width_diff = new_width - current_width
1384
+ height_diff = new_height - current_height
1385
+
1386
+ # Adjust coordinates to maintain center point
1387
+ new_x0 -= width_diff / 2
1388
+ new_x1 += width_diff / 2
1389
+ new_top -= height_diff / 2
1390
+ new_bottom += height_diff / 2
1391
+
1392
+ # Create new region with expanded bbox
1393
+ new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
1394
+
1395
+ # Copy multi-page properties if present
1396
+ if self._spans_pages:
1397
+ new_region._spans_pages = True
1398
+ new_region._multi_page_elements = self._multi_page_elements
1399
+ new_region._page_range = self._page_range
1400
+ new_region.start_element = self.start_element
1401
+ new_region.end_element = self.end_element
1402
+
1403
+ return new_region
1404
+
1405
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1406
+ """
1407
+ Get a section between two elements within this region.
1408
+
1409
+ Args:
1410
+ start_element: Element marking the start of the section
1411
+ end_element: Element marking the end of the section
1412
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1413
+
1414
+ Returns:
1415
+ Region representing the section
1416
+ """
1417
+ elements = self.get_elements()
1418
+
1419
+ # If no elements, return self
1420
+ if not elements:
1421
+ return self
1422
+
1423
+ # Sort elements in reading order
1424
+ elements.sort(key=lambda e: (e.top, e.x0))
1425
+
1426
+ # Find start index
1427
+ start_idx = 0
1428
+ if start_element:
1429
+ try:
1430
+ start_idx = elements.index(start_element)
1431
+ except ValueError:
1432
+ # Start element not in region, use first element
1433
+ pass
1434
+
1435
+ # Find end index
1436
+ end_idx = len(elements) - 1
1437
+ if end_element:
1438
+ try:
1439
+ end_idx = elements.index(end_element)
1440
+ except ValueError:
1441
+ # End element not in region, use last element
1442
+ pass
1443
+
1444
+ # Adjust indexes based on boundary inclusion
1445
+ if boundary_inclusion == 'none':
1446
+ start_idx += 1
1447
+ end_idx -= 1
1448
+ elif boundary_inclusion == 'start':
1449
+ end_idx -= 1
1450
+ elif boundary_inclusion == 'end':
1451
+ start_idx += 1
1452
+
1453
+ # Ensure valid indexes
1454
+ start_idx = max(0, start_idx)
1455
+ end_idx = min(len(elements) - 1, end_idx)
1456
+
1457
+ # If no valid elements in range, return empty region
1458
+ if start_idx > end_idx:
1459
+ return Region(self.page, (0, 0, 0, 0))
1460
+
1461
+ # Get elements in range
1462
+ section_elements = elements[start_idx:end_idx+1]
1463
+
1464
+ # Create bounding box around elements
1465
+ x0 = min(e.x0 for e in section_elements)
1466
+ top = min(e.top for e in section_elements)
1467
+ x1 = max(e.x1 for e in section_elements)
1468
+ bottom = max(e.bottom for e in section_elements)
1469
+
1470
+ # Adjust boundaries for better boundary inclusion/exclusion
1471
+ pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
1472
+
1473
+ # Only proceed with adjustments if we have elements in the section
1474
+ if section_elements:
1475
+ # Adjust top boundary if start element should be excluded
1476
+ if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
1477
+ # If start element is just above the section, move the top down
1478
+ # Use a larger threshold (10 points) to catch more cases
1479
+ if abs(top - start_element.bottom) < 10:
1480
+ top += pixel_adjustment
1481
+
1482
+ # Adjust bottom boundary if end element should be excluded
1483
+ if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
1484
+ # If end element is just below the section, move the bottom up
1485
+ # Use a larger threshold (10 points) to catch more cases
1486
+ if abs(bottom - end_element.top) < 10:
1487
+ bottom -= pixel_adjustment
1488
+
1489
+ # Ensure top is always less than bottom (valid region)
1490
+ if top >= bottom:
1491
+ # Reset to original if adjustment would create an invalid region
1492
+ top = min(e.top for e in section_elements)
1493
+ bottom = max(e.bottom for e in section_elements)
1494
+
1495
+ # Create new region
1496
+ section = Region(self.page, (x0, top, x1, bottom))
1497
+ section.start_element = start_element if boundary_inclusion in ('start', 'both') else None
1498
+ section.end_element = end_element if boundary_inclusion in ('end', 'both') else None
1499
+
1500
+ return section
1501
+
1502
+ def get_sections(self, start_elements=None, end_elements=None, boundary_inclusion='both') -> List['Region']:
1503
+ """
1504
+ Get sections within this region based on start/end elements.
1505
+
1506
+ Args:
1507
+ start_elements: Elements or selector string that mark the start of sections
1508
+ end_elements: Elements or selector string that mark the end of sections
1509
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1510
+
1511
+ Returns:
1512
+ List of Region objects representing the extracted sections
1513
+ """
1514
+ from natural_pdf.elements.collections import ElementCollection
1515
+
1516
+ # Process string selectors to find elements
1517
+ if isinstance(start_elements, str):
1518
+ start_elements = self.find_all(start_elements)
1519
+ if hasattr(start_elements, 'elements'):
1520
+ start_elements = start_elements.elements
1521
+
1522
+ if isinstance(end_elements, str):
1523
+ end_elements = self.find_all(end_elements)
1524
+ if hasattr(end_elements, 'elements'):
1525
+ end_elements = end_elements.elements
1526
+
1527
+ # If no start elements, return empty list
1528
+ if not start_elements:
1529
+ return []
1530
+
1531
+ # Sort elements in reading order
1532
+ all_elements = self.get_elements()
1533
+ all_elements.sort(key=lambda e: (e.top, e.x0))
1534
+
1535
+ # Get all indexes in the sorted list
1536
+ section_boundaries = []
1537
+
1538
+ # Add start element indexes
1539
+ for element in start_elements:
1540
+ try:
1541
+ idx = all_elements.index(element)
1542
+ section_boundaries.append({
1543
+ 'index': idx,
1544
+ 'element': element,
1545
+ 'type': 'start'
1546
+ })
1547
+ except ValueError:
1548
+ # Element not in this region, skip
1549
+ continue
1550
+
1551
+ # Add end element indexes if provided
1552
+ if end_elements:
1553
+ for element in end_elements:
1554
+ try:
1555
+ idx = all_elements.index(element)
1556
+ section_boundaries.append({
1557
+ 'index': idx,
1558
+ 'element': element,
1559
+ 'type': 'end'
1560
+ })
1561
+ except ValueError:
1562
+ # Element not in this region, skip
1563
+ continue
1564
+
1565
+ # Sort boundaries by index (document order)
1566
+ section_boundaries.sort(key=lambda x: x['index'])
1567
+
1568
+ # Generate sections
1569
+ sections = []
1570
+ current_start = None
1571
+
1572
+ for i, boundary in enumerate(section_boundaries):
1573
+ # If it's a start boundary and we don't have a current start
1574
+ if boundary['type'] == 'start' and current_start is None:
1575
+ current_start = boundary
1576
+
1577
+ # If it's an end boundary and we have a current start
1578
+ elif boundary['type'] == 'end' and current_start is not None:
1579
+ # Create a section from current_start to this boundary
1580
+ start_element = current_start['element']
1581
+ end_element = boundary['element']
1582
+ section = self.get_section_between(
1583
+ start_element,
1584
+ end_element,
1585
+ boundary_inclusion
1586
+ )
1587
+ sections.append(section)
1588
+ current_start = None
1589
+
1590
+ # If it's another start boundary and we have a current start (for splitting by starts only)
1591
+ elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1592
+ # Create a section from current_start to just before this boundary
1593
+ start_element = current_start['element']
1594
+ end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
1595
+ section = self.get_section_between(
1596
+ start_element,
1597
+ end_element,
1598
+ boundary_inclusion
1599
+ )
1600
+ sections.append(section)
1601
+ current_start = boundary
1602
+
1603
+ # Handle the last section if we have a current start
1604
+ if current_start is not None:
1605
+ start_element = current_start['element']
1606
+ # Use the last element in the region as the end
1607
+ end_element = all_elements[-1] if all_elements else None
1608
+ section = self.get_section_between(
1609
+ start_element,
1610
+ end_element,
1611
+ boundary_inclusion
1612
+ )
1613
+ sections.append(section)
1614
+
1615
+ return sections
1616
+
1617
+ def create_cells(self):
1618
+ """
1619
+ Create cell regions for a TATR-detected table.
1620
+
1621
+ Returns:
1622
+ List of cell regions
1623
+ """
1624
+ if not (self.region_type == 'table' and self.model == 'tatr'):
1625
+ raise ValueError("Only works for TATR-detected table regions")
1626
+
1627
+ # Find rows and columns that belong to this table
1628
+ rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1629
+ columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1630
+
1631
+ # Filter to only include those that overlap with this table
1632
+ def is_in_table(element):
1633
+ element_center_x = (element.x0 + element.x1) / 2
1634
+ element_center_y = (element.top + element.bottom) / 2
1635
+ return (self.x0 <= element_center_x <= self.x1 and
1636
+ self.top <= element_center_y <= self.bottom)
1637
+
1638
+ table_rows = [r for r in rows if is_in_table(r)]
1639
+ table_columns = [c for c in columns if is_in_table(c)]
1640
+
1641
+ # Sort rows and columns
1642
+ table_rows.sort(key=lambda r: r.top)
1643
+ table_columns.sort(key=lambda c: c.x0)
1644
+
1645
+ # Create cells
1646
+ cells = []
1647
+ for row in table_rows:
1648
+ for column in table_columns:
1649
+ # Create cell region at the intersection
1650
+ cell = self.page.create_region(
1651
+ column.x0, row.top, column.x1, row.bottom
1652
+ )
1653
+ # Set minimal metadata
1654
+ cell.source = 'derived'
1655
+ cell.region_type = 'table-cell'
1656
+ cell.model = 'tatr'
1657
+
1658
+ cells.append(cell)
1659
+
1660
+ return cells
1661
+
1662
+ def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1663
+ """
1664
+ Ask a question about the region content using document QA.
1665
+
1666
+ This method uses a document question answering model to extract answers from the region content.
1667
+ It leverages both textual content and layout information for better understanding.
1668
+
1669
+ Args:
1670
+ question: The question to ask about the region content
1671
+ min_confidence: Minimum confidence threshold for answers (0.0-1.0)
1672
+ model: Optional model name to use for QA (if None, uses default model)
1673
+ **kwargs: Additional parameters to pass to the QA engine
1674
+
1675
+ Returns:
1676
+ Dictionary with answer details: {
1677
+ "answer": extracted text,
1678
+ "confidence": confidence score,
1679
+ "found": whether an answer was found,
1680
+ "page_num": page number,
1681
+ "region": reference to this region,
1682
+ "source_elements": list of elements that contain the answer (if found)
1683
+ }
1684
+ """
1685
+ from natural_pdf.qa.document_qa import get_qa_engine
1686
+
1687
+ # Get or initialize QA engine with specified model
1688
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1689
+
1690
+ # Ask the question using the QA engine
1691
+ return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1692
+
1693
+ def add_child(self, child):
1694
+ """
1695
+ Add a child region to this region.
1696
+
1697
+ Used for hierarchical document structure when using models like Docling
1698
+ that understand document hierarchy.
1699
+
1700
+ Args:
1701
+ child: Region object to add as a child
1702
+
1703
+ Returns:
1704
+ Self for method chaining
1705
+ """
1706
+ self.child_regions.append(child)
1707
+ child.parent_region = self
1708
+ return self
1709
+
1710
+ def get_children(self, selector=None):
1711
+ """
1712
+ Get immediate child regions, optionally filtered by selector.
1713
+
1714
+ Args:
1715
+ selector: Optional selector to filter children
1716
+
1717
+ Returns:
1718
+ List of child regions matching the selector
1719
+ """
1720
+ import logging
1721
+ logger = logging.getLogger("natural_pdf.elements.region")
1722
+
1723
+ if selector is None:
1724
+ return self.child_regions
1725
+
1726
+ # Use existing selector parser to filter
1727
+ from natural_pdf.selectors.parser import match_elements_with_selector
1728
+ matched = match_elements_with_selector(self.child_regions, selector)
1729
+ logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
1730
+ return matched
1731
+
1732
+ def get_descendants(self, selector=None):
1733
+ """
1734
+ Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
1735
+
1736
+ Args:
1737
+ selector: Optional selector to filter descendants
1738
+
1739
+ Returns:
1740
+ List of descendant regions matching the selector
1741
+ """
1742
+ import logging
1743
+ logger = logging.getLogger("natural_pdf.elements.region")
1744
+
1745
+ all_descendants = []
1746
+
1747
+ # First add direct children
1748
+ all_descendants.extend(self.child_regions)
1749
+
1750
+ # Then recursively add their descendants
1751
+ for child in self.child_regions:
1752
+ all_descendants.extend(child.get_descendants())
1753
+
1754
+ logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
1755
+
1756
+ # Filter by selector if provided
1757
+ if selector is not None:
1758
+ from natural_pdf.selectors.parser import match_elements_with_selector
1759
+ matched = match_elements_with_selector(all_descendants, selector)
1760
+ logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1761
+ return matched
1762
+
1763
+ return all_descendants
1764
+
1765
+ def find_all(self, selector, recursive=True, **kwargs):
1766
+ """
1767
+ Find all matching elements within this region, with optional recursion through child regions.
1768
+
1769
+ Args:
1770
+ selector: The selector to find elements with
1771
+ recursive: Whether to search recursively through child regions
1772
+ **kwargs: Additional parameters to pass to the selector parser
1773
+
1774
+ Returns:
1775
+ Collection of matching elements
1776
+ """
1777
+ # Get direct matches
1778
+ direct_matches = self._find_all(selector, region=self, **kwargs)
1779
+
1780
+ if not recursive or not self.child_regions:
1781
+ return direct_matches
1782
+
1783
+ # Get recursive matches from children
1784
+ from natural_pdf.elements.collections import ElementCollection
1785
+ all_matches = list(direct_matches)
1786
+
1787
+ for child in self.child_regions:
1788
+ child_matches = child.find_all(selector, recursive=True, **kwargs)
1789
+ for match in child_matches:
1790
+ if match not in all_matches:
1791
+ all_matches.append(match)
1792
+
1793
+ return ElementCollection(all_matches)