natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,1366 @@
1
+ from typing import Optional, Union, List, Dict, Tuple, Any, Callable, TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from natural_pdf.core.page import Page
5
+ from natural_pdf.elements.base import Element
6
+ from natural_pdf.elements.text import TextElement
7
+
8
+ # Import OCRManager conditionally to avoid circular imports
9
+ try:
10
+ from natural_pdf.utils.ocr import OCRManager
11
+ except ImportError:
12
+ # OCRManager will be imported directly in methods that use it
13
+ pass
14
+
15
+
16
+ class Region:
17
+ """
18
+ Represents a rectangular region on a page.
19
+ """
20
+
21
+ def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
22
+ """
23
+ Initialize a region.
24
+
25
+ Args:
26
+ page: Parent page
27
+ bbox: Bounding box as (x0, top, x1, bottom)
28
+ polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
+ """
30
+ self._page = page
31
+ self._bbox = bbox
32
+ self._polygon = polygon
33
+ self._multi_page_elements = None
34
+ self._spans_pages = False
35
+ self._page_range = None
36
+ self.start_element = None
37
+ self.end_element = None
38
+
39
+ # Standard attributes for all elements
40
+ self.object_type = 'region' # For selector compatibility
41
+
42
+ # Layout detection attributes
43
+ self.region_type = None
44
+ self.normalized_type = None
45
+ self.confidence = None
46
+ self.model = None
47
+
48
+ # Region management attributes
49
+ self.name = None
50
+ self.source = None # Will be set by creation methods
51
+
52
+ @property
53
+ def page(self) -> 'Page':
54
+ """Get the parent page."""
55
+ return self._page
56
+
57
+ @property
58
+ def bbox(self) -> Tuple[float, float, float, float]:
59
+ """Get the bounding box as (x0, top, x1, bottom)."""
60
+ return self._bbox
61
+
62
+ @property
63
+ def x0(self) -> float:
64
+ """Get the left coordinate."""
65
+ return self._bbox[0]
66
+
67
+ @property
68
+ def top(self) -> float:
69
+ """Get the top coordinate."""
70
+ return self._bbox[1]
71
+
72
+ @property
73
+ def x1(self) -> float:
74
+ """Get the right coordinate."""
75
+ return self._bbox[2]
76
+
77
+ @property
78
+ def bottom(self) -> float:
79
+ """Get the bottom coordinate."""
80
+ return self._bbox[3]
81
+
82
+ @property
83
+ def width(self) -> float:
84
+ """Get the width of the region."""
85
+ return self.x1 - self.x0
86
+
87
+ @property
88
+ def height(self) -> float:
89
+ """Get the height of the region."""
90
+ return self.bottom - self.top
91
+
92
+ @property
93
+ def has_polygon(self) -> bool:
94
+ """Check if this region has polygon coordinates."""
95
+ return self._polygon is not None and len(self._polygon) >= 3
96
+
97
+ @property
98
+ def polygon(self) -> List[Tuple[float, float]]:
99
+ """Get polygon coordinates if available, otherwise return rectangle corners."""
100
+ if self._polygon:
101
+ return self._polygon
102
+ else:
103
+ # Create rectangle corners from bbox as fallback
104
+ return [
105
+ (self.x0, self.top), # top-left
106
+ (self.x1, self.top), # top-right
107
+ (self.x1, self.bottom), # bottom-right
108
+ (self.x0, self.bottom) # bottom-left
109
+ ]
110
+
111
+ def _is_point_in_polygon(self, x: float, y: float) -> bool:
112
+ """
113
+ Check if a point is inside the polygon using ray casting algorithm.
114
+
115
+ Args:
116
+ x: X-coordinate to check
117
+ y: Y-coordinate to check
118
+
119
+ Returns:
120
+ True if the point is inside the polygon
121
+ """
122
+ # If no polygon, use simple rectangle check
123
+ if not self.has_polygon:
124
+ return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
125
+
126
+ # Ray casting algorithm for complex polygons
127
+ poly = self.polygon
128
+ n = len(poly)
129
+ inside = False
130
+
131
+ p1x, p1y = poly[0]
132
+ for i in range(1, n + 1):
133
+ p2x, p2y = poly[i % n]
134
+ if y > min(p1y, p2y) and y <= max(p1y, p2y) and x <= max(p1x, p2x):
135
+ if p1y != p2y:
136
+ xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
137
+ if p1x == p2x or x <= xinters:
138
+ inside = not inside
139
+ p1x, p1y = p2x, p2y
140
+
141
+ return inside
142
+
143
+ def _is_element_in_region(self, element: 'Element', use_boundary_tolerance=True) -> bool:
144
+ """
145
+ Check if an element is within this region.
146
+
147
+ Args:
148
+ element: Element to check
149
+ use_boundary_tolerance: Whether to apply a small tolerance for boundary elements
150
+
151
+ Returns:
152
+ True if the element is in the region, False otherwise
153
+ """
154
+ # If we have multi-page elements cached, check if the element is in the list
155
+ if self._spans_pages and self._multi_page_elements is not None:
156
+ return element in self._multi_page_elements
157
+
158
+ # Check if element is on the same page
159
+ if element.page != self._page:
160
+ return False
161
+
162
+ # Calculate element center
163
+ element_center_x = (element.x0 + element.x1) / 2
164
+ element_center_y = (element.top + element.bottom) / 2
165
+
166
+ # If this is a boundary region with exclusions, apply strict boundary checking
167
+ # This helps enforce boundary_inclusion behavior in get_sections
168
+ if hasattr(self, 'start_element') or hasattr(self, 'end_element'):
169
+ # Apply a small tolerance to avoid border cases
170
+ # When an element is right at the border, we want to be more strict
171
+ tolerance = 2.0 if use_boundary_tolerance else 0.0
172
+
173
+ # Check if element center is strictly within the region (not just on border)
174
+ if (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
175
+ self.top + tolerance <= element_center_y <= self.bottom - tolerance):
176
+ return True
177
+
178
+ # For elements right at the boundary, be more conservative
179
+ return False
180
+
181
+ # If the element itself has a polygon, check if ANY corner is in this region
182
+ if hasattr(element, 'has_polygon') and element.has_polygon:
183
+ for point in element.polygon:
184
+ if self._is_point_in_polygon(point[0], point[1]):
185
+ return True
186
+ # If no point is inside, check if the center is inside
187
+ return self._is_point_in_polygon(element_center_x, element_center_y)
188
+
189
+ # For regular elements, check if center is in the region
190
+ # Add a small tolerance (1 pixel) to avoid including elements that are exactly on the boundary
191
+ # This ensures consistent behavior with the below() and above() method fixes
192
+ tolerance = 1.0 if use_boundary_tolerance else 0.0
193
+
194
+ # Check if within region with the tolerance applied
195
+ if self.has_polygon:
196
+ return self._is_point_in_polygon(element_center_x, element_center_y)
197
+ else:
198
+ # For rectangular regions, apply tolerance to all sides
199
+ return (self.x0 + tolerance <= element_center_x <= self.x1 - tolerance and
200
+ self.top + tolerance <= element_center_y <= self.bottom - tolerance)
201
+
202
+ def highlight(self,
203
+ label: Optional[str] = None,
204
+ color: Optional[Tuple[int, int, int, int]] = None,
205
+ use_color_cycling: bool = False,
206
+ include_attrs: Optional[List[str]] = None) -> 'Region':
207
+ """
208
+ Highlight this region on the page.
209
+
210
+ Args:
211
+ label: Optional label for the highlight
212
+ color: RGBA color tuple for the highlight, or None to use automatic color
213
+ use_color_cycling: Force color cycling even with no label (default: False)
214
+ include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
215
+
216
+ Returns:
217
+ Self for method chaining
218
+ """
219
+ # Add highlight to the page's highlight manager
220
+ if self.has_polygon:
221
+ self._page._highlight_mgr.add_polygon_highlight(
222
+ self.polygon,
223
+ color,
224
+ label,
225
+ use_color_cycling,
226
+ element=self, # Pass the region itself so attributes can be accessed
227
+ include_attrs=include_attrs
228
+ )
229
+ else:
230
+ self._page._highlight_mgr.add_highlight(
231
+ self.bbox,
232
+ color,
233
+ label,
234
+ use_color_cycling,
235
+ element=self, # Pass the region itself so attributes can be accessed
236
+ include_attrs=include_attrs
237
+ )
238
+ return self
239
+
240
+ def to_image(self,
241
+ scale: float = 2.0,
242
+ resolution: float = 150,
243
+ crop_only: bool = False,
244
+ include_highlights: bool = True,
245
+ **kwargs) -> 'Image.Image':
246
+ """
247
+ Generate an image of just this region.
248
+
249
+ Args:
250
+ resolution: Resolution in DPI for rendering (default: 150)
251
+ crop_only: If True, only crop the region without highlighting its boundaries
252
+ include_highlights: Whether to include existing highlights (default: True)
253
+ **kwargs: Additional parameters for page.to_image()
254
+
255
+ Returns:
256
+ PIL Image of just this region
257
+ """
258
+ # First get the full page image with highlights if requested
259
+ page_image = self._page.to_image(scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs)
260
+
261
+ # Calculate the crop coordinates - apply resolution scaling factor
262
+ # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
263
+ scale_factor = scale
264
+
265
+ # Apply scaling to the coordinates
266
+ x0 = int(self.x0 * scale_factor)
267
+ top = int(self.top * scale_factor)
268
+ x1 = int(self.x1 * scale_factor)
269
+ bottom = int(self.bottom * scale_factor)
270
+
271
+ # Crop the image to just this region
272
+ region_image = page_image.crop((x0, top, x1, bottom))
273
+
274
+ # If not crop_only, add a border to highlight the region boundaries
275
+ if not crop_only:
276
+ from PIL import ImageDraw
277
+
278
+ # Create a 1px border around the region
279
+ draw = ImageDraw.Draw(region_image)
280
+ draw.rectangle((0, 0, region_image.width-1, region_image.height-1),
281
+ outline=(255, 0, 0), width=1)
282
+
283
+ return region_image
284
+
285
+ def show(self,
286
+ scale: float = 2.0,
287
+ labels: bool = True,
288
+ legend_position: str = 'right') -> 'Image.Image':
289
+ """
290
+ Show the page with this region highlighted.
291
+
292
+ Args:
293
+ scale: Scale factor for rendering
294
+ labels: Whether to include a legend for labels
295
+ legend_position: Position of the legend
296
+
297
+ Returns:
298
+ PIL Image of the page with this region highlighted
299
+ """
300
+ # Highlight this region if not already highlighted
301
+ self.highlight()
302
+
303
+ # Get and display the highlighted image
304
+ return self._page.show(scale, labels=labels, legend_position=legend_position)
305
+
306
+ def save(self,
307
+ filename: str,
308
+ scale: float = 2.0,
309
+ labels: bool = True,
310
+ legend_position: str = 'right') -> 'Region':
311
+ """
312
+ Save the page with this region highlighted to an image file.
313
+
314
+ Args:
315
+ filename: Path to save the image to
316
+ scale: Scale factor for rendering
317
+ labels: Whether to include a legend for labels
318
+ legend_position: Position of the legend
319
+
320
+ Returns:
321
+ Self for method chaining
322
+ """
323
+ # Highlight this region if not already highlighted
324
+ self.highlight()
325
+
326
+ # Save the highlighted image
327
+ self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
328
+ return self
329
+
330
+ def save_image(self,
331
+ filename: str,
332
+ resolution: float = 150,
333
+ crop_only: bool = False,
334
+ include_highlights: bool = True,
335
+ **kwargs) -> 'Region':
336
+ """
337
+ Save an image of just this region to a file.
338
+
339
+ Args:
340
+ filename: Path to save the image to
341
+ resolution: Resolution in DPI for rendering (default: 150)
342
+ crop_only: If True, only crop the region without highlighting its boundaries
343
+ include_highlights: Whether to include existing highlights (default: True)
344
+ **kwargs: Additional parameters for page.to_image()
345
+
346
+ Returns:
347
+ Self for method chaining
348
+ """
349
+ # Get the region image
350
+ image = self.to_image(
351
+ resolution=resolution,
352
+ crop_only=crop_only,
353
+ include_highlights=include_highlights,
354
+ **kwargs
355
+ )
356
+
357
+ # Save the image
358
+ image.save(filename)
359
+ return self
360
+
361
+ def get_elements(self, selector: Optional[str] = None, apply_exclusions=True, **kwargs) -> List['Element']:
362
+ """
363
+ Get all elements within this region.
364
+
365
+ Args:
366
+ selector: Optional selector to filter elements
367
+ apply_exclusions: Whether to apply exclusion regions
368
+ **kwargs: Additional parameters for element filtering
369
+
370
+ Returns:
371
+ List of elements in the region
372
+ """
373
+ # If we have multi-page elements, return those
374
+ if self._spans_pages and self._multi_page_elements is not None:
375
+ return self._multi_page_elements
376
+
377
+ # Otherwise, get elements from the page
378
+ if selector:
379
+ elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
380
+ else:
381
+ elements = self.page.get_elements(apply_exclusions=apply_exclusions)
382
+
383
+ # Filter to elements in this region
384
+ return [e for e in elements if self._is_element_in_region(e)]
385
+
386
+ def extract_text(self, keep_blank_chars=True, apply_exclusions=True, ocr=None, preserve_whitespace=None, debug=False, **kwargs) -> str:
387
+ """
388
+ Extract text from this region using pdfplumber's native functionality.
389
+
390
+ Args:
391
+ keep_blank_chars: Whether to keep blank characters (legacy parameter)
392
+ apply_exclusions: Whether to apply exclusion regions
393
+ ocr: OCR configuration. If None, uses PDF settings
394
+ preserve_whitespace: Synonym for keep_blank_chars (for compatibility with page.extract_text)
395
+ debug: Enable verbose debugging for exclusion handling
396
+ **kwargs: Additional parameters for text extraction
397
+
398
+ Returns:
399
+ Extracted text as string
400
+ """
401
+ # Handle preserve_whitespace parameter for consistency with Page.extract_text
402
+ if preserve_whitespace is not None:
403
+ keep_blank_chars = preserve_whitespace
404
+
405
+ # If we span multiple pages, use the original implementation
406
+ if self._spans_pages and self._multi_page_elements is not None:
407
+ # Sort elements in reading order - only include text-like elements
408
+ text_elements = [e for e in self._multi_page_elements if hasattr(e, 'text')]
409
+
410
+ # Sort in reading order (by page, then top-to-bottom, left-to-right)
411
+ sorted_elements = sorted(text_elements, key=lambda e: (e.page.index, e.top, e.x0))
412
+
413
+ # Extract text directly from elements to avoid recursion
414
+ texts = []
415
+ for element in sorted_elements:
416
+ if hasattr(element, 'text'):
417
+ texts.append(element.text)
418
+
419
+ text_result = " ".join(texts)
420
+ return text_result
421
+
422
+ # Check if we have exclusions to apply
423
+ exclusion_regions = []
424
+ if apply_exclusions and self._page._exclusions:
425
+ exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
426
+
427
+ if debug:
428
+ import logging
429
+ logger = logging.getLogger("natural_pdf.elements.region")
430
+ logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
431
+
432
+ # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
433
+ # If not, ignore exclusions entirely
434
+ if exclusion_regions:
435
+ has_intersection = False
436
+ for i, exclusion in enumerate(exclusion_regions):
437
+ # Use a simple bbox overlap check
438
+ overlap = (self.x0 < exclusion.x1 and self.x1 > exclusion.x0 and
439
+ self.top < exclusion.bottom and self.bottom > exclusion.top)
440
+
441
+ if overlap:
442
+ has_intersection = True
443
+ if debug:
444
+ import logging
445
+ logger = logging.getLogger("natural_pdf.elements.region")
446
+ logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
447
+ break
448
+
449
+ # If no intersection, process without exclusions
450
+ if not has_intersection:
451
+ if debug:
452
+ import logging
453
+ logger = logging.getLogger("natural_pdf.elements.region")
454
+ logger.debug(f" No intersection with any exclusion, ignoring exclusions")
455
+ apply_exclusions = False
456
+ exclusion_regions = []
457
+
458
+ # IMPROVEMENT 2: If rectangular region + full-width exclusions (headers/footers),
459
+ # we can use the simpler cropping approach
460
+ # Only use crop for simple cases
461
+ can_use_crop = not self.has_polygon
462
+ result = "" # Default empty result
463
+ if can_use_crop and apply_exclusions and exclusion_regions:
464
+ # We'll keep track of exclusions that are full-width horizontal bands (headers/footers)
465
+ # and those that are not
466
+ footer_header_exclusions = []
467
+ other_exclusions = []
468
+
469
+ for i, exclusion in enumerate(exclusion_regions):
470
+ # Check if exclusion spans the full width of the page
471
+ # and is either at the top or bottom
472
+ full_width = (abs(exclusion.x0) < 5 and
473
+ abs(exclusion.x1 - self.page.width) < 5)
474
+
475
+ if debug:
476
+ import logging
477
+ logger = logging.getLogger("natural_pdf.elements.region")
478
+ logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
479
+
480
+ if full_width:
481
+ footer_header_exclusions.append(exclusion)
482
+ else:
483
+ other_exclusions.append(exclusion)
484
+
485
+ # If we have only header/footer exclusions, we can use the cropping approach
486
+ all_are_bands = len(other_exclusions) == 0 and len(footer_header_exclusions) > 0
487
+
488
+ if all_are_bands:
489
+ # Find the actual content area after excluding header/footer
490
+ top_bound = self.top
491
+ bottom_bound = self.bottom
492
+
493
+ if debug:
494
+ import logging
495
+ logger = logging.getLogger("natural_pdf.elements.region")
496
+ logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
497
+
498
+ # Process only header/footer exclusions for cropping
499
+ for exclusion in footer_header_exclusions:
500
+ # If exclusion is at the top of our region
501
+ if exclusion.bottom > self.top and exclusion.top <= self.top:
502
+ # Move top bound to exclude the header
503
+ top_bound = max(top_bound, exclusion.bottom)
504
+ if debug:
505
+ import logging
506
+ logger = logging.getLogger("natural_pdf.elements.region")
507
+ logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
508
+
509
+ # If exclusion is at the bottom of our region
510
+ if exclusion.top < self.bottom and exclusion.bottom >= self.bottom:
511
+ # Move bottom bound to exclude the footer
512
+ bottom_bound = min(bottom_bound, exclusion.top)
513
+ if debug:
514
+ import logging
515
+ logger = logging.getLogger("natural_pdf.elements.region")
516
+ logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
517
+
518
+
519
+ if debug:
520
+ import logging
521
+ logger = logging.getLogger("natural_pdf.elements.region")
522
+ logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
523
+
524
+ # If we still have a valid region after exclusions
525
+ if top_bound < bottom_bound:
526
+ # Use direct crop with adjusted bounds
527
+ crop_bbox = (self.x0, top_bound, self.x1, bottom_bound)
528
+ cropped = self.page._page.crop(crop_bbox)
529
+ result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
530
+
531
+ if debug:
532
+ import logging
533
+ logger = logging.getLogger("natural_pdf.elements.region")
534
+ logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
535
+
536
+ # Skip the complex filtering approach
537
+ return result
538
+ else:
539
+ # This would only happen if the region is entirely inside an exclusion zone
540
+ # or if both top and bottom of the region are excluded leaving no valid area
541
+ import logging
542
+ logger = logging.getLogger("natural_pdf.elements.region")
543
+ logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
544
+ return ""
545
+ # We have exclusions, but not all are headers/footers,
546
+ # or we have a non-rectangular region
547
+ else:
548
+ if debug:
549
+ import logging
550
+ logger = logging.getLogger("natural_pdf.elements.region")
551
+ logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
552
+
553
+ # Don't use crop for mixed exclusion types
554
+ can_use_crop = False
555
+
556
+ # If we got a result from header/footer cropping, return it
557
+ if result:
558
+ return result
559
+
560
+ # For single-page regions without exclusions, or when exclusions don't apply, use direct cropping
561
+ if can_use_crop and not apply_exclusions:
562
+ # Simple case: use direct crop
563
+ crop_bbox = self.bbox
564
+ cropped = self.page._page.crop(crop_bbox)
565
+ result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
566
+ return result
567
+
568
+ # For all other cases (complex exclusions, polygons), we use element filtering
569
+ import warnings
570
+ import logging
571
+ logger = logging.getLogger("natural_pdf.elements.region")
572
+
573
+ if debug:
574
+ logger.debug(f"Using element filtering approach for region {self.bbox}")
575
+
576
+ # Get all elements in this region first
577
+ all_elements = self.get_elements(apply_exclusions=False)
578
+
579
+ if apply_exclusions and exclusion_regions:
580
+ if debug:
581
+ logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
582
+
583
+ # Filter out elements in exclusion zones
584
+ filtered_elements = []
585
+ for elem in all_elements:
586
+ in_exclusion = False
587
+ # For each element, check if it's in any exclusion zone
588
+ element_center_x = (elem.x0 + elem.x1) / 2
589
+ element_center_y = (elem.top + elem.bottom) / 2
590
+
591
+ for exclusion in exclusion_regions:
592
+ if (exclusion.x0 <= element_center_x <= exclusion.x1 and
593
+ exclusion.top <= element_center_y <= exclusion.bottom):
594
+ in_exclusion = True
595
+ break
596
+
597
+ if not in_exclusion:
598
+ filtered_elements.append(elem)
599
+ else:
600
+ # No exclusions, use all elements
601
+ filtered_elements = all_elements
602
+
603
+ # Now extract text from the filtered elements
604
+ if filtered_elements:
605
+ from natural_pdf.elements.collections import ElementCollection
606
+ collection = ElementCollection(filtered_elements)
607
+ # Sort in reading order
608
+ collection = collection.sort(key=lambda e: (e.top, e.x0))
609
+ # Extract text
610
+ result = " ".join(e.text for e in collection if hasattr(e, 'text'))
611
+
612
+ if debug:
613
+ logger.debug(f"Got {len(result)} characters from element-based extraction")
614
+
615
+ # Return the result
616
+ return result
617
+ else:
618
+ if debug:
619
+ logger.debug(f"No elements found after filtering")
620
+ return ""
621
+
622
+ # Handle OCR if needed
623
+ use_ocr = ocr is True or (isinstance(ocr, dict) and ocr.get('enabled', False))
624
+ auto_ocr = ocr is None and self.page._parent._ocr_config.get('enabled') == 'auto'
625
+
626
+ # Run OCR if explicitly requested or if in auto mode and no text found
627
+ if use_ocr or (auto_ocr and not result.strip()):
628
+ ocr_config = self.page._get_ocr_config(ocr or {}) if use_ocr else self.page._get_ocr_config({'enabled': 'auto'})
629
+ ocr_elements = self.apply_ocr(**ocr_config)
630
+
631
+ if ocr_elements:
632
+ # Filter OCR elements by exclusions if needed
633
+ if apply_exclusions and exclusion_regions:
634
+ filtered_ocr = []
635
+ for element in ocr_elements:
636
+ exclude = False
637
+ for region in exclusion_regions:
638
+ if region._is_element_in_region(element):
639
+ exclude = True
640
+ break
641
+ if not exclude:
642
+ filtered_ocr.append(element)
643
+ else:
644
+ filtered_ocr = ocr_elements
645
+
646
+ # Extract text from OCR elements
647
+ from natural_pdf.elements.collections import ElementCollection
648
+ ocr_collection = ElementCollection(filtered_ocr)
649
+ ocr_text = ocr_collection.extract_text(preserve_whitespace=keep_blank_chars, **kwargs)
650
+
651
+ # Use OCR text if it's not empty
652
+ if ocr_text.strip():
653
+ return ocr_text
654
+
655
+ return result
656
+
657
+ def extract_table(self, method: str = None, table_settings: dict = None,
658
+ use_ocr: bool = False, ocr_config: dict = None) -> List[List[str]]:
659
+ """
660
+ Extract a table from this region.
661
+
662
+ Args:
663
+ method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
664
+ table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
665
+ use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
666
+ ocr_config: OCR configuration parameters
667
+
668
+ Returns:
669
+ Table data as a list of rows, where each row is a list of cell values
670
+ """
671
+ # Default settings if none provided
672
+ if table_settings is None:
673
+ table_settings = {}
674
+
675
+ # Auto-detect method if not specified
676
+ if method is None:
677
+ # If this is a TATR-detected region, use TATR method
678
+ if hasattr(self, 'model') and self.model == 'tatr' and self.region_type == 'table':
679
+ method = 'tatr'
680
+ else:
681
+ method = 'plumber'
682
+
683
+ # Use the selected method
684
+ if method == 'tatr':
685
+ return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
686
+ else: # Default to pdfplumber
687
+ return self._extract_table_plumber(table_settings)
688
+
689
+ def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
690
+ """
691
+ Extract table using pdfplumber's table extraction.
692
+
693
+ Args:
694
+ table_settings: Settings for pdfplumber table extraction
695
+
696
+ Returns:
697
+ Table data as a list of rows, where each row is a list of cell values
698
+ """
699
+ # Create a crop of the page for this region
700
+ cropped = self.page._page.crop(self.bbox)
701
+
702
+ # Extract table from the cropped area
703
+ tables = cropped.extract_tables(table_settings)
704
+
705
+ # Return the first table or an empty list if none found
706
+ if tables:
707
+ return tables[0]
708
+ return []
709
+
710
+ def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
711
+ """
712
+ Extract table using TATR structure detection.
713
+
714
+ Args:
715
+ use_ocr: Whether to apply OCR to each cell for better text extraction
716
+ ocr_config: Optional OCR configuration parameters
717
+
718
+ Returns:
719
+ Table data as a list of rows, where each row is a list of cell values
720
+ """
721
+ # Find all rows and headers in this table
722
+ rows = self.page.find_all(f'region[type=table-row][model=tatr]')
723
+ headers = self.page.find_all(f'region[type=table-column-header][model=tatr]')
724
+ columns = self.page.find_all(f'region[type=table-column][model=tatr]')
725
+
726
+ # Filter to only include rows/headers/columns that overlap with this table region
727
+ def is_in_table(region):
728
+ # Check for overlap - simplifying to center point for now
729
+ region_center_x = (region.x0 + region.x1) / 2
730
+ region_center_y = (region.top + region.bottom) / 2
731
+ return (self.x0 <= region_center_x <= self.x1 and
732
+ self.top <= region_center_y <= self.bottom)
733
+
734
+ rows = [row for row in rows if is_in_table(row)]
735
+ headers = [header for header in headers if is_in_table(header)]
736
+ columns = [column for column in columns if is_in_table(column)]
737
+
738
+ # Sort rows by vertical position (top to bottom)
739
+ rows.sort(key=lambda r: r.top)
740
+
741
+ # Sort columns by horizontal position (left to right)
742
+ columns.sort(key=lambda c: c.x0)
743
+
744
+ # Create table data structure
745
+ table_data = []
746
+
747
+ # Prepare OCR config if needed
748
+ if use_ocr:
749
+ # Default OCR config focuses on small text with low confidence
750
+ default_ocr_config = {
751
+ "enabled": True,
752
+ "min_confidence": 0.1, # Lower than default to catch more text
753
+ "detection_params": {
754
+ "text_threshold": 0.1, # Lower threshold for low-contrast text
755
+ "link_threshold": 0.1 # Lower threshold for connecting text components
756
+ }
757
+ }
758
+
759
+ # Merge with provided config if any
760
+ if ocr_config:
761
+ if isinstance(ocr_config, dict):
762
+ # Update default config with provided values
763
+ for key, value in ocr_config.items():
764
+ if isinstance(value, dict) and key in default_ocr_config and isinstance(default_ocr_config[key], dict):
765
+ # Merge nested dicts
766
+ default_ocr_config[key].update(value)
767
+ else:
768
+ # Replace value
769
+ default_ocr_config[key] = value
770
+ else:
771
+ # Not a dict, use as is
772
+ default_ocr_config = ocr_config
773
+
774
+ # Use the merged config
775
+ ocr_config = default_ocr_config
776
+
777
+ # Add header row if headers were detected
778
+ if headers:
779
+ header_texts = []
780
+ for header in headers:
781
+ if use_ocr:
782
+ # Try OCR for better text extraction
783
+ ocr_elements = header.apply_ocr(**ocr_config)
784
+ if ocr_elements:
785
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
786
+ if ocr_text:
787
+ header_texts.append(ocr_text)
788
+ continue
789
+
790
+ # Fallback to normal extraction
791
+ header_texts.append(header.extract_text().strip())
792
+ table_data.append(header_texts)
793
+
794
+ # Process rows
795
+ for row in rows:
796
+ row_cells = []
797
+
798
+ # If we have columns, use them to extract cells
799
+ if columns:
800
+ for column in columns:
801
+ # Create a cell region at the intersection of row and column
802
+ cell_bbox = (
803
+ column.x0,
804
+ row.top,
805
+ column.x1,
806
+ row.bottom
807
+ )
808
+
809
+ # Create a region for this cell
810
+ from natural_pdf.elements.region import Region # Import here to avoid circular imports
811
+ cell_region = Region(self.page, cell_bbox)
812
+
813
+ # Extract text from the cell
814
+ if use_ocr:
815
+ # Apply OCR to the cell
816
+ ocr_elements = cell_region.apply_ocr(**ocr_config)
817
+ if ocr_elements:
818
+ # Get text from OCR elements
819
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
820
+ if ocr_text:
821
+ row_cells.append(ocr_text)
822
+ continue
823
+
824
+ # Fallback to normal extraction
825
+ cell_text = cell_region.extract_text().strip()
826
+ row_cells.append(cell_text)
827
+ else:
828
+ # No column information, just extract the whole row text
829
+ if use_ocr:
830
+ # Try OCR on the whole row
831
+ ocr_elements = row.apply_ocr(**ocr_config)
832
+ if ocr_elements:
833
+ ocr_text = " ".join(e.text for e in ocr_elements).strip()
834
+ if ocr_text:
835
+ row_cells.append(ocr_text)
836
+ continue
837
+
838
+ # Fallback to normal extraction
839
+ row_cells.append(row.extract_text().strip())
840
+
841
+ table_data.append(row_cells)
842
+
843
+ return table_data
844
+
845
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
846
+ """
847
+ Find the first element in this region matching the selector.
848
+
849
+ Args:
850
+ selector: CSS-like selector string
851
+ apply_exclusions: Whether to apply exclusion regions
852
+ **kwargs: Additional parameters for element filtering
853
+
854
+ Returns:
855
+ First matching element or None
856
+ """
857
+ elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
858
+ return elements[0] if elements else None
859
+
860
+ def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
861
+ """
862
+ Find all elements in this region matching the selector.
863
+
864
+ Args:
865
+ selector: CSS-like selector string
866
+ apply_exclusions: Whether to apply exclusion regions
867
+ **kwargs: Additional parameters for element filtering
868
+
869
+ Returns:
870
+ ElementCollection with matching elements
871
+ """
872
+ from natural_pdf.elements.collections import ElementCollection
873
+
874
+ # If we span multiple pages, filter our elements
875
+ if self._spans_pages and self._multi_page_elements is not None:
876
+ # Parse the selector
877
+ from natural_pdf.selectors.parser import parse_selector
878
+ selector_obj = parse_selector(selector)
879
+
880
+ # Rather than using matches_selector, let each page's find_all handle the matching
881
+ # since that method is already properly implemented
882
+ all_matching_elements = []
883
+ page_ranges = {}
884
+
885
+ # Group elements by page
886
+ for element in self._multi_page_elements:
887
+ if element.page not in page_ranges:
888
+ page_ranges[element.page] = []
889
+ page_ranges[element.page].append(element)
890
+
891
+ # For each page, use its find_all to match elements, then filter to our collection
892
+ for page, page_elements in page_ranges.items():
893
+ # Get all matching elements from the page
894
+ page_matches = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
895
+
896
+ # Filter to just the elements that are in our collection
897
+ for element in page_matches:
898
+ if element in page_elements:
899
+ all_matching_elements.append(element)
900
+
901
+ return ElementCollection(all_matching_elements)
902
+
903
+ # Otherwise, get elements from the page and filter by selector and region
904
+ page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
905
+ filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
906
+ return ElementCollection(filtered_elements)
907
+
908
+ def apply_ocr(self, **ocr_params) -> List['TextElement']:
909
+ """
910
+ Apply OCR to this region and return the created text elements.
911
+
912
+ Args:
913
+ **ocr_params: OCR parameters to override defaults
914
+
915
+ Returns:
916
+ List of created text elements
917
+ """
918
+ from natural_pdf.utils.ocr import OCRManager
919
+
920
+ # Get OCR configuration but suppress verbose output
921
+ if isinstance(ocr_params, dict):
922
+ ocr_params["verbose"] = False
923
+ else:
924
+ ocr_params = {"enabled": True, "verbose": False}
925
+
926
+ ocr_config = self.page._get_ocr_config(ocr_params)
927
+
928
+ # Skip if OCR is disabled
929
+ if not ocr_config.get('enabled'):
930
+ return []
931
+
932
+ # Render the page
933
+ page_image = self.page.to_image()
934
+
935
+ # Crop to this region
936
+ region_image = page_image.crop((self.x0, self.top, self.x1, self.bottom))
937
+
938
+ # Run OCR on this region
939
+ ocr_mgr = OCRManager.get_instance()
940
+ results = ocr_mgr.recognize_region(region_image, ocr_config)
941
+
942
+ # Adjust coordinates to be relative to the page
943
+ for result in results:
944
+ # Calculate bbox in page coordinates
945
+ result['bbox'] = (
946
+ result['bbox'][0] + self.x0,
947
+ result['bbox'][1] + self.top,
948
+ result['bbox'][2] + self.x0,
949
+ result['bbox'][3] + self.top
950
+ )
951
+
952
+ # Create text elements with adjusted coordinates
953
+ elements = []
954
+ for result in results:
955
+ # Only include results that are fully within the region
956
+ if (result['bbox'][0] >= self.x0 and
957
+ result['bbox'][1] >= self.top and
958
+ result['bbox'][2] <= self.x1 and
959
+ result['bbox'][3] <= self.bottom):
960
+ # Create a TextElement object with the appropriate fields
961
+ from natural_pdf.elements.text import TextElement
962
+ element_data = {
963
+ 'text': result['text'],
964
+ 'x0': result['bbox'][0],
965
+ 'top': result['bbox'][1],
966
+ 'x1': result['bbox'][2],
967
+ 'bottom': result['bbox'][3],
968
+ 'width': result['bbox'][2] - result['bbox'][0],
969
+ 'height': result['bbox'][3] - result['bbox'][1],
970
+ 'object_type': 'text',
971
+ 'source': 'ocr',
972
+ 'confidence': result['confidence'],
973
+ # Add default font information to work with existing expectations
974
+ 'fontname': 'OCR-detected',
975
+ 'size': 10.0,
976
+ 'page_number': self.page.number
977
+ }
978
+
979
+ elem = TextElement(element_data, self.page)
980
+ elements.append(elem)
981
+
982
+ # Add to page's elements
983
+ if hasattr(self.page, '_elements') and self.page._elements is not None:
984
+ # Add to words list to make it accessible via standard API
985
+ if 'words' in self.page._elements:
986
+ self.page._elements['words'].append(elem)
987
+ else:
988
+ self.page._elements['words'] = [elem]
989
+
990
+ return elements
991
+
992
+ def expand(self,
993
+ left: float = 0,
994
+ right: float = 0,
995
+ top_expand: float = 0, # Renamed to avoid conflict
996
+ bottom_expand: float = 0, # Renamed to avoid conflict
997
+ width_factor: float = 1.0,
998
+ height_factor: float = 1.0,
999
+ # Keep original parameter names for backward compatibility
1000
+ top: float = None,
1001
+ bottom: float = None) -> 'Region':
1002
+ """
1003
+ Create a new region expanded from this one.
1004
+
1005
+ Args:
1006
+ left: Amount to expand left edge
1007
+ right: Amount to expand right edge
1008
+ top_expand: Amount to expand top edge (upward)
1009
+ bottom_expand: Amount to expand bottom edge (downward)
1010
+ width_factor: Factor to multiply width by
1011
+ height_factor: Factor to multiply height by
1012
+ top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
1013
+ bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
1014
+
1015
+ Returns:
1016
+ New expanded Region
1017
+ """
1018
+ # Start with current coordinates
1019
+ new_x0 = self.x0
1020
+ new_x1 = self.x1
1021
+ new_top = self.top
1022
+ new_bottom = self.bottom
1023
+
1024
+ # Handle the deprecated parameter names for backward compatibility
1025
+ if top is not None:
1026
+ top_expand = top
1027
+ if bottom is not None:
1028
+ bottom_expand = bottom
1029
+
1030
+ # Apply absolute expansions first
1031
+ new_x0 -= left
1032
+ new_x1 += right
1033
+ new_top -= top_expand # Expand upward (decrease top coordinate)
1034
+ new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
1035
+
1036
+ # Apply percentage factors if provided
1037
+ if width_factor != 1.0 or height_factor != 1.0:
1038
+ # Current width and height
1039
+ current_width = new_x1 - new_x0
1040
+ current_height = new_bottom - new_top
1041
+
1042
+ # Calculate new width and height
1043
+ new_width = current_width * width_factor
1044
+ new_height = current_height * height_factor
1045
+
1046
+ # Calculate width and height differences
1047
+ width_diff = new_width - current_width
1048
+ height_diff = new_height - current_height
1049
+
1050
+ # Adjust coordinates to maintain center point
1051
+ new_x0 -= width_diff / 2
1052
+ new_x1 += width_diff / 2
1053
+ new_top -= height_diff / 2
1054
+ new_bottom += height_diff / 2
1055
+
1056
+ # Create new region with expanded bbox
1057
+ new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
1058
+
1059
+ # Copy multi-page properties if present
1060
+ if self._spans_pages:
1061
+ new_region._spans_pages = True
1062
+ new_region._multi_page_elements = self._multi_page_elements
1063
+ new_region._page_range = self._page_range
1064
+ new_region.start_element = self.start_element
1065
+ new_region.end_element = self.end_element
1066
+
1067
+ return new_region
1068
+
1069
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1070
+ """
1071
+ Get a section between two elements within this region.
1072
+
1073
+ Args:
1074
+ start_element: Element marking the start of the section
1075
+ end_element: Element marking the end of the section
1076
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1077
+
1078
+ Returns:
1079
+ Region representing the section
1080
+ """
1081
+ elements = self.get_elements()
1082
+
1083
+ # If no elements, return self
1084
+ if not elements:
1085
+ return self
1086
+
1087
+ # Sort elements in reading order
1088
+ elements.sort(key=lambda e: (e.top, e.x0))
1089
+
1090
+ # Find start index
1091
+ start_idx = 0
1092
+ if start_element:
1093
+ try:
1094
+ start_idx = elements.index(start_element)
1095
+ except ValueError:
1096
+ # Start element not in region, use first element
1097
+ pass
1098
+
1099
+ # Find end index
1100
+ end_idx = len(elements) - 1
1101
+ if end_element:
1102
+ try:
1103
+ end_idx = elements.index(end_element)
1104
+ except ValueError:
1105
+ # End element not in region, use last element
1106
+ pass
1107
+
1108
+ # Adjust indexes based on boundary inclusion
1109
+ if boundary_inclusion == 'none':
1110
+ start_idx += 1
1111
+ end_idx -= 1
1112
+ elif boundary_inclusion == 'start':
1113
+ end_idx -= 1
1114
+ elif boundary_inclusion == 'end':
1115
+ start_idx += 1
1116
+
1117
+ # Ensure valid indexes
1118
+ start_idx = max(0, start_idx)
1119
+ end_idx = min(len(elements) - 1, end_idx)
1120
+
1121
+ # If no valid elements in range, return empty region
1122
+ if start_idx > end_idx:
1123
+ return Region(self.page, (0, 0, 0, 0))
1124
+
1125
+ # Get elements in range
1126
+ section_elements = elements[start_idx:end_idx+1]
1127
+
1128
+ # Create bounding box around elements
1129
+ x0 = min(e.x0 for e in section_elements)
1130
+ top = min(e.top for e in section_elements)
1131
+ x1 = max(e.x1 for e in section_elements)
1132
+ bottom = max(e.bottom for e in section_elements)
1133
+
1134
+ # Adjust boundaries for better boundary inclusion/exclusion
1135
+ pixel_adjustment = 2.0 # Amount to adjust for avoiding boundary elements
1136
+
1137
+ # Only proceed with adjustments if we have elements in the section
1138
+ if section_elements:
1139
+ # Adjust top boundary if start element should be excluded
1140
+ if start_element and boundary_inclusion not in ('start', 'both') and start_idx > 0:
1141
+ # If start element is just above the section, move the top down
1142
+ # Use a larger threshold (10 points) to catch more cases
1143
+ if abs(top - start_element.bottom) < 10:
1144
+ top += pixel_adjustment
1145
+
1146
+ # Adjust bottom boundary if end element should be excluded
1147
+ if end_element and boundary_inclusion not in ('end', 'both') and end_idx < len(elements) - 1:
1148
+ # If end element is just below the section, move the bottom up
1149
+ # Use a larger threshold (10 points) to catch more cases
1150
+ if abs(bottom - end_element.top) < 10:
1151
+ bottom -= pixel_adjustment
1152
+
1153
+ # Ensure top is always less than bottom (valid region)
1154
+ if top >= bottom:
1155
+ # Reset to original if adjustment would create an invalid region
1156
+ top = min(e.top for e in section_elements)
1157
+ bottom = max(e.bottom for e in section_elements)
1158
+
1159
+ # Create new region
1160
+ section = Region(self.page, (x0, top, x1, bottom))
1161
+ section.start_element = start_element if boundary_inclusion in ('start', 'both') else None
1162
+ section.end_element = end_element if boundary_inclusion in ('end', 'both') else None
1163
+
1164
+ return section
1165
+
1166
+ def get_sections(self, start_elements=None, end_elements=None, boundary_inclusion='both') -> List['Region']:
1167
+ """
1168
+ Get sections within this region based on start/end elements.
1169
+
1170
+ Args:
1171
+ start_elements: Elements or selector string that mark the start of sections
1172
+ end_elements: Elements or selector string that mark the end of sections
1173
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
1174
+
1175
+ Returns:
1176
+ List of Region objects representing the extracted sections
1177
+ """
1178
+ from natural_pdf.elements.collections import ElementCollection
1179
+
1180
+ # Process string selectors to find elements
1181
+ if isinstance(start_elements, str):
1182
+ start_elements = self.find_all(start_elements)
1183
+ if hasattr(start_elements, 'elements'):
1184
+ start_elements = start_elements.elements
1185
+
1186
+ if isinstance(end_elements, str):
1187
+ end_elements = self.find_all(end_elements)
1188
+ if hasattr(end_elements, 'elements'):
1189
+ end_elements = end_elements.elements
1190
+
1191
+ # If no start elements, return empty list
1192
+ if not start_elements:
1193
+ return []
1194
+
1195
+ # Sort elements in reading order
1196
+ all_elements = self.get_elements()
1197
+ all_elements.sort(key=lambda e: (e.top, e.x0))
1198
+
1199
+ # Get all indexes in the sorted list
1200
+ section_boundaries = []
1201
+
1202
+ # Add start element indexes
1203
+ for element in start_elements:
1204
+ try:
1205
+ idx = all_elements.index(element)
1206
+ section_boundaries.append({
1207
+ 'index': idx,
1208
+ 'element': element,
1209
+ 'type': 'start'
1210
+ })
1211
+ except ValueError:
1212
+ # Element not in this region, skip
1213
+ continue
1214
+
1215
+ # Add end element indexes if provided
1216
+ if end_elements:
1217
+ for element in end_elements:
1218
+ try:
1219
+ idx = all_elements.index(element)
1220
+ section_boundaries.append({
1221
+ 'index': idx,
1222
+ 'element': element,
1223
+ 'type': 'end'
1224
+ })
1225
+ except ValueError:
1226
+ # Element not in this region, skip
1227
+ continue
1228
+
1229
+ # Sort boundaries by index (document order)
1230
+ section_boundaries.sort(key=lambda x: x['index'])
1231
+
1232
+ # Generate sections
1233
+ sections = []
1234
+ current_start = None
1235
+
1236
+ for i, boundary in enumerate(section_boundaries):
1237
+ # If it's a start boundary and we don't have a current start
1238
+ if boundary['type'] == 'start' and current_start is None:
1239
+ current_start = boundary
1240
+
1241
+ # If it's an end boundary and we have a current start
1242
+ elif boundary['type'] == 'end' and current_start is not None:
1243
+ # Create a section from current_start to this boundary
1244
+ start_element = current_start['element']
1245
+ end_element = boundary['element']
1246
+ section = self.get_section_between(
1247
+ start_element,
1248
+ end_element,
1249
+ boundary_inclusion
1250
+ )
1251
+ sections.append(section)
1252
+ current_start = None
1253
+
1254
+ # If it's another start boundary and we have a current start (for splitting by starts only)
1255
+ elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1256
+ # Create a section from current_start to just before this boundary
1257
+ start_element = current_start['element']
1258
+ end_element = all_elements[boundary['index'] - 1] if boundary['index'] > 0 else None
1259
+ section = self.get_section_between(
1260
+ start_element,
1261
+ end_element,
1262
+ boundary_inclusion
1263
+ )
1264
+ sections.append(section)
1265
+ current_start = boundary
1266
+
1267
+ # Handle the last section if we have a current start
1268
+ if current_start is not None:
1269
+ start_element = current_start['element']
1270
+ # Use the last element in the region as the end
1271
+ end_element = all_elements[-1] if all_elements else None
1272
+ section = self.get_section_between(
1273
+ start_element,
1274
+ end_element,
1275
+ boundary_inclusion
1276
+ )
1277
+ sections.append(section)
1278
+
1279
+ return sections
1280
+
1281
+ def create_cells(self):
1282
+ """
1283
+ Create cell regions for a TATR-detected table.
1284
+
1285
+ Returns:
1286
+ List of cell regions
1287
+ """
1288
+ if not (self.region_type == 'table' and self.model == 'tatr'):
1289
+ raise ValueError("Only works for TATR-detected table regions")
1290
+
1291
+ # Find rows and columns that belong to this table
1292
+ rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1293
+ columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1294
+
1295
+ # Filter to only include those that overlap with this table
1296
+ def is_in_table(element):
1297
+ element_center_x = (element.x0 + element.x1) / 2
1298
+ element_center_y = (element.top + element.bottom) / 2
1299
+ return (self.x0 <= element_center_x <= self.x1 and
1300
+ self.top <= element_center_y <= self.bottom)
1301
+
1302
+ table_rows = [r for r in rows if is_in_table(r)]
1303
+ table_columns = [c for c in columns if is_in_table(c)]
1304
+
1305
+ # Sort rows and columns
1306
+ table_rows.sort(key=lambda r: r.top)
1307
+ table_columns.sort(key=lambda c: c.x0)
1308
+
1309
+ # Create cells
1310
+ cells = []
1311
+ for row in table_rows:
1312
+ for column in table_columns:
1313
+ # Create cell region at the intersection
1314
+ cell = self.page.create_region(
1315
+ column.x0, row.top, column.x1, row.bottom
1316
+ )
1317
+ # Set minimal metadata
1318
+ cell.source = 'derived'
1319
+ cell.region_type = 'table-cell'
1320
+ cell.model = 'tatr'
1321
+
1322
+ cells.append(cell)
1323
+
1324
+ return cells
1325
+
1326
+ def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1327
+ """
1328
+ Ask a question about the region content using document QA.
1329
+
1330
+ This method uses a document question answering model to extract answers from the region content.
1331
+ It leverages both textual content and layout information for better understanding.
1332
+
1333
+ Args:
1334
+ question: The question to ask about the region content
1335
+ min_confidence: Minimum confidence threshold for answers (0.0-1.0)
1336
+ model: Optional model name to use for QA (if None, uses default model)
1337
+ **kwargs: Additional parameters to pass to the QA engine
1338
+
1339
+ Returns:
1340
+ Dictionary with answer details: {
1341
+ "answer": extracted text,
1342
+ "confidence": confidence score,
1343
+ "found": whether an answer was found,
1344
+ "page_num": page number,
1345
+ "region": reference to this region,
1346
+ "source_elements": list of elements that contain the answer (if found)
1347
+ }
1348
+ """
1349
+ try:
1350
+ from natural_pdf.qa.document_qa import get_qa_engine
1351
+
1352
+ # Get or initialize QA engine with specified model
1353
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1354
+
1355
+ # Ask the question using the QA engine
1356
+ return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1357
+ except ImportError as e:
1358
+ import logging
1359
+ logger = logging.getLogger("natural_pdf.elements.region")
1360
+ logger.warning(f"QA functionality not available: {e}")
1361
+ return {
1362
+ "answer": "",
1363
+ "confidence": 0.0,
1364
+ "error": "QA functionality not available",
1365
+ "found": False
1366
+ }