natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -88,20 +88,6 @@ class RectangleElement(Element):
88
88
  """Get the stroke width of the rectangle."""
89
89
  return self._obj.get("linewidth", 0)
90
90
 
91
- def text_inside(self, **kwargs) -> Any:
92
- """
93
- Get text elements inside this rectangle.
94
-
95
- Args:
96
- **kwargs: Additional filter parameters
97
-
98
- Returns:
99
- ElementCollection of text elements inside this rectangle
100
- """
101
- from natural_pdf.elements.collections import ElementCollection
102
-
103
- # TODO: Implement proper filtering of elements inside this rectangle
104
- return ElementCollection([]) # Placeholder
105
91
 
106
92
  def extract_text(self, **kwargs) -> str:
107
93
  """
@@ -21,15 +21,15 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
21
21
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
22
22
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
- from natural_pdf.utils.locks import pdf_render_lock # Import the lock
25
-
26
- # Import new utils
27
- from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
28
24
 
29
25
  # ------------------------------------------------------------------
30
26
  # Table utilities
31
27
  # ------------------------------------------------------------------
32
28
  from natural_pdf.tables import TableResult
29
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
30
+
31
+ # Import new utils
32
+ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
33
33
 
34
34
  # --- End Classification Imports --- #
35
35
 
@@ -55,9 +55,70 @@ except ImportError:
55
55
  logger = logging.getLogger(__name__)
56
56
 
57
57
 
58
- class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
59
- """
60
- Represents a rectangular region on a page.
58
+ class Region(
59
+ DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
60
+ ):
61
+ """Represents a rectangular region on a page.
62
+
63
+ Regions are fundamental building blocks in natural-pdf that define rectangular
64
+ areas of a page for analysis, extraction, and navigation. They can be created
65
+ manually or automatically through spatial navigation methods like .below(), .above(),
66
+ .left(), and .right() from elements or other regions.
67
+
68
+ Regions integrate multiple analysis capabilities through mixins and provide:
69
+ - Element filtering and collection within the region boundary
70
+ - OCR processing for the region area
71
+ - Table detection and extraction
72
+ - AI-powered classification and structured data extraction
73
+ - Visual rendering and debugging capabilities
74
+ - Text extraction with spatial awareness
75
+
76
+ The Region class supports both rectangular and polygonal boundaries, making it
77
+ suitable for complex document layouts and irregular shapes detected by layout
78
+ analysis algorithms.
79
+
80
+ Attributes:
81
+ page: Reference to the parent Page object.
82
+ bbox: Bounding box tuple (x0, top, x1, bottom) in PDF coordinates.
83
+ x0: Left x-coordinate.
84
+ top: Top y-coordinate (minimum y).
85
+ x1: Right x-coordinate.
86
+ bottom: Bottom y-coordinate (maximum y).
87
+ width: Region width (x1 - x0).
88
+ height: Region height (bottom - top).
89
+ polygon: List of coordinate points for non-rectangular regions.
90
+ label: Optional descriptive label for the region.
91
+ metadata: Dictionary for storing analysis results and custom data.
92
+
93
+ Example:
94
+ Creating regions:
95
+ ```python
96
+ pdf = npdf.PDF("document.pdf")
97
+ page = pdf.pages[0]
98
+
99
+ # Manual region creation
100
+ header_region = page.region(0, 0, page.width, 100)
101
+
102
+ # Spatial navigation from elements
103
+ summary_text = page.find('text:contains("Summary")')
104
+ content_region = summary_text.below(until='text[size>12]:bold')
105
+
106
+ # Extract content from region
107
+ tables = content_region.extract_table()
108
+ text = content_region.get_text()
109
+ ```
110
+
111
+ Advanced usage:
112
+ ```python
113
+ # OCR processing
114
+ region.apply_ocr(engine='easyocr', resolution=300)
115
+
116
+ # AI-powered extraction
117
+ data = region.extract_structured_data(MySchema)
118
+
119
+ # Visual debugging
120
+ region.show(highlights=['tables', 'text'])
121
+ ```
61
122
  """
62
123
 
63
124
  def __init__(
@@ -68,23 +129,46 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
68
129
  parent=None,
69
130
  label: Optional[str] = None,
70
131
  ):
71
- """
72
- Initialize a region.
132
+ """Initialize a region.
133
+
134
+ Creates a Region object that represents a rectangular or polygonal area on a page.
135
+ Regions are used for spatial navigation, content extraction, and analysis operations.
73
136
 
74
137
  Args:
75
- page: Parent page
76
- bbox: Bounding box as (x0, top, x1, bottom)
77
- polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
78
- parent: Optional parent region (for hierarchical document structure)
138
+ page: Parent Page object that contains this region and provides access
139
+ to document elements and analysis capabilities.
140
+ bbox: Bounding box coordinates as (x0, top, x1, bottom) tuple in PDF
141
+ coordinate system (points, with origin at bottom-left).
142
+ polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for
143
+ non-rectangular regions. If provided, the region will use polygon-based
144
+ intersection calculations instead of simple rectangle overlap.
145
+ parent: Optional parent region for hierarchical document structure.
146
+ Useful for maintaining tree-like relationships between regions.
147
+ label: Optional descriptive label for the region, useful for debugging
148
+ and identification in complex workflows.
149
+
150
+ Example:
151
+ ```python
152
+ pdf = npdf.PDF("document.pdf")
153
+ page = pdf.pages[0]
154
+
155
+ # Rectangular region
156
+ header = Region(page, (0, 0, page.width, 100), label="header")
157
+
158
+ # Polygonal region (from layout detection)
159
+ table_polygon = [(50, 100), (300, 100), (300, 400), (50, 400)]
160
+ table_region = Region(page, (50, 100, 300, 400),
161
+ polygon=table_polygon, label="table")
162
+ ```
163
+
164
+ Note:
165
+ Regions are typically created through page methods like page.region() or
166
+ spatial navigation methods like element.below(). Direct instantiation is
167
+ used mainly for advanced workflows or layout analysis integration.
79
168
  """
80
169
  self._page = page
81
170
  self._bbox = bbox
82
171
  self._polygon = polygon
83
- self._multi_page_elements = None
84
- self._spans_pages = False
85
- self._page_range = None
86
- self.start_element = None
87
- self.end_element = None
88
172
 
89
173
  self.metadata: Dict[str, Any] = {}
90
174
  # Analysis results live under self.metadata['analysis'] via property
@@ -444,10 +528,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
444
528
  Returns:
445
529
  True if the element is in the region, False otherwise
446
530
  """
447
- # If we have multi-page elements cached, check if the element is in the list
448
- if self._spans_pages and self._multi_page_elements is not None:
449
- return element in self._multi_page_elements
450
-
451
531
  # Check if element is on the same page
452
532
  if not hasattr(element, "page") or element.page != self._page:
453
533
  return False
@@ -614,12 +694,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
614
694
  """
615
695
  # Apply global options as defaults
616
696
  import natural_pdf
697
+
617
698
  if resolution is None:
618
699
  if natural_pdf.options.image.resolution is not None:
619
700
  resolution = natural_pdf.options.image.resolution
620
701
  else:
621
702
  resolution = 144 # Default resolution when none specified
622
-
703
+
623
704
  # Handle the case where user wants the cropped region to have a specific width
624
705
  page_kwargs = kwargs.copy()
625
706
  effective_resolution = resolution # Start with the provided resolution
@@ -722,12 +803,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
722
803
  """
723
804
  # Apply global options as defaults
724
805
  import natural_pdf
806
+
725
807
  if resolution is None:
726
808
  if natural_pdf.options.image.resolution is not None:
727
809
  resolution = natural_pdf.options.image.resolution
728
810
  else:
729
811
  resolution = 144 # Default resolution when none specified
730
-
812
+
731
813
  if not self._page:
732
814
  raise ValueError("Region must be associated with a page to show.")
733
815
 
@@ -764,7 +846,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
764
846
  )
765
847
 
766
848
  def save(
767
- self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
849
+ self,
850
+ filename: str,
851
+ resolution: Optional[float] = None,
852
+ labels: bool = True,
853
+ legend_position: str = "right",
768
854
  ) -> "Region":
769
855
  """
770
856
  Save the page with this region highlighted to an image file.
@@ -780,17 +866,20 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
780
866
  """
781
867
  # Apply global options as defaults
782
868
  import natural_pdf
869
+
783
870
  if resolution is None:
784
871
  if natural_pdf.options.image.resolution is not None:
785
872
  resolution = natural_pdf.options.image.resolution
786
873
  else:
787
874
  resolution = 144 # Default resolution when none specified
788
-
875
+
789
876
  # Highlight this region if not already highlighted
790
877
  self.highlight()
791
878
 
792
879
  # Save the highlighted image
793
- self._page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
880
+ self._page.save_image(
881
+ filename, resolution=resolution, labels=labels, legend_position=legend_position
882
+ )
794
883
  return self
795
884
 
796
885
  def save_image(
@@ -816,12 +905,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
816
905
  """
817
906
  # Apply global options as defaults
818
907
  import natural_pdf
908
+
819
909
  if resolution is None:
820
910
  if natural_pdf.options.image.resolution is not None:
821
911
  resolution = natural_pdf.options.image.resolution
822
912
  else:
823
913
  resolution = 144 # Default resolution when none specified
824
-
914
+
825
915
  # Get the region image
826
916
  image = self.to_image(
827
917
  resolution=resolution,
@@ -856,27 +946,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
856
946
  pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
857
947
  This helps avoid detecting box borders/slivers as content.
858
948
 
859
- Returns:
860
- New Region with visual whitespace trimmed from all edges
949
+ Returns
950
+ ------
861
951
 
862
- Example:
863
- # Basic trimming with 1 pixel padding and 0.5px pre-shrink
864
- trimmed = region.trim()
952
+ New Region with visual whitespace trimmed from all edges
865
953
 
866
- # More aggressive trimming with no padding and no pre-shrink
867
- tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
954
+ Examples
955
+ --------
868
956
 
869
- # Conservative trimming with more padding
870
- loose = region.trim(padding=3, threshold=0.98)
957
+ ```python
958
+ # Basic trimming with 1 pixel padding and 0.5px pre-shrink
959
+ trimmed = region.trim()
960
+
961
+ # More aggressive trimming with no padding and no pre-shrink
962
+ tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
963
+
964
+ # Conservative trimming with more padding
965
+ loose = region.trim(padding=3, threshold=0.98)
966
+ ```
871
967
  """
872
968
  # Apply global options as defaults
873
969
  import natural_pdf
970
+
874
971
  if resolution is None:
875
972
  if natural_pdf.options.image.resolution is not None:
876
973
  resolution = natural_pdf.options.image.resolution
877
974
  else:
878
975
  resolution = 144 # Default resolution when none specified
879
-
976
+
880
977
  # Pre-shrink the region to avoid box slivers
881
978
  work_region = (
882
979
  self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
@@ -885,9 +982,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
885
982
  )
886
983
 
887
984
  # Get the region image
888
- image = work_region.to_image(
889
- resolution=resolution, crop=True, include_highlights=False
890
- )
985
+ image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
891
986
 
892
987
  if image is None:
893
988
  logger.warning(
@@ -1113,12 +1208,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1113
1208
  Returns:
1114
1209
  List of elements in the region
1115
1210
  """
1116
- # If we have multi-page elements, return those
1117
- if self._spans_pages and self._multi_page_elements is not None:
1118
- # TODO: Apply selector to multi-page elements if needed
1119
- return self._multi_page_elements
1120
-
1121
- # Otherwise, get elements from the page
1122
1211
  if selector:
1123
1212
  # Find elements on the page matching the selector
1124
1213
  page_elements = self.page.find_all(
@@ -1257,7 +1346,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1257
1346
  try:
1258
1347
  cell_regions_in_table = [
1259
1348
  c
1260
- for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
1349
+ for c in self.page.find_all(
1350
+ "region[type=table_cell]", apply_exclusions=False
1351
+ )
1261
1352
  if self.intersects(c)
1262
1353
  ]
1263
1354
  except Exception as _cells_err:
@@ -1324,7 +1415,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1324
1415
  # This must happen AFTER alias handling (so strategies are final)
1325
1416
  # and BEFORE we delegate to _extract_table_* helpers.
1326
1417
  # -------------------------------------------------------------
1327
- if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
1418
+ if "text" in (
1419
+ table_settings.get("vertical_strategy"),
1420
+ table_settings.get("horizontal_strategy"),
1421
+ ):
1328
1422
  page_cfg = getattr(self.page, "_config", {})
1329
1423
  # Ensure text_* tolerances passed to pdfplumber
1330
1424
  if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
@@ -1466,19 +1560,35 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1466
1560
  table_settings.get("vertical_strategy"),
1467
1561
  table_settings.get("horizontal_strategy"),
1468
1562
  )
1469
- if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1563
+ if (
1564
+ _uses_text
1565
+ and "text_x_tolerance" not in table_settings
1566
+ and "x_tolerance" not in table_settings
1567
+ ):
1470
1568
  x_tol = pdf_cfg.get("x_tolerance")
1471
1569
  if x_tol is not None:
1472
1570
  table_settings.setdefault("text_x_tolerance", x_tol)
1473
- if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1571
+ if (
1572
+ _uses_text
1573
+ and "text_y_tolerance" not in table_settings
1574
+ and "y_tolerance" not in table_settings
1575
+ ):
1474
1576
  y_tol = pdf_cfg.get("y_tolerance")
1475
1577
  if y_tol is not None:
1476
1578
  table_settings.setdefault("text_y_tolerance", y_tol)
1477
1579
 
1478
- if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1580
+ if (
1581
+ _uses_text
1582
+ and "snap_tolerance" not in table_settings
1583
+ and "snap_x_tolerance" not in table_settings
1584
+ ):
1479
1585
  snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1480
1586
  table_settings.setdefault("snap_tolerance", snap)
1481
- if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1587
+ if (
1588
+ _uses_text
1589
+ and "join_tolerance" not in table_settings
1590
+ and "join_x_tolerance" not in table_settings
1591
+ ):
1482
1592
  join = table_settings.get("snap_tolerance", 1)
1483
1593
  table_settings.setdefault("join_tolerance", join)
1484
1594
  table_settings.setdefault("join_x_tolerance", join)
@@ -1510,11 +1620,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1510
1620
  table_settings.get("vertical_strategy"),
1511
1621
  table_settings.get("horizontal_strategy"),
1512
1622
  )
1513
- if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1623
+ if (
1624
+ _uses_text
1625
+ and "text_x_tolerance" not in table_settings
1626
+ and "x_tolerance" not in table_settings
1627
+ ):
1514
1628
  x_tol = pdf_cfg.get("x_tolerance")
1515
1629
  if x_tol is not None:
1516
1630
  table_settings.setdefault("text_x_tolerance", x_tol)
1517
- if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1631
+ if (
1632
+ _uses_text
1633
+ and "text_y_tolerance" not in table_settings
1634
+ and "y_tolerance" not in table_settings
1635
+ ):
1518
1636
  y_tol = pdf_cfg.get("y_tolerance")
1519
1637
  if y_tol is not None:
1520
1638
  table_settings.setdefault("text_y_tolerance", y_tol)
@@ -1942,23 +2060,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1942
2060
  else:
1943
2061
  raise ValueError("Internal error: No selector or text provided.")
1944
2062
 
1945
- # If we span multiple pages, filter our elements
1946
- # TODO: Revisit multi-page region logic
1947
- if self._spans_pages and self._multi_page_elements is not None:
1948
- logger.warning("find_all on multi-page regions is not fully implemented.")
1949
- # Temporary: Apply filter directly to cached elements
1950
- try:
1951
- selector_obj = parse_selector(effective_selector)
1952
- # Pass regex/case flags down
1953
- kwargs["regex"] = regex
1954
- kwargs["case"] = case
1955
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1956
- matching = [el for el in self._multi_page_elements if filter_func(el)]
1957
- return ElementCollection(matching)
1958
- except Exception as e:
1959
- logger.error(f"Error applying selector to multi-page region elements: {e}")
1960
- return ElementCollection([])
1961
-
1962
2063
  # Normal case: Region is on a single page
1963
2064
  try:
1964
2065
  # Parse the final selector string
@@ -2016,10 +2117,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2016
2117
 
2017
2118
  Examples
2018
2119
  ---------
2019
- >>> def llm_ocr(region):
2020
- ... image = region.to_image(resolution=300, crop=True)
2021
- ... return my_llm_client.ocr(image)
2022
- >>> region.apply_ocr(function=llm_ocr)
2120
+ ```python
2121
+ def llm_ocr(region):
2122
+ image = region.to_image(resolution=300, crop=True)
2123
+ return my_llm_client.ocr(image)
2124
+ region.apply_ocr(function=llm_ocr)
2125
+ ```
2023
2126
 
2024
2127
  Args:
2025
2128
  replace: Whether to remove existing OCR elements first (default ``True``).
@@ -2088,15 +2191,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2088
2191
  # Remove OCR CHAR dicts overlapping region
2089
2192
  for char in list(self.page._element_mgr.chars):
2090
2193
  # char can be dict or TextElement; normalise
2091
- char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
2194
+ char_src = (
2195
+ char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
2196
+ )
2092
2197
  if char_src == "ocr":
2093
2198
  # Rough bbox for dicts
2094
2199
  if isinstance(char, dict):
2095
- cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
2200
+ cx0, ctop, cx1, cbottom = (
2201
+ char.get("x0", 0),
2202
+ char.get("top", 0),
2203
+ char.get("x1", 0),
2204
+ char.get("bottom", 0),
2205
+ )
2096
2206
  else:
2097
2207
  cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
2098
2208
  # Quick overlap check
2099
- if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
2209
+ if not (
2210
+ cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
2211
+ ):
2100
2212
  _safe_remove(char)
2101
2213
 
2102
2214
  logger.info(
@@ -2219,7 +2331,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2219
2331
  """
2220
2332
  Apply a custom OCR function to this region and create text elements from the results.
2221
2333
 
2222
- This is useful when you want to use a custom OCR method (e.g., an LLM API,
2334
+ This is useful when you want to use a custom OCR method (e.g., an LLM API,
2223
2335
  specialized OCR service, or any custom logic) instead of the built-in OCR engines.
2224
2336
 
2225
2337
  Args:
@@ -2244,15 +2356,15 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2244
2356
  image = region.to_image(resolution=300, crop=True)
2245
2357
  # Call your LLM API here
2246
2358
  return llm_client.ocr(image)
2247
-
2359
+
2248
2360
  region.apply_custom_ocr(ocr_with_llm)
2249
-
2361
+
2250
2362
  # Using with a custom OCR service
2251
2363
  def ocr_with_service(region):
2252
2364
  img_bytes = region.to_image(crop=True).tobytes()
2253
2365
  response = ocr_service.process(img_bytes)
2254
2366
  return response.text
2255
-
2367
+
2256
2368
  region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
2257
2369
  """
2258
2370
  # If replace is True, remove existing OCR elements in this region
@@ -2260,9 +2372,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2260
2372
  logger.info(
2261
2373
  f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
2262
2374
  )
2263
-
2375
+
2264
2376
  removed_count = 0
2265
-
2377
+
2266
2378
  # Helper to remove a single element safely
2267
2379
  def _safe_remove(elem):
2268
2380
  nonlocal removed_count
@@ -2281,41 +2393,60 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2281
2393
  success = False
2282
2394
  if success:
2283
2395
  removed_count += 1
2284
-
2285
- # Remove OCR elements overlapping this region
2286
- for word in list(self.page._element_mgr.words):
2287
- if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
2288
- _safe_remove(word)
2289
-
2290
- # Also check custom-ocr sources
2396
+
2397
+ # Remove ALL OCR elements overlapping this region
2398
+ # Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
2291
2399
  for word in list(self.page._element_mgr.words):
2292
- if getattr(word, "source", "") == source_label and self.intersects(word):
2400
+ word_source = getattr(word, "source", "")
2401
+ # Match built-in OCR behavior: remove elements with source "ocr" exactly
2402
+ # Also remove elements with the same source_label to avoid duplicates
2403
+ if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
2293
2404
  _safe_remove(word)
2294
-
2295
- if removed_count > 0:
2296
- logger.info(
2297
- f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
2405
+
2406
+ # Also remove char dicts if needed (matching built-in OCR)
2407
+ for char in list(self.page._element_mgr.chars):
2408
+ # char can be dict or TextElement; normalize
2409
+ char_src = (
2410
+ char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
2298
2411
  )
2299
-
2412
+ if char_src == "ocr" or char_src == source_label:
2413
+ # Rough bbox for dicts
2414
+ if isinstance(char, dict):
2415
+ cx0, ctop, cx1, cbottom = (
2416
+ char.get("x0", 0),
2417
+ char.get("top", 0),
2418
+ char.get("x1", 0),
2419
+ char.get("bottom", 0),
2420
+ )
2421
+ else:
2422
+ cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
2423
+ # Quick overlap check
2424
+ if not (
2425
+ cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
2426
+ ):
2427
+ _safe_remove(char)
2428
+
2429
+ if removed_count > 0:
2430
+ logger.info(f"Region {self.bbox}: Removed {removed_count} existing OCR elements.")
2431
+
2300
2432
  # Call the custom OCR function
2301
2433
  try:
2302
2434
  logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
2303
2435
  ocr_text = ocr_function(self)
2304
-
2436
+
2305
2437
  if ocr_text is not None and not isinstance(ocr_text, str):
2306
2438
  logger.warning(
2307
2439
  f"Custom OCR function returned non-string type ({type(ocr_text)}). "
2308
2440
  f"Converting to string."
2309
2441
  )
2310
2442
  ocr_text = str(ocr_text)
2311
-
2443
+
2312
2444
  except Exception as e:
2313
2445
  logger.error(
2314
- f"Error calling custom OCR function for region {self.bbox}: {e}",
2315
- exc_info=True
2446
+ f"Error calling custom OCR function for region {self.bbox}: {e}", exc_info=True
2316
2447
  )
2317
2448
  return self
2318
-
2449
+
2319
2450
  # Create text element if we got text
2320
2451
  if ocr_text is not None:
2321
2452
  # Use the to_text_element method to create the element
@@ -2323,16 +2454,16 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2323
2454
  text_content=ocr_text,
2324
2455
  source_label=source_label,
2325
2456
  confidence=confidence,
2326
- add_to_page=add_to_page
2457
+ add_to_page=add_to_page,
2327
2458
  )
2328
-
2459
+
2329
2460
  logger.info(
2330
2461
  f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
2331
2462
  f"{' and added to page' if add_to_page else ''}"
2332
2463
  )
2333
2464
  else:
2334
2465
  logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
2335
-
2466
+
2336
2467
  return self
2337
2468
 
2338
2469
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
@@ -3280,9 +3411,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
3280
3411
  return []
3281
3412
 
3282
3413
  # Build arrays of centers
3283
- centers = np.array([
3284
- [(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
3285
- ])
3414
+ centers = np.array([[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions])
3286
3415
  xs = centers[:, 0]
3287
3416
  ys = centers[:, 1]
3288
3417
 
@@ -3314,5 +3443,3 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
3314
3443
  table_grid[row_idx][col_idx] = text_val if text_val else None
3315
3444
 
3316
3445
  return table_grid
3317
-
3318
-