natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +113 -22
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/elements/rect.py
CHANGED
@@ -88,20 +88,6 @@ class RectangleElement(Element):
|
|
88
88
|
"""Get the stroke width of the rectangle."""
|
89
89
|
return self._obj.get("linewidth", 0)
|
90
90
|
|
91
|
-
def text_inside(self, **kwargs) -> Any:
|
92
|
-
"""
|
93
|
-
Get text elements inside this rectangle.
|
94
|
-
|
95
|
-
Args:
|
96
|
-
**kwargs: Additional filter parameters
|
97
|
-
|
98
|
-
Returns:
|
99
|
-
ElementCollection of text elements inside this rectangle
|
100
|
-
"""
|
101
|
-
from natural_pdf.elements.collections import ElementCollection
|
102
|
-
|
103
|
-
# TODO: Implement proper filtering of elements inside this rectangle
|
104
|
-
return ElementCollection([]) # Placeholder
|
105
91
|
|
106
92
|
def extract_text(self, **kwargs) -> str:
|
107
93
|
"""
|
natural_pdf/elements/region.py
CHANGED
@@ -21,15 +21,15 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
|
21
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
22
22
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
23
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
-
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
25
|
-
|
26
|
-
# Import new utils
|
27
|
-
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
28
24
|
|
29
25
|
# ------------------------------------------------------------------
|
30
26
|
# Table utilities
|
31
27
|
# ------------------------------------------------------------------
|
32
28
|
from natural_pdf.tables import TableResult
|
29
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
30
|
+
|
31
|
+
# Import new utils
|
32
|
+
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
33
33
|
|
34
34
|
# --- End Classification Imports --- #
|
35
35
|
|
@@ -55,9 +55,70 @@ except ImportError:
|
|
55
55
|
logger = logging.getLogger(__name__)
|
56
56
|
|
57
57
|
|
58
|
-
class Region(
|
59
|
-
|
60
|
-
|
58
|
+
class Region(
|
59
|
+
DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
|
60
|
+
):
|
61
|
+
"""Represents a rectangular region on a page.
|
62
|
+
|
63
|
+
Regions are fundamental building blocks in natural-pdf that define rectangular
|
64
|
+
areas of a page for analysis, extraction, and navigation. They can be created
|
65
|
+
manually or automatically through spatial navigation methods like .below(), .above(),
|
66
|
+
.left(), and .right() from elements or other regions.
|
67
|
+
|
68
|
+
Regions integrate multiple analysis capabilities through mixins and provide:
|
69
|
+
- Element filtering and collection within the region boundary
|
70
|
+
- OCR processing for the region area
|
71
|
+
- Table detection and extraction
|
72
|
+
- AI-powered classification and structured data extraction
|
73
|
+
- Visual rendering and debugging capabilities
|
74
|
+
- Text extraction with spatial awareness
|
75
|
+
|
76
|
+
The Region class supports both rectangular and polygonal boundaries, making it
|
77
|
+
suitable for complex document layouts and irregular shapes detected by layout
|
78
|
+
analysis algorithms.
|
79
|
+
|
80
|
+
Attributes:
|
81
|
+
page: Reference to the parent Page object.
|
82
|
+
bbox: Bounding box tuple (x0, top, x1, bottom) in PDF coordinates.
|
83
|
+
x0: Left x-coordinate.
|
84
|
+
top: Top y-coordinate (minimum y).
|
85
|
+
x1: Right x-coordinate.
|
86
|
+
bottom: Bottom y-coordinate (maximum y).
|
87
|
+
width: Region width (x1 - x0).
|
88
|
+
height: Region height (bottom - top).
|
89
|
+
polygon: List of coordinate points for non-rectangular regions.
|
90
|
+
label: Optional descriptive label for the region.
|
91
|
+
metadata: Dictionary for storing analysis results and custom data.
|
92
|
+
|
93
|
+
Example:
|
94
|
+
Creating regions:
|
95
|
+
```python
|
96
|
+
pdf = npdf.PDF("document.pdf")
|
97
|
+
page = pdf.pages[0]
|
98
|
+
|
99
|
+
# Manual region creation
|
100
|
+
header_region = page.region(0, 0, page.width, 100)
|
101
|
+
|
102
|
+
# Spatial navigation from elements
|
103
|
+
summary_text = page.find('text:contains("Summary")')
|
104
|
+
content_region = summary_text.below(until='text[size>12]:bold')
|
105
|
+
|
106
|
+
# Extract content from region
|
107
|
+
tables = content_region.extract_table()
|
108
|
+
text = content_region.get_text()
|
109
|
+
```
|
110
|
+
|
111
|
+
Advanced usage:
|
112
|
+
```python
|
113
|
+
# OCR processing
|
114
|
+
region.apply_ocr(engine='easyocr', resolution=300)
|
115
|
+
|
116
|
+
# AI-powered extraction
|
117
|
+
data = region.extract_structured_data(MySchema)
|
118
|
+
|
119
|
+
# Visual debugging
|
120
|
+
region.show(highlights=['tables', 'text'])
|
121
|
+
```
|
61
122
|
"""
|
62
123
|
|
63
124
|
def __init__(
|
@@ -68,23 +129,46 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
68
129
|
parent=None,
|
69
130
|
label: Optional[str] = None,
|
70
131
|
):
|
71
|
-
"""
|
72
|
-
|
132
|
+
"""Initialize a region.
|
133
|
+
|
134
|
+
Creates a Region object that represents a rectangular or polygonal area on a page.
|
135
|
+
Regions are used for spatial navigation, content extraction, and analysis operations.
|
73
136
|
|
74
137
|
Args:
|
75
|
-
page: Parent
|
76
|
-
|
77
|
-
|
78
|
-
|
138
|
+
page: Parent Page object that contains this region and provides access
|
139
|
+
to document elements and analysis capabilities.
|
140
|
+
bbox: Bounding box coordinates as (x0, top, x1, bottom) tuple in PDF
|
141
|
+
coordinate system (points, with origin at bottom-left).
|
142
|
+
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for
|
143
|
+
non-rectangular regions. If provided, the region will use polygon-based
|
144
|
+
intersection calculations instead of simple rectangle overlap.
|
145
|
+
parent: Optional parent region for hierarchical document structure.
|
146
|
+
Useful for maintaining tree-like relationships between regions.
|
147
|
+
label: Optional descriptive label for the region, useful for debugging
|
148
|
+
and identification in complex workflows.
|
149
|
+
|
150
|
+
Example:
|
151
|
+
```python
|
152
|
+
pdf = npdf.PDF("document.pdf")
|
153
|
+
page = pdf.pages[0]
|
154
|
+
|
155
|
+
# Rectangular region
|
156
|
+
header = Region(page, (0, 0, page.width, 100), label="header")
|
157
|
+
|
158
|
+
# Polygonal region (from layout detection)
|
159
|
+
table_polygon = [(50, 100), (300, 100), (300, 400), (50, 400)]
|
160
|
+
table_region = Region(page, (50, 100, 300, 400),
|
161
|
+
polygon=table_polygon, label="table")
|
162
|
+
```
|
163
|
+
|
164
|
+
Note:
|
165
|
+
Regions are typically created through page methods like page.region() or
|
166
|
+
spatial navigation methods like element.below(). Direct instantiation is
|
167
|
+
used mainly for advanced workflows or layout analysis integration.
|
79
168
|
"""
|
80
169
|
self._page = page
|
81
170
|
self._bbox = bbox
|
82
171
|
self._polygon = polygon
|
83
|
-
self._multi_page_elements = None
|
84
|
-
self._spans_pages = False
|
85
|
-
self._page_range = None
|
86
|
-
self.start_element = None
|
87
|
-
self.end_element = None
|
88
172
|
|
89
173
|
self.metadata: Dict[str, Any] = {}
|
90
174
|
# Analysis results live under self.metadata['analysis'] via property
|
@@ -444,10 +528,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
444
528
|
Returns:
|
445
529
|
True if the element is in the region, False otherwise
|
446
530
|
"""
|
447
|
-
# If we have multi-page elements cached, check if the element is in the list
|
448
|
-
if self._spans_pages and self._multi_page_elements is not None:
|
449
|
-
return element in self._multi_page_elements
|
450
|
-
|
451
531
|
# Check if element is on the same page
|
452
532
|
if not hasattr(element, "page") or element.page != self._page:
|
453
533
|
return False
|
@@ -614,12 +694,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
614
694
|
"""
|
615
695
|
# Apply global options as defaults
|
616
696
|
import natural_pdf
|
697
|
+
|
617
698
|
if resolution is None:
|
618
699
|
if natural_pdf.options.image.resolution is not None:
|
619
700
|
resolution = natural_pdf.options.image.resolution
|
620
701
|
else:
|
621
702
|
resolution = 144 # Default resolution when none specified
|
622
|
-
|
703
|
+
|
623
704
|
# Handle the case where user wants the cropped region to have a specific width
|
624
705
|
page_kwargs = kwargs.copy()
|
625
706
|
effective_resolution = resolution # Start with the provided resolution
|
@@ -722,12 +803,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
722
803
|
"""
|
723
804
|
# Apply global options as defaults
|
724
805
|
import natural_pdf
|
806
|
+
|
725
807
|
if resolution is None:
|
726
808
|
if natural_pdf.options.image.resolution is not None:
|
727
809
|
resolution = natural_pdf.options.image.resolution
|
728
810
|
else:
|
729
811
|
resolution = 144 # Default resolution when none specified
|
730
|
-
|
812
|
+
|
731
813
|
if not self._page:
|
732
814
|
raise ValueError("Region must be associated with a page to show.")
|
733
815
|
|
@@ -764,7 +846,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
764
846
|
)
|
765
847
|
|
766
848
|
def save(
|
767
|
-
self,
|
849
|
+
self,
|
850
|
+
filename: str,
|
851
|
+
resolution: Optional[float] = None,
|
852
|
+
labels: bool = True,
|
853
|
+
legend_position: str = "right",
|
768
854
|
) -> "Region":
|
769
855
|
"""
|
770
856
|
Save the page with this region highlighted to an image file.
|
@@ -780,17 +866,20 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
780
866
|
"""
|
781
867
|
# Apply global options as defaults
|
782
868
|
import natural_pdf
|
869
|
+
|
783
870
|
if resolution is None:
|
784
871
|
if natural_pdf.options.image.resolution is not None:
|
785
872
|
resolution = natural_pdf.options.image.resolution
|
786
873
|
else:
|
787
874
|
resolution = 144 # Default resolution when none specified
|
788
|
-
|
875
|
+
|
789
876
|
# Highlight this region if not already highlighted
|
790
877
|
self.highlight()
|
791
878
|
|
792
879
|
# Save the highlighted image
|
793
|
-
self._page.save_image(
|
880
|
+
self._page.save_image(
|
881
|
+
filename, resolution=resolution, labels=labels, legend_position=legend_position
|
882
|
+
)
|
794
883
|
return self
|
795
884
|
|
796
885
|
def save_image(
|
@@ -816,12 +905,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
816
905
|
"""
|
817
906
|
# Apply global options as defaults
|
818
907
|
import natural_pdf
|
908
|
+
|
819
909
|
if resolution is None:
|
820
910
|
if natural_pdf.options.image.resolution is not None:
|
821
911
|
resolution = natural_pdf.options.image.resolution
|
822
912
|
else:
|
823
913
|
resolution = 144 # Default resolution when none specified
|
824
|
-
|
914
|
+
|
825
915
|
# Get the region image
|
826
916
|
image = self.to_image(
|
827
917
|
resolution=resolution,
|
@@ -856,27 +946,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
856
946
|
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
857
947
|
This helps avoid detecting box borders/slivers as content.
|
858
948
|
|
859
|
-
Returns
|
860
|
-
|
949
|
+
Returns
|
950
|
+
------
|
861
951
|
|
862
|
-
|
863
|
-
|
864
|
-
|
952
|
+
New Region with visual whitespace trimmed from all edges
|
953
|
+
|
954
|
+
Examples
|
955
|
+
--------
|
956
|
+
|
957
|
+
```python
|
958
|
+
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
959
|
+
trimmed = region.trim()
|
865
960
|
|
866
|
-
|
867
|
-
|
961
|
+
# More aggressive trimming with no padding and no pre-shrink
|
962
|
+
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
868
963
|
|
869
|
-
|
870
|
-
|
964
|
+
# Conservative trimming with more padding
|
965
|
+
loose = region.trim(padding=3, threshold=0.98)
|
966
|
+
```
|
871
967
|
"""
|
872
968
|
# Apply global options as defaults
|
873
969
|
import natural_pdf
|
970
|
+
|
874
971
|
if resolution is None:
|
875
972
|
if natural_pdf.options.image.resolution is not None:
|
876
973
|
resolution = natural_pdf.options.image.resolution
|
877
974
|
else:
|
878
975
|
resolution = 144 # Default resolution when none specified
|
879
|
-
|
976
|
+
|
880
977
|
# Pre-shrink the region to avoid box slivers
|
881
978
|
work_region = (
|
882
979
|
self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
|
@@ -885,9 +982,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
885
982
|
)
|
886
983
|
|
887
984
|
# Get the region image
|
888
|
-
image = work_region.to_image(
|
889
|
-
resolution=resolution, crop=True, include_highlights=False
|
890
|
-
)
|
985
|
+
image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
|
891
986
|
|
892
987
|
if image is None:
|
893
988
|
logger.warning(
|
@@ -1113,12 +1208,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1113
1208
|
Returns:
|
1114
1209
|
List of elements in the region
|
1115
1210
|
"""
|
1116
|
-
# If we have multi-page elements, return those
|
1117
|
-
if self._spans_pages and self._multi_page_elements is not None:
|
1118
|
-
# TODO: Apply selector to multi-page elements if needed
|
1119
|
-
return self._multi_page_elements
|
1120
|
-
|
1121
|
-
# Otherwise, get elements from the page
|
1122
1211
|
if selector:
|
1123
1212
|
# Find elements on the page matching the selector
|
1124
1213
|
page_elements = self.page.find_all(
|
@@ -1257,7 +1346,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1257
1346
|
try:
|
1258
1347
|
cell_regions_in_table = [
|
1259
1348
|
c
|
1260
|
-
for c in self.page.find_all(
|
1349
|
+
for c in self.page.find_all(
|
1350
|
+
"region[type=table_cell]", apply_exclusions=False
|
1351
|
+
)
|
1261
1352
|
if self.intersects(c)
|
1262
1353
|
]
|
1263
1354
|
except Exception as _cells_err:
|
@@ -1324,7 +1415,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1324
1415
|
# This must happen AFTER alias handling (so strategies are final)
|
1325
1416
|
# and BEFORE we delegate to _extract_table_* helpers.
|
1326
1417
|
# -------------------------------------------------------------
|
1327
|
-
if "text" in (
|
1418
|
+
if "text" in (
|
1419
|
+
table_settings.get("vertical_strategy"),
|
1420
|
+
table_settings.get("horizontal_strategy"),
|
1421
|
+
):
|
1328
1422
|
page_cfg = getattr(self.page, "_config", {})
|
1329
1423
|
# Ensure text_* tolerances passed to pdfplumber
|
1330
1424
|
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
@@ -1466,19 +1560,35 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1466
1560
|
table_settings.get("vertical_strategy"),
|
1467
1561
|
table_settings.get("horizontal_strategy"),
|
1468
1562
|
)
|
1469
|
-
if
|
1563
|
+
if (
|
1564
|
+
_uses_text
|
1565
|
+
and "text_x_tolerance" not in table_settings
|
1566
|
+
and "x_tolerance" not in table_settings
|
1567
|
+
):
|
1470
1568
|
x_tol = pdf_cfg.get("x_tolerance")
|
1471
1569
|
if x_tol is not None:
|
1472
1570
|
table_settings.setdefault("text_x_tolerance", x_tol)
|
1473
|
-
if
|
1571
|
+
if (
|
1572
|
+
_uses_text
|
1573
|
+
and "text_y_tolerance" not in table_settings
|
1574
|
+
and "y_tolerance" not in table_settings
|
1575
|
+
):
|
1474
1576
|
y_tol = pdf_cfg.get("y_tolerance")
|
1475
1577
|
if y_tol is not None:
|
1476
1578
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1477
1579
|
|
1478
|
-
if
|
1580
|
+
if (
|
1581
|
+
_uses_text
|
1582
|
+
and "snap_tolerance" not in table_settings
|
1583
|
+
and "snap_x_tolerance" not in table_settings
|
1584
|
+
):
|
1479
1585
|
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1480
1586
|
table_settings.setdefault("snap_tolerance", snap)
|
1481
|
-
if
|
1587
|
+
if (
|
1588
|
+
_uses_text
|
1589
|
+
and "join_tolerance" not in table_settings
|
1590
|
+
and "join_x_tolerance" not in table_settings
|
1591
|
+
):
|
1482
1592
|
join = table_settings.get("snap_tolerance", 1)
|
1483
1593
|
table_settings.setdefault("join_tolerance", join)
|
1484
1594
|
table_settings.setdefault("join_x_tolerance", join)
|
@@ -1510,11 +1620,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1510
1620
|
table_settings.get("vertical_strategy"),
|
1511
1621
|
table_settings.get("horizontal_strategy"),
|
1512
1622
|
)
|
1513
|
-
if
|
1623
|
+
if (
|
1624
|
+
_uses_text
|
1625
|
+
and "text_x_tolerance" not in table_settings
|
1626
|
+
and "x_tolerance" not in table_settings
|
1627
|
+
):
|
1514
1628
|
x_tol = pdf_cfg.get("x_tolerance")
|
1515
1629
|
if x_tol is not None:
|
1516
1630
|
table_settings.setdefault("text_x_tolerance", x_tol)
|
1517
|
-
if
|
1631
|
+
if (
|
1632
|
+
_uses_text
|
1633
|
+
and "text_y_tolerance" not in table_settings
|
1634
|
+
and "y_tolerance" not in table_settings
|
1635
|
+
):
|
1518
1636
|
y_tol = pdf_cfg.get("y_tolerance")
|
1519
1637
|
if y_tol is not None:
|
1520
1638
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
@@ -1942,23 +2060,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1942
2060
|
else:
|
1943
2061
|
raise ValueError("Internal error: No selector or text provided.")
|
1944
2062
|
|
1945
|
-
# If we span multiple pages, filter our elements
|
1946
|
-
# TODO: Revisit multi-page region logic
|
1947
|
-
if self._spans_pages and self._multi_page_elements is not None:
|
1948
|
-
logger.warning("find_all on multi-page regions is not fully implemented.")
|
1949
|
-
# Temporary: Apply filter directly to cached elements
|
1950
|
-
try:
|
1951
|
-
selector_obj = parse_selector(effective_selector)
|
1952
|
-
# Pass regex/case flags down
|
1953
|
-
kwargs["regex"] = regex
|
1954
|
-
kwargs["case"] = case
|
1955
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1956
|
-
matching = [el for el in self._multi_page_elements if filter_func(el)]
|
1957
|
-
return ElementCollection(matching)
|
1958
|
-
except Exception as e:
|
1959
|
-
logger.error(f"Error applying selector to multi-page region elements: {e}")
|
1960
|
-
return ElementCollection([])
|
1961
|
-
|
1962
2063
|
# Normal case: Region is on a single page
|
1963
2064
|
try:
|
1964
2065
|
# Parse the final selector string
|
@@ -2016,10 +2117,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2016
2117
|
|
2017
2118
|
Examples
|
2018
2119
|
---------
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2022
|
-
|
2120
|
+
```python
|
2121
|
+
def llm_ocr(region):
|
2122
|
+
image = region.to_image(resolution=300, crop=True)
|
2123
|
+
return my_llm_client.ocr(image)
|
2124
|
+
region.apply_ocr(function=llm_ocr)
|
2125
|
+
```
|
2023
2126
|
|
2024
2127
|
Args:
|
2025
2128
|
replace: Whether to remove existing OCR elements first (default ``True``).
|
@@ -2088,15 +2191,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2088
2191
|
# Remove OCR CHAR dicts overlapping region
|
2089
2192
|
for char in list(self.page._element_mgr.chars):
|
2090
2193
|
# char can be dict or TextElement; normalise
|
2091
|
-
char_src =
|
2194
|
+
char_src = (
|
2195
|
+
char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
2196
|
+
)
|
2092
2197
|
if char_src == "ocr":
|
2093
2198
|
# Rough bbox for dicts
|
2094
2199
|
if isinstance(char, dict):
|
2095
|
-
cx0, ctop, cx1, cbottom =
|
2200
|
+
cx0, ctop, cx1, cbottom = (
|
2201
|
+
char.get("x0", 0),
|
2202
|
+
char.get("top", 0),
|
2203
|
+
char.get("x1", 0),
|
2204
|
+
char.get("bottom", 0),
|
2205
|
+
)
|
2096
2206
|
else:
|
2097
2207
|
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
2098
2208
|
# Quick overlap check
|
2099
|
-
if not (
|
2209
|
+
if not (
|
2210
|
+
cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
|
2211
|
+
):
|
2100
2212
|
_safe_remove(char)
|
2101
2213
|
|
2102
2214
|
logger.info(
|
@@ -2219,7 +2331,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2219
2331
|
"""
|
2220
2332
|
Apply a custom OCR function to this region and create text elements from the results.
|
2221
2333
|
|
2222
|
-
This is useful when you want to use a custom OCR method (e.g., an LLM API,
|
2334
|
+
This is useful when you want to use a custom OCR method (e.g., an LLM API,
|
2223
2335
|
specialized OCR service, or any custom logic) instead of the built-in OCR engines.
|
2224
2336
|
|
2225
2337
|
Args:
|
@@ -2244,15 +2356,15 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2244
2356
|
image = region.to_image(resolution=300, crop=True)
|
2245
2357
|
# Call your LLM API here
|
2246
2358
|
return llm_client.ocr(image)
|
2247
|
-
|
2359
|
+
|
2248
2360
|
region.apply_custom_ocr(ocr_with_llm)
|
2249
|
-
|
2361
|
+
|
2250
2362
|
# Using with a custom OCR service
|
2251
2363
|
def ocr_with_service(region):
|
2252
2364
|
img_bytes = region.to_image(crop=True).tobytes()
|
2253
2365
|
response = ocr_service.process(img_bytes)
|
2254
2366
|
return response.text
|
2255
|
-
|
2367
|
+
|
2256
2368
|
region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
|
2257
2369
|
"""
|
2258
2370
|
# If replace is True, remove existing OCR elements in this region
|
@@ -2260,9 +2372,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2260
2372
|
logger.info(
|
2261
2373
|
f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
|
2262
2374
|
)
|
2263
|
-
|
2375
|
+
|
2264
2376
|
removed_count = 0
|
2265
|
-
|
2377
|
+
|
2266
2378
|
# Helper to remove a single element safely
|
2267
2379
|
def _safe_remove(elem):
|
2268
2380
|
nonlocal removed_count
|
@@ -2281,7 +2393,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2281
2393
|
success = False
|
2282
2394
|
if success:
|
2283
2395
|
removed_count += 1
|
2284
|
-
|
2396
|
+
|
2285
2397
|
# Remove ALL OCR elements overlapping this region
|
2286
2398
|
# Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
|
2287
2399
|
for word in list(self.page._element_mgr.words):
|
@@ -2290,45 +2402,51 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2290
2402
|
# Also remove elements with the same source_label to avoid duplicates
|
2291
2403
|
if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
|
2292
2404
|
_safe_remove(word)
|
2293
|
-
|
2405
|
+
|
2294
2406
|
# Also remove char dicts if needed (matching built-in OCR)
|
2295
2407
|
for char in list(self.page._element_mgr.chars):
|
2296
2408
|
# char can be dict or TextElement; normalize
|
2297
|
-
char_src =
|
2409
|
+
char_src = (
|
2410
|
+
char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
2411
|
+
)
|
2298
2412
|
if char_src == "ocr" or char_src == source_label:
|
2299
2413
|
# Rough bbox for dicts
|
2300
2414
|
if isinstance(char, dict):
|
2301
|
-
cx0, ctop, cx1, cbottom =
|
2415
|
+
cx0, ctop, cx1, cbottom = (
|
2416
|
+
char.get("x0", 0),
|
2417
|
+
char.get("top", 0),
|
2418
|
+
char.get("x1", 0),
|
2419
|
+
char.get("bottom", 0),
|
2420
|
+
)
|
2302
2421
|
else:
|
2303
2422
|
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
2304
2423
|
# Quick overlap check
|
2305
|
-
if not (
|
2424
|
+
if not (
|
2425
|
+
cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
|
2426
|
+
):
|
2306
2427
|
_safe_remove(char)
|
2307
|
-
|
2428
|
+
|
2308
2429
|
if removed_count > 0:
|
2309
|
-
logger.info(
|
2310
|
-
|
2311
|
-
)
|
2312
|
-
|
2430
|
+
logger.info(f"Region {self.bbox}: Removed {removed_count} existing OCR elements.")
|
2431
|
+
|
2313
2432
|
# Call the custom OCR function
|
2314
2433
|
try:
|
2315
2434
|
logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
|
2316
2435
|
ocr_text = ocr_function(self)
|
2317
|
-
|
2436
|
+
|
2318
2437
|
if ocr_text is not None and not isinstance(ocr_text, str):
|
2319
2438
|
logger.warning(
|
2320
2439
|
f"Custom OCR function returned non-string type ({type(ocr_text)}). "
|
2321
2440
|
f"Converting to string."
|
2322
2441
|
)
|
2323
2442
|
ocr_text = str(ocr_text)
|
2324
|
-
|
2443
|
+
|
2325
2444
|
except Exception as e:
|
2326
2445
|
logger.error(
|
2327
|
-
f"Error calling custom OCR function for region {self.bbox}: {e}",
|
2328
|
-
exc_info=True
|
2446
|
+
f"Error calling custom OCR function for region {self.bbox}: {e}", exc_info=True
|
2329
2447
|
)
|
2330
2448
|
return self
|
2331
|
-
|
2449
|
+
|
2332
2450
|
# Create text element if we got text
|
2333
2451
|
if ocr_text is not None:
|
2334
2452
|
# Use the to_text_element method to create the element
|
@@ -2336,16 +2454,16 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2336
2454
|
text_content=ocr_text,
|
2337
2455
|
source_label=source_label,
|
2338
2456
|
confidence=confidence,
|
2339
|
-
add_to_page=add_to_page
|
2457
|
+
add_to_page=add_to_page,
|
2340
2458
|
)
|
2341
|
-
|
2459
|
+
|
2342
2460
|
logger.info(
|
2343
2461
|
f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
|
2344
2462
|
f"{' and added to page' if add_to_page else ''}"
|
2345
2463
|
)
|
2346
2464
|
else:
|
2347
2465
|
logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
|
2348
|
-
|
2466
|
+
|
2349
2467
|
return self
|
2350
2468
|
|
2351
2469
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
@@ -3293,9 +3411,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
3293
3411
|
return []
|
3294
3412
|
|
3295
3413
|
# Build arrays of centers
|
3296
|
-
centers = np.array([
|
3297
|
-
[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
|
3298
|
-
])
|
3414
|
+
centers = np.array([[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions])
|
3299
3415
|
xs = centers[:, 0]
|
3300
3416
|
ys = centers[:, 1]
|
3301
3417
|
|
@@ -3327,5 +3443,3 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
3327
3443
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3328
3444
|
|
3329
3445
|
return table_grid
|
3330
|
-
|
3331
|
-
|