natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/elements/base.py +18 -1
- natural_pdf/elements/element_collection.py +153 -15
- natural_pdf/elements/rect.py +34 -0
- natural_pdf/elements/region.py +55 -3
- natural_pdf/elements/text.py +20 -2
- natural_pdf/selectors/parser.py +28 -1
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/RECORD +24 -23
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_inline_js.py +22 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- temp/debug_cell_extraction.py +0 -42
- temp/debug_exclusion_overlap.py +0 -43
- temp/debug_exclusions_guides.py +0 -67
- temp/debug_extra_guide.py +0 -41
- temp/debug_outer_boundaries.py +0 -46
- temp/debug_st_search.py +0 -33
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/top_level.txt +0 -0
@@ -92,6 +92,16 @@ class HighlightRenderer:
|
|
92
92
|
|
93
93
|
def _draw_highlights(self):
|
94
94
|
"""Draws all highlight shapes, borders, vertices, and attributes."""
|
95
|
+
# Get the pdfplumber page offset for coordinate translation
|
96
|
+
page_offset_x = 0
|
97
|
+
page_offset_y = 0
|
98
|
+
|
99
|
+
if hasattr(self.page, "_page") and hasattr(self.page._page, "bbox"):
|
100
|
+
# PDFPlumber page bbox might have negative offsets
|
101
|
+
page_offset_x = -self.page._page.bbox[0]
|
102
|
+
page_offset_y = -self.page._page.bbox[1]
|
103
|
+
logger.debug(f"Applying highlight offset: x={page_offset_x}, y={page_offset_y}")
|
104
|
+
|
95
105
|
for highlight in self.highlights:
|
96
106
|
# Create a transparent overlay for this single highlight
|
97
107
|
overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
|
@@ -101,7 +111,11 @@ class HighlightRenderer:
|
|
101
111
|
|
102
112
|
if highlight.is_polygon:
|
103
113
|
scaled_polygon = [
|
104
|
-
(
|
114
|
+
(
|
115
|
+
(p[0] + page_offset_x) * self.scale_factor,
|
116
|
+
(p[1] + page_offset_y) * self.scale_factor,
|
117
|
+
)
|
118
|
+
for p in highlight.polygon
|
105
119
|
]
|
106
120
|
# Draw polygon fill and border
|
107
121
|
draw.polygon(
|
@@ -117,11 +131,16 @@ class HighlightRenderer:
|
|
117
131
|
else: # Rectangle
|
118
132
|
x0, top, x1, bottom = highlight.bbox
|
119
133
|
x0_s, top_s, x1_s, bottom_s = (
|
120
|
-
x0 * self.scale_factor,
|
121
|
-
top * self.scale_factor,
|
122
|
-
x1 * self.scale_factor,
|
123
|
-
bottom * self.scale_factor,
|
134
|
+
(x0 + page_offset_x) * self.scale_factor,
|
135
|
+
(top + page_offset_y) * self.scale_factor,
|
136
|
+
(x1 + page_offset_x) * self.scale_factor,
|
137
|
+
(bottom + page_offset_y) * self.scale_factor,
|
124
138
|
)
|
139
|
+
logger.debug(f"Original bbox: ({x0}, {top}, {x1}, {bottom})")
|
140
|
+
logger.debug(
|
141
|
+
f"Offset bbox: ({x0 + page_offset_x}, {top + page_offset_y}, {x1 + page_offset_x}, {bottom + page_offset_y})"
|
142
|
+
)
|
143
|
+
logger.debug(f"Scaled bbox: ({x0_s}, {top_s}, {x1_s}, {bottom_s})")
|
125
144
|
scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
|
126
145
|
# Draw rectangle fill and border
|
127
146
|
draw.rectangle(
|
@@ -1482,11 +1501,22 @@ class HighlightingService:
|
|
1482
1501
|
offset_x = crop_offset[0] * scale_factor
|
1483
1502
|
offset_y = crop_offset[1] * scale_factor
|
1484
1503
|
|
1504
|
+
# Add pdfplumber page offset for coordinate translation
|
1505
|
+
page_offset_x = 0
|
1506
|
+
page_offset_y = 0
|
1507
|
+
if hasattr(page, "_page") and hasattr(page._page, "bbox"):
|
1508
|
+
# PDFPlumber page bbox might have negative offsets
|
1509
|
+
page_offset_x = -page._page.bbox[0]
|
1510
|
+
page_offset_y = -page._page.bbox[1]
|
1511
|
+
|
1485
1512
|
# Draw the highlight
|
1486
1513
|
if polygon:
|
1487
1514
|
# Scale polygon points and apply offset
|
1488
1515
|
scaled_polygon = [
|
1489
|
-
(
|
1516
|
+
(
|
1517
|
+
(p[0] + page_offset_x) * scale_factor - offset_x,
|
1518
|
+
(p[1] + page_offset_y) * scale_factor - offset_y,
|
1519
|
+
)
|
1490
1520
|
for p in polygon
|
1491
1521
|
]
|
1492
1522
|
draw.polygon(
|
@@ -1496,10 +1526,10 @@ class HighlightingService:
|
|
1496
1526
|
# Scale bbox and apply offset
|
1497
1527
|
x0, y0, x1, y1 = bbox
|
1498
1528
|
scaled_bbox = [
|
1499
|
-
x0 * scale_factor - offset_x,
|
1500
|
-
y0 * scale_factor - offset_y,
|
1501
|
-
x1 * scale_factor - offset_x,
|
1502
|
-
y1 * scale_factor - offset_y,
|
1529
|
+
(x0 + page_offset_x) * scale_factor - offset_x,
|
1530
|
+
(y0 + page_offset_y) * scale_factor - offset_y,
|
1531
|
+
(x1 + page_offset_x) * scale_factor - offset_x,
|
1532
|
+
(y1 + page_offset_y) * scale_factor - offset_y,
|
1503
1533
|
]
|
1504
1534
|
draw.rectangle(
|
1505
1535
|
scaled_bbox, fill=color, outline=(color[0], color[1], color[2], BORDER_ALPHA)
|
natural_pdf/elements/base.py
CHANGED
@@ -106,6 +106,7 @@ class DirectionalMixin:
|
|
106
106
|
include_source: bool = False,
|
107
107
|
until: Optional[str] = None,
|
108
108
|
include_endpoint: bool = True,
|
109
|
+
offset: float = 0.0,
|
109
110
|
**kwargs,
|
110
111
|
) -> "Region":
|
111
112
|
"""
|
@@ -118,6 +119,7 @@ class DirectionalMixin:
|
|
118
119
|
include_source: Whether to include this element/region's area in the result
|
119
120
|
until: Optional selector string to specify a boundary element
|
120
121
|
include_endpoint: Whether to include the boundary element found by 'until'
|
122
|
+
offset: Pixel offset when excluding source/endpoint (default: 0.1)
|
121
123
|
**kwargs: Additional parameters for the 'until' selector search
|
122
124
|
|
123
125
|
Returns:
|
@@ -127,7 +129,7 @@ class DirectionalMixin:
|
|
127
129
|
|
128
130
|
is_horizontal = direction in ("left", "right")
|
129
131
|
is_positive = direction in ("right", "below") # right/below are positive directions
|
130
|
-
pixel_offset =
|
132
|
+
pixel_offset = offset # Use provided offset for excluding elements/endpoints
|
131
133
|
|
132
134
|
# 1. Determine initial boundaries based on direction and include_source
|
133
135
|
if is_horizontal:
|
@@ -260,6 +262,7 @@ class DirectionalMixin:
|
|
260
262
|
include_source: bool = False,
|
261
263
|
until: Optional[str] = None,
|
262
264
|
include_endpoint: bool = True,
|
265
|
+
offset: float = 0.1,
|
263
266
|
**kwargs,
|
264
267
|
) -> "Region":
|
265
268
|
"""
|
@@ -271,6 +274,7 @@ class DirectionalMixin:
|
|
271
274
|
include_source: Whether to include this element/region in the result (default: False)
|
272
275
|
until: Optional selector string to specify an upper boundary element
|
273
276
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
277
|
+
offset: Pixel offset when excluding source/endpoint (default: 0.1)
|
274
278
|
**kwargs: Additional parameters
|
275
279
|
|
276
280
|
Returns:
|
@@ -295,6 +299,7 @@ class DirectionalMixin:
|
|
295
299
|
include_source=include_source,
|
296
300
|
until=until,
|
297
301
|
include_endpoint=include_endpoint,
|
302
|
+
offset=offset,
|
298
303
|
**kwargs,
|
299
304
|
)
|
300
305
|
|
@@ -305,6 +310,7 @@ class DirectionalMixin:
|
|
305
310
|
include_source: bool = False,
|
306
311
|
until: Optional[str] = None,
|
307
312
|
include_endpoint: bool = True,
|
313
|
+
offset: float = 0.1,
|
308
314
|
**kwargs,
|
309
315
|
) -> "Region":
|
310
316
|
"""
|
@@ -316,6 +322,7 @@ class DirectionalMixin:
|
|
316
322
|
include_source: Whether to include this element/region in the result (default: False)
|
317
323
|
until: Optional selector string to specify a lower boundary element
|
318
324
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
325
|
+
offset: Pixel offset when excluding source/endpoint (default: 0.1)
|
319
326
|
**kwargs: Additional parameters
|
320
327
|
|
321
328
|
Returns:
|
@@ -340,6 +347,7 @@ class DirectionalMixin:
|
|
340
347
|
include_source=include_source,
|
341
348
|
until=until,
|
342
349
|
include_endpoint=include_endpoint,
|
350
|
+
offset=offset,
|
343
351
|
**kwargs,
|
344
352
|
)
|
345
353
|
|
@@ -350,6 +358,7 @@ class DirectionalMixin:
|
|
350
358
|
include_source: bool = False,
|
351
359
|
until: Optional[str] = None,
|
352
360
|
include_endpoint: bool = True,
|
361
|
+
offset: float = 0.1,
|
353
362
|
**kwargs,
|
354
363
|
) -> "Region":
|
355
364
|
"""
|
@@ -361,6 +370,7 @@ class DirectionalMixin:
|
|
361
370
|
include_source: Whether to include this element/region in the result (default: False)
|
362
371
|
until: Optional selector string to specify a left boundary element
|
363
372
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
373
|
+
offset: Pixel offset when excluding source/endpoint (default: 0.1)
|
364
374
|
**kwargs: Additional parameters
|
365
375
|
|
366
376
|
Returns:
|
@@ -385,6 +395,7 @@ class DirectionalMixin:
|
|
385
395
|
include_source=include_source,
|
386
396
|
until=until,
|
387
397
|
include_endpoint=include_endpoint,
|
398
|
+
offset=offset,
|
388
399
|
**kwargs,
|
389
400
|
)
|
390
401
|
|
@@ -395,6 +406,7 @@ class DirectionalMixin:
|
|
395
406
|
include_source: bool = False,
|
396
407
|
until: Optional[str] = None,
|
397
408
|
include_endpoint: bool = True,
|
409
|
+
offset: float = 0.1,
|
398
410
|
**kwargs,
|
399
411
|
) -> "Region":
|
400
412
|
"""
|
@@ -406,6 +418,7 @@ class DirectionalMixin:
|
|
406
418
|
include_source: Whether to include this element/region in the result (default: False)
|
407
419
|
until: Optional selector string to specify a right boundary element
|
408
420
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
421
|
+
offset: Pixel offset when excluding source/endpoint (default: 0.1)
|
409
422
|
**kwargs: Additional parameters
|
410
423
|
|
411
424
|
Returns:
|
@@ -430,6 +443,7 @@ class DirectionalMixin:
|
|
430
443
|
include_source=include_source,
|
431
444
|
until=until,
|
432
445
|
include_endpoint=include_endpoint,
|
446
|
+
offset=offset,
|
433
447
|
**kwargs,
|
434
448
|
)
|
435
449
|
|
@@ -1195,6 +1209,9 @@ class Element(
|
|
1195
1209
|
|
1196
1210
|
return self
|
1197
1211
|
|
1212
|
+
def exclude(self):
|
1213
|
+
self.page.add_exclusion(self)
|
1214
|
+
|
1198
1215
|
def _get_render_specs(
|
1199
1216
|
self,
|
1200
1217
|
mode: Literal["show", "render"] = "show",
|
@@ -888,6 +888,9 @@ class ElementCollection(
|
|
888
888
|
self._elements.sort(key=key, reverse=reverse)
|
889
889
|
return self
|
890
890
|
|
891
|
+
def exclude(self):
|
892
|
+
self.page.add_exclusion(self)
|
893
|
+
|
891
894
|
def highlight(
|
892
895
|
self,
|
893
896
|
label: Optional[str] = None,
|
@@ -1902,13 +1905,87 @@ class ElementCollection(
|
|
1902
1905
|
|
1903
1906
|
return ElementCollection(all_found_elements)
|
1904
1907
|
|
1905
|
-
def extract_each_text(
|
1906
|
-
|
1907
|
-
|
1908
|
+
def extract_each_text(
|
1909
|
+
self,
|
1910
|
+
order: Optional[Union[str, Callable[[T], Any]]] = None,
|
1911
|
+
*,
|
1912
|
+
newlines: bool = True,
|
1913
|
+
**kwargs,
|
1914
|
+
) -> List[str]:
|
1915
|
+
"""Return a list with the extracted text for every element.
|
1916
|
+
|
1917
|
+
Parameters
|
1918
|
+
----------
|
1919
|
+
order
|
1920
|
+
Controls the ordering of elements **before** extraction:
|
1921
|
+
|
1922
|
+
* ``None`` (default) – keep the collection's current order.
|
1923
|
+
* ``callable`` – a function that will be used as ``key`` for :pyfunc:`sorted`.
|
1924
|
+
* ``"ltr"`` – left-to-right ordering (x0, then y-top).
|
1925
|
+
* ``"rtl"`` – right-to-left ordering (−x0, then y-top).
|
1926
|
+
* ``"natural"`` – natural reading order (y-top, then x0).
|
1927
|
+
|
1928
|
+
Remaining keyword arguments are forwarded to each element's
|
1929
|
+
:py:meth:`extract_text` method.
|
1908
1930
|
"""
|
1909
|
-
|
1910
|
-
|
1911
|
-
)
|
1931
|
+
|
1932
|
+
# -- Determine ordering --------------------------------------------------
|
1933
|
+
elements: List[T] = list(self._elements) # make a shallow copy we can sort
|
1934
|
+
|
1935
|
+
if order is not None and len(elements) > 1:
|
1936
|
+
try:
|
1937
|
+
if callable(order):
|
1938
|
+
elements.sort(key=order)
|
1939
|
+
elif isinstance(order, str):
|
1940
|
+
preset = order.lower()
|
1941
|
+
if preset in {"ltr", "left-to-right"}:
|
1942
|
+
elements.sort(
|
1943
|
+
key=lambda el: (
|
1944
|
+
(
|
1945
|
+
getattr(el, "page", None).index
|
1946
|
+
if hasattr(el, "page") and el.page
|
1947
|
+
else 0
|
1948
|
+
),
|
1949
|
+
getattr(el, "x0", 0),
|
1950
|
+
getattr(el, "top", 0),
|
1951
|
+
)
|
1952
|
+
)
|
1953
|
+
elif preset in {"rtl", "right-to-left"}:
|
1954
|
+
elements.sort(
|
1955
|
+
key=lambda el: (
|
1956
|
+
(
|
1957
|
+
getattr(el, "page", None).index
|
1958
|
+
if hasattr(el, "page") and el.page
|
1959
|
+
else 0
|
1960
|
+
),
|
1961
|
+
-getattr(el, "x0", 0),
|
1962
|
+
getattr(el, "top", 0),
|
1963
|
+
)
|
1964
|
+
)
|
1965
|
+
elif preset in {"natural", "tdlr", "top-down"}:
|
1966
|
+
elements.sort(
|
1967
|
+
key=lambda el: (
|
1968
|
+
(
|
1969
|
+
getattr(el, "page", None).index
|
1970
|
+
if hasattr(el, "page") and el.page
|
1971
|
+
else 0
|
1972
|
+
),
|
1973
|
+
getattr(el, "top", 0),
|
1974
|
+
getattr(el, "x0", 0),
|
1975
|
+
)
|
1976
|
+
)
|
1977
|
+
else:
|
1978
|
+
# Unknown preset – silently ignore to keep original order
|
1979
|
+
pass
|
1980
|
+
except Exception:
|
1981
|
+
# If anything goes wrong, fall back to original order
|
1982
|
+
pass
|
1983
|
+
|
1984
|
+
# -- Extract ----------------------------------------------------------------
|
1985
|
+
return [
|
1986
|
+
el.extract_text(newlines=newlines, **kwargs) if el is not None else None # type: ignore[arg-type]
|
1987
|
+
for el in elements
|
1988
|
+
]
|
1912
1989
|
|
1913
1990
|
def correct_ocr(
|
1914
1991
|
self,
|
@@ -2673,10 +2750,17 @@ class ElementCollection(
|
|
2673
2750
|
else:
|
2674
2751
|
v_dist = 0 # Vertically overlapping
|
2675
2752
|
|
2676
|
-
#
|
2677
|
-
#
|
2678
|
-
|
2753
|
+
# ------------------------------------------------------------------
|
2754
|
+
# Decide connection logic based on vertical_gap parameter
|
2755
|
+
# ------------------------------------------------------------------
|
2756
|
+
if vertical_gap is not None:
|
2757
|
+
# Consider elements connected when they vertically stack within
|
2758
|
+
# the allowed gap **and** have some horizontal overlap
|
2759
|
+
horizontal_overlap = not (h_dist > 0)
|
2760
|
+
return horizontal_overlap and v_dist <= vertical_gap
|
2679
2761
|
|
2762
|
+
# Fallback to legacy Chebyshev distance using ``threshold``
|
2763
|
+
distance = max(h_dist, v_dist)
|
2680
2764
|
return distance <= threshold
|
2681
2765
|
|
2682
2766
|
def _merge_region_group(
|
@@ -2752,6 +2836,9 @@ class ElementCollection(
|
|
2752
2836
|
def dissolve(
|
2753
2837
|
self,
|
2754
2838
|
padding: float = 2.0,
|
2839
|
+
*,
|
2840
|
+
vertical_gap: Optional[float] = None,
|
2841
|
+
vertical: Optional[bool] = False,
|
2755
2842
|
geometry: Literal["rect", "polygon"] = "rect",
|
2756
2843
|
group_by: List[str] = None,
|
2757
2844
|
) -> "ElementCollection":
|
@@ -2764,8 +2851,19 @@ class ElementCollection(
|
|
2764
2851
|
bounding boxes.
|
2765
2852
|
|
2766
2853
|
Args:
|
2767
|
-
padding: Maximum distance in
|
2768
|
-
them connected
|
2854
|
+
padding: Maximum chebyshev distance (in any direction) between
|
2855
|
+
elements to consider them connected **when ``vertical_gap`` is
|
2856
|
+
not provided**. Default 2.0 pt.
|
2857
|
+
|
2858
|
+
vertical_gap: If given, switches to *stack-aware* dissolve:
|
2859
|
+
two elements are connected when their horizontal projections
|
2860
|
+
overlap (any amount) **and** the vertical distance between them
|
2861
|
+
is ≤ ``vertical_gap``. This lets you combine multi-line labels
|
2862
|
+
that share the same column but have blank space between lines.
|
2863
|
+
|
2864
|
+
vertical: If given, automatically sets vertical_gap to maximum to
|
2865
|
+
allow for easy vertical stacking.
|
2866
|
+
|
2769
2867
|
geometry: Type of geometry to use for merged regions. Currently only
|
2770
2868
|
"rect" (bounding box) is supported. "polygon" will raise
|
2771
2869
|
NotImplementedError.
|
@@ -2807,6 +2905,9 @@ class ElementCollection(
|
|
2807
2905
|
if geometry not in ["rect", "polygon"]:
|
2808
2906
|
raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
|
2809
2907
|
|
2908
|
+
if vertical:
|
2909
|
+
vertical_gap = float("inf")
|
2910
|
+
|
2810
2911
|
from natural_pdf.elements.region import Region
|
2811
2912
|
|
2812
2913
|
# Filter to elements with bbox (all elements that can be dissolved)
|
@@ -2835,7 +2936,9 @@ class ElementCollection(
|
|
2835
2936
|
logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
|
2836
2937
|
|
2837
2938
|
# Find connected components within this group
|
2838
|
-
components = self._find_connected_components_elements(
|
2939
|
+
components = self._find_connected_components_elements(
|
2940
|
+
group_elements, padding, vertical_gap
|
2941
|
+
)
|
2839
2942
|
|
2840
2943
|
# Merge each component
|
2841
2944
|
for component_elements in components:
|
@@ -2894,7 +2997,7 @@ class ElementCollection(
|
|
2894
2997
|
return groups
|
2895
2998
|
|
2896
2999
|
def _find_connected_components_elements(
|
2897
|
-
self, elements: List["Element"], padding: float
|
3000
|
+
self, elements: List["Element"], padding: float, vertical_gap: Optional[float] = None
|
2898
3001
|
) -> List[List["Element"]]:
|
2899
3002
|
"""Find connected components among elements using union-find."""
|
2900
3003
|
if not elements:
|
@@ -2919,7 +3022,7 @@ class ElementCollection(
|
|
2919
3022
|
# Check all pairs of elements for connectivity
|
2920
3023
|
for i in range(len(elements)):
|
2921
3024
|
for j in range(i + 1, len(elements)):
|
2922
|
-
if self._are_elements_connected(elements[i], elements[j], padding):
|
3025
|
+
if self._are_elements_connected(elements[i], elements[j], padding, vertical_gap):
|
2923
3026
|
union(i, j)
|
2924
3027
|
|
2925
3028
|
# Group elements by their connected component
|
@@ -3004,7 +3107,9 @@ class ElementCollection(
|
|
3004
3107
|
|
3005
3108
|
return merged_region
|
3006
3109
|
|
3007
|
-
def _are_elements_connected(
|
3110
|
+
def _are_elements_connected(
|
3111
|
+
self, elem1: "Element", elem2: "Element", threshold: float, vertical_gap: float | None
|
3112
|
+
) -> bool:
|
3008
3113
|
"""Check if two elements are connected (adjacent or overlapping)."""
|
3009
3114
|
# Check if elements are on the same page
|
3010
3115
|
# Handle edge cases where elements might not have a page attribute
|
@@ -3057,6 +3162,12 @@ class ElementCollection(
|
|
3057
3162
|
# This creates a square proximity zone
|
3058
3163
|
distance = max(h_dist, v_dist)
|
3059
3164
|
|
3165
|
+
if vertical_gap is not None:
|
3166
|
+
# 1. vertical distance ≤ vertical_gap
|
3167
|
+
# 2. horizontal ranges overlap OR touch
|
3168
|
+
h_overlap = (min(x1_1, x1_2) - max(x0_1, x0_2)) >= 0
|
3169
|
+
return h_overlap and v_dist <= vertical_gap
|
3170
|
+
|
3060
3171
|
return distance <= threshold
|
3061
3172
|
|
3062
3173
|
def _copy_element_attributes_to_region(
|
@@ -3163,3 +3274,30 @@ class ElementCollection(
|
|
3163
3274
|
return self
|
3164
3275
|
|
3165
3276
|
# ------------------------------------------------------------------
|
3277
|
+
|
3278
|
+
# ------------------------------------------------------------------
|
3279
|
+
# Public alias: combine
|
3280
|
+
# ------------------------------------------------------------------
|
3281
|
+
def combine(
|
3282
|
+
self,
|
3283
|
+
padding: float = 2.0,
|
3284
|
+
*,
|
3285
|
+
vertical_gap: Optional[float] = None,
|
3286
|
+
vertical: Optional[bool] = False,
|
3287
|
+
geometry: Literal["rect", "polygon"] = "rect",
|
3288
|
+
group_by: List[str] = None,
|
3289
|
+
) -> "ElementCollection":
|
3290
|
+
"""Alias for :py:meth:`dissolve` – retained for discoverability.
|
3291
|
+
|
3292
|
+
Many users find the verb *combine* more intuitive than *dissolve* when
|
3293
|
+
merging nearby or stacked elements into unified Regions. The parameters
|
3294
|
+
are identical; see :py:meth:`dissolve` for full documentation.
|
3295
|
+
"""
|
3296
|
+
|
3297
|
+
return self.dissolve(
|
3298
|
+
padding=padding,
|
3299
|
+
vertical_gap=vertical_gap,
|
3300
|
+
vertical=vertical,
|
3301
|
+
geometry=geometry,
|
3302
|
+
group_by=group_by,
|
3303
|
+
)
|
natural_pdf/elements/rect.py
CHANGED
@@ -88,6 +88,40 @@ class RectangleElement(Element):
|
|
88
88
|
"""Get the stroke width of the rectangle."""
|
89
89
|
return self._obj.get("linewidth", 0)
|
90
90
|
|
91
|
+
@property
|
92
|
+
def is_horizontal(self) -> bool:
|
93
|
+
"""Check if this is a horizontal line based on coordinates."""
|
94
|
+
# Calculate absolute difference in coordinates
|
95
|
+
dx = abs(self.x1 - self.x0)
|
96
|
+
dy = abs(self.top - self.bottom)
|
97
|
+
|
98
|
+
# Define a tolerance for near-horizontal lines (e.g., 1 point)
|
99
|
+
tolerance = 1.0
|
100
|
+
|
101
|
+
# Horizontal if y-change is within tolerance and x-change is significant
|
102
|
+
return dy <= tolerance and dx > tolerance
|
103
|
+
|
104
|
+
@property
|
105
|
+
def is_vertical(self) -> bool:
|
106
|
+
"""Check if this is a vertical line based on coordinates."""
|
107
|
+
# Calculate absolute difference in coordinates
|
108
|
+
dx = abs(self.x1 - self.x0)
|
109
|
+
dy = abs(self.top - self.bottom)
|
110
|
+
|
111
|
+
# Define a tolerance for near-vertical lines (e.g., 1 point)
|
112
|
+
tolerance = 1.0
|
113
|
+
|
114
|
+
# Vertical if x-change is within tolerance and y-change is significant
|
115
|
+
return dx <= tolerance and dy > tolerance
|
116
|
+
|
117
|
+
@property
|
118
|
+
def orientation(self) -> str:
|
119
|
+
"""Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
|
120
|
+
if self.is_horizontal:
|
121
|
+
return "horizontal"
|
122
|
+
elif self.is_vertical:
|
123
|
+
return "vertical"
|
124
|
+
|
91
125
|
def extract_text(self, **kwargs) -> str:
|
92
126
|
"""
|
93
127
|
Extract text from inside this rectangle.
|
natural_pdf/elements/region.py
CHANGED
@@ -45,6 +45,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
45
45
|
|
46
46
|
# Import new utils
|
47
47
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
48
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
48
49
|
|
49
50
|
# Import viewer widget support
|
50
51
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
@@ -80,6 +81,7 @@ class Region(
|
|
80
81
|
ExtractionMixin,
|
81
82
|
ShapeDetectionMixin,
|
82
83
|
DescribeMixin,
|
84
|
+
VisualSearchMixin,
|
83
85
|
Visualizable,
|
84
86
|
):
|
85
87
|
"""Represents a rectangular region on a page.
|
@@ -736,6 +738,9 @@ class Region(
|
|
736
738
|
and self.bottom > element.top
|
737
739
|
)
|
738
740
|
|
741
|
+
def exclude(self):
|
742
|
+
self.page.add_exclusion(self)
|
743
|
+
|
739
744
|
def highlight(
|
740
745
|
self,
|
741
746
|
label: Optional[str] = None,
|
@@ -1227,7 +1232,13 @@ class Region(
|
|
1227
1232
|
return [e for e in page_elements if self._is_element_in_region(e)]
|
1228
1233
|
|
1229
1234
|
def extract_text(
|
1230
|
-
self,
|
1235
|
+
self,
|
1236
|
+
apply_exclusions: bool = True,
|
1237
|
+
debug: bool = False,
|
1238
|
+
*,
|
1239
|
+
newlines: Union[bool, str] = True,
|
1240
|
+
content_filter=None,
|
1241
|
+
**kwargs,
|
1231
1242
|
) -> str:
|
1232
1243
|
"""
|
1233
1244
|
Extract text from this region, respecting page exclusions and using pdfplumber's
|
@@ -1236,6 +1247,7 @@ class Region(
|
|
1236
1247
|
Args:
|
1237
1248
|
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
1238
1249
|
debug: Enable verbose debugging output for filtering steps.
|
1250
|
+
newlines: Whether to strip newline characters from the extracted text.
|
1239
1251
|
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
1240
1252
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1241
1253
|
- A callable that takes text and returns True to KEEP the character
|
@@ -1309,6 +1321,18 @@ class Region(
|
|
1309
1321
|
user_kwargs=final_kwargs, # Pass kwargs including content_filter
|
1310
1322
|
)
|
1311
1323
|
|
1324
|
+
# Flexible newline handling (same logic as TextElement)
|
1325
|
+
if isinstance(newlines, bool):
|
1326
|
+
if newlines is False:
|
1327
|
+
replacement = " "
|
1328
|
+
else:
|
1329
|
+
replacement = None
|
1330
|
+
else:
|
1331
|
+
replacement = str(newlines)
|
1332
|
+
|
1333
|
+
if replacement is not None:
|
1334
|
+
result = result.replace("\n", replacement).replace("\r", replacement)
|
1335
|
+
|
1312
1336
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
1313
1337
|
return result
|
1314
1338
|
|
@@ -1692,7 +1716,21 @@ class Region(
|
|
1692
1716
|
else:
|
1693
1717
|
filtered_page = base_plumber_page
|
1694
1718
|
|
1695
|
-
|
1719
|
+
# Ensure bbox is within pdfplumber page bounds
|
1720
|
+
page_bbox = filtered_page.bbox
|
1721
|
+
clipped_bbox = (
|
1722
|
+
max(self.bbox[0], page_bbox[0]), # x0
|
1723
|
+
max(self.bbox[1], page_bbox[1]), # y0
|
1724
|
+
min(self.bbox[2], page_bbox[2]), # x1
|
1725
|
+
min(self.bbox[3], page_bbox[3]), # y1
|
1726
|
+
)
|
1727
|
+
|
1728
|
+
# Only crop if the clipped bbox is valid (has positive width and height)
|
1729
|
+
if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
|
1730
|
+
cropped = filtered_page.crop(clipped_bbox)
|
1731
|
+
else:
|
1732
|
+
# If the region is completely outside the page bounds, return empty list
|
1733
|
+
return []
|
1696
1734
|
|
1697
1735
|
# Extract all tables from the cropped area
|
1698
1736
|
tables = cropped.extract_tables(table_settings)
|
@@ -1786,7 +1824,21 @@ class Region(
|
|
1786
1824
|
filtered_page = base_plumber_page
|
1787
1825
|
|
1788
1826
|
# Now crop the (possibly filtered) page to the region bbox
|
1789
|
-
|
1827
|
+
# Ensure bbox is within pdfplumber page bounds
|
1828
|
+
page_bbox = filtered_page.bbox
|
1829
|
+
clipped_bbox = (
|
1830
|
+
max(self.bbox[0], page_bbox[0]), # x0
|
1831
|
+
max(self.bbox[1], page_bbox[1]), # y0
|
1832
|
+
min(self.bbox[2], page_bbox[2]), # x1
|
1833
|
+
min(self.bbox[3], page_bbox[3]), # y1
|
1834
|
+
)
|
1835
|
+
|
1836
|
+
# Only crop if the clipped bbox is valid (has positive width and height)
|
1837
|
+
if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
|
1838
|
+
cropped = filtered_page.crop(clipped_bbox)
|
1839
|
+
else:
|
1840
|
+
# If the region is completely outside the page bounds, return empty table
|
1841
|
+
return []
|
1790
1842
|
|
1791
1843
|
# Extract the single largest table from the cropped area
|
1792
1844
|
table = cropped.extract_table(table_settings)
|
natural_pdf/elements/text.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
Text element classes for natural-pdf.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
6
6
|
|
7
7
|
from natural_pdf.elements.base import Element
|
8
8
|
|
@@ -236,7 +236,13 @@ class TextElement(Element):
|
|
236
236
|
return (0, 0, 0)
|
237
237
|
|
238
238
|
def extract_text(
|
239
|
-
self,
|
239
|
+
self,
|
240
|
+
keep_blank_chars: bool = True,
|
241
|
+
strip: Optional[bool] = True,
|
242
|
+
*,
|
243
|
+
newlines: Union[bool, str] = True,
|
244
|
+
content_filter=None,
|
245
|
+
**kwargs,
|
240
246
|
) -> str:
|
241
247
|
"""
|
242
248
|
Extract text from this element.
|
@@ -292,6 +298,18 @@ class TextElement(Element):
|
|
292
298
|
if strip:
|
293
299
|
result = result.strip()
|
294
300
|
|
301
|
+
# Flexible newline handling
|
302
|
+
if isinstance(newlines, bool):
|
303
|
+
if newlines is False:
|
304
|
+
replacement = " " # single space when False
|
305
|
+
else:
|
306
|
+
replacement = None # keep as-is when True
|
307
|
+
else:
|
308
|
+
replacement = str(newlines)
|
309
|
+
|
310
|
+
if replacement is not None:
|
311
|
+
result = result.replace("\n", replacement).replace("\r", replacement)
|
312
|
+
|
295
313
|
return result
|
296
314
|
|
297
315
|
def contains(self, substring: str, case_sensitive: bool = True) -> bool:
|