natural-pdf 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/page.py +42 -9
- natural_pdf/utils/spatial.py +42 -39
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +8 -8
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import base64
|
2
2
|
import concurrent.futures # Added import
|
3
|
+
import contextlib
|
3
4
|
import hashlib
|
4
5
|
import io
|
5
6
|
import json
|
@@ -275,6 +276,9 @@ class Page(
|
|
275
276
|
self._load_elements()
|
276
277
|
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
277
278
|
|
279
|
+
# Flag to prevent infinite recursion when computing exclusions
|
280
|
+
self._computing_exclusions = False
|
281
|
+
|
278
282
|
def _get_render_specs(
|
279
283
|
self,
|
280
284
|
mode: Literal["show", "render"] = "show",
|
@@ -412,6 +416,35 @@ class Page(
|
|
412
416
|
self._exclusions = []
|
413
417
|
return self
|
414
418
|
|
419
|
+
@contextlib.contextmanager
|
420
|
+
def without_exclusions(self):
|
421
|
+
"""
|
422
|
+
Context manager that temporarily disables exclusion processing.
|
423
|
+
|
424
|
+
This prevents infinite recursion when exclusion callables themselves
|
425
|
+
use find() operations. While in this context, all find operations
|
426
|
+
will skip exclusion filtering.
|
427
|
+
|
428
|
+
Example:
|
429
|
+
```python
|
430
|
+
# This exclusion would normally cause infinite recursion:
|
431
|
+
page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
|
432
|
+
|
433
|
+
# But internally, it's safe because we use:
|
434
|
+
with page.without_exclusions():
|
435
|
+
region = exclusion_callable(page)
|
436
|
+
```
|
437
|
+
|
438
|
+
Yields:
|
439
|
+
The page object with exclusions temporarily disabled.
|
440
|
+
"""
|
441
|
+
old_value = self._computing_exclusions
|
442
|
+
self._computing_exclusions = True
|
443
|
+
try:
|
444
|
+
yield self
|
445
|
+
finally:
|
446
|
+
self._computing_exclusions = old_value
|
447
|
+
|
415
448
|
def add_exclusion(
|
416
449
|
self,
|
417
450
|
exclusion_func_or_region: Union[
|
@@ -759,15 +792,10 @@ class Page(
|
|
759
792
|
if debug:
|
760
793
|
print(f" - Evaluating callable '{exclusion_label}'...")
|
761
794
|
|
762
|
-
#
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
# Call the function - Expects it to return a Region or None
|
767
|
-
region_result = exclusion_item(self)
|
768
|
-
|
769
|
-
# Restore exclusions
|
770
|
-
self._exclusions = temp_original_exclusions
|
795
|
+
# Use context manager to prevent infinite recursion
|
796
|
+
with self.without_exclusions():
|
797
|
+
# Call the function - Expects it to return a Region or None
|
798
|
+
region_result = exclusion_item(self)
|
771
799
|
|
772
800
|
if isinstance(region_result, Region):
|
773
801
|
# Assign the label to the returned region
|
@@ -947,6 +975,11 @@ class Page(
|
|
947
975
|
Returns:
|
948
976
|
A new list containing only the elements not excluded.
|
949
977
|
"""
|
978
|
+
# Skip exclusion filtering if we're currently computing exclusions
|
979
|
+
# This prevents infinite recursion when exclusion callables use find operations
|
980
|
+
if self._computing_exclusions:
|
981
|
+
return elements
|
982
|
+
|
950
983
|
# Check both page-level and PDF-level exclusions
|
951
984
|
has_page_exclusions = bool(self._exclusions)
|
952
985
|
has_pdf_exclusions = (
|
natural_pdf/utils/spatial.py
CHANGED
@@ -10,11 +10,11 @@ with include_boundaries='none'.
|
|
10
10
|
|
11
11
|
Example:
|
12
12
|
from natural_pdf.utils.spatial import is_element_in_region
|
13
|
-
|
13
|
+
|
14
14
|
# Check if element is in region using center-based logic (default)
|
15
15
|
if is_element_in_region(element, region):
|
16
16
|
print("Element is in region")
|
17
|
-
|
17
|
+
|
18
18
|
# Use different strategies
|
19
19
|
if is_element_in_region(element, region, strategy="intersects"):
|
20
20
|
print("Element overlaps with region")
|
@@ -35,16 +35,16 @@ InclusionStrategy = Literal["center", "intersects", "contains"]
|
|
35
35
|
|
36
36
|
def is_element_in_region(
|
37
37
|
element: "Element",
|
38
|
-
region: "Region",
|
38
|
+
region: "Region",
|
39
39
|
strategy: InclusionStrategy = "center",
|
40
|
-
check_page: bool = True
|
40
|
+
check_page: bool = True,
|
41
41
|
) -> bool:
|
42
42
|
"""
|
43
43
|
Unified function to check if an element is inside a region.
|
44
|
-
|
44
|
+
|
45
45
|
This centralizes the logic used across Region, Page, and Flow to ensure
|
46
46
|
consistent behavior throughout the library.
|
47
|
-
|
47
|
+
|
48
48
|
Args:
|
49
49
|
element: The element to check
|
50
50
|
region: The region to check against
|
@@ -53,7 +53,7 @@ def is_element_in_region(
|
|
53
53
|
- "intersects": Element belongs if any part overlaps
|
54
54
|
- "contains": Element belongs only if fully contained
|
55
55
|
check_page: Whether to verify element and region are on the same page
|
56
|
-
|
56
|
+
|
57
57
|
Returns:
|
58
58
|
bool: True if element is in region according to the strategy
|
59
59
|
"""
|
@@ -61,18 +61,18 @@ def is_element_in_region(
|
|
61
61
|
if not hasattr(element, "bbox") or not element.bbox:
|
62
62
|
logger.debug(f"Element lacks bbox attributes: {element}")
|
63
63
|
return False
|
64
|
-
|
64
|
+
|
65
65
|
if not hasattr(region, "bbox") or not region.bbox:
|
66
66
|
logger.debug(f"Region lacks bbox attributes: {region}")
|
67
67
|
return False
|
68
|
-
|
68
|
+
|
69
69
|
# Check page membership if requested
|
70
70
|
if check_page:
|
71
71
|
if not hasattr(element, "page") or not hasattr(region, "page"):
|
72
72
|
return False
|
73
73
|
if element.page != region.page:
|
74
74
|
return False
|
75
|
-
|
75
|
+
|
76
76
|
# Apply the appropriate strategy
|
77
77
|
if strategy == "center":
|
78
78
|
# Use existing region method if available
|
@@ -82,37 +82,43 @@ def is_element_in_region(
|
|
82
82
|
# Fallback calculation
|
83
83
|
elem_center_x = (element.x0 + element.x1) / 2
|
84
84
|
elem_center_y = (element.top + element.bottom) / 2
|
85
|
-
|
85
|
+
|
86
86
|
# Use region's is_point_inside if available
|
87
87
|
if hasattr(region, "is_point_inside"):
|
88
88
|
return region.is_point_inside(elem_center_x, elem_center_y)
|
89
89
|
else:
|
90
90
|
# Simple bounds check
|
91
|
-
return (
|
92
|
-
|
93
|
-
|
91
|
+
return (
|
92
|
+
region.x0 <= elem_center_x <= region.x1
|
93
|
+
and region.top <= elem_center_y <= region.bottom
|
94
|
+
)
|
95
|
+
|
94
96
|
elif strategy == "intersects":
|
95
97
|
# Use existing region method if available
|
96
98
|
if hasattr(region, "intersects"):
|
97
99
|
return region.intersects(element)
|
98
100
|
else:
|
99
101
|
# Simple bbox overlap check
|
100
|
-
return not (
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
return not (
|
103
|
+
element.x1 < region.x0
|
104
|
+
or element.x0 > region.x1
|
105
|
+
or element.bottom < region.top
|
106
|
+
or element.top > region.bottom
|
107
|
+
)
|
108
|
+
|
105
109
|
elif strategy == "contains":
|
106
110
|
# Use existing region method if available
|
107
111
|
if hasattr(region, "contains"):
|
108
112
|
return region.contains(element)
|
109
113
|
else:
|
110
114
|
# Simple full containment check
|
111
|
-
return (
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
115
|
+
return (
|
116
|
+
region.x0 <= element.x0
|
117
|
+
and element.x1 <= region.x1
|
118
|
+
and region.top <= element.top
|
119
|
+
and element.bottom <= region.bottom
|
120
|
+
)
|
121
|
+
|
116
122
|
else:
|
117
123
|
raise ValueError(f"Unknown inclusion strategy: {strategy}")
|
118
124
|
|
@@ -120,10 +126,10 @@ def is_element_in_region(
|
|
120
126
|
def get_inclusion_strategy() -> InclusionStrategy:
|
121
127
|
"""
|
122
128
|
Get the current global inclusion strategy.
|
123
|
-
|
129
|
+
|
124
130
|
This could be made configurable via environment variable or settings.
|
125
131
|
For now, returns the default strategy.
|
126
|
-
|
132
|
+
|
127
133
|
Returns:
|
128
134
|
The current inclusion strategy (default: "center")
|
129
135
|
"""
|
@@ -132,38 +138,35 @@ def get_inclusion_strategy() -> InclusionStrategy:
|
|
132
138
|
return "center"
|
133
139
|
|
134
140
|
|
135
|
-
def calculate_element_overlap_percentage(
|
136
|
-
element: "Element",
|
137
|
-
region: "Region"
|
138
|
-
) -> float:
|
141
|
+
def calculate_element_overlap_percentage(element: "Element", region: "Region") -> float:
|
139
142
|
"""
|
140
143
|
Calculate what percentage of an element overlaps with a region.
|
141
|
-
|
144
|
+
|
142
145
|
Args:
|
143
146
|
element: The element to check
|
144
147
|
region: The region to check against
|
145
|
-
|
148
|
+
|
146
149
|
Returns:
|
147
150
|
float: Percentage of element area that overlaps with region (0.0 to 1.0)
|
148
151
|
"""
|
149
152
|
if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
|
150
153
|
return 0.0
|
151
|
-
|
154
|
+
|
152
155
|
# Calculate intersection bounds
|
153
156
|
intersect_x0 = max(element.x0, region.x0)
|
154
|
-
intersect_y0 = max(element.top, region.top)
|
157
|
+
intersect_y0 = max(element.top, region.top)
|
155
158
|
intersect_x1 = min(element.x1, region.x1)
|
156
159
|
intersect_y1 = min(element.bottom, region.bottom)
|
157
|
-
|
160
|
+
|
158
161
|
# Check if there's an intersection
|
159
162
|
if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
|
160
163
|
return 0.0
|
161
|
-
|
164
|
+
|
162
165
|
# Calculate areas
|
163
166
|
element_area = (element.x1 - element.x0) * (element.bottom - element.top)
|
164
167
|
if element_area == 0:
|
165
168
|
return 0.0
|
166
|
-
|
169
|
+
|
167
170
|
intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
|
168
|
-
|
169
|
-
return intersect_area / element_area
|
171
|
+
|
172
|
+
return intersect_area / element_area
|
@@ -27,7 +27,7 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=619R97OtMd7uhaax7fZNJmhy9GxSs9HCNP4OzGgP828,55882
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
|
30
|
-
natural_pdf/core/page.py,sha256=-
|
30
|
+
natural_pdf/core/page.py,sha256=-78LuCbU9AEd4MGMm7_yoBl9rMAvOvrbPWcVsrMoe0s,159986
|
31
31
|
natural_pdf/core/page_collection.py,sha256=bLZ3TqTQbmP3oYrbfEi7HUoPMbcGplEtUMZ3Z1y7fuw,66728
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
33
|
natural_pdf/core/pdf.py,sha256=i8dYCimL_k5FV6BmPI1a2Dk7XZfwLP8TziXr2n3O_fI,105639
|
@@ -101,7 +101,7 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
|
|
101
101
|
natural_pdf/utils/pdfminer_patches.py,sha256=Ob81OMoNUGMUIy9nMw3deSQ_Z6cQmhbRlHUC3EHw2jk,4201
|
102
102
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
103
103
|
natural_pdf/utils/sections.py,sha256=HZX7829-fquKgIF7vUN2tL10-aXckEaM25g_2VcgWU4,12941
|
104
|
-
natural_pdf/utils/spatial.py,sha256=
|
104
|
+
natural_pdf/utils/spatial.py,sha256=YjzGO4A013ZDYGYbs7hl4RbJOaKrg8-x__Dl_BamwUA,5908
|
105
105
|
natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
|
106
106
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
107
107
|
natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
|
@@ -111,7 +111,7 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
|
|
111
111
|
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
112
112
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
113
113
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
114
|
-
natural_pdf-0.2.
|
114
|
+
natural_pdf-0.2.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
115
115
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
116
116
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
117
117
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -148,8 +148,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
148
148
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
149
149
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
150
150
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
151
|
-
natural_pdf-0.2.
|
152
|
-
natural_pdf-0.2.
|
153
|
-
natural_pdf-0.2.
|
154
|
-
natural_pdf-0.2.
|
155
|
-
natural_pdf-0.2.
|
151
|
+
natural_pdf-0.2.18.dist-info/METADATA,sha256=_hwRZyYPDD_bl-dRHE2KLo8oeo2TPxVxGi66grA-ZIs,6960
|
152
|
+
natural_pdf-0.2.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
153
|
+
natural_pdf-0.2.18.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
154
|
+
natural_pdf-0.2.18.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
155
|
+
natural_pdf-0.2.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|