natural-pdf 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import base64
2
2
  import concurrent.futures # Added import
3
+ import contextlib
3
4
  import hashlib
4
5
  import io
5
6
  import json
@@ -275,6 +276,9 @@ class Page(
275
276
  self._load_elements()
276
277
  self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
277
278
 
279
+ # Flag to prevent infinite recursion when computing exclusions
280
+ self._computing_exclusions = False
281
+
278
282
  def _get_render_specs(
279
283
  self,
280
284
  mode: Literal["show", "render"] = "show",
@@ -412,6 +416,35 @@ class Page(
412
416
  self._exclusions = []
413
417
  return self
414
418
 
419
+ @contextlib.contextmanager
420
+ def without_exclusions(self):
421
+ """
422
+ Context manager that temporarily disables exclusion processing.
423
+
424
+ This prevents infinite recursion when exclusion callables themselves
425
+ use find() operations. While in this context, all find operations
426
+ will skip exclusion filtering.
427
+
428
+ Example:
429
+ ```python
430
+ # This exclusion would normally cause infinite recursion:
431
+ page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
432
+
433
+ # But internally, it's safe because we use:
434
+ with page.without_exclusions():
435
+ region = exclusion_callable(page)
436
+ ```
437
+
438
+ Yields:
439
+ The page object with exclusions temporarily disabled.
440
+ """
441
+ old_value = self._computing_exclusions
442
+ self._computing_exclusions = True
443
+ try:
444
+ yield self
445
+ finally:
446
+ self._computing_exclusions = old_value
447
+
415
448
  def add_exclusion(
416
449
  self,
417
450
  exclusion_func_or_region: Union[
@@ -759,15 +792,10 @@ class Page(
759
792
  if debug:
760
793
  print(f" - Evaluating callable '{exclusion_label}'...")
761
794
 
762
- # Temporarily clear exclusions (consider if really needed)
763
- temp_original_exclusions = self._exclusions
764
- self._exclusions = []
765
-
766
- # Call the function - Expects it to return a Region or None
767
- region_result = exclusion_item(self)
768
-
769
- # Restore exclusions
770
- self._exclusions = temp_original_exclusions
795
+ # Use context manager to prevent infinite recursion
796
+ with self.without_exclusions():
797
+ # Call the function - Expects it to return a Region or None
798
+ region_result = exclusion_item(self)
771
799
 
772
800
  if isinstance(region_result, Region):
773
801
  # Assign the label to the returned region
@@ -947,6 +975,11 @@ class Page(
947
975
  Returns:
948
976
  A new list containing only the elements not excluded.
949
977
  """
978
+ # Skip exclusion filtering if we're currently computing exclusions
979
+ # This prevents infinite recursion when exclusion callables use find operations
980
+ if self._computing_exclusions:
981
+ return elements
982
+
950
983
  # Check both page-level and PDF-level exclusions
951
984
  has_page_exclusions = bool(self._exclusions)
952
985
  has_pdf_exclusions = (
@@ -10,11 +10,11 @@ with include_boundaries='none'.
10
10
 
11
11
  Example:
12
12
  from natural_pdf.utils.spatial import is_element_in_region
13
-
13
+
14
14
  # Check if element is in region using center-based logic (default)
15
15
  if is_element_in_region(element, region):
16
16
  print("Element is in region")
17
-
17
+
18
18
  # Use different strategies
19
19
  if is_element_in_region(element, region, strategy="intersects"):
20
20
  print("Element overlaps with region")
@@ -35,16 +35,16 @@ InclusionStrategy = Literal["center", "intersects", "contains"]
35
35
 
36
36
  def is_element_in_region(
37
37
  element: "Element",
38
- region: "Region",
38
+ region: "Region",
39
39
  strategy: InclusionStrategy = "center",
40
- check_page: bool = True
40
+ check_page: bool = True,
41
41
  ) -> bool:
42
42
  """
43
43
  Unified function to check if an element is inside a region.
44
-
44
+
45
45
  This centralizes the logic used across Region, Page, and Flow to ensure
46
46
  consistent behavior throughout the library.
47
-
47
+
48
48
  Args:
49
49
  element: The element to check
50
50
  region: The region to check against
@@ -53,7 +53,7 @@ def is_element_in_region(
53
53
  - "intersects": Element belongs if any part overlaps
54
54
  - "contains": Element belongs only if fully contained
55
55
  check_page: Whether to verify element and region are on the same page
56
-
56
+
57
57
  Returns:
58
58
  bool: True if element is in region according to the strategy
59
59
  """
@@ -61,18 +61,18 @@ def is_element_in_region(
61
61
  if not hasattr(element, "bbox") or not element.bbox:
62
62
  logger.debug(f"Element lacks bbox attributes: {element}")
63
63
  return False
64
-
64
+
65
65
  if not hasattr(region, "bbox") or not region.bbox:
66
66
  logger.debug(f"Region lacks bbox attributes: {region}")
67
67
  return False
68
-
68
+
69
69
  # Check page membership if requested
70
70
  if check_page:
71
71
  if not hasattr(element, "page") or not hasattr(region, "page"):
72
72
  return False
73
73
  if element.page != region.page:
74
74
  return False
75
-
75
+
76
76
  # Apply the appropriate strategy
77
77
  if strategy == "center":
78
78
  # Use existing region method if available
@@ -82,37 +82,43 @@ def is_element_in_region(
82
82
  # Fallback calculation
83
83
  elem_center_x = (element.x0 + element.x1) / 2
84
84
  elem_center_y = (element.top + element.bottom) / 2
85
-
85
+
86
86
  # Use region's is_point_inside if available
87
87
  if hasattr(region, "is_point_inside"):
88
88
  return region.is_point_inside(elem_center_x, elem_center_y)
89
89
  else:
90
90
  # Simple bounds check
91
- return (region.x0 <= elem_center_x <= region.x1 and
92
- region.top <= elem_center_y <= region.bottom)
93
-
91
+ return (
92
+ region.x0 <= elem_center_x <= region.x1
93
+ and region.top <= elem_center_y <= region.bottom
94
+ )
95
+
94
96
  elif strategy == "intersects":
95
97
  # Use existing region method if available
96
98
  if hasattr(region, "intersects"):
97
99
  return region.intersects(element)
98
100
  else:
99
101
  # Simple bbox overlap check
100
- return not (element.x1 < region.x0 or
101
- element.x0 > region.x1 or
102
- element.bottom < region.top or
103
- element.top > region.bottom)
104
-
102
+ return not (
103
+ element.x1 < region.x0
104
+ or element.x0 > region.x1
105
+ or element.bottom < region.top
106
+ or element.top > region.bottom
107
+ )
108
+
105
109
  elif strategy == "contains":
106
110
  # Use existing region method if available
107
111
  if hasattr(region, "contains"):
108
112
  return region.contains(element)
109
113
  else:
110
114
  # Simple full containment check
111
- return (region.x0 <= element.x0 and
112
- element.x1 <= region.x1 and
113
- region.top <= element.top and
114
- element.bottom <= region.bottom)
115
-
115
+ return (
116
+ region.x0 <= element.x0
117
+ and element.x1 <= region.x1
118
+ and region.top <= element.top
119
+ and element.bottom <= region.bottom
120
+ )
121
+
116
122
  else:
117
123
  raise ValueError(f"Unknown inclusion strategy: {strategy}")
118
124
 
@@ -120,10 +126,10 @@ def is_element_in_region(
120
126
  def get_inclusion_strategy() -> InclusionStrategy:
121
127
  """
122
128
  Get the current global inclusion strategy.
123
-
129
+
124
130
  This could be made configurable via environment variable or settings.
125
131
  For now, returns the default strategy.
126
-
132
+
127
133
  Returns:
128
134
  The current inclusion strategy (default: "center")
129
135
  """
@@ -132,38 +138,35 @@ def get_inclusion_strategy() -> InclusionStrategy:
132
138
  return "center"
133
139
 
134
140
 
135
- def calculate_element_overlap_percentage(
136
- element: "Element",
137
- region: "Region"
138
- ) -> float:
141
+ def calculate_element_overlap_percentage(element: "Element", region: "Region") -> float:
139
142
  """
140
143
  Calculate what percentage of an element overlaps with a region.
141
-
144
+
142
145
  Args:
143
146
  element: The element to check
144
147
  region: The region to check against
145
-
148
+
146
149
  Returns:
147
150
  float: Percentage of element area that overlaps with region (0.0 to 1.0)
148
151
  """
149
152
  if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
150
153
  return 0.0
151
-
154
+
152
155
  # Calculate intersection bounds
153
156
  intersect_x0 = max(element.x0, region.x0)
154
- intersect_y0 = max(element.top, region.top)
157
+ intersect_y0 = max(element.top, region.top)
155
158
  intersect_x1 = min(element.x1, region.x1)
156
159
  intersect_y1 = min(element.bottom, region.bottom)
157
-
160
+
158
161
  # Check if there's an intersection
159
162
  if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
160
163
  return 0.0
161
-
164
+
162
165
  # Calculate areas
163
166
  element_area = (element.x1 - element.x0) * (element.bottom - element.top)
164
167
  if element_area == 0:
165
168
  return 0.0
166
-
169
+
167
170
  intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
168
-
169
- return intersect_area / element_area
171
+
172
+ return intersect_area / element_area
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.17
3
+ Version: 0.2.18
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=619R97OtMd7uhaax7fZNJmhy9GxSs9HCNP4OzGgP828,55882
29
29
  natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
30
- natural_pdf/core/page.py,sha256=-0OaIoXz0zjT_jnPjjI2jpb8vvNKh-1W56auA5UBhTA,158791
30
+ natural_pdf/core/page.py,sha256=-78LuCbU9AEd4MGMm7_yoBl9rMAvOvrbPWcVsrMoe0s,159986
31
31
  natural_pdf/core/page_collection.py,sha256=bLZ3TqTQbmP3oYrbfEi7HUoPMbcGplEtUMZ3Z1y7fuw,66728
32
32
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
33
33
  natural_pdf/core/pdf.py,sha256=i8dYCimL_k5FV6BmPI1a2Dk7XZfwLP8TziXr2n3O_fI,105639
@@ -101,7 +101,7 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
101
101
  natural_pdf/utils/pdfminer_patches.py,sha256=Ob81OMoNUGMUIy9nMw3deSQ_Z6cQmhbRlHUC3EHw2jk,4201
102
102
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
103
103
  natural_pdf/utils/sections.py,sha256=HZX7829-fquKgIF7vUN2tL10-aXckEaM25g_2VcgWU4,12941
104
- natural_pdf/utils/spatial.py,sha256=JOH2LHnF5WBDcjNQsHQdj458zwUgKtSWW7Tj0motn70,5968
104
+ natural_pdf/utils/spatial.py,sha256=YjzGO4A013ZDYGYbs7hl4RbJOaKrg8-x__Dl_BamwUA,5908
105
105
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
106
106
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
107
107
  natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
@@ -111,7 +111,7 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
111
111
  natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
112
112
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
113
113
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
114
- natural_pdf-0.2.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
114
+ natural_pdf-0.2.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
115
115
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
116
116
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
117
117
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -148,8 +148,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
148
148
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
149
149
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
150
150
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
151
- natural_pdf-0.2.17.dist-info/METADATA,sha256=8K5PCwh_OuI8vkWRLChHeT-LuEd0sRmigkRm55ZNeDo,6960
152
- natural_pdf-0.2.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
153
- natural_pdf-0.2.17.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
154
- natural_pdf-0.2.17.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
155
- natural_pdf-0.2.17.dist-info/RECORD,,
151
+ natural_pdf-0.2.18.dist-info/METADATA,sha256=_hwRZyYPDD_bl-dRHE2KLo8oeo2TPxVxGi66grA-ZIs,6960
152
+ natural_pdf-0.2.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
153
+ natural_pdf-0.2.18.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
154
+ natural_pdf-0.2.18.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
155
+ natural_pdf-0.2.18.dist-info/RECORD,,