natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -1,51 +1,63 @@
1
- import pdfplumber
2
- import os
3
- import logging
4
- import tempfile
5
- from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
6
- from PIL import Image
7
1
  import base64
2
+ import hashlib
8
3
  import io
9
4
  import json
5
+ import logging
6
+ import os
10
7
  import re
11
- import hashlib
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
11
+
12
+ import pdfplumber
13
+ from PIL import Image
12
14
 
13
15
  from natural_pdf.elements.collections import ElementCollection
14
16
  from natural_pdf.elements.region import Region
15
17
 
16
18
  if TYPE_CHECKING:
17
19
  import pdfplumber
18
- from natural_pdf.core.pdf import PDF
19
- from natural_pdf.elements.collections import ElementCollection
20
+
20
21
  from natural_pdf.core.highlighting_service import HighlightingService
22
+ from natural_pdf.core.pdf import PDF
21
23
  from natural_pdf.elements.base import Element
24
+ from natural_pdf.elements.collections import ElementCollection
22
25
 
23
- from natural_pdf.elements.text import TextElement
26
+ # New Imports
27
+ import itertools
28
+
29
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
30
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
31
+
32
+ from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
24
33
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
25
34
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
26
- from natural_pdf.ocr import OCROptions
27
- from natural_pdf.ocr import OCRManager
28
- from natural_pdf.core.element_manager import ElementManager
29
- from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
30
- from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
31
35
  from natural_pdf.analyzers.text_options import TextStyleOptions
36
+ from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
37
+ from natural_pdf.core.element_manager import ElementManager
38
+ from natural_pdf.elements.text import TextElement
39
+ from natural_pdf.ocr import OCRManager, OCROptions
40
+
41
+ # Import new utils
42
+ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
32
43
  from natural_pdf.widgets import InteractiveViewerWidget
33
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
44
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
34
45
 
35
46
  logger = logging.getLogger(__name__)
36
47
 
48
+
37
49
  class Page:
38
50
  """
39
51
  Enhanced Page wrapper built on top of pdfplumber.Page.
40
-
52
+
41
53
  This class provides a fluent interface for working with PDF pages,
42
54
  with improved selection, navigation, extraction, and question-answering capabilities.
43
55
  """
44
-
45
- def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
56
+
57
+ def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
46
58
  """
47
59
  Initialize a page wrapper.
48
-
60
+
49
61
  Args:
50
62
  page: pdfplumber page object
51
63
  parent: Parent PDF object
@@ -57,39 +69,51 @@ class Page:
57
69
  self._index = index
58
70
  self._text_styles = None # Lazy-loaded text style analyzer results
59
71
  self._exclusions = [] # List to store exclusion functions/regions
60
-
72
+
61
73
  # Region management
62
74
  self._regions = {
63
- 'detected': [], # Layout detection results
64
- 'named': {}, # Named regions (name -> region)
75
+ "detected": [], # Layout detection results
76
+ "named": {}, # Named regions (name -> region)
65
77
  }
66
-
78
+
67
79
  # Initialize ElementManager
68
80
  self._element_mgr = ElementManager(self, font_attrs)
69
81
 
70
82
  # --- Get OCR Manager Instance ---
71
- if OCRManager and hasattr(parent, '_ocr_manager') and isinstance(parent._ocr_manager, OCRManager):
83
+ if (
84
+ OCRManager
85
+ and hasattr(parent, "_ocr_manager")
86
+ and isinstance(parent._ocr_manager, OCRManager)
87
+ ):
72
88
  self._ocr_manager = parent._ocr_manager
73
89
  logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
74
90
  else:
75
91
  self._ocr_manager = None
76
92
  if OCRManager:
77
- logger.warning(f"Page {self.number}: OCRManager instance not found on parent PDF object.")
93
+ logger.warning(
94
+ f"Page {self.number}: OCRManager instance not found on parent PDF object."
95
+ )
78
96
 
79
97
  # --- Get Layout Manager Instance ---
80
- if LayoutManager and hasattr(parent, '_layout_manager') and isinstance(parent._layout_manager, LayoutManager):
98
+ if (
99
+ LayoutManager
100
+ and hasattr(parent, "_layout_manager")
101
+ and isinstance(parent._layout_manager, LayoutManager)
102
+ ):
81
103
  self._layout_manager = parent._layout_manager
82
104
  logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
83
105
  else:
84
106
  self._layout_manager = None
85
107
  if LayoutManager:
86
- logger.warning(f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail.")
108
+ logger.warning(
109
+ f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail."
110
+ )
87
111
 
88
112
  # Initialize the internal variable with a single underscore
89
- self._layout_analyzer = None
113
+ self._layout_analyzer = None
90
114
 
91
115
  @property
92
- def pdf(self) -> 'PDF':
116
+ def pdf(self) -> "PDF":
93
117
  """Provides public access to the parent PDF object."""
94
118
  return self._parent
95
119
 
@@ -97,7 +121,7 @@ class Page:
97
121
  def number(self) -> int:
98
122
  """Get page number (1-based)."""
99
123
  return self._page.page_number
100
-
124
+
101
125
  @property
102
126
  def page_number(self) -> int:
103
127
  """Get page number (1-based)."""
@@ -107,12 +131,12 @@ class Page:
107
131
  def index(self) -> int:
108
132
  """Get page index (0-based)."""
109
133
  return self._index
110
-
134
+
111
135
  @property
112
136
  def width(self) -> float:
113
137
  """Get page width."""
114
138
  return self._page.width
115
-
139
+
116
140
  @property
117
141
  def height(self) -> float:
118
142
  """Get page height."""
@@ -120,107 +144,125 @@ class Page:
120
144
 
121
145
  # --- Highlighting Service Accessor ---
122
146
  @property
123
- def _highlighter(self) -> 'HighlightingService':
124
- """Provides access to the parent PDF's HighlightingService."""
125
- if not hasattr(self._parent, 'highlighter'):
126
- # This should ideally not happen if PDF.__init__ works correctly
127
- raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
128
- return self._parent.highlighter
147
+ def _highlighter(self) -> "HighlightingService":
148
+ """Provides access to the parent PDF's HighlightingService."""
149
+ if not hasattr(self._parent, "highlighter"):
150
+ # This should ideally not happen if PDF.__init__ works correctly
151
+ raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
152
+ return self._parent.highlighter
129
153
 
130
- def clear_exclusions(self) -> 'Page':
154
+ def clear_exclusions(self) -> "Page":
131
155
  """
132
156
  Clear all exclusions from the page.
133
157
  """
134
158
  self._exclusions = []
135
159
  return self
136
160
 
137
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
161
+ def add_exclusion(
162
+ self,
163
+ exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
164
+ label: Optional[str] = None,
165
+ ) -> "Page":
138
166
  """
139
167
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
140
168
  Ensures non-callable items are stored as Region objects if possible.
141
-
169
+
142
170
  Args:
143
171
  exclusion_func_or_region: Either a callable function returning a Region,
144
172
  a Region object, or another object with a valid .bbox attribute.
145
173
  label: Optional label for this exclusion (e.g., 'header', 'footer').
146
-
174
+
147
175
  Returns:
148
176
  Self for method chaining
149
-
177
+
150
178
  Raises:
151
179
  TypeError: If a non-callable, non-Region object without a valid bbox is provided.
152
180
  """
153
- exclusion_data = None # Initialize exclusion data
181
+ exclusion_data = None # Initialize exclusion data
154
182
 
155
183
  if callable(exclusion_func_or_region):
156
184
  # Store callable functions along with their label
157
185
  exclusion_data = (exclusion_func_or_region, label)
158
- logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
186
+ logger.debug(
187
+ f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
188
+ )
159
189
  elif isinstance(exclusion_func_or_region, Region):
160
190
  # Store Region objects directly, assigning the label
161
- exclusion_func_or_region.label = label # Assign label
162
- exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
163
- logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
164
- elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
191
+ exclusion_func_or_region.label = label # Assign label
192
+ exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
193
+ logger.debug(
194
+ f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
195
+ )
196
+ elif (
197
+ hasattr(exclusion_func_or_region, "bbox")
198
+ and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
199
+ and len(exclusion_func_or_region.bbox) == 4
200
+ ):
165
201
  # Convert objects with a valid bbox to a Region before storing
166
202
  try:
167
203
  bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
168
204
  # Pass the label to the Region constructor
169
205
  region_to_add = Region(self, bbox_coords, label=label)
170
- exclusion_data = (region_to_add, label) # Store as tuple
171
- logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
206
+ exclusion_data = (region_to_add, label) # Store as tuple
207
+ logger.debug(
208
+ f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
209
+ )
172
210
  except (ValueError, TypeError, Exception) as e:
173
211
  # Raise an error if conversion fails
174
- raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
212
+ raise TypeError(
213
+ f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
214
+ ) from e
175
215
  else:
176
216
  # Reject invalid types
177
- raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
217
+ raise TypeError(
218
+ f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
219
+ )
178
220
 
179
221
  # Append the stored data (tuple of object/callable and label)
180
222
  if exclusion_data:
181
223
  self._exclusions.append(exclusion_data)
182
224
 
183
225
  return self
184
-
185
- def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
226
+
227
+ def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
186
228
  """
187
229
  Add a region to the page.
188
-
230
+
189
231
  Args:
190
232
  region: Region object to add
191
233
  name: Optional name for the region
192
-
234
+
193
235
  Returns:
194
236
  Self for method chaining
195
237
  """
196
238
  # Check if it's actually a Region object
197
239
  if not isinstance(region, Region):
198
240
  raise TypeError("region must be a Region object")
199
-
241
+
200
242
  # Set the source and name
201
- region.source = 'named'
202
-
243
+ region.source = "named"
244
+
203
245
  if name:
204
246
  region.name = name
205
247
  # Add to named regions dictionary (overwriting if name already exists)
206
- self._regions['named'][name] = region
248
+ self._regions["named"][name] = region
207
249
  else:
208
250
  # Add to detected regions list (unnamed but registered)
209
- self._regions['detected'].append(region)
210
-
251
+ self._regions["detected"].append(region)
252
+
211
253
  # Add to element manager for selector queries
212
254
  self._element_mgr.add_region(region)
213
-
255
+
214
256
  return self
215
-
216
- def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
257
+
258
+ def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
217
259
  """
218
260
  Add multiple regions to the page.
219
-
261
+
220
262
  Args:
221
263
  regions: List of Region objects to add
222
264
  prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
223
-
265
+
224
266
  Returns:
225
267
  Self for method chaining
226
268
  """
@@ -232,23 +274,23 @@ class Page:
232
274
  # Add without names
233
275
  for region in regions:
234
276
  self.add_region(region)
235
-
277
+
236
278
  return self
237
-
279
+
238
280
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
239
281
  """
240
282
  Get all exclusion regions for this page.
241
283
  Assumes self._exclusions contains tuples of (callable/Region, label).
242
-
284
+
243
285
  Args:
244
286
  include_callable: Whether to evaluate callable exclusion functions
245
287
  debug: Enable verbose debug logging for exclusion evaluation
246
-
288
+
247
289
  Returns:
248
290
  List of Region objects to exclude, with labels assigned.
249
291
  """
250
292
  regions = []
251
-
293
+
252
294
  if debug:
253
295
  print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
254
296
 
@@ -280,32 +322,39 @@ class Page:
280
322
  if debug:
281
323
  print(f" ✓ Added region from callable '{label}': {region_result}")
282
324
  elif region_result:
283
- logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
284
- if debug:
285
- print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
325
+ logger.warning(
326
+ f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
327
+ )
328
+ if debug:
329
+ print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
286
330
  else:
287
331
  if debug:
288
- print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
332
+ print(
333
+ f" ✗ Callable '{exclusion_label}' returned None, no region added"
334
+ )
289
335
 
290
336
  except Exception as e:
291
337
  error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
292
338
  print(error_msg)
293
339
  import traceback
340
+
294
341
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
295
342
 
296
343
  # Process direct Region objects (label was assigned in add_exclusion)
297
344
  elif isinstance(exclusion_item, Region):
298
- regions.append(exclusion_item) # Label is already on the Region object
345
+ regions.append(exclusion_item) # Label is already on the Region object
299
346
  if debug:
300
347
  print(f" - Added direct region '{label}': {exclusion_item}")
301
348
  # No else needed, add_exclusion should prevent invalid types
302
-
349
+
303
350
  if debug:
304
351
  print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
305
-
352
+
306
353
  return regions
307
354
 
308
- def _filter_elements_by_exclusions(self, elements: List['Element'], debug_exclusions: bool = False) -> List['Element']:
355
+ def _filter_elements_by_exclusions(
356
+ self, elements: List["Element"], debug_exclusions: bool = False
357
+ ) -> List["Element"]:
309
358
  """
310
359
  Filters a list of elements, removing those within the page's exclusion regions.
311
360
 
@@ -318,19 +367,27 @@ class Page:
318
367
  """
319
368
  if not self._exclusions:
320
369
  if debug_exclusions:
321
- print(f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements.")
370
+ print(
371
+ f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
372
+ )
322
373
  return elements
323
374
 
324
375
  # Get all exclusion regions, including evaluating callable functions
325
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
376
+ exclusion_regions = self._get_exclusion_regions(
377
+ include_callable=True, debug=debug_exclusions
378
+ )
326
379
 
327
380
  if not exclusion_regions:
328
381
  if debug_exclusions:
329
- print(f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements.")
382
+ print(
383
+ f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements."
384
+ )
330
385
  return elements
331
386
 
332
387
  if debug_exclusions:
333
- print(f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements.")
388
+ print(
389
+ f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements."
390
+ )
334
391
 
335
392
  filtered_elements = []
336
393
  excluded_count = 0
@@ -346,7 +403,9 @@ class Page:
346
403
  filtered_elements.append(element)
347
404
 
348
405
  if debug_exclusions:
349
- print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}.")
406
+ print(
407
+ f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}."
408
+ )
350
409
 
351
410
  return filtered_elements
352
411
 
@@ -365,15 +424,18 @@ class Page:
365
424
  Element object or None if not found
366
425
  """
367
426
  from natural_pdf.selectors.parser import parse_selector
427
+
368
428
  selector_obj = parse_selector(selector)
369
-
429
+
370
430
  # Pass regex and case flags to selector function
371
- kwargs['regex'] = regex
372
- kwargs['case'] = case
373
-
431
+ kwargs["regex"] = regex
432
+ kwargs["case"] = case
433
+
374
434
  # First get all matching elements without applying exclusions initially within _apply_selector
375
- results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
376
-
435
+ results_collection = self._apply_selector(
436
+ selector_obj, **kwargs
437
+ ) # _apply_selector doesn't filter
438
+
377
439
  # Filter the results based on exclusions if requested
378
440
  if apply_exclusions and self._exclusions and results_collection:
379
441
  filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
@@ -385,7 +447,9 @@ class Page:
385
447
  else:
386
448
  return None
387
449
 
388
- def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
450
+ def find_all(
451
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
452
+ ) -> "ElementCollection":
389
453
  """
390
454
  Find all elements on this page matching selector.
391
455
 
@@ -395,20 +459,23 @@ class Page:
395
459
  regex: Whether to use regex for text search in :contains (default: False)
396
460
  case: Whether to do case-sensitive text search (default: True)
397
461
  **kwargs: Additional filter parameters
398
-
462
+
399
463
  Returns:
400
464
  ElementCollection with matching elements
401
465
  """
402
466
  from natural_pdf.selectors.parser import parse_selector
467
+
403
468
  selector_obj = parse_selector(selector)
404
-
469
+
405
470
  # Pass regex and case flags to selector function
406
- kwargs['regex'] = regex
407
- kwargs['case'] = case
408
-
471
+ kwargs["regex"] = regex
472
+ kwargs["case"] = case
473
+
409
474
  # First get all matching elements without applying exclusions initially within _apply_selector
410
- results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
411
-
475
+ results_collection = self._apply_selector(
476
+ selector_obj, **kwargs
477
+ ) # _apply_selector doesn't filter
478
+
412
479
  # Filter the results based on exclusions if requested
413
480
  if apply_exclusions and self._exclusions and results_collection:
414
481
  filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
@@ -416,208 +483,348 @@ class Page:
416
483
  else:
417
484
  # Return the unfiltered collection
418
485
  return results_collection
419
-
420
- def _apply_selector(self, selector_obj: Dict, **kwargs) -> 'ElementCollection': # Removed apply_exclusions arg
486
+
487
+ def _apply_selector(
488
+ self, selector_obj: Dict, **kwargs
489
+ ) -> "ElementCollection": # Removed apply_exclusions arg
421
490
  """
422
491
  Apply selector to page elements.
423
492
  Exclusions are now handled by the calling methods (find, find_all) if requested.
424
-
493
+
425
494
  Args:
426
495
  selector_obj: Parsed selector dictionary
427
496
  **kwargs: Additional filter parameters including 'regex' and 'case'
428
-
497
+
429
498
  Returns:
430
499
  ElementCollection of matching elements (unfiltered by exclusions)
431
500
  """
432
501
  from natural_pdf.selectors.parser import selector_to_filter_func
433
-
502
+
434
503
  # Get element type to filter
435
- element_type = selector_obj.get('type', 'any').lower()
436
-
504
+ element_type = selector_obj.get("type", "any").lower()
505
+
437
506
  # Determine which elements to search based on element type
438
507
  elements_to_search = []
439
- if element_type == 'any':
508
+ if element_type == "any":
440
509
  elements_to_search = self._element_mgr.get_all_elements()
441
- elif element_type == 'text':
510
+ elif element_type == "text":
442
511
  elements_to_search = self._element_mgr.words
443
- elif element_type == 'char':
512
+ elif element_type == "char":
444
513
  elements_to_search = self._element_mgr.chars
445
- elif element_type == 'word':
514
+ elif element_type == "word":
446
515
  elements_to_search = self._element_mgr.words
447
- elif element_type == 'rect' or element_type == 'rectangle':
516
+ elif element_type == "rect" or element_type == "rectangle":
448
517
  elements_to_search = self._element_mgr.rects
449
- elif element_type == 'line':
518
+ elif element_type == "line":
450
519
  elements_to_search = self._element_mgr.lines
451
- elif element_type == 'region':
520
+ elif element_type == "region":
452
521
  elements_to_search = self._element_mgr.regions
453
522
  else:
454
523
  elements_to_search = self._element_mgr.get_all_elements()
455
-
524
+
456
525
  # Create filter function from selector, passing any additional parameters
457
526
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
458
-
527
+
459
528
  # Apply the filter to matching elements
460
529
  matching_elements = [element for element in elements_to_search if filter_func(element)]
461
-
530
+
462
531
  # Handle spatial pseudo-classes that require relationship checking
463
- for pseudo in selector_obj.get('pseudo_classes', []):
464
- name = pseudo.get('name')
465
- args = pseudo.get('args', '')
466
-
467
- if name in ('above', 'below', 'near', 'left-of', 'right-of'):
532
+ for pseudo in selector_obj.get("pseudo_classes", []):
533
+ name = pseudo.get("name")
534
+ args = pseudo.get("args", "")
535
+
536
+ if name in ("above", "below", "near", "left-of", "right-of"):
468
537
  # Find the reference element first
469
538
  from natural_pdf.selectors.parser import parse_selector
539
+
470
540
  ref_selector = parse_selector(args) if isinstance(args, str) else args
471
541
  # Recursively call _apply_selector for reference element (exclusions handled later)
472
- ref_elements = self._apply_selector(ref_selector, **kwargs)
473
-
542
+ ref_elements = self._apply_selector(ref_selector, **kwargs)
543
+
474
544
  if not ref_elements:
475
545
  return ElementCollection([])
476
-
546
+
477
547
  ref_element = ref_elements.first
478
- if not ref_element: continue
479
-
548
+ if not ref_element:
549
+ continue
550
+
480
551
  # Filter elements based on spatial relationship
481
- if name == 'above':
482
- matching_elements = [el for el in matching_elements if hasattr(el, 'bottom') and hasattr(ref_element, 'top') and el.bottom <= ref_element.top]
483
- elif name == 'below':
484
- matching_elements = [el for el in matching_elements if hasattr(el, 'top') and hasattr(ref_element, 'bottom') and el.top >= ref_element.bottom]
485
- elif name == 'left-of':
486
- matching_elements = [el for el in matching_elements if hasattr(el, 'x1') and hasattr(ref_element, 'x0') and el.x1 <= ref_element.x0]
487
- elif name == 'right-of':
488
- matching_elements = [el for el in matching_elements if hasattr(el, 'x0') and hasattr(ref_element, 'x1') and el.x0 >= ref_element.x1]
489
- elif name == 'near':
552
+ if name == "above":
553
+ matching_elements = [
554
+ el
555
+ for el in matching_elements
556
+ if hasattr(el, "bottom")
557
+ and hasattr(ref_element, "top")
558
+ and el.bottom <= ref_element.top
559
+ ]
560
+ elif name == "below":
561
+ matching_elements = [
562
+ el
563
+ for el in matching_elements
564
+ if hasattr(el, "top")
565
+ and hasattr(ref_element, "bottom")
566
+ and el.top >= ref_element.bottom
567
+ ]
568
+ elif name == "left-of":
569
+ matching_elements = [
570
+ el
571
+ for el in matching_elements
572
+ if hasattr(el, "x1")
573
+ and hasattr(ref_element, "x0")
574
+ and el.x1 <= ref_element.x0
575
+ ]
576
+ elif name == "right-of":
577
+ matching_elements = [
578
+ el
579
+ for el in matching_elements
580
+ if hasattr(el, "x0")
581
+ and hasattr(ref_element, "x1")
582
+ and el.x0 >= ref_element.x1
583
+ ]
584
+ elif name == "near":
585
+
490
586
  def distance(el1, el2):
491
- if not (hasattr(el1, 'x0') and hasattr(el1, 'x1') and hasattr(el1, 'top') and hasattr(el1, 'bottom') and
492
- hasattr(el2, 'x0') and hasattr(el2, 'x1') and hasattr(el2, 'top') and hasattr(el2, 'bottom')):
493
- return float('inf') # Cannot calculate distance
494
- el1_center_x = (el1.x0 + el1.x1) / 2
495
- el1_center_y = (el1.top + el1.bottom) / 2
496
- el2_center_x = (el2.x0 + el2.x1) / 2
497
- el2_center_y = (el2.top + el2.bottom) / 2
498
- return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
499
-
500
- threshold = kwargs.get('near_threshold', 50)
501
- matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
502
-
587
+ if not (
588
+ hasattr(el1, "x0")
589
+ and hasattr(el1, "x1")
590
+ and hasattr(el1, "top")
591
+ and hasattr(el1, "bottom")
592
+ and hasattr(el2, "x0")
593
+ and hasattr(el2, "x1")
594
+ and hasattr(el2, "top")
595
+ and hasattr(el2, "bottom")
596
+ ):
597
+ return float("inf") # Cannot calculate distance
598
+ el1_center_x = (el1.x0 + el1.x1) / 2
599
+ el1_center_y = (el1.top + el1.bottom) / 2
600
+ el2_center_x = (el2.x0 + el2.x1) / 2
601
+ el2_center_y = (el2.top + el2.bottom) / 2
602
+ return (
603
+ (el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2
604
+ ) ** 0.5
605
+
606
+ threshold = kwargs.get("near_threshold", 50)
607
+ matching_elements = [
608
+ el for el in matching_elements if distance(el, ref_element) <= threshold
609
+ ]
610
+
503
611
  # Sort elements in reading order if requested
504
- if kwargs.get('reading_order', True):
505
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
506
- matching_elements.sort(key=lambda el: (el.top, el.x0))
612
+ if kwargs.get("reading_order", True):
613
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
614
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
507
615
  else:
508
- logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
509
-
616
+ logger.warning(
617
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
618
+ )
619
+
510
620
  # Create result collection - exclusions are handled by the calling methods (find, find_all)
511
621
  result = ElementCollection(matching_elements)
512
-
622
+
513
623
  return result
514
624
 
515
625
  def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
516
626
  """
517
627
  Create a region on this page with the specified coordinates.
518
-
628
+
519
629
  Args:
520
630
  x0: Left x-coordinate
521
631
  top: Top y-coordinate
522
632
  x1: Right x-coordinate
523
633
  bottom: Bottom y-coordinate
524
-
634
+
525
635
  Returns:
526
636
  Region object for the specified coordinates
527
637
  """
528
638
  from natural_pdf.elements.region import Region
639
+
529
640
  return Region(self, (x0, top, x1, bottom))
530
-
531
- def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
532
- width: str = "full") -> Any:
641
+
642
+ def region(
643
+ self,
644
+ left: float = None,
645
+ top: float = None,
646
+ right: float = None,
647
+ bottom: float = None,
648
+ width: Union[str, float, None] = None,
649
+ height: Optional[float] = None,
650
+ ) -> Any:
533
651
  """
534
- Create a region on this page with more intuitive named parameters.
535
-
652
+ Create a region on this page with more intuitive named parameters,
653
+ allowing definition by coordinates or by coordinate + dimension.
654
+
536
655
  Args:
537
- left: Left x-coordinate (default: 0)
538
- top: Top y-coordinate (default: 0)
539
- right: Right x-coordinate (default: page width)
540
- bottom: Bottom y-coordinate (default: page height)
541
- width: Width mode - "full" for full page width or "element" for element width
542
-
656
+ left: Left x-coordinate (default: 0 if width not used).
657
+ top: Top y-coordinate (default: 0 if height not used).
658
+ right: Right x-coordinate (default: page width if width not used).
659
+ bottom: Bottom y-coordinate (default: page height if height not used).
660
+ width: Width definition. Can be:
661
+ - Numeric: The width of the region in points. Cannot be used with both left and right.
662
+ - String 'full': Sets region width to full page width (overrides left/right).
663
+ - String 'element' or None (default): Uses provided/calculated left/right,
664
+ defaulting to page width if neither are specified.
665
+ height: Numeric height of the region. Cannot be used with both top and bottom.
666
+
543
667
  Returns:
544
668
  Region object for the specified coordinates
545
-
669
+
670
+ Raises:
671
+ ValueError: If conflicting arguments are provided (e.g., top, bottom, and height)
672
+ or if width is an invalid string.
673
+
546
674
  Examples:
547
- >>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
548
- >>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
549
- """
550
- # Handle defaults
551
- left = 0 if left is None else left
552
- top = 0 if top is None else top
553
- right = self.width if right is None else right
554
- bottom = self.height if bottom is None else bottom
555
-
556
- # Handle width parameter
557
- if width == "full":
558
- left = 0
559
- right = self.width
560
- elif width != "element":
561
- raise ValueError("Width must be 'full' or 'element'")
562
-
675
+ >>> page.region(top=100, height=50) # Region from y=100 to y=150, default width
676
+ >>> page.region(left=50, width=100) # Region from x=50 to x=150, default height
677
+ >>> page.region(bottom=500, height=50) # Region from y=450 to y=500
678
+ >>> page.region(right=200, width=50) # Region from x=150 to x=200
679
+ >>> page.region(top=100, bottom=200, width="full") # Explicit full width
680
+ """
681
+ # --- Type checking and basic validation ---
682
+ is_width_numeric = isinstance(width, (int, float))
683
+ is_width_string = isinstance(width, str)
684
+ width_mode = "element" # Default mode
685
+
686
+ if height is not None and top is not None and bottom is not None:
687
+ raise ValueError("Cannot specify top, bottom, and height simultaneously.")
688
+ if is_width_numeric and left is not None and right is not None:
689
+ raise ValueError("Cannot specify left, right, and a numeric width simultaneously.")
690
+ if is_width_string:
691
+ width_lower = width.lower()
692
+ if width_lower not in ["full", "element"]:
693
+ raise ValueError("String width argument must be 'full' or 'element'.")
694
+ width_mode = width_lower
695
+
696
+ # --- Calculate Coordinates ---
697
+ final_top = top
698
+ final_bottom = bottom
699
+ final_left = left
700
+ final_right = right
701
+
702
+ # Height calculations
703
+ if height is not None:
704
+ if top is not None:
705
+ final_bottom = top + height
706
+ elif bottom is not None:
707
+ final_top = bottom - height
708
+ else: # Neither top nor bottom provided, default top to 0
709
+ final_top = 0
710
+ final_bottom = height
711
+
712
+ # Width calculations (numeric only)
713
+ if is_width_numeric:
714
+ if left is not None:
715
+ final_right = left + width
716
+ elif right is not None:
717
+ final_left = right - width
718
+ else: # Neither left nor right provided, default left to 0
719
+ final_left = 0
720
+ final_right = width
721
+
722
+ # --- Apply Defaults for Unset Coordinates ---
723
+ # Only default coordinates if they weren't set by dimension calculation
724
+ if final_top is None:
725
+ final_top = 0
726
+ if final_bottom is None:
727
+ # Check if bottom should have been set by height calc
728
+ if height is None or top is None:
729
+ final_bottom = self.height
730
+
731
+ if final_left is None:
732
+ final_left = 0
733
+ if final_right is None:
734
+ # Check if right should have been set by width calc
735
+ if not is_width_numeric or left is None:
736
+ final_right = self.width
737
+
738
+ # --- Handle width_mode == 'full' ---
739
+ if width_mode == "full":
740
+ # Override left/right if mode is full
741
+ final_left = 0
742
+ final_right = self.width
743
+
744
+ # --- Final Validation & Creation ---
745
+ # Ensure coordinates are within page bounds (clamp)
746
+ final_left = max(0, final_left)
747
+ final_top = max(0, final_top)
748
+ final_right = min(self.width, final_right)
749
+ final_bottom = min(self.height, final_bottom)
750
+
751
+ # Ensure valid box (x0<=x1, top<=bottom)
752
+ if final_left > final_right:
753
+ logger.warning(f"Calculated left ({final_left}) > right ({final_right}). Swapping.")
754
+ final_left, final_right = final_right, final_left
755
+ if final_top > final_bottom:
756
+ logger.warning(f"Calculated top ({final_top}) > bottom ({final_bottom}). Swapping.")
757
+ final_top, final_bottom = final_bottom, final_top
758
+
563
759
  from natural_pdf.elements.region import Region
564
- region = Region(self, (left, top, right, bottom))
760
+
761
+ region = Region(self, (final_left, final_top, final_right, final_bottom))
565
762
  return region
566
-
567
- def get_elements(self, apply_exclusions=True, debug_exclusions: bool = False) -> List['Element']:
763
+
764
+ def get_elements(
765
+ self, apply_exclusions=True, debug_exclusions: bool = False
766
+ ) -> List["Element"]:
568
767
  """
569
768
  Get all elements on this page.
570
-
769
+
571
770
  Args:
572
771
  apply_exclusions: Whether to apply exclusion regions (default: True).
573
772
  debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
574
-
773
+
575
774
  Returns:
576
775
  List of all elements on the page, potentially filtered by exclusions.
577
776
  """
578
777
  # Get all elements from the element manager
579
778
  all_elements = self._element_mgr.get_all_elements()
580
-
779
+
581
780
  # Apply exclusions if requested
582
781
  if apply_exclusions and self._exclusions:
583
- return self._filter_elements_by_exclusions(all_elements, debug_exclusions=debug_exclusions)
782
+ return self._filter_elements_by_exclusions(
783
+ all_elements, debug_exclusions=debug_exclusions
784
+ )
584
785
  else:
585
786
  if debug_exclusions:
586
- print(f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied).")
787
+ print(
788
+ f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied)."
789
+ )
587
790
  return all_elements
588
-
589
- def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
791
+
792
+ def filter_elements(
793
+ self, elements: List["Element"], selector: str, **kwargs
794
+ ) -> List["Element"]:
590
795
  """
591
796
  Filter a list of elements based on a selector.
592
-
797
+
593
798
  Args:
594
799
  elements: List of elements to filter
595
800
  selector: CSS-like selector string
596
801
  **kwargs: Additional filter parameters
597
-
802
+
598
803
  Returns:
599
804
  List of elements that match the selector
600
805
  """
601
806
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
602
-
807
+
603
808
  # Parse the selector
604
809
  selector_obj = parse_selector(selector)
605
-
810
+
606
811
  # Create filter function from selector
607
812
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
608
-
813
+
609
814
  # Apply the filter to the elements
610
815
  matching_elements = [element for element in elements if filter_func(element)]
611
-
816
+
612
817
  # Sort elements in reading order if requested
613
- if kwargs.get('reading_order', True):
614
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
615
- matching_elements.sort(key=lambda el: (el.top, el.x0))
818
+ if kwargs.get("reading_order", True):
819
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
820
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
616
821
  else:
617
- logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
618
-
822
+ logger.warning(
823
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
824
+ )
825
+
619
826
  return matching_elements
620
-
827
+
621
828
  def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
622
829
  """
623
830
  Select content from the top of the page until matching selector.
@@ -626,26 +833,28 @@ class Page:
626
833
  selector: CSS-like selector string
627
834
  include_endpoint: Whether to include the endpoint element in the region
628
835
  **kwargs: Additional selection parameters
629
-
836
+
630
837
  Returns:
631
838
  Region object representing the selected content
632
-
839
+
633
840
  Examples:
634
841
  >>> page.until('text:contains("Conclusion")') # Select from top to conclusion
635
842
  >>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
636
843
  """
637
- # Find the target element
844
+ # Find the target element
638
845
  target = self.find(selector, **kwargs)
639
846
  if not target:
640
847
  # If target not found, return a default region (full page)
641
848
  from natural_pdf.elements.region import Region
849
+
642
850
  return Region(self, (0, 0, self.width, self.height))
643
-
851
+
644
852
  # Create a region from the top of the page to the target
645
853
  from natural_pdf.elements.region import Region
854
+
646
855
  # Ensure target has positional attributes before using them
647
- target_top = getattr(target, 'top', 0)
648
- target_bottom = getattr(target, 'bottom', self.height)
856
+ target_top = getattr(target, "top", 0)
857
+ target_bottom = getattr(target, "bottom", self.height)
649
858
 
650
859
  if include_endpoint:
651
860
  # Include the target element
@@ -653,17 +862,16 @@ class Page:
653
862
  else:
654
863
  # Up to the target element
655
864
  region = Region(self, (0, 0, self.width, target_top))
656
-
865
+
657
866
  region.end_element = target
658
867
  return region
659
868
 
660
-
661
869
  def crop(self, bbox=None, **kwargs) -> Any:
662
870
  """
663
871
  Crop the page to the specified bounding box.
664
872
 
665
873
  This is a direct wrapper around pdfplumber's crop method.
666
-
874
+
667
875
  Args:
668
876
  bbox: Bounding box (x0, top, x1, bottom) or None
669
877
  **kwargs: Additional parameters (top, bottom, left, right)
@@ -674,59 +882,82 @@ class Page:
674
882
  # Returns the pdfplumber page object, not a natural-pdf Page
675
883
  return self._page.crop(bbox, **kwargs)
676
884
 
677
- def extract_text(self,
678
- preserve_whitespace=True,
679
- use_exclusions=True,
680
- debug_exclusions=False, **kwargs) -> str:
885
+ def extract_text(
886
+ self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
887
+ ) -> str:
681
888
  """
682
- Extract text from this page, respecting any exclusion regions.
683
-
889
+ Extract text from this page, respecting exclusions and using pdfplumber's
890
+ layout engine (chars_to_textmap) if layout arguments are provided or default.
891
+
684
892
  Args:
685
- preserve_whitespace: Whether to keep blank characters (default: True)
686
- use_exclusions: Whether to apply exclusion regions (default: True)
687
- debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
688
- **kwargs: Additional extraction parameters passed to pdfplumber
689
-
893
+ use_exclusions: Whether to apply exclusion regions (default: True).
894
+ Note: Filtering logic is now always applied if exclusions exist.
895
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
896
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
897
+ `chars_to_textmap` function. Common parameters include:
898
+ - layout (bool): If True (default), inserts spaces/newlines.
899
+ - x_density (float): Pixels per character horizontally.
900
+ - y_density (float): Pixels per line vertically.
901
+ - x_tolerance (float): Tolerance for horizontal character grouping.
902
+ - y_tolerance (float): Tolerance for vertical character grouping.
903
+ - line_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
904
+ - char_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
905
+ See pdfplumber documentation for more.
906
+
690
907
  Returns:
691
- Extracted text as string
692
- """
693
- if not use_exclusions or not self._exclusions:
694
- # If no exclusions or exclusions disabled, use regular extraction
695
- if debug_exclusions:
696
- print(f"Page {self.index}: Extracting text via pdfplumber (exclusions not applied).")
697
- # Note: pdfplumber still uses keep_blank_chars parameter
698
- return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
699
-
700
- # --- Exclusion Logic ---
701
- # 1. Get all potentially relevant text elements (words)
702
- all_text_elements = self.words # Use the words property
703
- if debug_exclusions:
704
- print(f"Page {self.index}: Starting text extraction with {len(all_text_elements)} words before exclusion.")
705
-
706
- # 2. Filter elements using the centralized method
707
- filtered_elements = self._filter_elements_by_exclusions(all_text_elements, debug_exclusions=debug_exclusions)
708
-
709
- # 3. Extract text from the filtered elements
710
- collection = ElementCollection(filtered_elements)
711
- # Ensure elements are sorted for logical text flow (might be redundant if self.words is sorted)
712
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in collection.elements):
713
- collection.sort(key=lambda el: (el.top, el.x0))
714
-
715
- # Join text, handling potential missing text attributes gracefully
716
- result = " ".join(getattr(el, 'text', '') for el in collection.elements)
717
-
718
- if debug_exclusions:
719
- print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied.")
720
-
908
+ Extracted text as string, potentially with layout-based spacing.
909
+ """
910
+ logger.debug(f"Page {self.number}: extract_text called with kwargs: {kwargs}")
911
+ debug = kwargs.get("debug", debug_exclusions) # Allow 'debug' kwarg
912
+
913
+ # 1. Get Word Elements (triggers load_elements if needed)
914
+ word_elements = self.words
915
+ if not word_elements:
916
+ logger.debug(f"Page {self.number}: No word elements found.")
917
+ return ""
918
+
919
+ # 2. Get Exclusions
920
+ apply_exclusions_flag = kwargs.get("use_exclusions", True)
921
+ exclusion_regions = []
922
+ if apply_exclusions_flag and self._exclusions:
923
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
924
+ if debug:
925
+ logger.debug(f"Page {self.number}: Applying {len(exclusion_regions)} exclusions.")
926
+ elif debug:
927
+ logger.debug(f"Page {self.number}: Not applying exclusions.")
928
+
929
+ # 3. Collect All Character Dictionaries from Word Elements
930
+ all_char_dicts = []
931
+ for word in word_elements:
932
+ all_char_dicts.extend(getattr(word, "_char_dicts", []))
933
+
934
+ # 4. Spatially Filter Characters
935
+ filtered_chars = filter_chars_spatially(
936
+ char_dicts=all_char_dicts,
937
+ exclusion_regions=exclusion_regions,
938
+ target_region=None, # No target region for full page extraction
939
+ debug=debug,
940
+ )
941
+
942
+ # 5. Generate Text Layout using Utility
943
+ # Pass page bbox as layout context
944
+ page_bbox = (0, 0, self.width, self.height)
945
+ result = generate_text_layout(
946
+ char_dicts=filtered_chars,
947
+ layout_context_bbox=page_bbox,
948
+ user_kwargs=kwargs, # Pass original user kwargs
949
+ )
950
+
951
+ logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
721
952
  return result
722
953
 
723
954
  def extract_table(self, table_settings={}) -> List[Any]:
724
955
  """
725
956
  Extract the largest table from this page.
726
-
957
+
727
958
  Args:
728
959
  table_settings: Additional extraction parameters
729
-
960
+
730
961
  Returns:
731
962
  List of extracted tables (or None if no table found)
732
963
  """
@@ -736,10 +967,10 @@ class Page:
736
967
  def extract_tables(self, table_settings={}) -> List[Any]:
737
968
  """
738
969
  Extract tables from this page.
739
-
970
+
740
971
  Args:
741
972
  table_settings: Additional extraction parameters
742
-
973
+
743
974
  Returns:
744
975
  List of extracted tables
745
976
  """
@@ -749,33 +980,33 @@ class Page:
749
980
  def _load_elements(self):
750
981
  """Load all elements from the page via ElementManager."""
751
982
  self._element_mgr.load_elements()
752
-
983
+
753
984
  def _create_char_elements(self):
754
985
  """DEPRECATED: Use self._element_mgr.chars"""
755
986
  logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
756
- return self._element_mgr.chars # Delegate
987
+ return self._element_mgr.chars # Delegate
757
988
 
758
989
  def _process_font_information(self, char_dict):
759
- """DEPRECATED: Handled by ElementManager"""
760
- logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
761
- # ElementManager handles this internally
762
- pass
990
+ """DEPRECATED: Handled by ElementManager"""
991
+ logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
992
+ # ElementManager handles this internally
993
+ pass
763
994
 
764
995
  def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
765
996
  """DEPRECATED: Use self._element_mgr.words"""
766
997
  logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
767
- return self._element_mgr.words # Delegate
998
+ return self._element_mgr.words # Delegate
768
999
 
769
1000
  def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
770
1001
  """DEPRECATED: Handled by ElementManager"""
771
1002
  logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
772
1003
  pass
773
-
1004
+
774
1005
  def _check_font_attributes_match(self, char, prev_char, font_attrs):
775
1006
  """DEPRECATED: Handled by ElementManager"""
776
1007
  logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
777
1008
  pass
778
-
1009
+
779
1010
  def _create_word_element(self, chars, font_attrs):
780
1011
  """DEPRECATED: Handled by ElementManager"""
781
1012
  logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
@@ -785,34 +1016,36 @@ class Page:
785
1016
  def chars(self) -> List[Any]:
786
1017
  """Get all character elements on this page."""
787
1018
  return self._element_mgr.chars
788
-
1019
+
789
1020
  @property
790
1021
  def words(self) -> List[Any]:
791
1022
  """Get all word elements on this page."""
792
1023
  return self._element_mgr.words
793
-
1024
+
794
1025
  @property
795
1026
  def rects(self) -> List[Any]:
796
1027
  """Get all rectangle elements on this page."""
797
1028
  return self._element_mgr.rects
798
-
1029
+
799
1030
  @property
800
1031
  def lines(self) -> List[Any]:
801
1032
  """Get all line elements on this page."""
802
1033
  return self._element_mgr.lines
803
-
804
- def highlight(self,
805
- bbox: Optional[Tuple[float, float, float, float]] = None,
806
- color: Optional[Union[Tuple, str]] = None,
807
- label: Optional[str] = None,
808
- use_color_cycling: bool = False,
809
- element: Optional[Any] = None,
810
- include_attrs: Optional[List[str]] = None,
811
- existing: str = 'append') -> 'Page':
1034
+
1035
+ def highlight(
1036
+ self,
1037
+ bbox: Optional[Tuple[float, float, float, float]] = None,
1038
+ color: Optional[Union[Tuple, str]] = None,
1039
+ label: Optional[str] = None,
1040
+ use_color_cycling: bool = False,
1041
+ element: Optional[Any] = None,
1042
+ include_attrs: Optional[List[str]] = None,
1043
+ existing: str = "append",
1044
+ ) -> "Page":
812
1045
  """
813
1046
  Highlight a bounding box or the entire page.
814
1047
  Delegates to the central HighlightingService.
815
-
1048
+
816
1049
  Args:
817
1050
  bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
818
1051
  color: RGBA color tuple/string for the highlight.
@@ -834,23 +1067,24 @@ class Page:
834
1067
  use_color_cycling=use_color_cycling,
835
1068
  element=element,
836
1069
  include_attrs=include_attrs,
837
- existing=existing
1070
+ existing=existing,
838
1071
  )
839
1072
  return self
840
1073
 
841
1074
  def highlight_polygon(
842
- self,
1075
+ self,
843
1076
  polygon: List[Tuple[float, float]],
844
- color: Optional[Union[Tuple, str]] = None,
1077
+ color: Optional[Union[Tuple, str]] = None,
845
1078
  label: Optional[str] = None,
846
1079
  use_color_cycling: bool = False,
847
1080
  element: Optional[Any] = None,
848
1081
  include_attrs: Optional[List[str]] = None,
849
- existing: str = 'append') -> 'Page':
1082
+ existing: str = "append",
1083
+ ) -> "Page":
850
1084
  """
851
1085
  Highlight a polygon shape on the page.
852
1086
  Delegates to the central HighlightingService.
853
-
1087
+
854
1088
  Args:
855
1089
  polygon: List of (x, y) points defining the polygon.
856
1090
  color: RGBA color tuple/string for the highlight.
@@ -871,51 +1105,55 @@ class Page:
871
1105
  use_color_cycling=use_color_cycling,
872
1106
  element=element,
873
1107
  include_attrs=include_attrs,
874
- existing=existing
1108
+ existing=existing,
875
1109
  )
876
1110
  return self
877
-
878
- def show(self,
879
- scale: float = 2.0,
880
- width: Optional[int] = None,
881
- labels: bool = True,
882
- legend_position: str = 'right',
883
- render_ocr: bool = False) -> Optional[Image.Image]:
1111
+
1112
+ def show(
1113
+ self,
1114
+ scale: float = 2.0,
1115
+ width: Optional[int] = None,
1116
+ labels: bool = True,
1117
+ legend_position: str = "right",
1118
+ render_ocr: bool = False,
1119
+ ) -> Optional[Image.Image]:
884
1120
  """
885
1121
  Generates and returns an image of the page with persistent highlights rendered.
886
-
1122
+
887
1123
  Args:
888
1124
  scale: Scale factor for rendering.
889
1125
  width: Optional width for the output image.
890
1126
  labels: Whether to include a legend for labels.
891
1127
  legend_position: Position of the legend.
892
1128
  render_ocr: Whether to render OCR text.
893
-
1129
+
894
1130
  Returns:
895
1131
  PIL Image object of the page with highlights, or None if rendering fails.
896
1132
  """
897
1133
  return self.to_image(
898
1134
  scale=scale,
899
1135
  width=width,
900
- labels=labels,
901
- legend_position=legend_position,
1136
+ labels=labels,
1137
+ legend_position=legend_position,
902
1138
  render_ocr=render_ocr,
903
- include_highlights=True # Ensure highlights are requested
1139
+ include_highlights=True, # Ensure highlights are requested
904
1140
  )
905
-
906
- def save_image(self,
907
- filename: str,
908
- scale: float = 2.0,
909
- width: Optional[int] = None,
910
- labels: bool = True,
911
- legend_position: str = 'right',
912
- render_ocr: bool = False,
913
- include_highlights: bool = True, # Allow saving without highlights
914
- resolution: Optional[float] = None,
915
- **kwargs) -> 'Page':
1141
+
1142
+ def save_image(
1143
+ self,
1144
+ filename: str,
1145
+ scale: float = 2.0,
1146
+ width: Optional[int] = None,
1147
+ labels: bool = True,
1148
+ legend_position: str = "right",
1149
+ render_ocr: bool = False,
1150
+ include_highlights: bool = True, # Allow saving without highlights
1151
+ resolution: Optional[float] = None,
1152
+ **kwargs,
1153
+ ) -> "Page":
916
1154
  """
917
1155
  Save the page image to a file, rendering highlights via HighlightingService.
918
-
1156
+
919
1157
  Args:
920
1158
  filename: Path to save the image to.
921
1159
  scale: Scale factor for rendering highlights.
@@ -926,7 +1164,7 @@ class Page:
926
1164
  include_highlights: Whether to render highlights.
927
1165
  resolution: Resolution for base image rendering.
928
1166
  **kwargs: Additional args for pdfplumber's to_image.
929
-
1167
+
930
1168
  Returns:
931
1169
  Self for method chaining.
932
1170
  """
@@ -935,25 +1173,25 @@ class Page:
935
1173
  path=filename,
936
1174
  scale=scale,
937
1175
  width=width,
938
- labels=labels,
1176
+ labels=labels,
939
1177
  legend_position=legend_position,
940
1178
  render_ocr=render_ocr,
941
1179
  include_highlights=include_highlights,
942
1180
  resolution=resolution,
943
- **kwargs
1181
+ **kwargs,
944
1182
  )
945
1183
  return self
946
-
947
- def clear_highlights(self) -> 'Page':
1184
+
1185
+ def clear_highlights(self) -> "Page":
948
1186
  """
949
1187
  Clear all highlights *from this specific page* via HighlightingService.
950
-
1188
+
951
1189
  Returns:
952
1190
  Self for method chaining
953
1191
  """
954
1192
  self._highlighter.clear_page(self.index)
955
1193
  return self
956
-
1194
+
957
1195
  def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
958
1196
  """
959
1197
  Analyze text elements by style, adding attributes directly to elements.
@@ -982,19 +1220,21 @@ class Page:
982
1220
  # Return the collection of elements which now have style attributes
983
1221
  return processed_elements_collection
984
1222
 
985
- def to_image(self,
986
- path: Optional[str] = None,
987
- scale: float = 2.0,
988
- width: Optional[int] = None,
989
- labels: bool = True,
990
- legend_position: str = 'right',
991
- render_ocr: bool = False,
992
- resolution: Optional[float] = None,
993
- include_highlights: bool = True,
994
- **kwargs) -> Optional[Image.Image]:
1223
+ def to_image(
1224
+ self,
1225
+ path: Optional[str] = None,
1226
+ scale: float = 2.0,
1227
+ width: Optional[int] = None,
1228
+ labels: bool = True,
1229
+ legend_position: str = "right",
1230
+ render_ocr: bool = False,
1231
+ resolution: Optional[float] = None,
1232
+ include_highlights: bool = True,
1233
+ **kwargs,
1234
+ ) -> Optional[Image.Image]:
995
1235
  """
996
1236
  Generate a PIL image of the page, using HighlightingService if needed.
997
-
1237
+
998
1238
  Args:
999
1239
  path: Optional path to save the image to.
1000
1240
  scale: Scale factor for rendering highlights.
@@ -1005,7 +1245,7 @@ class Page:
1005
1245
  resolution: Resolution in DPI for base page image (default: scale * 72).
1006
1246
  include_highlights: Whether to render highlights.
1007
1247
  **kwargs: Additional parameters for pdfplumber.to_image.
1008
-
1248
+
1009
1249
  Returns:
1010
1250
  PIL Image of the page, or None if rendering fails.
1011
1251
  """
@@ -1020,7 +1260,7 @@ class Page:
1020
1260
  legend_position=legend_position,
1021
1261
  render_ocr=render_ocr,
1022
1262
  resolution=resolution,
1023
- **kwargs
1263
+ **kwargs,
1024
1264
  )
1025
1265
  else:
1026
1266
  # Get the base page image directly from pdfplumber if no highlights needed
@@ -1028,26 +1268,36 @@ class Page:
1028
1268
  # Use the underlying pdfplumber page object
1029
1269
  img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1030
1270
  # Access the PIL image directly (assuming pdfplumber structure)
1031
- image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
1032
- if isinstance(image, bytes): # Handle cases where it returns bytes
1033
- from io import BytesIO
1034
- image = Image.open(BytesIO(image)).convert('RGB') # Convert to RGB for consistency
1035
-
1271
+ image = (
1272
+ img_object.annotated
1273
+ if hasattr(img_object, "annotated")
1274
+ else img_object._repr_png_()
1275
+ )
1276
+ if isinstance(image, bytes): # Handle cases where it returns bytes
1277
+ from io import BytesIO
1278
+
1279
+ image = Image.open(BytesIO(image)).convert(
1280
+ "RGB"
1281
+ ) # Convert to RGB for consistency
1282
+
1036
1283
  except Exception as e:
1037
1284
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1038
- return None # Return None on error
1285
+ return None # Return None on error
1039
1286
 
1040
- if image is None: return None
1287
+ if image is None:
1288
+ return None
1041
1289
 
1042
1290
  # Resize the final image if width is provided
1043
1291
  if width is not None and width > 0 and image.width > 0:
1044
1292
  aspect_ratio = image.height / image.width
1045
1293
  height = int(width * aspect_ratio)
1046
1294
  try:
1047
- image = image.resize((width, height), Image.Resampling.LANCZOS) # Use modern resampling
1295
+ image = image.resize(
1296
+ (width, height), Image.Resampling.LANCZOS
1297
+ ) # Use modern resampling
1048
1298
  except Exception as resize_error:
1049
- logger.warning(f"Could not resize image: {resize_error}")
1050
-
1299
+ logger.warning(f"Could not resize image: {resize_error}")
1300
+
1051
1301
  # Save the image if path is provided
1052
1302
  if path:
1053
1303
  try:
@@ -1056,15 +1306,21 @@ class Page:
1056
1306
  image.save(path)
1057
1307
  logger.debug(f"Saved page image to: {path}")
1058
1308
  except Exception as save_error:
1059
- logger.error(f"Failed to save image to {path}: {save_error}")
1060
-
1309
+ logger.error(f"Failed to save image to {path}: {save_error}")
1310
+
1061
1311
  return image
1062
-
1063
- def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
1312
+
1313
+ def _create_text_elements_from_ocr(
1314
+ self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
1315
+ ) -> List[TextElement]:
1064
1316
  """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1065
- logger.warning("_create_text_elements_from_ocr is deprecated. Use self._element_mgr version.")
1066
- return self._element_mgr.create_text_elements_from_ocr(ocr_results, image_width, image_height)
1067
-
1317
+ logger.warning(
1318
+ "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
1319
+ )
1320
+ return self._element_mgr.create_text_elements_from_ocr(
1321
+ ocr_results, image_width, image_height
1322
+ )
1323
+
1068
1324
  def apply_ocr(
1069
1325
  self,
1070
1326
  engine: Optional[str] = None,
@@ -1072,35 +1328,40 @@ class Page:
1072
1328
  languages: Optional[List[str]] = None,
1073
1329
  min_confidence: Optional[float] = None,
1074
1330
  device: Optional[str] = None,
1075
- ) -> List[TextElement]:
1331
+ ) -> "Page":
1076
1332
  """
1077
- Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
1078
-
1333
+ Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1334
+
1079
1335
  Returns:
1080
1336
  List of created TextElements derived from OCR results for this page.
1081
1337
  """
1082
- if not hasattr(self._parent, 'apply_ocr_to_pages'):
1083
- logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
1084
- return []
1338
+ if not hasattr(self._parent, "apply_ocr"):
1339
+ logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1340
+ return []
1085
1341
 
1086
- logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
1342
+ logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1087
1343
  try:
1088
1344
  # Delegate to parent PDF, targeting only this page's index
1089
- self._parent.apply_ocr_to_pages(
1345
+ self._parent.apply_ocr(
1090
1346
  pages=[self.index],
1091
- engine=engine, options=options, languages=languages,
1092
- min_confidence=min_confidence, device=device
1347
+ engine=engine,
1348
+ options=options,
1349
+ languages=languages,
1350
+ min_confidence=min_confidence,
1351
+ device=device,
1093
1352
  )
1094
1353
  except Exception as e:
1095
- logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1096
- return []
1354
+ logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1355
+ return []
1097
1356
 
1098
1357
  # Return the OCR elements specifically added to this page
1099
1358
  # Use element manager to retrieve them
1100
- ocr_elements = [el for el in self.words if getattr(el, 'source', None) == 'ocr']
1101
- logger.debug(f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements.")
1102
- return ocr_elements
1103
-
1359
+ ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1360
+ logger.debug(
1361
+ f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1362
+ )
1363
+ return self
1364
+
1104
1365
  def extract_ocr_elements(
1105
1366
  self,
1106
1367
  engine: Optional[str] = None,
@@ -1114,42 +1375,55 @@ class Page:
1114
1375
  Uses the shared OCRManager instance.
1115
1376
  """
1116
1377
  if not self._ocr_manager:
1117
- logger.error(f"Page {self.number}: OCRManager not available. Cannot extract OCR elements.")
1118
- return []
1119
-
1378
+ logger.error(
1379
+ f"Page {self.number}: OCRManager not available. Cannot extract OCR elements."
1380
+ )
1381
+ return []
1382
+
1120
1383
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1121
1384
  try:
1122
- ocr_scale = getattr(self._parent, '_config', {}).get('ocr_image_scale', 2.0)
1385
+ ocr_scale = getattr(self._parent, "_config", {}).get("ocr_image_scale", 2.0)
1123
1386
  # Get base image without highlights
1124
1387
  image = self.to_image(scale=ocr_scale, include_highlights=False)
1125
1388
  if not image:
1126
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1127
- return []
1389
+ logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1390
+ return []
1128
1391
  logger.debug(f" Rendered image size: {image.width}x{image.height}")
1129
1392
  except Exception as e:
1130
1393
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1131
1394
  return []
1132
-
1133
- manager_args = {'images': image, 'options': options, 'engine': engine}
1134
- if languages is not None: manager_args['languages'] = languages
1135
- if min_confidence is not None: manager_args['min_confidence'] = min_confidence
1136
- if device is not None: manager_args['device'] = device
1137
-
1138
- logger.debug(f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }")
1395
+
1396
+ manager_args = {"images": image, "options": options, "engine": engine}
1397
+ if languages is not None:
1398
+ manager_args["languages"] = languages
1399
+ if min_confidence is not None:
1400
+ manager_args["min_confidence"] = min_confidence
1401
+ if device is not None:
1402
+ manager_args["device"] = device
1403
+
1404
+ logger.debug(
1405
+ f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
1406
+ )
1139
1407
  try:
1140
1408
  # apply_ocr now returns List[List[Dict]] or List[Dict]
1141
1409
  results_list = self._ocr_manager.apply_ocr(**manager_args)
1142
1410
  # If it returned a list of lists (batch mode), take the first list
1143
- results = results_list[0] if isinstance(results_list, list) and results_list and isinstance(results_list[0], list) else results_list
1144
-
1411
+ results = (
1412
+ results_list[0]
1413
+ if isinstance(results_list, list)
1414
+ and results_list
1415
+ and isinstance(results_list[0], list)
1416
+ else results_list
1417
+ )
1418
+
1145
1419
  if not isinstance(results, list):
1146
- logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1147
- results = []
1420
+ logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1421
+ results = []
1148
1422
  logger.info(f" OCR Manager returned {len(results)} results for extraction.")
1149
1423
  except Exception as e:
1150
- logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
1151
- return []
1152
-
1424
+ logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
1425
+ return []
1426
+
1153
1427
  # Convert results but DO NOT add to ElementManager
1154
1428
  logger.debug(f" Converting OCR results to TextElements (extract only)...")
1155
1429
  # Use a temporary method to create elements without adding them globally
@@ -1157,29 +1431,36 @@ class Page:
1157
1431
  scale_x = self.width / image.width if image.width else 1
1158
1432
  scale_y = self.height / image.height if image.height else 1
1159
1433
  for result in results:
1160
- x0, top, x1, bottom = [float(c) for c in result['bbox']]
1434
+ x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1161
1435
  elem_data = {
1162
- 'text': result['text'], 'confidence': result['confidence'],
1163
- 'x0': x0 * scale_x, 'top': top * scale_y,
1164
- 'x1': x1 * scale_x, 'bottom': bottom * scale_y,
1165
- 'width': (x1 - x0) * scale_x, 'height': (bottom - top) * scale_y,
1166
- 'object_type': 'text', 'source': 'ocr',
1167
- 'fontname': 'OCR-temp', 'size': 10.0, 'page_number': self.number
1436
+ "text": result["text"],
1437
+ "confidence": result["confidence"],
1438
+ "x0": x0 * scale_x,
1439
+ "top": top * scale_y,
1440
+ "x1": x1 * scale_x,
1441
+ "bottom": bottom * scale_y,
1442
+ "width": (x1 - x0) * scale_x,
1443
+ "height": (bottom - top) * scale_y,
1444
+ "object_type": "text",
1445
+ "source": "ocr",
1446
+ "fontname": "OCR-temp",
1447
+ "size": 10.0,
1448
+ "page_number": self.number,
1168
1449
  }
1169
1450
  temp_elements.append(TextElement(elem_data, self))
1170
1451
 
1171
1452
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1172
1453
  return temp_elements
1173
-
1454
+
1174
1455
  @property
1175
1456
  def layout_analyzer(self) -> LayoutAnalyzer:
1176
1457
  """Get or create the layout analyzer for this page."""
1177
- if self._layout_analyzer is None:
1178
- if not self._layout_manager:
1179
- logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
1180
- return None
1181
- self._layout_analyzer = LayoutAnalyzer(self)
1182
- return self._layout_analyzer
1458
+ if self._layout_analyzer is None:
1459
+ if not self._layout_manager:
1460
+ logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
1461
+ return None
1462
+ self._layout_analyzer = LayoutAnalyzer(self)
1463
+ return self._layout_analyzer
1183
1464
 
1184
1465
  def analyze_layout(
1185
1466
  self,
@@ -1189,7 +1470,7 @@ class Page:
1189
1470
  classes: Optional[List[str]] = None,
1190
1471
  exclude_classes: Optional[List[str]] = None,
1191
1472
  device: Optional[str] = None,
1192
- existing: str = "replace"
1473
+ existing: str = "replace",
1193
1474
  ) -> ElementCollection[Region]:
1194
1475
  """
1195
1476
  Analyze the page layout using the configured LayoutManager.
@@ -1200,8 +1481,10 @@ class Page:
1200
1481
  """
1201
1482
  analyzer = self.layout_analyzer
1202
1483
  if not analyzer:
1203
- logger.error("Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?).")
1204
- return ElementCollection([]) # Return empty collection
1484
+ logger.error(
1485
+ "Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?)."
1486
+ )
1487
+ return ElementCollection([]) # Return empty collection
1205
1488
 
1206
1489
  # The analyzer's analyze_layout method already adds regions to the page
1207
1490
  # and its element manager. We just need to retrieve them.
@@ -1212,17 +1495,20 @@ class Page:
1212
1495
  classes=classes,
1213
1496
  exclude_classes=exclude_classes,
1214
1497
  device=device,
1215
- existing=existing
1498
+ existing=existing,
1216
1499
  )
1217
1500
 
1218
1501
  # Retrieve the detected regions from the element manager
1219
1502
  # Filter regions based on source='detected' and potentially the model used if available
1220
- detected_regions = [r for r in self._element_mgr.regions
1221
- if r.source == 'detected' and (not engine or getattr(r, 'model', None) == engine)]
1503
+ detected_regions = [
1504
+ r
1505
+ for r in self._element_mgr.regions
1506
+ if r.source == "detected" and (not engine or getattr(r, "model", None) == engine)
1507
+ ]
1222
1508
 
1223
1509
  return ElementCollection(detected_regions)
1224
1510
 
1225
- def clear_detected_layout_regions(self) -> 'Page':
1511
+ def clear_detected_layout_regions(self) -> "Page":
1226
1512
  """
1227
1513
  Removes all regions from this page that were added by layout analysis
1228
1514
  (i.e., regions where `source` attribute is 'detected').
@@ -1233,47 +1519,61 @@ class Page:
1233
1519
  Returns:
1234
1520
  Self for method chaining.
1235
1521
  """
1236
- if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
1237
- logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
1238
- self._regions['detected'] = [] # Ensure page's list is also clear
1239
- return self
1522
+ if (
1523
+ not hasattr(self._element_mgr, "regions")
1524
+ or not hasattr(self._element_mgr, "_elements")
1525
+ or "regions" not in self._element_mgr._elements
1526
+ ):
1527
+ logger.debug(
1528
+ f"Page {self.index}: No regions found in ElementManager, nothing to clear."
1529
+ )
1530
+ self._regions["detected"] = [] # Ensure page's list is also clear
1531
+ return self
1240
1532
 
1241
1533
  # Filter ElementManager's list to keep only non-detected regions
1242
1534
  original_count = len(self._element_mgr.regions)
1243
- self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
1535
+ self._element_mgr._elements["regions"] = [
1536
+ r for r in self._element_mgr.regions if getattr(r, "source", None) != "detected"
1537
+ ]
1244
1538
  new_count = len(self._element_mgr.regions)
1245
1539
  removed_count = original_count - new_count
1246
1540
 
1247
1541
  # Clear the page's specific list of detected regions
1248
- self._regions['detected'] = []
1542
+ self._regions["detected"] = []
1249
1543
 
1250
1544
  logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
1251
1545
  return self
1252
1546
 
1253
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
1547
+ def get_section_between(
1548
+ self, start_element=None, end_element=None, boundary_inclusion="both"
1549
+ ) -> Optional[Region]: # Return Optional
1254
1550
  """
1255
1551
  Get a section between two elements on this page.
1256
1552
  """
1257
1553
  # Create a full-page region to operate within
1258
1554
  page_region = self.create_region(0, 0, self.width, self.height)
1259
-
1555
+
1260
1556
  # Delegate to the region's method
1261
1557
  try:
1262
1558
  return page_region.get_section_between(
1263
1559
  start_element=start_element,
1264
1560
  end_element=end_element,
1265
- boundary_inclusion=boundary_inclusion
1561
+ boundary_inclusion=boundary_inclusion,
1266
1562
  )
1267
1563
  except Exception as e:
1268
- logger.error(f"Error getting section between elements on page {self.index}: {e}", exc_info=True)
1269
- return None
1270
-
1271
- def get_sections(self,
1272
- start_elements=None,
1273
- end_elements=None,
1274
- boundary_inclusion='both',
1275
- y_threshold=5.0,
1276
- bounding_box=None) -> 'ElementCollection[Region]': # Updated type hint
1564
+ logger.error(
1565
+ f"Error getting section between elements on page {self.index}: {e}", exc_info=True
1566
+ )
1567
+ return None
1568
+
1569
+ def get_sections(
1570
+ self,
1571
+ start_elements=None,
1572
+ end_elements=None,
1573
+ boundary_inclusion="both",
1574
+ y_threshold=5.0,
1575
+ bounding_box=None,
1576
+ ) -> "ElementCollection[Region]": # Updated type hint
1277
1577
  """
1278
1578
  Get sections of a page defined by start/end elements.
1279
1579
  Uses the page-level implementation.
@@ -1281,6 +1581,7 @@ class Page:
1281
1581
  Returns:
1282
1582
  An ElementCollection containing the found Region objects.
1283
1583
  """
1584
+
1284
1585
  # Helper function to get bounds from bounding_box parameter
1285
1586
  def get_bounds():
1286
1587
  if bounding_box:
@@ -1289,130 +1590,180 @@ class Page:
1289
1590
  return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
1290
1591
  else:
1291
1592
  return 0, 0, self.width, self.height
1292
-
1593
+
1293
1594
  regions = []
1294
-
1595
+
1295
1596
  # Handle cases where elements are provided as strings (selectors)
1296
1597
  if isinstance(start_elements, str):
1297
- start_elements = self.find_all(start_elements).elements # Get list of elements
1298
- elif hasattr(start_elements, 'elements'): # Handle ElementCollection input
1299
- start_elements = start_elements.elements
1300
-
1598
+ start_elements = self.find_all(start_elements).elements # Get list of elements
1599
+ elif hasattr(start_elements, "elements"): # Handle ElementCollection input
1600
+ start_elements = start_elements.elements
1601
+
1301
1602
  if isinstance(end_elements, str):
1302
1603
  end_elements = self.find_all(end_elements).elements
1303
- elif hasattr(end_elements, 'elements'):
1304
- end_elements = end_elements.elements
1604
+ elif hasattr(end_elements, "elements"):
1605
+ end_elements = end_elements.elements
1305
1606
 
1306
1607
  # Ensure start_elements is a list
1307
- if start_elements is None: start_elements = []
1308
- if end_elements is None: end_elements = []
1608
+ if start_elements is None:
1609
+ start_elements = []
1610
+ if end_elements is None:
1611
+ end_elements = []
1309
1612
 
1310
- valid_inclusions = ['start', 'end', 'both', 'none']
1613
+ valid_inclusions = ["start", "end", "both", "none"]
1311
1614
  if boundary_inclusion not in valid_inclusions:
1312
1615
  raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
1313
-
1616
+
1314
1617
  if not start_elements:
1315
1618
  # Return an empty ElementCollection if no start elements
1316
1619
  return ElementCollection([])
1317
-
1620
+
1318
1621
  # Combine start and end elements with their type
1319
1622
  all_boundaries = []
1320
- for el in start_elements: all_boundaries.append((el, 'start'))
1321
- for el in end_elements: all_boundaries.append((el, 'end'))
1322
-
1623
+ for el in start_elements:
1624
+ all_boundaries.append((el, "start"))
1625
+ for el in end_elements:
1626
+ all_boundaries.append((el, "end"))
1627
+
1323
1628
  # Sort all boundary elements primarily by top, then x0
1324
1629
  try:
1325
- all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
1630
+ all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
1326
1631
  except AttributeError as e:
1327
- logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
1328
- return ElementCollection([]) # Cannot proceed if elements lack position
1632
+ logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
1633
+ return ElementCollection([]) # Cannot proceed if elements lack position
1329
1634
 
1330
1635
  # Process sorted boundaries to find sections
1331
1636
  current_start_element = None
1332
1637
  active_section_started = False
1333
1638
 
1334
1639
  for element, element_type in all_boundaries:
1335
- if element_type == 'start':
1640
+ if element_type == "start":
1336
1641
  # If we have an active section, this start implicitly ends it
1337
1642
  if active_section_started:
1338
- end_boundary_el = element # Use this start as the end boundary
1643
+ end_boundary_el = element # Use this start as the end boundary
1339
1644
  # Determine region boundaries
1340
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1341
- sec_bottom = end_boundary_el.top if boundary_inclusion not in ['end', 'both'] else end_boundary_el.bottom
1342
-
1343
- if sec_top < sec_bottom: # Ensure valid region
1645
+ sec_top = (
1646
+ current_start_element.top
1647
+ if boundary_inclusion in ["start", "both"]
1648
+ else current_start_element.bottom
1649
+ )
1650
+ sec_bottom = (
1651
+ end_boundary_el.top
1652
+ if boundary_inclusion not in ["end", "both"]
1653
+ else end_boundary_el.bottom
1654
+ )
1655
+
1656
+ if sec_top < sec_bottom: # Ensure valid region
1344
1657
  x0, _, x1, _ = get_bounds()
1345
1658
  region = self.create_region(x0, sec_top, x1, sec_bottom)
1346
1659
  region.start_element = current_start_element
1347
- region.end_element = end_boundary_el # Mark the element that ended it
1348
- region.is_end_next_start = True # Mark how it ended
1660
+ region.end_element = end_boundary_el # Mark the element that ended it
1661
+ region.is_end_next_start = True # Mark how it ended
1349
1662
  regions.append(region)
1350
- active_section_started = False # Reset for the new start
1351
-
1663
+ active_section_started = False # Reset for the new start
1664
+
1352
1665
  # Set this as the potential start of the next section
1353
1666
  current_start_element = element
1354
1667
  active_section_started = True
1355
1668
 
1356
- elif element_type == 'end' and active_section_started:
1669
+ elif element_type == "end" and active_section_started:
1357
1670
  # We found an explicit end for the current section
1358
1671
  end_boundary_el = element
1359
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1360
- sec_bottom = end_boundary_el.bottom if boundary_inclusion in ['end', 'both'] else end_boundary_el.top
1361
-
1362
- if sec_top < sec_bottom: # Ensure valid region
1672
+ sec_top = (
1673
+ current_start_element.top
1674
+ if boundary_inclusion in ["start", "both"]
1675
+ else current_start_element.bottom
1676
+ )
1677
+ sec_bottom = (
1678
+ end_boundary_el.bottom
1679
+ if boundary_inclusion in ["end", "both"]
1680
+ else end_boundary_el.top
1681
+ )
1682
+
1683
+ if sec_top < sec_bottom: # Ensure valid region
1363
1684
  x0, _, x1, _ = get_bounds()
1364
1685
  region = self.create_region(x0, sec_top, x1, sec_bottom)
1365
1686
  region.start_element = current_start_element
1366
1687
  region.end_element = end_boundary_el
1367
1688
  region.is_end_next_start = False
1368
1689
  regions.append(region)
1369
-
1690
+
1370
1691
  # Reset: section ended explicitly
1371
1692
  current_start_element = None
1372
1693
  active_section_started = False
1373
-
1694
+
1374
1695
  # Handle the last section if it was started but never explicitly ended
1375
1696
  if active_section_started:
1376
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1697
+ sec_top = (
1698
+ current_start_element.top
1699
+ if boundary_inclusion in ["start", "both"]
1700
+ else current_start_element.bottom
1701
+ )
1377
1702
  x0, _, x1, page_bottom = get_bounds()
1378
1703
  if sec_top < page_bottom:
1379
- region = self.create_region(x0, sec_top, x1, page_bottom)
1380
- region.start_element = current_start_element
1381
- region.end_element = None # Ended by page end
1382
- region.is_end_next_start = False
1383
- regions.append(region)
1384
-
1704
+ region = self.create_region(x0, sec_top, x1, page_bottom)
1705
+ region.start_element = current_start_element
1706
+ region.end_element = None # Ended by page end
1707
+ region.is_end_next_start = False
1708
+ regions.append(region)
1709
+
1385
1710
  # Return the list wrapped in an ElementCollection
1386
1711
  return ElementCollection(regions)
1387
-
1712
+
1388
1713
  def __repr__(self) -> str:
1389
1714
  """String representation of the page."""
1390
1715
  return f"<Page number={self.number} index={self.index}>"
1391
-
1392
- def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1716
+
1717
+ def ask(
1718
+ self,
1719
+ question: str,
1720
+ min_confidence: float = 0.1,
1721
+ model: str = None,
1722
+ debug: bool = False,
1723
+ **kwargs,
1724
+ ) -> Dict[str, Any]:
1393
1725
  """
1394
1726
  Ask a question about the page content using document QA.
1395
1727
  """
1396
1728
  try:
1397
- from natural_pdf.qa.document_qa import get_qa_engine
1398
- # Get or initialize QA engine with specified model
1399
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1400
- # Ask the question using the QA engine
1401
- return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1729
+ from natural_pdf.qa.document_qa import get_qa_engine
1730
+
1731
+ # Get or initialize QA engine with specified model
1732
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1733
+ # Ask the question using the QA engine
1734
+ return qa_engine.ask_pdf_page(
1735
+ self, question, min_confidence=min_confidence, debug=debug, **kwargs
1736
+ )
1402
1737
  except ImportError:
1403
- logger.error("Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies.")
1404
- return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1738
+ logger.error(
1739
+ "Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies."
1740
+ )
1741
+ return {
1742
+ "answer": None,
1743
+ "confidence": 0.0,
1744
+ "found": False,
1745
+ "page_num": self.number,
1746
+ "source_elements": [],
1747
+ }
1405
1748
  except Exception as e:
1406
- logger.error(f"Error during page.ask: {e}", exc_info=True)
1407
- return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1749
+ logger.error(f"Error during page.ask: {e}", exc_info=True)
1750
+ return {
1751
+ "answer": None,
1752
+ "confidence": 0.0,
1753
+ "found": False,
1754
+ "page_num": self.number,
1755
+ "source_elements": [],
1756
+ }
1408
1757
 
1409
- def show_preview(self,
1410
- temporary_highlights: List[Dict],
1411
- scale: float = 2.0,
1412
- width: Optional[int] = None,
1413
- labels: bool = True,
1414
- legend_position: str = 'right',
1415
- render_ocr: bool = False) -> Optional[Image.Image]:
1758
+ def show_preview(
1759
+ self,
1760
+ temporary_highlights: List[Dict],
1761
+ scale: float = 2.0,
1762
+ width: Optional[int] = None,
1763
+ labels: bool = True,
1764
+ legend_position: str = "right",
1765
+ render_ocr: bool = False,
1766
+ ) -> Optional[Image.Image]:
1416
1767
  """
1417
1768
  Generates and returns a non-stateful preview image containing only
1418
1769
  the provided temporary highlights.
@@ -1437,13 +1788,16 @@ class Page:
1437
1788
  scale=scale,
1438
1789
  labels=labels,
1439
1790
  legend_position=legend_position,
1440
- render_ocr=render_ocr
1791
+ render_ocr=render_ocr,
1441
1792
  )
1442
1793
  except AttributeError:
1443
1794
  logger.error(f"HighlightingService does not have the required 'render_preview' method.")
1444
1795
  return None
1445
1796
  except Exception as e:
1446
- logger.error(f"Error calling highlighter.render_preview for page {self.index}: {e}", exc_info=True)
1797
+ logger.error(
1798
+ f"Error calling highlighter.render_preview for page {self.index}: {e}",
1799
+ exc_info=True,
1800
+ )
1447
1801
  return None
1448
1802
 
1449
1803
  # Return the rendered image directly
@@ -1451,7 +1805,7 @@ class Page:
1451
1805
 
1452
1806
  @property
1453
1807
  def text_style_labels(self) -> List[str]:
1454
- """
1808
+ """
1455
1809
  Get a sorted list of unique text style labels found on the page.
1456
1810
 
1457
1811
  Runs text style analysis with default options if it hasn't been run yet.
@@ -1461,52 +1815,66 @@ class Page:
1461
1815
  A sorted list of unique style label strings.
1462
1816
  """
1463
1817
  # Check if the summary attribute exists from a previous run
1464
- if not hasattr(self, '_text_styles_summary') or not self._text_styles_summary:
1818
+ if not hasattr(self, "_text_styles_summary") or not self._text_styles_summary:
1465
1819
  # If not, run the analysis with default options
1466
1820
  logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
1467
- self.analyze_text_styles() # Use default options
1821
+ self.analyze_text_styles() # Use default options
1468
1822
 
1469
1823
  # Extract labels from the summary dictionary
1470
- if hasattr(self, '_text_styles_summary') and self._text_styles_summary:
1824
+ if hasattr(self, "_text_styles_summary") and self._text_styles_summary:
1471
1825
  # The summary maps style_key -> {'label': ..., 'properties': ...}
1472
- labels = {style_info['label'] for style_info in self._text_styles_summary.values()}
1826
+ labels = {style_info["label"] for style_info in self._text_styles_summary.values()}
1473
1827
  return sorted(list(labels))
1474
1828
  else:
1475
1829
  # Fallback if summary wasn't created for some reason (e.g., no text elements)
1476
- logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
1477
- return []
1830
+ logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
1831
+ return []
1478
1832
 
1479
- def viewer(self,
1480
- # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
1481
- # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
1482
- ) -> 'SimpleInteractiveViewerWidget': # Return type hint updated
1833
+ def viewer(
1834
+ self,
1835
+ # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
1836
+ # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
1837
+ ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
1483
1838
  """
1484
1839
  Creates and returns an interactive ipywidget for exploring elements on this page.
1485
1840
 
1486
1841
  Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
1487
1842
 
1488
1843
  Returns:
1489
- A SimpleInteractiveViewerWidget instance ready for display in Jupyter.
1844
+ A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
1845
+ or None if ipywidgets is not installed or widget creation fails.
1490
1846
 
1491
1847
  Raises:
1492
- RuntimeError: If required dependencies (ipywidgets) are missing.
1848
+ # Optional: Could raise ImportError instead of returning None
1849
+ # ImportError: If required dependencies (ipywidgets) are missing.
1493
1850
  ValueError: If image rendering or data preparation fails within from_page.
1494
1851
  """
1495
- # Dynamically import here if needed, or ensure it's globally available
1852
+ # Check for availability using the imported flag and class variable
1853
+ if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
1854
+ logger.error(
1855
+ "Interactive viewer requires optional dependencies ('ipywidgets'). "
1856
+ "Install with `pip install natural-pdf[interactive]`"
1857
+ )
1858
+ # raise ImportError("ipywidgets not found.") # Option 1: Raise error
1859
+ return None # Option 2: Return None gracefully
1860
+
1861
+ # If we reach here, SimpleInteractiveViewerWidget should be the actual class
1496
1862
  try:
1497
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1498
- except ImportError:
1499
- logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
1500
- raise
1501
-
1502
- # Pass self (the Page object) to the factory method
1503
- return SimpleInteractiveViewerWidget.from_page(self)
1863
+ # Pass self (the Page object) to the factory method
1864
+ return SimpleInteractiveViewerWidget.from_page(self)
1865
+ except Exception as e:
1866
+ # Catch potential errors during widget creation (e.g., image rendering)
1867
+ logger.error(
1868
+ f"Error creating viewer widget from page {self.number}: {e}", exc_info=True
1869
+ )
1870
+ # raise # Option 1: Re-raise error (might include ValueError from from_page)
1871
+ return None # Option 2: Return None on creation error
1504
1872
 
1505
1873
  # --- Indexable Protocol Methods ---
1506
1874
  def get_id(self) -> str:
1507
1875
  """Returns a unique identifier for the page (required by Indexable protocol)."""
1508
1876
  # Ensure path is safe for use in IDs (replace problematic chars)
1509
- safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
1877
+ safe_path = re.sub(r"[^a-zA-Z0-9_-]", "_", str(self.pdf.path))
1510
1878
  return f"pdf_{safe_path}_page_{self.page_number}"
1511
1879
 
1512
1880
  def get_metadata(self) -> Dict[str, Any]:
@@ -1517,21 +1885,47 @@ class Page:
1517
1885
  "page_number": self.page_number,
1518
1886
  "width": self.width,
1519
1887
  "height": self.height,
1520
- "content_hash": self.get_content_hash() # Include the hash
1888
+ "content_hash": self.get_content_hash(), # Include the hash
1521
1889
  }
1522
1890
  return metadata
1523
1891
 
1524
- def get_content(self) -> 'Page':
1892
+ def get_content(self) -> "Page":
1525
1893
  """
1526
1894
  Returns the primary content object (self) for indexing (required by Indexable protocol).
1527
1895
  SearchService implementations decide how to process this (e.g., call extract_text).
1528
1896
  """
1529
- return self # Return the Page object itself
1897
+ return self # Return the Page object itself
1530
1898
 
1531
1899
  def get_content_hash(self) -> str:
1532
1900
  """Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
1533
1901
  # Hash the extracted text (without exclusions for consistency)
1534
1902
  # Consider if exclusions should be part of the hash? For now, hash raw text.
1535
1903
  # Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
1536
- text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
1537
- return hashlib.sha256(text_content.encode('utf-8')).hexdigest()
1904
+ text_content = self.extract_text(
1905
+ use_exclusions=False, preserve_whitespace=False
1906
+ ) # Normalize whitespace?
1907
+ return hashlib.sha256(text_content.encode("utf-8")).hexdigest()
1908
+
1909
+ # --- New Method: save_searchable ---
1910
+ def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1911
+ """
1912
+ Saves the PDF page with an OCR text layer, making content searchable.
1913
+
1914
+ Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
1915
+
1916
+ Note: OCR must have been applied to the pages beforehand
1917
+ (e.g., using pdf.apply_ocr()).
1918
+
1919
+ Args:
1920
+ output_path: Path to save the searchable PDF.
1921
+ dpi: Resolution for rendering and OCR overlay (default 300).
1922
+ **kwargs: Additional keyword arguments passed to the exporter.
1923
+ """
1924
+ # Import moved here, assuming it's always available now
1925
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
1926
+
1927
+ # Convert pathlib.Path to string if necessary
1928
+ output_path_str = str(output_path)
1929
+
1930
+ create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
1931
+ logger.info(f"Searchable PDF saved to: {output_path_str}")