natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -1,51 +1,66 @@
1
- import pdfplumber
2
- import os
3
- import logging
4
- import tempfile
5
- from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
6
- from PIL import Image
7
1
  import base64
2
+ import hashlib
8
3
  import io
9
4
  import json
5
+ import logging
6
+ import os
10
7
  import re
11
- import hashlib
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
11
+
12
+ import pdfplumber
13
+ from PIL import Image, ImageDraw
12
14
 
13
15
  from natural_pdf.elements.collections import ElementCollection
14
16
  from natural_pdf.elements.region import Region
15
17
 
16
18
  if TYPE_CHECKING:
17
19
  import pdfplumber
18
- from natural_pdf.core.pdf import PDF
19
- from natural_pdf.elements.collections import ElementCollection
20
+
20
21
  from natural_pdf.core.highlighting_service import HighlightingService
22
+ from natural_pdf.core.pdf import PDF
21
23
  from natural_pdf.elements.base import Element
24
+ from natural_pdf.elements.collections import ElementCollection
22
25
 
23
- from natural_pdf.elements.text import TextElement
26
+ # New Imports
27
+ import itertools
28
+
29
+ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
30
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
31
+
32
+ from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
24
33
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
25
34
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
26
- from natural_pdf.ocr import OCROptions
27
- from natural_pdf.ocr import OCRManager
28
- from natural_pdf.core.element_manager import ElementManager
29
- from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
30
- from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
31
35
  from natural_pdf.analyzers.text_options import TextStyleOptions
36
+ from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
37
+ from natural_pdf.core.element_manager import ElementManager
38
+ from natural_pdf.elements.text import TextElement
39
+ from natural_pdf.ocr import OCRManager, OCROptions
40
+
41
+ # Import new utils
42
+ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
32
43
  from natural_pdf.widgets import InteractiveViewerWidget
33
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
44
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
45
+
46
+ from natural_pdf.qa import DocumentQA, get_qa_engine
47
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
34
48
 
35
49
  logger = logging.getLogger(__name__)
36
50
 
51
+
37
52
  class Page:
38
53
  """
39
54
  Enhanced Page wrapper built on top of pdfplumber.Page.
40
-
55
+
41
56
  This class provides a fluent interface for working with PDF pages,
42
57
  with improved selection, navigation, extraction, and question-answering capabilities.
43
58
  """
44
-
45
- def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
59
+
60
+ def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
46
61
  """
47
62
  Initialize a page wrapper.
48
-
63
+
49
64
  Args:
50
65
  page: pdfplumber page object
51
66
  parent: Parent PDF object
@@ -57,39 +72,51 @@ class Page:
57
72
  self._index = index
58
73
  self._text_styles = None # Lazy-loaded text style analyzer results
59
74
  self._exclusions = [] # List to store exclusion functions/regions
60
-
75
+
61
76
  # Region management
62
77
  self._regions = {
63
- 'detected': [], # Layout detection results
64
- 'named': {}, # Named regions (name -> region)
78
+ "detected": [], # Layout detection results
79
+ "named": {}, # Named regions (name -> region)
65
80
  }
66
-
81
+
67
82
  # Initialize ElementManager
68
83
  self._element_mgr = ElementManager(self, font_attrs)
69
84
 
70
85
  # --- Get OCR Manager Instance ---
71
- if OCRManager and hasattr(parent, '_ocr_manager') and isinstance(parent._ocr_manager, OCRManager):
86
+ if (
87
+ OCRManager
88
+ and hasattr(parent, "_ocr_manager")
89
+ and isinstance(parent._ocr_manager, OCRManager)
90
+ ):
72
91
  self._ocr_manager = parent._ocr_manager
73
92
  logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
74
93
  else:
75
94
  self._ocr_manager = None
76
95
  if OCRManager:
77
- logger.warning(f"Page {self.number}: OCRManager instance not found on parent PDF object.")
96
+ logger.warning(
97
+ f"Page {self.number}: OCRManager instance not found on parent PDF object."
98
+ )
78
99
 
79
100
  # --- Get Layout Manager Instance ---
80
- if LayoutManager and hasattr(parent, '_layout_manager') and isinstance(parent._layout_manager, LayoutManager):
101
+ if (
102
+ LayoutManager
103
+ and hasattr(parent, "_layout_manager")
104
+ and isinstance(parent._layout_manager, LayoutManager)
105
+ ):
81
106
  self._layout_manager = parent._layout_manager
82
107
  logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
83
108
  else:
84
109
  self._layout_manager = None
85
110
  if LayoutManager:
86
- logger.warning(f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail.")
111
+ logger.warning(
112
+ f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail."
113
+ )
87
114
 
88
115
  # Initialize the internal variable with a single underscore
89
- self._layout_analyzer = None
116
+ self._layout_analyzer = None
90
117
 
91
118
  @property
92
- def pdf(self) -> 'PDF':
119
+ def pdf(self) -> "PDF":
93
120
  """Provides public access to the parent PDF object."""
94
121
  return self._parent
95
122
 
@@ -97,7 +124,7 @@ class Page:
97
124
  def number(self) -> int:
98
125
  """Get page number (1-based)."""
99
126
  return self._page.page_number
100
-
127
+
101
128
  @property
102
129
  def page_number(self) -> int:
103
130
  """Get page number (1-based)."""
@@ -107,12 +134,12 @@ class Page:
107
134
  def index(self) -> int:
108
135
  """Get page index (0-based)."""
109
136
  return self._index
110
-
137
+
111
138
  @property
112
139
  def width(self) -> float:
113
140
  """Get page width."""
114
141
  return self._page.width
115
-
142
+
116
143
  @property
117
144
  def height(self) -> float:
118
145
  """Get page height."""
@@ -120,107 +147,125 @@ class Page:
120
147
 
121
148
  # --- Highlighting Service Accessor ---
122
149
  @property
123
- def _highlighter(self) -> 'HighlightingService':
124
- """Provides access to the parent PDF's HighlightingService."""
125
- if not hasattr(self._parent, 'highlighter'):
126
- # This should ideally not happen if PDF.__init__ works correctly
127
- raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
128
- return self._parent.highlighter
150
+ def _highlighter(self) -> "HighlightingService":
151
+ """Provides access to the parent PDF's HighlightingService."""
152
+ if not hasattr(self._parent, "highlighter"):
153
+ # This should ideally not happen if PDF.__init__ works correctly
154
+ raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
155
+ return self._parent.highlighter
129
156
 
130
- def clear_exclusions(self) -> 'Page':
157
+ def clear_exclusions(self) -> "Page":
131
158
  """
132
159
  Clear all exclusions from the page.
133
160
  """
134
161
  self._exclusions = []
135
162
  return self
136
163
 
137
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
164
+ def add_exclusion(
165
+ self,
166
+ exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
167
+ label: Optional[str] = None,
168
+ ) -> "Page":
138
169
  """
139
170
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
140
171
  Ensures non-callable items are stored as Region objects if possible.
141
-
172
+
142
173
  Args:
143
174
  exclusion_func_or_region: Either a callable function returning a Region,
144
175
  a Region object, or another object with a valid .bbox attribute.
145
176
  label: Optional label for this exclusion (e.g., 'header', 'footer').
146
-
177
+
147
178
  Returns:
148
179
  Self for method chaining
149
-
180
+
150
181
  Raises:
151
182
  TypeError: If a non-callable, non-Region object without a valid bbox is provided.
152
183
  """
153
- exclusion_data = None # Initialize exclusion data
184
+ exclusion_data = None # Initialize exclusion data
154
185
 
155
186
  if callable(exclusion_func_or_region):
156
187
  # Store callable functions along with their label
157
188
  exclusion_data = (exclusion_func_or_region, label)
158
- logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
189
+ logger.debug(
190
+ f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
191
+ )
159
192
  elif isinstance(exclusion_func_or_region, Region):
160
193
  # Store Region objects directly, assigning the label
161
- exclusion_func_or_region.label = label # Assign label
162
- exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
163
- logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
164
- elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
194
+ exclusion_func_or_region.label = label # Assign label
195
+ exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
196
+ logger.debug(
197
+ f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
198
+ )
199
+ elif (
200
+ hasattr(exclusion_func_or_region, "bbox")
201
+ and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
202
+ and len(exclusion_func_or_region.bbox) == 4
203
+ ):
165
204
  # Convert objects with a valid bbox to a Region before storing
166
205
  try:
167
206
  bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
168
207
  # Pass the label to the Region constructor
169
208
  region_to_add = Region(self, bbox_coords, label=label)
170
- exclusion_data = (region_to_add, label) # Store as tuple
171
- logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
209
+ exclusion_data = (region_to_add, label) # Store as tuple
210
+ logger.debug(
211
+ f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
212
+ )
172
213
  except (ValueError, TypeError, Exception) as e:
173
214
  # Raise an error if conversion fails
174
- raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
215
+ raise TypeError(
216
+ f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
217
+ ) from e
175
218
  else:
176
219
  # Reject invalid types
177
- raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
220
+ raise TypeError(
221
+ f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
222
+ )
178
223
 
179
224
  # Append the stored data (tuple of object/callable and label)
180
225
  if exclusion_data:
181
226
  self._exclusions.append(exclusion_data)
182
227
 
183
228
  return self
184
-
185
- def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
229
+
230
+ def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
186
231
  """
187
232
  Add a region to the page.
188
-
233
+
189
234
  Args:
190
235
  region: Region object to add
191
236
  name: Optional name for the region
192
-
237
+
193
238
  Returns:
194
239
  Self for method chaining
195
240
  """
196
241
  # Check if it's actually a Region object
197
242
  if not isinstance(region, Region):
198
243
  raise TypeError("region must be a Region object")
199
-
244
+
200
245
  # Set the source and name
201
- region.source = 'named'
202
-
246
+ region.source = "named"
247
+
203
248
  if name:
204
249
  region.name = name
205
250
  # Add to named regions dictionary (overwriting if name already exists)
206
- self._regions['named'][name] = region
251
+ self._regions["named"][name] = region
207
252
  else:
208
253
  # Add to detected regions list (unnamed but registered)
209
- self._regions['detected'].append(region)
210
-
254
+ self._regions["detected"].append(region)
255
+
211
256
  # Add to element manager for selector queries
212
257
  self._element_mgr.add_region(region)
213
-
258
+
214
259
  return self
215
-
216
- def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
260
+
261
+ def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
217
262
  """
218
263
  Add multiple regions to the page.
219
-
264
+
220
265
  Args:
221
266
  regions: List of Region objects to add
222
267
  prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
223
-
268
+
224
269
  Returns:
225
270
  Self for method chaining
226
271
  """
@@ -232,23 +277,23 @@ class Page:
232
277
  # Add without names
233
278
  for region in regions:
234
279
  self.add_region(region)
235
-
280
+
236
281
  return self
237
-
282
+
238
283
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
239
284
  """
240
285
  Get all exclusion regions for this page.
241
286
  Assumes self._exclusions contains tuples of (callable/Region, label).
242
-
287
+
243
288
  Args:
244
289
  include_callable: Whether to evaluate callable exclusion functions
245
290
  debug: Enable verbose debug logging for exclusion evaluation
246
-
291
+
247
292
  Returns:
248
293
  List of Region objects to exclude, with labels assigned.
249
294
  """
250
295
  regions = []
251
-
296
+
252
297
  if debug:
253
298
  print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
254
299
 
@@ -280,32 +325,39 @@ class Page:
280
325
  if debug:
281
326
  print(f" ✓ Added region from callable '{label}': {region_result}")
282
327
  elif region_result:
283
- logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
284
- if debug:
285
- print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
328
+ logger.warning(
329
+ f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
330
+ )
331
+ if debug:
332
+ print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
286
333
  else:
287
334
  if debug:
288
- print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
335
+ print(
336
+ f" ✗ Callable '{exclusion_label}' returned None, no region added"
337
+ )
289
338
 
290
339
  except Exception as e:
291
340
  error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
292
341
  print(error_msg)
293
342
  import traceback
343
+
294
344
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
295
345
 
296
346
  # Process direct Region objects (label was assigned in add_exclusion)
297
347
  elif isinstance(exclusion_item, Region):
298
- regions.append(exclusion_item) # Label is already on the Region object
348
+ regions.append(exclusion_item) # Label is already on the Region object
299
349
  if debug:
300
350
  print(f" - Added direct region '{label}': {exclusion_item}")
301
351
  # No else needed, add_exclusion should prevent invalid types
302
-
352
+
303
353
  if debug:
304
354
  print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
305
-
355
+
306
356
  return regions
307
357
 
308
- def _filter_elements_by_exclusions(self, elements: List['Element'], debug_exclusions: bool = False) -> List['Element']:
358
+ def _filter_elements_by_exclusions(
359
+ self, elements: List["Element"], debug_exclusions: bool = False
360
+ ) -> List["Element"]:
309
361
  """
310
362
  Filters a list of elements, removing those within the page's exclusion regions.
311
363
 
@@ -318,19 +370,27 @@ class Page:
318
370
  """
319
371
  if not self._exclusions:
320
372
  if debug_exclusions:
321
- print(f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements.")
373
+ print(
374
+ f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
375
+ )
322
376
  return elements
323
377
 
324
378
  # Get all exclusion regions, including evaluating callable functions
325
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
379
+ exclusion_regions = self._get_exclusion_regions(
380
+ include_callable=True, debug=debug_exclusions
381
+ )
326
382
 
327
383
  if not exclusion_regions:
328
384
  if debug_exclusions:
329
- print(f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements.")
385
+ print(
386
+ f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements."
387
+ )
330
388
  return elements
331
389
 
332
390
  if debug_exclusions:
333
- print(f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements.")
391
+ print(
392
+ f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements."
393
+ )
334
394
 
335
395
  filtered_elements = []
336
396
  excluded_count = 0
@@ -346,7 +406,9 @@ class Page:
346
406
  filtered_elements.append(element)
347
407
 
348
408
  if debug_exclusions:
349
- print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}.")
409
+ print(
410
+ f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}."
411
+ )
350
412
 
351
413
  return filtered_elements
352
414
 
@@ -365,15 +427,18 @@ class Page:
365
427
  Element object or None if not found
366
428
  """
367
429
  from natural_pdf.selectors.parser import parse_selector
430
+
368
431
  selector_obj = parse_selector(selector)
369
-
432
+
370
433
  # Pass regex and case flags to selector function
371
- kwargs['regex'] = regex
372
- kwargs['case'] = case
373
-
434
+ kwargs["regex"] = regex
435
+ kwargs["case"] = case
436
+
374
437
  # First get all matching elements without applying exclusions initially within _apply_selector
375
- results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
376
-
438
+ results_collection = self._apply_selector(
439
+ selector_obj, **kwargs
440
+ ) # _apply_selector doesn't filter
441
+
377
442
  # Filter the results based on exclusions if requested
378
443
  if apply_exclusions and self._exclusions and results_collection:
379
444
  filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
@@ -385,7 +450,9 @@ class Page:
385
450
  else:
386
451
  return None
387
452
 
388
- def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
453
+ def find_all(
454
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
455
+ ) -> "ElementCollection":
389
456
  """
390
457
  Find all elements on this page matching selector.
391
458
 
@@ -395,20 +462,23 @@ class Page:
395
462
  regex: Whether to use regex for text search in :contains (default: False)
396
463
  case: Whether to do case-sensitive text search (default: True)
397
464
  **kwargs: Additional filter parameters
398
-
465
+
399
466
  Returns:
400
467
  ElementCollection with matching elements
401
468
  """
402
469
  from natural_pdf.selectors.parser import parse_selector
470
+
403
471
  selector_obj = parse_selector(selector)
404
-
472
+
405
473
  # Pass regex and case flags to selector function
406
- kwargs['regex'] = regex
407
- kwargs['case'] = case
408
-
474
+ kwargs["regex"] = regex
475
+ kwargs["case"] = case
476
+
409
477
  # First get all matching elements without applying exclusions initially within _apply_selector
410
- results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
411
-
478
+ results_collection = self._apply_selector(
479
+ selector_obj, **kwargs
480
+ ) # _apply_selector doesn't filter
481
+
412
482
  # Filter the results based on exclusions if requested
413
483
  if apply_exclusions and self._exclusions and results_collection:
414
484
  filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
@@ -416,208 +486,348 @@ class Page:
416
486
  else:
417
487
  # Return the unfiltered collection
418
488
  return results_collection
419
-
420
- def _apply_selector(self, selector_obj: Dict, **kwargs) -> 'ElementCollection': # Removed apply_exclusions arg
489
+
490
+ def _apply_selector(
491
+ self, selector_obj: Dict, **kwargs
492
+ ) -> "ElementCollection": # Removed apply_exclusions arg
421
493
  """
422
494
  Apply selector to page elements.
423
495
  Exclusions are now handled by the calling methods (find, find_all) if requested.
424
-
496
+
425
497
  Args:
426
498
  selector_obj: Parsed selector dictionary
427
499
  **kwargs: Additional filter parameters including 'regex' and 'case'
428
-
500
+
429
501
  Returns:
430
502
  ElementCollection of matching elements (unfiltered by exclusions)
431
503
  """
432
504
  from natural_pdf.selectors.parser import selector_to_filter_func
433
-
505
+
434
506
  # Get element type to filter
435
- element_type = selector_obj.get('type', 'any').lower()
436
-
507
+ element_type = selector_obj.get("type", "any").lower()
508
+
437
509
  # Determine which elements to search based on element type
438
510
  elements_to_search = []
439
- if element_type == 'any':
511
+ if element_type == "any":
440
512
  elements_to_search = self._element_mgr.get_all_elements()
441
- elif element_type == 'text':
513
+ elif element_type == "text":
442
514
  elements_to_search = self._element_mgr.words
443
- elif element_type == 'char':
515
+ elif element_type == "char":
444
516
  elements_to_search = self._element_mgr.chars
445
- elif element_type == 'word':
517
+ elif element_type == "word":
446
518
  elements_to_search = self._element_mgr.words
447
- elif element_type == 'rect' or element_type == 'rectangle':
519
+ elif element_type == "rect" or element_type == "rectangle":
448
520
  elements_to_search = self._element_mgr.rects
449
- elif element_type == 'line':
521
+ elif element_type == "line":
450
522
  elements_to_search = self._element_mgr.lines
451
- elif element_type == 'region':
523
+ elif element_type == "region":
452
524
  elements_to_search = self._element_mgr.regions
453
525
  else:
454
526
  elements_to_search = self._element_mgr.get_all_elements()
455
-
527
+
456
528
  # Create filter function from selector, passing any additional parameters
457
529
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
458
-
530
+
459
531
  # Apply the filter to matching elements
460
532
  matching_elements = [element for element in elements_to_search if filter_func(element)]
461
-
533
+
462
534
  # Handle spatial pseudo-classes that require relationship checking
463
- for pseudo in selector_obj.get('pseudo_classes', []):
464
- name = pseudo.get('name')
465
- args = pseudo.get('args', '')
466
-
467
- if name in ('above', 'below', 'near', 'left-of', 'right-of'):
535
+ for pseudo in selector_obj.get("pseudo_classes", []):
536
+ name = pseudo.get("name")
537
+ args = pseudo.get("args", "")
538
+
539
+ if name in ("above", "below", "near", "left-of", "right-of"):
468
540
  # Find the reference element first
469
541
  from natural_pdf.selectors.parser import parse_selector
542
+
470
543
  ref_selector = parse_selector(args) if isinstance(args, str) else args
471
544
  # Recursively call _apply_selector for reference element (exclusions handled later)
472
- ref_elements = self._apply_selector(ref_selector, **kwargs)
473
-
545
+ ref_elements = self._apply_selector(ref_selector, **kwargs)
546
+
474
547
  if not ref_elements:
475
548
  return ElementCollection([])
476
-
549
+
477
550
  ref_element = ref_elements.first
478
- if not ref_element: continue
479
-
551
+ if not ref_element:
552
+ continue
553
+
480
554
  # Filter elements based on spatial relationship
481
- if name == 'above':
482
- matching_elements = [el for el in matching_elements if hasattr(el, 'bottom') and hasattr(ref_element, 'top') and el.bottom <= ref_element.top]
483
- elif name == 'below':
484
- matching_elements = [el for el in matching_elements if hasattr(el, 'top') and hasattr(ref_element, 'bottom') and el.top >= ref_element.bottom]
485
- elif name == 'left-of':
486
- matching_elements = [el for el in matching_elements if hasattr(el, 'x1') and hasattr(ref_element, 'x0') and el.x1 <= ref_element.x0]
487
- elif name == 'right-of':
488
- matching_elements = [el for el in matching_elements if hasattr(el, 'x0') and hasattr(ref_element, 'x1') and el.x0 >= ref_element.x1]
489
- elif name == 'near':
555
+ if name == "above":
556
+ matching_elements = [
557
+ el
558
+ for el in matching_elements
559
+ if hasattr(el, "bottom")
560
+ and hasattr(ref_element, "top")
561
+ and el.bottom <= ref_element.top
562
+ ]
563
+ elif name == "below":
564
+ matching_elements = [
565
+ el
566
+ for el in matching_elements
567
+ if hasattr(el, "top")
568
+ and hasattr(ref_element, "bottom")
569
+ and el.top >= ref_element.bottom
570
+ ]
571
+ elif name == "left-of":
572
+ matching_elements = [
573
+ el
574
+ for el in matching_elements
575
+ if hasattr(el, "x1")
576
+ and hasattr(ref_element, "x0")
577
+ and el.x1 <= ref_element.x0
578
+ ]
579
+ elif name == "right-of":
580
+ matching_elements = [
581
+ el
582
+ for el in matching_elements
583
+ if hasattr(el, "x0")
584
+ and hasattr(ref_element, "x1")
585
+ and el.x0 >= ref_element.x1
586
+ ]
587
+ elif name == "near":
588
+
490
589
  def distance(el1, el2):
491
- if not (hasattr(el1, 'x0') and hasattr(el1, 'x1') and hasattr(el1, 'top') and hasattr(el1, 'bottom') and
492
- hasattr(el2, 'x0') and hasattr(el2, 'x1') and hasattr(el2, 'top') and hasattr(el2, 'bottom')):
493
- return float('inf') # Cannot calculate distance
494
- el1_center_x = (el1.x0 + el1.x1) / 2
495
- el1_center_y = (el1.top + el1.bottom) / 2
496
- el2_center_x = (el2.x0 + el2.x1) / 2
497
- el2_center_y = (el2.top + el2.bottom) / 2
498
- return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
499
-
500
- threshold = kwargs.get('near_threshold', 50)
501
- matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
502
-
590
+ if not (
591
+ hasattr(el1, "x0")
592
+ and hasattr(el1, "x1")
593
+ and hasattr(el1, "top")
594
+ and hasattr(el1, "bottom")
595
+ and hasattr(el2, "x0")
596
+ and hasattr(el2, "x1")
597
+ and hasattr(el2, "top")
598
+ and hasattr(el2, "bottom")
599
+ ):
600
+ return float("inf") # Cannot calculate distance
601
+ el1_center_x = (el1.x0 + el1.x1) / 2
602
+ el1_center_y = (el1.top + el1.bottom) / 2
603
+ el2_center_x = (el2.x0 + el2.x1) / 2
604
+ el2_center_y = (el2.top + el2.bottom) / 2
605
+ return (
606
+ (el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2
607
+ ) ** 0.5
608
+
609
+ threshold = kwargs.get("near_threshold", 50)
610
+ matching_elements = [
611
+ el for el in matching_elements if distance(el, ref_element) <= threshold
612
+ ]
613
+
503
614
  # Sort elements in reading order if requested
504
- if kwargs.get('reading_order', True):
505
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
506
- matching_elements.sort(key=lambda el: (el.top, el.x0))
615
+ if kwargs.get("reading_order", True):
616
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
617
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
507
618
  else:
508
- logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
509
-
619
+ logger.warning(
620
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
621
+ )
622
+
510
623
  # Create result collection - exclusions are handled by the calling methods (find, find_all)
511
624
  result = ElementCollection(matching_elements)
512
-
625
+
513
626
  return result
514
627
 
515
628
  def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
516
629
  """
517
630
  Create a region on this page with the specified coordinates.
518
-
631
+
519
632
  Args:
520
633
  x0: Left x-coordinate
521
634
  top: Top y-coordinate
522
635
  x1: Right x-coordinate
523
636
  bottom: Bottom y-coordinate
524
-
637
+
525
638
  Returns:
526
639
  Region object for the specified coordinates
527
640
  """
528
641
  from natural_pdf.elements.region import Region
642
+
529
643
  return Region(self, (x0, top, x1, bottom))
530
-
531
- def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
532
- width: str = "full") -> Any:
644
+
645
+ def region(
646
+ self,
647
+ left: float = None,
648
+ top: float = None,
649
+ right: float = None,
650
+ bottom: float = None,
651
+ width: Union[str, float, None] = None,
652
+ height: Optional[float] = None,
653
+ ) -> Any:
533
654
  """
534
- Create a region on this page with more intuitive named parameters.
535
-
655
+ Create a region on this page with more intuitive named parameters,
656
+ allowing definition by coordinates or by coordinate + dimension.
657
+
536
658
  Args:
537
- left: Left x-coordinate (default: 0)
538
- top: Top y-coordinate (default: 0)
539
- right: Right x-coordinate (default: page width)
540
- bottom: Bottom y-coordinate (default: page height)
541
- width: Width mode - "full" for full page width or "element" for element width
542
-
659
+ left: Left x-coordinate (default: 0 if width not used).
660
+ top: Top y-coordinate (default: 0 if height not used).
661
+ right: Right x-coordinate (default: page width if width not used).
662
+ bottom: Bottom y-coordinate (default: page height if height not used).
663
+ width: Width definition. Can be:
664
+ - Numeric: The width of the region in points. Cannot be used with both left and right.
665
+ - String 'full': Sets region width to full page width (overrides left/right).
666
+ - String 'element' or None (default): Uses provided/calculated left/right,
667
+ defaulting to page width if neither are specified.
668
+ height: Numeric height of the region. Cannot be used with both top and bottom.
669
+
543
670
  Returns:
544
671
  Region object for the specified coordinates
545
-
672
+
673
+ Raises:
674
+ ValueError: If conflicting arguments are provided (e.g., top, bottom, and height)
675
+ or if width is an invalid string.
676
+
546
677
  Examples:
547
- >>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
548
- >>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
549
- """
550
- # Handle defaults
551
- left = 0 if left is None else left
552
- top = 0 if top is None else top
553
- right = self.width if right is None else right
554
- bottom = self.height if bottom is None else bottom
555
-
556
- # Handle width parameter
557
- if width == "full":
558
- left = 0
559
- right = self.width
560
- elif width != "element":
561
- raise ValueError("Width must be 'full' or 'element'")
562
-
678
+ >>> page.region(top=100, height=50) # Region from y=100 to y=150, default width
679
+ >>> page.region(left=50, width=100) # Region from x=50 to x=150, default height
680
+ >>> page.region(bottom=500, height=50) # Region from y=450 to y=500
681
+ >>> page.region(right=200, width=50) # Region from x=150 to x=200
682
+ >>> page.region(top=100, bottom=200, width="full") # Explicit full width
683
+ """
684
+ # --- Type checking and basic validation ---
685
+ is_width_numeric = isinstance(width, (int, float))
686
+ is_width_string = isinstance(width, str)
687
+ width_mode = "element" # Default mode
688
+
689
+ if height is not None and top is not None and bottom is not None:
690
+ raise ValueError("Cannot specify top, bottom, and height simultaneously.")
691
+ if is_width_numeric and left is not None and right is not None:
692
+ raise ValueError("Cannot specify left, right, and a numeric width simultaneously.")
693
+ if is_width_string:
694
+ width_lower = width.lower()
695
+ if width_lower not in ["full", "element"]:
696
+ raise ValueError("String width argument must be 'full' or 'element'.")
697
+ width_mode = width_lower
698
+
699
+ # --- Calculate Coordinates ---
700
+ final_top = top
701
+ final_bottom = bottom
702
+ final_left = left
703
+ final_right = right
704
+
705
+ # Height calculations
706
+ if height is not None:
707
+ if top is not None:
708
+ final_bottom = top + height
709
+ elif bottom is not None:
710
+ final_top = bottom - height
711
+ else: # Neither top nor bottom provided, default top to 0
712
+ final_top = 0
713
+ final_bottom = height
714
+
715
+ # Width calculations (numeric only)
716
+ if is_width_numeric:
717
+ if left is not None:
718
+ final_right = left + width
719
+ elif right is not None:
720
+ final_left = right - width
721
+ else: # Neither left nor right provided, default left to 0
722
+ final_left = 0
723
+ final_right = width
724
+
725
+ # --- Apply Defaults for Unset Coordinates ---
726
+ # Only default coordinates if they weren't set by dimension calculation
727
+ if final_top is None:
728
+ final_top = 0
729
+ if final_bottom is None:
730
+ # Check if bottom should have been set by height calc
731
+ if height is None or top is None:
732
+ final_bottom = self.height
733
+
734
+ if final_left is None:
735
+ final_left = 0
736
+ if final_right is None:
737
+ # Check if right should have been set by width calc
738
+ if not is_width_numeric or left is None:
739
+ final_right = self.width
740
+
741
+ # --- Handle width_mode == 'full' ---
742
+ if width_mode == "full":
743
+ # Override left/right if mode is full
744
+ final_left = 0
745
+ final_right = self.width
746
+
747
+ # --- Final Validation & Creation ---
748
+ # Ensure coordinates are within page bounds (clamp)
749
+ final_left = max(0, final_left)
750
+ final_top = max(0, final_top)
751
+ final_right = min(self.width, final_right)
752
+ final_bottom = min(self.height, final_bottom)
753
+
754
+ # Ensure valid box (x0<=x1, top<=bottom)
755
+ if final_left > final_right:
756
+ logger.warning(f"Calculated left ({final_left}) > right ({final_right}). Swapping.")
757
+ final_left, final_right = final_right, final_left
758
+ if final_top > final_bottom:
759
+ logger.warning(f"Calculated top ({final_top}) > bottom ({final_bottom}). Swapping.")
760
+ final_top, final_bottom = final_bottom, final_top
761
+
563
762
  from natural_pdf.elements.region import Region
564
- region = Region(self, (left, top, right, bottom))
763
+
764
+ region = Region(self, (final_left, final_top, final_right, final_bottom))
565
765
  return region
566
-
567
- def get_elements(self, apply_exclusions=True, debug_exclusions: bool = False) -> List['Element']:
766
+
767
+ def get_elements(
768
+ self, apply_exclusions=True, debug_exclusions: bool = False
769
+ ) -> List["Element"]:
568
770
  """
569
771
  Get all elements on this page.
570
-
772
+
571
773
  Args:
572
774
  apply_exclusions: Whether to apply exclusion regions (default: True).
573
775
  debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
574
-
776
+
575
777
  Returns:
576
778
  List of all elements on the page, potentially filtered by exclusions.
577
779
  """
578
780
  # Get all elements from the element manager
579
781
  all_elements = self._element_mgr.get_all_elements()
580
-
782
+
581
783
  # Apply exclusions if requested
582
784
  if apply_exclusions and self._exclusions:
583
- return self._filter_elements_by_exclusions(all_elements, debug_exclusions=debug_exclusions)
785
+ return self._filter_elements_by_exclusions(
786
+ all_elements, debug_exclusions=debug_exclusions
787
+ )
584
788
  else:
585
789
  if debug_exclusions:
586
- print(f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied).")
790
+ print(
791
+ f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied)."
792
+ )
587
793
  return all_elements
588
-
589
- def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
794
+
795
+ def filter_elements(
796
+ self, elements: List["Element"], selector: str, **kwargs
797
+ ) -> List["Element"]:
590
798
  """
591
799
  Filter a list of elements based on a selector.
592
-
800
+
593
801
  Args:
594
802
  elements: List of elements to filter
595
803
  selector: CSS-like selector string
596
804
  **kwargs: Additional filter parameters
597
-
805
+
598
806
  Returns:
599
807
  List of elements that match the selector
600
808
  """
601
809
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
602
-
810
+
603
811
  # Parse the selector
604
812
  selector_obj = parse_selector(selector)
605
-
813
+
606
814
  # Create filter function from selector
607
815
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
608
-
816
+
609
817
  # Apply the filter to the elements
610
818
  matching_elements = [element for element in elements if filter_func(element)]
611
-
819
+
612
820
  # Sort elements in reading order if requested
613
- if kwargs.get('reading_order', True):
614
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
615
- matching_elements.sort(key=lambda el: (el.top, el.x0))
821
+ if kwargs.get("reading_order", True):
822
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
823
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
616
824
  else:
617
- logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
618
-
825
+ logger.warning(
826
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
827
+ )
828
+
619
829
  return matching_elements
620
-
830
+
621
831
  def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
622
832
  """
623
833
  Select content from the top of the page until matching selector.
@@ -626,26 +836,28 @@ class Page:
626
836
  selector: CSS-like selector string
627
837
  include_endpoint: Whether to include the endpoint element in the region
628
838
  **kwargs: Additional selection parameters
629
-
839
+
630
840
  Returns:
631
841
  Region object representing the selected content
632
-
842
+
633
843
  Examples:
634
844
  >>> page.until('text:contains("Conclusion")') # Select from top to conclusion
635
845
  >>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
636
846
  """
637
- # Find the target element
847
+ # Find the target element
638
848
  target = self.find(selector, **kwargs)
639
849
  if not target:
640
850
  # If target not found, return a default region (full page)
641
851
  from natural_pdf.elements.region import Region
852
+
642
853
  return Region(self, (0, 0, self.width, self.height))
643
-
854
+
644
855
  # Create a region from the top of the page to the target
645
856
  from natural_pdf.elements.region import Region
857
+
646
858
  # Ensure target has positional attributes before using them
647
- target_top = getattr(target, 'top', 0)
648
- target_bottom = getattr(target, 'bottom', self.height)
859
+ target_top = getattr(target, "top", 0)
860
+ target_bottom = getattr(target, "bottom", self.height)
649
861
 
650
862
  if include_endpoint:
651
863
  # Include the target element
@@ -653,17 +865,16 @@ class Page:
653
865
  else:
654
866
  # Up to the target element
655
867
  region = Region(self, (0, 0, self.width, target_top))
656
-
868
+
657
869
  region.end_element = target
658
870
  return region
659
871
 
660
-
661
872
  def crop(self, bbox=None, **kwargs) -> Any:
662
873
  """
663
874
  Crop the page to the specified bounding box.
664
875
 
665
876
  This is a direct wrapper around pdfplumber's crop method.
666
-
877
+
667
878
  Args:
668
879
  bbox: Bounding box (x0, top, x1, bottom) or None
669
880
  **kwargs: Additional parameters (top, bottom, left, right)
@@ -674,59 +885,82 @@ class Page:
674
885
  # Returns the pdfplumber page object, not a natural-pdf Page
675
886
  return self._page.crop(bbox, **kwargs)
676
887
 
677
- def extract_text(self,
678
- preserve_whitespace=True,
679
- use_exclusions=True,
680
- debug_exclusions=False, **kwargs) -> str:
888
+ def extract_text(
889
+ self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
890
+ ) -> str:
681
891
  """
682
- Extract text from this page, respecting any exclusion regions.
683
-
892
+ Extract text from this page, respecting exclusions and using pdfplumber's
893
+ layout engine (chars_to_textmap) if layout arguments are provided or default.
894
+
684
895
  Args:
685
- preserve_whitespace: Whether to keep blank characters (default: True)
686
- use_exclusions: Whether to apply exclusion regions (default: True)
687
- debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
688
- **kwargs: Additional extraction parameters passed to pdfplumber
689
-
896
+ use_exclusions: Whether to apply exclusion regions (default: True).
897
+ Note: Filtering logic is now always applied if exclusions exist.
898
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
899
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
900
+ `chars_to_textmap` function. Common parameters include:
901
+ - layout (bool): If True (default), inserts spaces/newlines.
902
+ - x_density (float): Pixels per character horizontally.
903
+ - y_density (float): Pixels per line vertically.
904
+ - x_tolerance (float): Tolerance for horizontal character grouping.
905
+ - y_tolerance (float): Tolerance for vertical character grouping.
906
+ - line_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
907
+ - char_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
908
+ See pdfplumber documentation for more.
909
+
690
910
  Returns:
691
- Extracted text as string
692
- """
693
- if not use_exclusions or not self._exclusions:
694
- # If no exclusions or exclusions disabled, use regular extraction
695
- if debug_exclusions:
696
- print(f"Page {self.index}: Extracting text via pdfplumber (exclusions not applied).")
697
- # Note: pdfplumber still uses keep_blank_chars parameter
698
- return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
699
-
700
- # --- Exclusion Logic ---
701
- # 1. Get all potentially relevant text elements (words)
702
- all_text_elements = self.words # Use the words property
703
- if debug_exclusions:
704
- print(f"Page {self.index}: Starting text extraction with {len(all_text_elements)} words before exclusion.")
911
+ Extracted text as string, potentially with layout-based spacing.
912
+ """
913
+ logger.debug(f"Page {self.number}: extract_text called with kwargs: {kwargs}")
914
+ debug = kwargs.get("debug", debug_exclusions) # Allow 'debug' kwarg
915
+
916
+ # 1. Get Word Elements (triggers load_elements if needed)
917
+ word_elements = self.words
918
+ if not word_elements:
919
+ logger.debug(f"Page {self.number}: No word elements found.")
920
+ return ""
921
+
922
+ # 2. Get Exclusions
923
+ apply_exclusions_flag = kwargs.get("use_exclusions", True)
924
+ exclusion_regions = []
925
+ if apply_exclusions_flag and self._exclusions:
926
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
927
+ if debug:
928
+ logger.debug(f"Page {self.number}: Applying {len(exclusion_regions)} exclusions.")
929
+ elif debug:
930
+ logger.debug(f"Page {self.number}: Not applying exclusions.")
931
+
932
+ # 3. Collect All Character Dictionaries from Word Elements
933
+ all_char_dicts = []
934
+ for word in word_elements:
935
+ all_char_dicts.extend(getattr(word, "_char_dicts", []))
936
+
937
+ # 4. Spatially Filter Characters
938
+ filtered_chars = filter_chars_spatially(
939
+ char_dicts=all_char_dicts,
940
+ exclusion_regions=exclusion_regions,
941
+ target_region=None, # No target region for full page extraction
942
+ debug=debug,
943
+ )
705
944
 
706
- # 2. Filter elements using the centralized method
707
- filtered_elements = self._filter_elements_by_exclusions(all_text_elements, debug_exclusions=debug_exclusions)
945
+ # 5. Generate Text Layout using Utility
946
+ # Pass page bbox as layout context
947
+ page_bbox = (0, 0, self.width, self.height)
948
+ result = generate_text_layout(
949
+ char_dicts=filtered_chars,
950
+ layout_context_bbox=page_bbox,
951
+ user_kwargs=kwargs, # Pass original user kwargs
952
+ )
708
953
 
709
- # 3. Extract text from the filtered elements
710
- collection = ElementCollection(filtered_elements)
711
- # Ensure elements are sorted for logical text flow (might be redundant if self.words is sorted)
712
- if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in collection.elements):
713
- collection.sort(key=lambda el: (el.top, el.x0))
714
-
715
- # Join text, handling potential missing text attributes gracefully
716
- result = " ".join(getattr(el, 'text', '') for el in collection.elements)
717
-
718
- if debug_exclusions:
719
- print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied.")
720
-
954
+ logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
721
955
  return result
722
956
 
723
957
  def extract_table(self, table_settings={}) -> List[Any]:
724
958
  """
725
959
  Extract the largest table from this page.
726
-
960
+
727
961
  Args:
728
962
  table_settings: Additional extraction parameters
729
-
963
+
730
964
  Returns:
731
965
  List of extracted tables (or None if no table found)
732
966
  """
@@ -736,10 +970,10 @@ class Page:
736
970
  def extract_tables(self, table_settings={}) -> List[Any]:
737
971
  """
738
972
  Extract tables from this page.
739
-
973
+
740
974
  Args:
741
975
  table_settings: Additional extraction parameters
742
-
976
+
743
977
  Returns:
744
978
  List of extracted tables
745
979
  """
@@ -749,33 +983,33 @@ class Page:
749
983
  def _load_elements(self):
750
984
  """Load all elements from the page via ElementManager."""
751
985
  self._element_mgr.load_elements()
752
-
986
+
753
987
  def _create_char_elements(self):
754
988
  """DEPRECATED: Use self._element_mgr.chars"""
755
989
  logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
756
- return self._element_mgr.chars # Delegate
990
+ return self._element_mgr.chars # Delegate
757
991
 
758
992
  def _process_font_information(self, char_dict):
759
- """DEPRECATED: Handled by ElementManager"""
760
- logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
761
- # ElementManager handles this internally
762
- pass
993
+ """DEPRECATED: Handled by ElementManager"""
994
+ logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
995
+ # ElementManager handles this internally
996
+ pass
763
997
 
764
998
  def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
765
999
  """DEPRECATED: Use self._element_mgr.words"""
766
1000
  logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
767
- return self._element_mgr.words # Delegate
1001
+ return self._element_mgr.words # Delegate
768
1002
 
769
1003
  def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
770
1004
  """DEPRECATED: Handled by ElementManager"""
771
1005
  logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
772
1006
  pass
773
-
1007
+
774
1008
  def _check_font_attributes_match(self, char, prev_char, font_attrs):
775
1009
  """DEPRECATED: Handled by ElementManager"""
776
1010
  logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
777
1011
  pass
778
-
1012
+
779
1013
  def _create_word_element(self, chars, font_attrs):
780
1014
  """DEPRECATED: Handled by ElementManager"""
781
1015
  logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
@@ -785,34 +1019,36 @@ class Page:
785
1019
  def chars(self) -> List[Any]:
786
1020
  """Get all character elements on this page."""
787
1021
  return self._element_mgr.chars
788
-
1022
+
789
1023
  @property
790
1024
  def words(self) -> List[Any]:
791
1025
  """Get all word elements on this page."""
792
1026
  return self._element_mgr.words
793
-
1027
+
794
1028
  @property
795
1029
  def rects(self) -> List[Any]:
796
1030
  """Get all rectangle elements on this page."""
797
1031
  return self._element_mgr.rects
798
-
1032
+
799
1033
  @property
800
1034
  def lines(self) -> List[Any]:
801
1035
  """Get all line elements on this page."""
802
1036
  return self._element_mgr.lines
803
-
804
- def highlight(self,
805
- bbox: Optional[Tuple[float, float, float, float]] = None,
806
- color: Optional[Union[Tuple, str]] = None,
807
- label: Optional[str] = None,
808
- use_color_cycling: bool = False,
809
- element: Optional[Any] = None,
810
- include_attrs: Optional[List[str]] = None,
811
- existing: str = 'append') -> 'Page':
1037
+
1038
+ def highlight(
1039
+ self,
1040
+ bbox: Optional[Tuple[float, float, float, float]] = None,
1041
+ color: Optional[Union[Tuple, str]] = None,
1042
+ label: Optional[str] = None,
1043
+ use_color_cycling: bool = False,
1044
+ element: Optional[Any] = None,
1045
+ include_attrs: Optional[List[str]] = None,
1046
+ existing: str = "append",
1047
+ ) -> "Page":
812
1048
  """
813
1049
  Highlight a bounding box or the entire page.
814
1050
  Delegates to the central HighlightingService.
815
-
1051
+
816
1052
  Args:
817
1053
  bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
818
1054
  color: RGBA color tuple/string for the highlight.
@@ -834,23 +1070,24 @@ class Page:
834
1070
  use_color_cycling=use_color_cycling,
835
1071
  element=element,
836
1072
  include_attrs=include_attrs,
837
- existing=existing
1073
+ existing=existing,
838
1074
  )
839
1075
  return self
840
1076
 
841
1077
  def highlight_polygon(
842
- self,
1078
+ self,
843
1079
  polygon: List[Tuple[float, float]],
844
- color: Optional[Union[Tuple, str]] = None,
1080
+ color: Optional[Union[Tuple, str]] = None,
845
1081
  label: Optional[str] = None,
846
1082
  use_color_cycling: bool = False,
847
1083
  element: Optional[Any] = None,
848
1084
  include_attrs: Optional[List[str]] = None,
849
- existing: str = 'append') -> 'Page':
1085
+ existing: str = "append",
1086
+ ) -> "Page":
850
1087
  """
851
1088
  Highlight a polygon shape on the page.
852
1089
  Delegates to the central HighlightingService.
853
-
1090
+
854
1091
  Args:
855
1092
  polygon: List of (x, y) points defining the polygon.
856
1093
  color: RGBA color tuple/string for the highlight.
@@ -871,51 +1108,55 @@ class Page:
871
1108
  use_color_cycling=use_color_cycling,
872
1109
  element=element,
873
1110
  include_attrs=include_attrs,
874
- existing=existing
1111
+ existing=existing,
875
1112
  )
876
1113
  return self
877
-
878
- def show(self,
879
- scale: float = 2.0,
880
- width: Optional[int] = None,
881
- labels: bool = True,
882
- legend_position: str = 'right',
883
- render_ocr: bool = False) -> Optional[Image.Image]:
1114
+
1115
+ def show(
1116
+ self,
1117
+ scale: float = 2.0,
1118
+ width: Optional[int] = None,
1119
+ labels: bool = True,
1120
+ legend_position: str = "right",
1121
+ render_ocr: bool = False,
1122
+ ) -> Optional[Image.Image]:
884
1123
  """
885
1124
  Generates and returns an image of the page with persistent highlights rendered.
886
-
1125
+
887
1126
  Args:
888
1127
  scale: Scale factor for rendering.
889
1128
  width: Optional width for the output image.
890
1129
  labels: Whether to include a legend for labels.
891
1130
  legend_position: Position of the legend.
892
1131
  render_ocr: Whether to render OCR text.
893
-
1132
+
894
1133
  Returns:
895
1134
  PIL Image object of the page with highlights, or None if rendering fails.
896
1135
  """
897
1136
  return self.to_image(
898
1137
  scale=scale,
899
1138
  width=width,
900
- labels=labels,
901
- legend_position=legend_position,
1139
+ labels=labels,
1140
+ legend_position=legend_position,
902
1141
  render_ocr=render_ocr,
903
- include_highlights=True # Ensure highlights are requested
1142
+ include_highlights=True, # Ensure highlights are requested
904
1143
  )
905
-
906
- def save_image(self,
907
- filename: str,
908
- scale: float = 2.0,
909
- width: Optional[int] = None,
910
- labels: bool = True,
911
- legend_position: str = 'right',
912
- render_ocr: bool = False,
913
- include_highlights: bool = True, # Allow saving without highlights
914
- resolution: Optional[float] = None,
915
- **kwargs) -> 'Page':
1144
+
1145
+ def save_image(
1146
+ self,
1147
+ filename: str,
1148
+ scale: float = 2.0,
1149
+ width: Optional[int] = None,
1150
+ labels: bool = True,
1151
+ legend_position: str = "right",
1152
+ render_ocr: bool = False,
1153
+ include_highlights: bool = True, # Allow saving without highlights
1154
+ resolution: Optional[float] = None,
1155
+ **kwargs,
1156
+ ) -> "Page":
916
1157
  """
917
1158
  Save the page image to a file, rendering highlights via HighlightingService.
918
-
1159
+
919
1160
  Args:
920
1161
  filename: Path to save the image to.
921
1162
  scale: Scale factor for rendering highlights.
@@ -926,7 +1167,7 @@ class Page:
926
1167
  include_highlights: Whether to render highlights.
927
1168
  resolution: Resolution for base image rendering.
928
1169
  **kwargs: Additional args for pdfplumber's to_image.
929
-
1170
+
930
1171
  Returns:
931
1172
  Self for method chaining.
932
1173
  """
@@ -935,25 +1176,25 @@ class Page:
935
1176
  path=filename,
936
1177
  scale=scale,
937
1178
  width=width,
938
- labels=labels,
1179
+ labels=labels,
939
1180
  legend_position=legend_position,
940
1181
  render_ocr=render_ocr,
941
1182
  include_highlights=include_highlights,
942
1183
  resolution=resolution,
943
- **kwargs
1184
+ **kwargs,
944
1185
  )
945
1186
  return self
946
-
947
- def clear_highlights(self) -> 'Page':
1187
+
1188
+ def clear_highlights(self) -> "Page":
948
1189
  """
949
1190
  Clear all highlights *from this specific page* via HighlightingService.
950
-
1191
+
951
1192
  Returns:
952
1193
  Self for method chaining
953
1194
  """
954
1195
  self._highlighter.clear_page(self.index)
955
1196
  return self
956
-
1197
+
957
1198
  def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
958
1199
  """
959
1200
  Analyze text elements by style, adding attributes directly to elements.
@@ -982,19 +1223,22 @@ class Page:
982
1223
  # Return the collection of elements which now have style attributes
983
1224
  return processed_elements_collection
984
1225
 
985
- def to_image(self,
986
- path: Optional[str] = None,
987
- scale: float = 2.0,
988
- width: Optional[int] = None,
989
- labels: bool = True,
990
- legend_position: str = 'right',
991
- render_ocr: bool = False,
992
- resolution: Optional[float] = None,
993
- include_highlights: bool = True,
994
- **kwargs) -> Optional[Image.Image]:
1226
+ def to_image(
1227
+ self,
1228
+ path: Optional[str] = None,
1229
+ scale: float = 2.0,
1230
+ width: Optional[int] = None,
1231
+ labels: bool = True,
1232
+ legend_position: str = "right",
1233
+ render_ocr: bool = False,
1234
+ resolution: Optional[float] = None,
1235
+ include_highlights: bool = True,
1236
+ exclusions: Optional[str] = None, # New parameter
1237
+ **kwargs,
1238
+ ) -> Optional[Image.Image]:
995
1239
  """
996
1240
  Generate a PIL image of the page, using HighlightingService if needed.
997
-
1241
+
998
1242
  Args:
999
1243
  path: Optional path to save the image to.
1000
1244
  scale: Scale factor for rendering highlights.
@@ -1004,50 +1248,104 @@ class Page:
1004
1248
  render_ocr: Whether to render OCR text on highlights.
1005
1249
  resolution: Resolution in DPI for base page image (default: scale * 72).
1006
1250
  include_highlights: Whether to render highlights.
1251
+ exclusions: If 'mask', excluded regions will be whited out on the image.
1252
+ (default: None).
1007
1253
  **kwargs: Additional parameters for pdfplumber.to_image.
1008
-
1254
+
1009
1255
  Returns:
1010
1256
  PIL Image of the page, or None if rendering fails.
1011
1257
  """
1012
1258
  image = None
1259
+ render_resolution = resolution if resolution is not None else scale * 72
1013
1260
  try:
1014
1261
  if include_highlights:
1015
1262
  # Delegate rendering to the central service
1016
1263
  image = self._highlighter.render_page(
1017
1264
  page_index=self.index,
1018
- scale=scale,
1265
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1019
1266
  labels=labels,
1020
1267
  legend_position=legend_position,
1021
1268
  render_ocr=render_ocr,
1022
- resolution=resolution,
1023
- **kwargs
1269
+ resolution=render_resolution, # Pass the calculated resolution
1270
+ **kwargs,
1024
1271
  )
1025
1272
  else:
1026
1273
  # Get the base page image directly from pdfplumber if no highlights needed
1027
- render_resolution = resolution if resolution is not None else scale * 72
1028
1274
  # Use the underlying pdfplumber page object
1029
1275
  img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1030
1276
  # Access the PIL image directly (assuming pdfplumber structure)
1031
- image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
1032
- if isinstance(image, bytes): # Handle cases where it returns bytes
1033
- from io import BytesIO
1034
- image = Image.open(BytesIO(image)).convert('RGB') # Convert to RGB for consistency
1035
-
1277
+ image = (
1278
+ img_object.annotated
1279
+ if hasattr(img_object, "annotated")
1280
+ else img_object._repr_png_()
1281
+ )
1282
+ if isinstance(image, bytes): # Handle cases where it returns bytes
1283
+ from io import BytesIO
1284
+
1285
+ image = Image.open(BytesIO(image)).convert(
1286
+ "RGB"
1287
+ ) # Convert to RGB for consistency
1288
+
1036
1289
  except Exception as e:
1037
1290
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1038
- return None # Return None on error
1291
+ return None # Return None on error
1039
1292
 
1040
- if image is None: return None
1293
+ if image is None:
1294
+ return None
1295
+
1296
+ # --- Apply exclusion masking if requested ---
1297
+ if exclusions == "mask" and self._exclusions:
1298
+ try:
1299
+ # Ensure image is mutable (RGB or RGBA)
1300
+ if image.mode not in ("RGB", "RGBA"):
1301
+ image = image.convert("RGB")
1302
+
1303
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1304
+ if exclusion_regions:
1305
+ draw = ImageDraw.Draw(image)
1306
+ # Calculate the scaling factor used for the image
1307
+ # Base image was rendered at render_resolution (DPI)
1308
+ # pdfplumber default is 72 DPI
1309
+ # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
1310
+ img_scale = render_resolution / 72.0
1311
+
1312
+ for region in exclusion_regions:
1313
+ # Convert PDF points (x0, top, x1, bottom) to image pixels
1314
+ img_x0 = region.x0 * img_scale
1315
+ img_top = region.top * img_scale
1316
+ img_x1 = region.x1 * img_scale
1317
+ img_bottom = region.bottom * img_scale
1318
+
1319
+ # Draw a white rectangle over the excluded area
1320
+ # Ensure coordinates are within image bounds (though region should be)
1321
+ img_coords = (
1322
+ max(0, img_x0),
1323
+ max(0, img_top),
1324
+ min(image.width, img_x1),
1325
+ min(image.height, img_bottom)
1326
+ )
1327
+ if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1328
+ draw.rectangle(img_coords, fill="white")
1329
+ else:
1330
+ logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
1331
+
1332
+ del draw # Release drawing context
1333
+ except Exception as mask_error:
1334
+ logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
1335
+ # Decide if you want to return None or continue without mask
1336
+ # For now, continue without mask
1041
1337
 
1042
1338
  # Resize the final image if width is provided
1043
1339
  if width is not None and width > 0 and image.width > 0:
1044
1340
  aspect_ratio = image.height / image.width
1045
1341
  height = int(width * aspect_ratio)
1046
1342
  try:
1047
- image = image.resize((width, height), Image.Resampling.LANCZOS) # Use modern resampling
1343
+ image = image.resize(
1344
+ (width, height), Image.Resampling.LANCZOS
1345
+ ) # Use modern resampling
1048
1346
  except Exception as resize_error:
1049
- logger.warning(f"Could not resize image: {resize_error}")
1050
-
1347
+ logger.warning(f"Could not resize image: {resize_error}")
1348
+
1051
1349
  # Save the image if path is provided
1052
1350
  if path:
1053
1351
  try:
@@ -1056,15 +1354,21 @@ class Page:
1056
1354
  image.save(path)
1057
1355
  logger.debug(f"Saved page image to: {path}")
1058
1356
  except Exception as save_error:
1059
- logger.error(f"Failed to save image to {path}: {save_error}")
1060
-
1357
+ logger.error(f"Failed to save image to {path}: {save_error}")
1358
+
1061
1359
  return image
1062
-
1063
- def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
1360
+
1361
+ def _create_text_elements_from_ocr(
1362
+ self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
1363
+ ) -> List[TextElement]:
1064
1364
  """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1065
- logger.warning("_create_text_elements_from_ocr is deprecated. Use self._element_mgr version.")
1066
- return self._element_mgr.create_text_elements_from_ocr(ocr_results, image_width, image_height)
1067
-
1365
+ logger.warning(
1366
+ "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
1367
+ )
1368
+ return self._element_mgr.create_text_elements_from_ocr(
1369
+ ocr_results, image_width, image_height
1370
+ )
1371
+
1068
1372
  def apply_ocr(
1069
1373
  self,
1070
1374
  engine: Optional[str] = None,
@@ -1072,35 +1376,58 @@ class Page:
1072
1376
  languages: Optional[List[str]] = None,
1073
1377
  min_confidence: Optional[float] = None,
1074
1378
  device: Optional[str] = None,
1075
- ) -> List[TextElement]:
1379
+ resolution: Optional[int] = None,
1380
+ detect_only: bool = False,
1381
+ apply_exclusions: bool = True,
1382
+ ) -> "Page":
1076
1383
  """
1077
1384
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1078
-
1385
+
1386
+ Args:
1387
+ engine: Name of the OCR engine.
1388
+ options: Engine-specific options object or dict.
1389
+ languages: List of engine-specific language codes.
1390
+ min_confidence: Minimum confidence threshold.
1391
+ device: Device to run OCR on.
1392
+ resolution: DPI resolution for rendering page image before OCR.
1393
+ apply_exclusions: If True (default), render page image for OCR
1394
+ with excluded areas masked (whited out).
1395
+
1079
1396
  Returns:
1080
1397
  List of created TextElements derived from OCR results for this page.
1081
1398
  """
1082
- if not hasattr(self._parent, 'apply_ocr'):
1083
- logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1084
- return []
1399
+ if not hasattr(self._parent, "apply_ocr"):
1400
+ logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1401
+ return [] # Return empty list for consistency
1085
1402
 
1086
1403
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1087
1404
  try:
1088
1405
  # Delegate to parent PDF, targeting only this page's index
1406
+ # Pass all relevant parameters through, including apply_exclusions
1089
1407
  self._parent.apply_ocr(
1090
1408
  pages=[self.index],
1091
- engine=engine, options=options, languages=languages,
1092
- min_confidence=min_confidence, device=device
1409
+ engine=engine,
1410
+ options=options,
1411
+ languages=languages,
1412
+ min_confidence=min_confidence,
1413
+ device=device,
1414
+ resolution=resolution,
1415
+ detect_only=detect_only,
1416
+ apply_exclusions=apply_exclusions,
1093
1417
  )
1094
1418
  except Exception as e:
1095
- logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1096
- return []
1419
+ logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1420
+ return []
1097
1421
 
1098
1422
  # Return the OCR elements specifically added to this page
1099
- # Use element manager to retrieve them
1100
- ocr_elements = [el for el in self.words if getattr(el, 'source', None) == 'ocr']
1101
- logger.debug(f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements.")
1102
- return ocr_elements
1103
-
1423
+ ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1424
+ logger.debug(
1425
+ f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1426
+ )
1427
+ # Note: The method is typed to return Page for chaining, but the log indicates
1428
+ # finding elements. Let's stick to returning self for chaining consistency.
1429
+ return self
1430
+
1104
1431
  def extract_ocr_elements(
1105
1432
  self,
1106
1433
  engine: Optional[str] = None,
@@ -1108,78 +1435,118 @@ class Page:
1108
1435
  languages: Optional[List[str]] = None,
1109
1436
  min_confidence: Optional[float] = None,
1110
1437
  device: Optional[str] = None,
1438
+ resolution: Optional[int] = None,
1111
1439
  ) -> List[TextElement]:
1112
1440
  """
1113
1441
  Extract text elements using OCR *without* adding them to the page's elements.
1114
1442
  Uses the shared OCRManager instance.
1443
+
1444
+ Args:
1445
+ engine: Name of the OCR engine.
1446
+ options: Engine-specific options object or dict.
1447
+ languages: List of engine-specific language codes.
1448
+ min_confidence: Minimum confidence threshold.
1449
+ device: Device to run OCR on.
1450
+ resolution: DPI resolution for rendering page image before OCR.
1451
+
1452
+ Returns:
1453
+ List of created TextElement objects derived from OCR results for this page.
1115
1454
  """
1116
1455
  if not self._ocr_manager:
1117
- logger.error(f"Page {self.number}: OCRManager not available. Cannot extract OCR elements.")
1118
- return []
1119
-
1456
+ logger.error(
1457
+ f"Page {self.number}: OCRManager not available. Cannot extract OCR elements."
1458
+ )
1459
+ return []
1460
+
1120
1461
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1462
+
1463
+ # Determine rendering resolution
1464
+ final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1465
+ logger.debug(f" Using rendering resolution: {final_resolution} DPI")
1466
+
1121
1467
  try:
1122
- ocr_scale = getattr(self._parent, '_config', {}).get('ocr_image_scale', 2.0)
1123
- # Get base image without highlights
1124
- image = self.to_image(scale=ocr_scale, include_highlights=False)
1468
+ # Get base image without highlights using the determined resolution
1469
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1125
1470
  if not image:
1126
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1127
- return []
1471
+ logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1472
+ return []
1128
1473
  logger.debug(f" Rendered image size: {image.width}x{image.height}")
1129
1474
  except Exception as e:
1130
1475
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1131
1476
  return []
1132
-
1133
- manager_args = {'images': image, 'options': options, 'engine': engine}
1134
- if languages is not None: manager_args['languages'] = languages
1135
- if min_confidence is not None: manager_args['min_confidence'] = min_confidence
1136
- if device is not None: manager_args['device'] = device
1137
-
1138
- logger.debug(f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }")
1477
+
1478
+ # Prepare arguments for the OCR Manager call
1479
+ manager_args = {
1480
+ "images": image,
1481
+ "engine": engine,
1482
+ "languages": languages,
1483
+ "min_confidence": min_confidence,
1484
+ "device": device,
1485
+ "options": options
1486
+ }
1487
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1488
+
1489
+ logger.debug(
1490
+ f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
1491
+ )
1139
1492
  try:
1140
1493
  # apply_ocr now returns List[List[Dict]] or List[Dict]
1141
1494
  results_list = self._ocr_manager.apply_ocr(**manager_args)
1142
1495
  # If it returned a list of lists (batch mode), take the first list
1143
- results = results_list[0] if isinstance(results_list, list) and results_list and isinstance(results_list[0], list) else results_list
1144
-
1496
+ results = (
1497
+ results_list[0]
1498
+ if isinstance(results_list, list)
1499
+ and results_list
1500
+ and isinstance(results_list[0], list)
1501
+ else results_list
1502
+ )
1145
1503
  if not isinstance(results, list):
1146
- logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1147
- results = []
1504
+ logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1505
+ results = []
1148
1506
  logger.info(f" OCR Manager returned {len(results)} results for extraction.")
1149
1507
  except Exception as e:
1150
- logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
1151
- return []
1152
-
1508
+ logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
1509
+ return []
1510
+
1153
1511
  # Convert results but DO NOT add to ElementManager
1154
1512
  logger.debug(f" Converting OCR results to TextElements (extract only)...")
1155
- # Use a temporary method to create elements without adding them globally
1156
1513
  temp_elements = []
1157
1514
  scale_x = self.width / image.width if image.width else 1
1158
1515
  scale_y = self.height / image.height if image.height else 1
1159
1516
  for result in results:
1160
- x0, top, x1, bottom = [float(c) for c in result['bbox']]
1161
- elem_data = {
1162
- 'text': result['text'], 'confidence': result['confidence'],
1163
- 'x0': x0 * scale_x, 'top': top * scale_y,
1164
- 'x1': x1 * scale_x, 'bottom': bottom * scale_y,
1165
- 'width': (x1 - x0) * scale_x, 'height': (bottom - top) * scale_y,
1166
- 'object_type': 'text', 'source': 'ocr',
1167
- 'fontname': 'OCR-temp', 'size': 10.0, 'page_number': self.number
1168
- }
1169
- temp_elements.append(TextElement(elem_data, self))
1517
+ try: # Added try-except around result processing
1518
+ x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1519
+ elem_data = {
1520
+ "text": result["text"],
1521
+ "confidence": result["confidence"],
1522
+ "x0": x0 * scale_x,
1523
+ "top": top * scale_y,
1524
+ "x1": x1 * scale_x,
1525
+ "bottom": bottom * scale_y,
1526
+ "width": (x1 - x0) * scale_x,
1527
+ "height": (bottom - top) * scale_y,
1528
+ "object_type": "text", # Using text for temporary elements
1529
+ "source": "ocr",
1530
+ "fontname": "OCR-extract", # Different name for clarity
1531
+ "size": 10.0,
1532
+ "page_number": self.number,
1533
+ }
1534
+ temp_elements.append(TextElement(elem_data, self))
1535
+ except (KeyError, ValueError, TypeError) as convert_err:
1536
+ logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
1170
1537
 
1171
1538
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1172
1539
  return temp_elements
1173
-
1540
+
1174
1541
  @property
1175
1542
  def layout_analyzer(self) -> LayoutAnalyzer:
1176
1543
  """Get or create the layout analyzer for this page."""
1177
- if self._layout_analyzer is None:
1178
- if not self._layout_manager:
1179
- logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
1180
- return None
1181
- self._layout_analyzer = LayoutAnalyzer(self)
1182
- return self._layout_analyzer
1544
+ if self._layout_analyzer is None:
1545
+ if not self._layout_manager:
1546
+ logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
1547
+ return None
1548
+ self._layout_analyzer = LayoutAnalyzer(self)
1549
+ return self._layout_analyzer
1183
1550
 
1184
1551
  def analyze_layout(
1185
1552
  self,
@@ -1189,7 +1556,7 @@ class Page:
1189
1556
  classes: Optional[List[str]] = None,
1190
1557
  exclude_classes: Optional[List[str]] = None,
1191
1558
  device: Optional[str] = None,
1192
- existing: str = "replace"
1559
+ existing: str = "replace",
1193
1560
  ) -> ElementCollection[Region]:
1194
1561
  """
1195
1562
  Analyze the page layout using the configured LayoutManager.
@@ -1200,8 +1567,10 @@ class Page:
1200
1567
  """
1201
1568
  analyzer = self.layout_analyzer
1202
1569
  if not analyzer:
1203
- logger.error("Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?).")
1204
- return ElementCollection([]) # Return empty collection
1570
+ logger.error(
1571
+ "Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?)."
1572
+ )
1573
+ return ElementCollection([]) # Return empty collection
1205
1574
 
1206
1575
  # The analyzer's analyze_layout method already adds regions to the page
1207
1576
  # and its element manager. We just need to retrieve them.
@@ -1212,17 +1581,20 @@ class Page:
1212
1581
  classes=classes,
1213
1582
  exclude_classes=exclude_classes,
1214
1583
  device=device,
1215
- existing=existing
1584
+ existing=existing,
1216
1585
  )
1217
1586
 
1218
1587
  # Retrieve the detected regions from the element manager
1219
1588
  # Filter regions based on source='detected' and potentially the model used if available
1220
- detected_regions = [r for r in self._element_mgr.regions
1221
- if r.source == 'detected' and (not engine or getattr(r, 'model', None) == engine)]
1589
+ detected_regions = [
1590
+ r
1591
+ for r in self._element_mgr.regions
1592
+ if r.source == "detected" and (not engine or getattr(r, "model", None) == engine)
1593
+ ]
1222
1594
 
1223
1595
  return ElementCollection(detected_regions)
1224
1596
 
1225
- def clear_detected_layout_regions(self) -> 'Page':
1597
+ def clear_detected_layout_regions(self) -> "Page":
1226
1598
  """
1227
1599
  Removes all regions from this page that were added by layout analysis
1228
1600
  (i.e., regions where `source` attribute is 'detected').
@@ -1233,47 +1605,61 @@ class Page:
1233
1605
  Returns:
1234
1606
  Self for method chaining.
1235
1607
  """
1236
- if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
1237
- logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
1238
- self._regions['detected'] = [] # Ensure page's list is also clear
1239
- return self
1608
+ if (
1609
+ not hasattr(self._element_mgr, "regions")
1610
+ or not hasattr(self._element_mgr, "_elements")
1611
+ or "regions" not in self._element_mgr._elements
1612
+ ):
1613
+ logger.debug(
1614
+ f"Page {self.index}: No regions found in ElementManager, nothing to clear."
1615
+ )
1616
+ self._regions["detected"] = [] # Ensure page's list is also clear
1617
+ return self
1240
1618
 
1241
1619
  # Filter ElementManager's list to keep only non-detected regions
1242
1620
  original_count = len(self._element_mgr.regions)
1243
- self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
1621
+ self._element_mgr._elements["regions"] = [
1622
+ r for r in self._element_mgr.regions if getattr(r, "source", None) != "detected"
1623
+ ]
1244
1624
  new_count = len(self._element_mgr.regions)
1245
1625
  removed_count = original_count - new_count
1246
1626
 
1247
1627
  # Clear the page's specific list of detected regions
1248
- self._regions['detected'] = []
1628
+ self._regions["detected"] = []
1249
1629
 
1250
1630
  logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
1251
1631
  return self
1252
1632
 
1253
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
1633
+ def get_section_between(
1634
+ self, start_element=None, end_element=None, boundary_inclusion="both"
1635
+ ) -> Optional[Region]: # Return Optional
1254
1636
  """
1255
1637
  Get a section between two elements on this page.
1256
1638
  """
1257
1639
  # Create a full-page region to operate within
1258
1640
  page_region = self.create_region(0, 0, self.width, self.height)
1259
-
1641
+
1260
1642
  # Delegate to the region's method
1261
1643
  try:
1262
1644
  return page_region.get_section_between(
1263
1645
  start_element=start_element,
1264
1646
  end_element=end_element,
1265
- boundary_inclusion=boundary_inclusion
1647
+ boundary_inclusion=boundary_inclusion,
1266
1648
  )
1267
1649
  except Exception as e:
1268
- logger.error(f"Error getting section between elements on page {self.index}: {e}", exc_info=True)
1269
- return None
1270
-
1271
- def get_sections(self,
1272
- start_elements=None,
1273
- end_elements=None,
1274
- boundary_inclusion='both',
1275
- y_threshold=5.0,
1276
- bounding_box=None) -> 'ElementCollection[Region]': # Updated type hint
1650
+ logger.error(
1651
+ f"Error getting section between elements on page {self.index}: {e}", exc_info=True
1652
+ )
1653
+ return None
1654
+
1655
+ def get_sections(
1656
+ self,
1657
+ start_elements=None,
1658
+ end_elements=None,
1659
+ boundary_inclusion="both",
1660
+ y_threshold=5.0,
1661
+ bounding_box=None,
1662
+ ) -> "ElementCollection[Region]": # Updated type hint
1277
1663
  """
1278
1664
  Get sections of a page defined by start/end elements.
1279
1665
  Uses the page-level implementation.
@@ -1281,6 +1667,7 @@ class Page:
1281
1667
  Returns:
1282
1668
  An ElementCollection containing the found Region objects.
1283
1669
  """
1670
+
1284
1671
  # Helper function to get bounds from bounding_box parameter
1285
1672
  def get_bounds():
1286
1673
  if bounding_box:
@@ -1289,130 +1676,180 @@ class Page:
1289
1676
  return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
1290
1677
  else:
1291
1678
  return 0, 0, self.width, self.height
1292
-
1679
+
1293
1680
  regions = []
1294
-
1681
+
1295
1682
  # Handle cases where elements are provided as strings (selectors)
1296
1683
  if isinstance(start_elements, str):
1297
- start_elements = self.find_all(start_elements).elements # Get list of elements
1298
- elif hasattr(start_elements, 'elements'): # Handle ElementCollection input
1299
- start_elements = start_elements.elements
1300
-
1684
+ start_elements = self.find_all(start_elements).elements # Get list of elements
1685
+ elif hasattr(start_elements, "elements"): # Handle ElementCollection input
1686
+ start_elements = start_elements.elements
1687
+
1301
1688
  if isinstance(end_elements, str):
1302
1689
  end_elements = self.find_all(end_elements).elements
1303
- elif hasattr(end_elements, 'elements'):
1304
- end_elements = end_elements.elements
1690
+ elif hasattr(end_elements, "elements"):
1691
+ end_elements = end_elements.elements
1305
1692
 
1306
1693
  # Ensure start_elements is a list
1307
- if start_elements is None: start_elements = []
1308
- if end_elements is None: end_elements = []
1694
+ if start_elements is None:
1695
+ start_elements = []
1696
+ if end_elements is None:
1697
+ end_elements = []
1309
1698
 
1310
- valid_inclusions = ['start', 'end', 'both', 'none']
1699
+ valid_inclusions = ["start", "end", "both", "none"]
1311
1700
  if boundary_inclusion not in valid_inclusions:
1312
1701
  raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
1313
-
1702
+
1314
1703
  if not start_elements:
1315
1704
  # Return an empty ElementCollection if no start elements
1316
1705
  return ElementCollection([])
1317
-
1706
+
1318
1707
  # Combine start and end elements with their type
1319
1708
  all_boundaries = []
1320
- for el in start_elements: all_boundaries.append((el, 'start'))
1321
- for el in end_elements: all_boundaries.append((el, 'end'))
1322
-
1709
+ for el in start_elements:
1710
+ all_boundaries.append((el, "start"))
1711
+ for el in end_elements:
1712
+ all_boundaries.append((el, "end"))
1713
+
1323
1714
  # Sort all boundary elements primarily by top, then x0
1324
1715
  try:
1325
- all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
1716
+ all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
1326
1717
  except AttributeError as e:
1327
- logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
1328
- return ElementCollection([]) # Cannot proceed if elements lack position
1718
+ logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
1719
+ return ElementCollection([]) # Cannot proceed if elements lack position
1329
1720
 
1330
1721
  # Process sorted boundaries to find sections
1331
1722
  current_start_element = None
1332
1723
  active_section_started = False
1333
1724
 
1334
1725
  for element, element_type in all_boundaries:
1335
- if element_type == 'start':
1726
+ if element_type == "start":
1336
1727
  # If we have an active section, this start implicitly ends it
1337
1728
  if active_section_started:
1338
- end_boundary_el = element # Use this start as the end boundary
1729
+ end_boundary_el = element # Use this start as the end boundary
1339
1730
  # Determine region boundaries
1340
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1341
- sec_bottom = end_boundary_el.top if boundary_inclusion not in ['end', 'both'] else end_boundary_el.bottom
1342
-
1343
- if sec_top < sec_bottom: # Ensure valid region
1731
+ sec_top = (
1732
+ current_start_element.top
1733
+ if boundary_inclusion in ["start", "both"]
1734
+ else current_start_element.bottom
1735
+ )
1736
+ sec_bottom = (
1737
+ end_boundary_el.top
1738
+ if boundary_inclusion not in ["end", "both"]
1739
+ else end_boundary_el.bottom
1740
+ )
1741
+
1742
+ if sec_top < sec_bottom: # Ensure valid region
1344
1743
  x0, _, x1, _ = get_bounds()
1345
1744
  region = self.create_region(x0, sec_top, x1, sec_bottom)
1346
1745
  region.start_element = current_start_element
1347
- region.end_element = end_boundary_el # Mark the element that ended it
1348
- region.is_end_next_start = True # Mark how it ended
1746
+ region.end_element = end_boundary_el # Mark the element that ended it
1747
+ region.is_end_next_start = True # Mark how it ended
1349
1748
  regions.append(region)
1350
- active_section_started = False # Reset for the new start
1351
-
1749
+ active_section_started = False # Reset for the new start
1750
+
1352
1751
  # Set this as the potential start of the next section
1353
1752
  current_start_element = element
1354
1753
  active_section_started = True
1355
1754
 
1356
- elif element_type == 'end' and active_section_started:
1755
+ elif element_type == "end" and active_section_started:
1357
1756
  # We found an explicit end for the current section
1358
1757
  end_boundary_el = element
1359
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1360
- sec_bottom = end_boundary_el.bottom if boundary_inclusion in ['end', 'both'] else end_boundary_el.top
1361
-
1362
- if sec_top < sec_bottom: # Ensure valid region
1758
+ sec_top = (
1759
+ current_start_element.top
1760
+ if boundary_inclusion in ["start", "both"]
1761
+ else current_start_element.bottom
1762
+ )
1763
+ sec_bottom = (
1764
+ end_boundary_el.bottom
1765
+ if boundary_inclusion in ["end", "both"]
1766
+ else end_boundary_el.top
1767
+ )
1768
+
1769
+ if sec_top < sec_bottom: # Ensure valid region
1363
1770
  x0, _, x1, _ = get_bounds()
1364
1771
  region = self.create_region(x0, sec_top, x1, sec_bottom)
1365
1772
  region.start_element = current_start_element
1366
1773
  region.end_element = end_boundary_el
1367
1774
  region.is_end_next_start = False
1368
1775
  regions.append(region)
1369
-
1776
+
1370
1777
  # Reset: section ended explicitly
1371
1778
  current_start_element = None
1372
1779
  active_section_started = False
1373
-
1780
+
1374
1781
  # Handle the last section if it was started but never explicitly ended
1375
1782
  if active_section_started:
1376
- sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1783
+ sec_top = (
1784
+ current_start_element.top
1785
+ if boundary_inclusion in ["start", "both"]
1786
+ else current_start_element.bottom
1787
+ )
1377
1788
  x0, _, x1, page_bottom = get_bounds()
1378
1789
  if sec_top < page_bottom:
1379
- region = self.create_region(x0, sec_top, x1, page_bottom)
1380
- region.start_element = current_start_element
1381
- region.end_element = None # Ended by page end
1382
- region.is_end_next_start = False
1383
- regions.append(region)
1384
-
1790
+ region = self.create_region(x0, sec_top, x1, page_bottom)
1791
+ region.start_element = current_start_element
1792
+ region.end_element = None # Ended by page end
1793
+ region.is_end_next_start = False
1794
+ regions.append(region)
1795
+
1385
1796
  # Return the list wrapped in an ElementCollection
1386
1797
  return ElementCollection(regions)
1387
-
1798
+
1388
1799
  def __repr__(self) -> str:
1389
1800
  """String representation of the page."""
1390
1801
  return f"<Page number={self.number} index={self.index}>"
1391
-
1392
- def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1802
+
1803
+ def ask(
1804
+ self,
1805
+ question: str,
1806
+ min_confidence: float = 0.1,
1807
+ model: str = None,
1808
+ debug: bool = False,
1809
+ **kwargs,
1810
+ ) -> Dict[str, Any]:
1393
1811
  """
1394
1812
  Ask a question about the page content using document QA.
1395
1813
  """
1396
1814
  try:
1397
- from natural_pdf.qa.document_qa import get_qa_engine
1398
- # Get or initialize QA engine with specified model
1399
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1400
- # Ask the question using the QA engine
1401
- return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1815
+ from natural_pdf.qa.document_qa import get_qa_engine
1816
+
1817
+ # Get or initialize QA engine with specified model
1818
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1819
+ # Ask the question using the QA engine
1820
+ return qa_engine.ask_pdf_page(
1821
+ self, question, min_confidence=min_confidence, debug=debug, **kwargs
1822
+ )
1402
1823
  except ImportError:
1403
- logger.error("Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies.")
1404
- return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1824
+ logger.error(
1825
+ "Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies."
1826
+ )
1827
+ return {
1828
+ "answer": None,
1829
+ "confidence": 0.0,
1830
+ "found": False,
1831
+ "page_num": self.number,
1832
+ "source_elements": [],
1833
+ }
1405
1834
  except Exception as e:
1406
- logger.error(f"Error during page.ask: {e}", exc_info=True)
1407
- return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1835
+ logger.error(f"Error during page.ask: {e}", exc_info=True)
1836
+ return {
1837
+ "answer": None,
1838
+ "confidence": 0.0,
1839
+ "found": False,
1840
+ "page_num": self.number,
1841
+ "source_elements": [],
1842
+ }
1408
1843
 
1409
- def show_preview(self,
1410
- temporary_highlights: List[Dict],
1411
- scale: float = 2.0,
1412
- width: Optional[int] = None,
1413
- labels: bool = True,
1414
- legend_position: str = 'right',
1415
- render_ocr: bool = False) -> Optional[Image.Image]:
1844
+ def show_preview(
1845
+ self,
1846
+ temporary_highlights: List[Dict],
1847
+ scale: float = 2.0,
1848
+ width: Optional[int] = None,
1849
+ labels: bool = True,
1850
+ legend_position: str = "right",
1851
+ render_ocr: bool = False,
1852
+ ) -> Optional[Image.Image]:
1416
1853
  """
1417
1854
  Generates and returns a non-stateful preview image containing only
1418
1855
  the provided temporary highlights.
@@ -1437,13 +1874,16 @@ class Page:
1437
1874
  scale=scale,
1438
1875
  labels=labels,
1439
1876
  legend_position=legend_position,
1440
- render_ocr=render_ocr
1877
+ render_ocr=render_ocr,
1441
1878
  )
1442
1879
  except AttributeError:
1443
1880
  logger.error(f"HighlightingService does not have the required 'render_preview' method.")
1444
1881
  return None
1445
1882
  except Exception as e:
1446
- logger.error(f"Error calling highlighter.render_preview for page {self.index}: {e}", exc_info=True)
1883
+ logger.error(
1884
+ f"Error calling highlighter.render_preview for page {self.index}: {e}",
1885
+ exc_info=True,
1886
+ )
1447
1887
  return None
1448
1888
 
1449
1889
  # Return the rendered image directly
@@ -1451,7 +1891,7 @@ class Page:
1451
1891
 
1452
1892
  @property
1453
1893
  def text_style_labels(self) -> List[str]:
1454
- """
1894
+ """
1455
1895
  Get a sorted list of unique text style labels found on the page.
1456
1896
 
1457
1897
  Runs text style analysis with default options if it hasn't been run yet.
@@ -1461,52 +1901,66 @@ class Page:
1461
1901
  A sorted list of unique style label strings.
1462
1902
  """
1463
1903
  # Check if the summary attribute exists from a previous run
1464
- if not hasattr(self, '_text_styles_summary') or not self._text_styles_summary:
1904
+ if not hasattr(self, "_text_styles_summary") or not self._text_styles_summary:
1465
1905
  # If not, run the analysis with default options
1466
1906
  logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
1467
- self.analyze_text_styles() # Use default options
1907
+ self.analyze_text_styles() # Use default options
1468
1908
 
1469
1909
  # Extract labels from the summary dictionary
1470
- if hasattr(self, '_text_styles_summary') and self._text_styles_summary:
1910
+ if hasattr(self, "_text_styles_summary") and self._text_styles_summary:
1471
1911
  # The summary maps style_key -> {'label': ..., 'properties': ...}
1472
- labels = {style_info['label'] for style_info in self._text_styles_summary.values()}
1912
+ labels = {style_info["label"] for style_info in self._text_styles_summary.values()}
1473
1913
  return sorted(list(labels))
1474
1914
  else:
1475
1915
  # Fallback if summary wasn't created for some reason (e.g., no text elements)
1476
- logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
1477
- return []
1916
+ logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
1917
+ return []
1478
1918
 
1479
- def viewer(self,
1480
- # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
1481
- # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
1482
- ) -> 'SimpleInteractiveViewerWidget': # Return type hint updated
1919
+ def viewer(
1920
+ self,
1921
+ # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
1922
+ # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
1923
+ ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
1483
1924
  """
1484
1925
  Creates and returns an interactive ipywidget for exploring elements on this page.
1485
1926
 
1486
1927
  Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
1487
1928
 
1488
1929
  Returns:
1489
- A SimpleInteractiveViewerWidget instance ready for display in Jupyter.
1930
+ A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
1931
+ or None if ipywidgets is not installed or widget creation fails.
1490
1932
 
1491
1933
  Raises:
1492
- RuntimeError: If required dependencies (ipywidgets) are missing.
1934
+ # Optional: Could raise ImportError instead of returning None
1935
+ # ImportError: If required dependencies (ipywidgets) are missing.
1493
1936
  ValueError: If image rendering or data preparation fails within from_page.
1494
1937
  """
1495
- # Dynamically import here if needed, or ensure it's globally available
1938
+ # Check for availability using the imported flag and class variable
1939
+ if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
1940
+ logger.error(
1941
+ "Interactive viewer requires optional dependencies ('ipywidgets'). "
1942
+ "Install with `pip install natural-pdf[interactive]`"
1943
+ )
1944
+ # raise ImportError("ipywidgets not found.") # Option 1: Raise error
1945
+ return None # Option 2: Return None gracefully
1946
+
1947
+ # If we reach here, SimpleInteractiveViewerWidget should be the actual class
1496
1948
  try:
1497
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1498
- except ImportError:
1499
- logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
1500
- raise
1501
-
1502
- # Pass self (the Page object) to the factory method
1503
- return SimpleInteractiveViewerWidget.from_page(self)
1949
+ # Pass self (the Page object) to the factory method
1950
+ return SimpleInteractiveViewerWidget.from_page(self)
1951
+ except Exception as e:
1952
+ # Catch potential errors during widget creation (e.g., image rendering)
1953
+ logger.error(
1954
+ f"Error creating viewer widget from page {self.number}: {e}", exc_info=True
1955
+ )
1956
+ # raise # Option 1: Re-raise error (might include ValueError from from_page)
1957
+ return None # Option 2: Return None on creation error
1504
1958
 
1505
1959
  # --- Indexable Protocol Methods ---
1506
1960
  def get_id(self) -> str:
1507
1961
  """Returns a unique identifier for the page (required by Indexable protocol)."""
1508
1962
  # Ensure path is safe for use in IDs (replace problematic chars)
1509
- safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
1963
+ safe_path = re.sub(r"[^a-zA-Z0-9_-]", "_", str(self.pdf.path))
1510
1964
  return f"pdf_{safe_path}_page_{self.page_number}"
1511
1965
 
1512
1966
  def get_metadata(self) -> Dict[str, Any]:
@@ -1517,21 +1971,90 @@ class Page:
1517
1971
  "page_number": self.page_number,
1518
1972
  "width": self.width,
1519
1973
  "height": self.height,
1520
- "content_hash": self.get_content_hash() # Include the hash
1974
+ "content_hash": self.get_content_hash(), # Include the hash
1521
1975
  }
1522
1976
  return metadata
1523
1977
 
1524
- def get_content(self) -> 'Page':
1978
+ def get_content(self) -> "Page":
1525
1979
  """
1526
1980
  Returns the primary content object (self) for indexing (required by Indexable protocol).
1527
1981
  SearchService implementations decide how to process this (e.g., call extract_text).
1528
1982
  """
1529
- return self # Return the Page object itself
1983
+ return self # Return the Page object itself
1530
1984
 
1531
1985
  def get_content_hash(self) -> str:
1532
1986
  """Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
1533
1987
  # Hash the extracted text (without exclusions for consistency)
1534
1988
  # Consider if exclusions should be part of the hash? For now, hash raw text.
1535
1989
  # Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
1536
- text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
1537
- return hashlib.sha256(text_content.encode('utf-8')).hexdigest()
1990
+ text_content = self.extract_text(
1991
+ use_exclusions=False, preserve_whitespace=False
1992
+ ) # Normalize whitespace?
1993
+ return hashlib.sha256(text_content.encode("utf-8")).hexdigest()
1994
+
1995
+ # --- New Method: save_searchable ---
1996
+ def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1997
+ """
1998
+ Saves the PDF page with an OCR text layer, making content searchable.
1999
+
2000
+ Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
2001
+
2002
+ Note: OCR must have been applied to the pages beforehand
2003
+ (e.g., pdf.apply_ocr()).
2004
+
2005
+ Args:
2006
+ output_path: Path to save the searchable PDF.
2007
+ dpi: Resolution for rendering and OCR overlay (default 300).
2008
+ **kwargs: Additional keyword arguments passed to the exporter.
2009
+ """
2010
+ # Import moved here, assuming it's always available now
2011
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
2012
+
2013
+ # Convert pathlib.Path to string if necessary
2014
+ output_path_str = str(output_path)
2015
+
2016
+ create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
2017
+ logger.info(f"Searchable PDF saved to: {output_path_str}")
2018
+
2019
+ # --- Added correct_ocr method ---
2020
+ def correct_ocr(
2021
+ self,
2022
+ correction_callback: Callable[[Any], Optional[str]],
2023
+ ) -> "Page": # Return self for chaining
2024
+ """
2025
+ Applies corrections to OCR-generated text elements on this page
2026
+ using a user-provided callback function.
2027
+
2028
+ Finds text elements on this page whose 'source' attribute starts
2029
+ with 'ocr' and calls the `correction_callback` for each, passing the
2030
+ element itself.
2031
+
2032
+ The `correction_callback` should contain the logic to:
2033
+ 1. Determine if the element needs correction.
2034
+ 2. Perform the correction (e.g., call an LLM).
2035
+ 3. Return the new text (`str`) or `None`.
2036
+
2037
+ If the callback returns a string, the element's `.text` is updated.
2038
+ Metadata updates (source, confidence, etc.) should happen within the callback.
2039
+
2040
+ Args:
2041
+ correction_callback: A function accepting an element and returning
2042
+ `Optional[str]` (new text or None).
2043
+
2044
+ Returns:
2045
+ Self for method chaining.
2046
+ """
2047
+ logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
2048
+
2049
+ # Find OCR elements specifically on this page
2050
+ # Note: We typically want to correct even if the element falls in an excluded area
2051
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2052
+
2053
+ # Delegate to the utility function
2054
+ _apply_ocr_correction_to_elements(
2055
+ elements=target_elements, # Pass the ElementCollection directly
2056
+ correction_callback=correction_callback,
2057
+ caller_info=f"Page({self.number})", # Pass caller info
2058
+ )
2059
+
2060
+ return self # Return self for chaining