natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,62 +1,88 @@
1
- import pdfplumber
1
+ import copy # Add import for deepcopy
2
2
  import logging
3
- import tempfile
4
3
  import os
5
4
  import re
5
+ import tempfile
6
6
  import urllib.request
7
- from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable, TYPE_CHECKING # Added Iterable and TYPE_CHECKING
8
- from pathlib import Path # Added Path
9
- import copy # Add import for deepcopy
7
+ from pathlib import Path # Added Path
8
+ from typing import ( # Added Iterable and TYPE_CHECKING
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ Iterable,
14
+ List,
15
+ Optional,
16
+ Tuple,
17
+ Type,
18
+ Union,
19
+ )
20
+ from pathlib import Path
21
+
22
+
23
+ import pdfplumber
10
24
  from PIL import Image
11
25
 
26
+ from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
27
+ LayoutManager,
28
+ )
29
+ from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
12
30
  from natural_pdf.core.page import Page
13
- from natural_pdf.selectors.parser import parse_selector
14
31
  from natural_pdf.elements.collections import ElementCollection
15
32
  from natural_pdf.elements.region import Region
16
33
  from natural_pdf.ocr import OCRManager, OCROptions
17
- from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
18
- from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
34
+ from natural_pdf.selectors.parser import parse_selector
19
35
 
20
36
  # Import the flag directly - this should always work
21
37
 
22
38
  # --- Add Search Service Imports (needed for new methods) ---
23
39
  try:
40
+ from typing import Any as TypingAny # Import Any if not already
41
+
42
+ from natural_pdf.search import TextSearchOptions # Keep for ask default
24
43
  from natural_pdf.search import (
25
- get_search_service,
26
- SearchServiceProtocol,
44
+ BaseSearchOptions,
27
45
  SearchOptions,
28
- TextSearchOptions, # Keep for ask default
29
- BaseSearchOptions
46
+ SearchServiceProtocol,
47
+ get_search_service,
30
48
  )
31
- from typing import Any as TypingAny # Import Any if not already
32
49
  except ImportError:
33
50
  # Define dummies if needed for type hints within the class
34
51
  SearchServiceProtocol = object
35
52
  SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
36
53
  TypingAny = object
54
+
37
55
  # Dummy factory needed for default arg in methods
38
56
  def get_search_service(**kwargs) -> SearchServiceProtocol:
39
- raise ImportError("Search dependencies are not installed. Install with: pip install natural-pdf[search]")
57
+ raise ImportError(
58
+ "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
59
+ )
60
+
40
61
 
41
62
  # --- End Search Service Imports ---
42
63
 
43
64
  # Set up logger early
44
65
  logger = logging.getLogger("natural_pdf.core.pdf")
45
66
 
67
+
46
68
  class PDF:
47
69
  """
48
70
  Enhanced PDF wrapper built on top of pdfplumber.
49
-
71
+
50
72
  This class provides a fluent interface for working with PDF documents,
51
73
  with improved selection, navigation, and extraction capabilities.
52
74
  """
53
-
54
- def __init__(self, path_or_url: str, reading_order: bool = True,
55
- font_attrs: Optional[List[str]] = None,
56
- keep_spaces: bool = True):
75
+
76
+ def __init__(
77
+ self,
78
+ path_or_url: str,
79
+ reading_order: bool = True,
80
+ font_attrs: Optional[List[str]] = None,
81
+ keep_spaces: bool = True,
82
+ ):
57
83
  """
58
84
  Initialize the enhanced PDF object.
59
-
85
+
60
86
  Args:
61
87
  path_or_url: Path to the PDF file or a URL to a PDF
62
88
  reading_order: Whether to use natural reading order
@@ -69,30 +95,30 @@ class PDF:
69
95
  False: Break text at spaces, each word is separate (legacy behavior)
70
96
  """
71
97
  # Check if the input is a URL
72
- is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
73
-
98
+ is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
99
+
74
100
  # Initialize path-related attributes
75
101
  self._original_path = path_or_url
76
102
  self._temp_file = None
77
- self._resolved_path = None # Store the actual path used by pdfplumber
103
+ self._resolved_path = None # Store the actual path used by pdfplumber
78
104
 
79
105
  if is_url:
80
106
  logger.info(f"Downloading PDF from URL: {path_or_url}")
81
107
  try:
82
108
  # Create a temporary file to store the downloaded PDF
83
- self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
84
-
109
+ self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
110
+
85
111
  # Download the PDF
86
112
  with urllib.request.urlopen(path_or_url) as response:
87
113
  self._temp_file.write(response.read())
88
114
  self._temp_file.flush()
89
115
  self._temp_file.close()
90
-
116
+
91
117
  # Use the temporary file path
92
118
  self._resolved_path = self._temp_file.name
93
119
  logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
94
120
  except Exception as e:
95
- if self._temp_file and hasattr(self._temp_file, 'name'):
121
+ if self._temp_file and hasattr(self._temp_file, "name"):
96
122
  try:
97
123
  os.unlink(self._temp_file.name)
98
124
  except:
@@ -104,40 +130,46 @@ class PDF:
104
130
  self._resolved_path = path_or_url
105
131
 
106
132
  logger.info(f"Initializing PDF from {self._resolved_path}")
107
- logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
108
-
133
+ logger.debug(
134
+ f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
135
+ )
136
+
109
137
  try:
110
138
  self._pdf = pdfplumber.open(self._resolved_path)
111
139
  except Exception as e:
112
- logger.error(f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}", exc_info=True)
113
- # Clean up temp file if creation failed
114
- self.close()
115
- raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
140
+ logger.error(
141
+ f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
142
+ exc_info=True,
143
+ )
144
+ # Clean up temp file if creation failed
145
+ self.close()
146
+ raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
116
147
 
117
- self._path = self._resolved_path # Keep original path too?
118
- self.path = self._resolved_path # Public attribute for the resolved path
119
- self.source_path = self._original_path # Public attribute for the user-provided path/URL
148
+ self._path = self._resolved_path # Keep original path too?
149
+ self.path = self._resolved_path # Public attribute for the resolved path
150
+ self.source_path = self._original_path # Public attribute for the user-provided path/URL
120
151
 
121
152
  self._reading_order = reading_order
122
- self._config = {
123
- 'keep_spaces': keep_spaces
124
- }
153
+ self._config = {"keep_spaces": keep_spaces}
125
154
 
126
155
  self._font_attrs = font_attrs # Store the font attribute configuration
127
156
 
128
157
  # Initialize Managers and Services (conditionally available)
129
158
  self._ocr_manager = OCRManager() if OCRManager else None
130
159
  self._layout_manager = LayoutManager() if LayoutManager else None
131
- self.highlighter = HighlightingService(self)
160
+ self.highlighter = HighlightingService(self)
132
161
 
133
162
  # Initialize pages last, passing necessary refs
134
- self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
163
+ self._pages = [
164
+ Page(p, parent=self, index=i, font_attrs=font_attrs)
165
+ for i, p in enumerate(self._pdf.pages)
166
+ ]
135
167
 
136
168
  # Other state
137
169
  self._element_cache = {}
138
170
  self._exclusions = [] # List to store exclusion functions/regions
139
171
  self._regions = [] # List to store region functions/definitions
140
-
172
+
141
173
  logger.info("Initialized HighlightingService.")
142
174
  logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
143
175
 
@@ -147,45 +179,48 @@ class PDF:
147
179
  return self._pdf.metadata
148
180
 
149
181
  @property
150
- def pages(self) -> 'PageCollection':
182
+ def pages(self) -> "PageCollection":
151
183
  """Access pages as a PageCollection object."""
152
184
  from natural_pdf.elements.collections import PageCollection
185
+
153
186
  # Ensure _pages is initialized
154
- if not hasattr(self, '_pages'):
155
- raise AttributeError("PDF pages not yet initialized.")
187
+ if not hasattr(self, "_pages"):
188
+ raise AttributeError("PDF pages not yet initialized.")
156
189
  return PageCollection(self._pages)
157
-
158
- def clear_exclusions(self) -> 'PDF':
190
+
191
+ def clear_exclusions(self) -> "PDF":
159
192
  """
160
193
  Clear all exclusion functions from the PDF.
161
-
194
+
162
195
  Returns:
163
196
  Self for method chaining
164
197
  """
165
198
  # Ensure _pages is initialized
166
- if not hasattr(self, '_pages'):
167
- raise AttributeError("PDF pages not yet initialized.")
199
+ if not hasattr(self, "_pages"):
200
+ raise AttributeError("PDF pages not yet initialized.")
168
201
 
169
202
  self._exclusions = []
170
203
  # Also clear from pages
171
204
  for page in self._pages:
172
205
  page.clear_exclusions()
173
206
  return self
174
-
175
- def add_exclusion(self, exclusion_func: Callable[['Page'], Optional[Region]], label: str = None) -> 'PDF':
207
+
208
+ def add_exclusion(
209
+ self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
210
+ ) -> "PDF":
176
211
  """
177
212
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
178
-
213
+
179
214
  Args:
180
215
  exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
181
216
  label: Optional label for this exclusion
182
-
217
+
183
218
  Returns:
184
219
  Self for method chaining
185
220
  """
186
221
  # Ensure _pages is initialized
187
- if not hasattr(self, '_pages'):
188
- raise AttributeError("PDF pages not yet initialized.")
222
+ if not hasattr(self, "_pages"):
223
+ raise AttributeError("PDF pages not yet initialized.")
189
224
 
190
225
  # Store exclusion with its label at PDF level
191
226
  exclusion_data = (exclusion_func, label)
@@ -202,12 +237,17 @@ class PDF:
202
237
  self,
203
238
  pages: Optional[Union[Iterable[int], range, slice]] = None,
204
239
  engine: Optional[str] = None,
205
- options: Optional['OCROptions'] = None,
240
+ # --- Common OCR Parameters (Direct Arguments) ---
206
241
  languages: Optional[List[str]] = None,
207
- min_confidence: Optional[float] = None,
242
+ min_confidence: Optional[float] = None, # Min confidence threshold
208
243
  device: Optional[str] = None,
209
- # Add other simple mode args if needed
210
- ) -> 'PDF':
244
+ resolution: Optional[int] = None, # DPI for rendering before OCR
245
+ apply_exclusions: bool = True, # New parameter
246
+ detect_only: bool = False,
247
+ # --- Engine-Specific Options --- Use 'options=' for this
248
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
+ # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
250
+ ) -> "PDF":
211
251
  """
212
252
  Applies OCR to specified pages (or all pages) of the PDF using batch processing.
213
253
 
@@ -217,42 +257,54 @@ class PDF:
217
257
  Args:
218
258
  pages: An iterable of 0-based page indices (list, range, tuple),
219
259
  a slice object, or None to process all pages.
220
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
221
- Uses manager's default if None. Ignored if 'options' is provided.
222
- options: An specific Options object (e.g., EasyOCROptions) for
223
- advanced configuration. Overrides simple arguments.
224
- languages: List of language codes for simple mode.
225
- min_confidence: Minimum confidence threshold for simple mode.
226
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
260
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
+ Uses manager's default ('easyocr') if None.
262
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
+ **Must be codes understood by the specific selected engine.**
264
+ No mapping is performed. Overrides manager/engine default.
265
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
+ Overrides manager/engine default.
267
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
+ Overrides manager/engine default.
269
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
+ Affects input quality for OCR. Defaults to 150 if not set.
271
+ apply_exclusions: If True (default), render page image for OCR with
272
+ excluded areas masked (whited out). If False, OCR
273
+ the raw page image without masking exclusions.
274
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
+ containing parameters specific to the chosen engine.
227
277
 
228
278
  Returns:
229
279
  Self for method chaining.
230
280
 
231
281
  Raises:
232
- ValueError: If page indices are invalid or the engine name is invalid.
233
- TypeError: If unexpected keyword arguments are provided in simple mode.
282
+ ValueError: If page indices are invalid.
283
+ TypeError: If 'options' is not compatible with the engine.
234
284
  RuntimeError: If the OCRManager or selected engine is not available.
235
285
  """
236
286
  if not self._ocr_manager:
237
- logger.error("OCRManager not available. Cannot apply OCR.")
238
- # Or raise RuntimeError("OCRManager not initialized.")
239
- return self
287
+ logger.error("OCRManager not available. Cannot apply OCR.")
288
+ # Or raise RuntimeError("OCRManager not initialized.")
289
+ return self
240
290
 
241
- # --- Determine Target Pages ---
291
+ # --- Determine Target Pages (unchanged) ---
242
292
  target_pages: List[Page] = []
243
293
  if pages is None:
244
294
  target_pages = self._pages
245
295
  elif isinstance(pages, slice):
246
296
  target_pages = self._pages[pages]
247
- elif hasattr(pages, '__iter__'): # Check if it's iterable (list, range, tuple, etc.)
297
+ elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
248
298
  try:
249
299
  target_pages = [self._pages[i] for i in pages]
250
300
  except IndexError:
251
301
  raise ValueError("Invalid page index provided in 'pages' iterable.")
252
302
  except TypeError:
253
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
303
+ raise TypeError(
304
+ "'pages' must be None, a slice, or an iterable of page indices (int)."
305
+ )
254
306
  else:
255
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
307
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
256
308
 
257
309
  if not target_pages:
258
310
  logger.warning("No pages selected for OCR processing.")
@@ -260,101 +312,129 @@ class PDF:
260
312
 
261
313
  page_numbers = [p.number for p in target_pages]
262
314
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
+ # --- Determine Rendering Resolution ---
316
+ # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
+ final_resolution = resolution # Use direct arg if provided
318
+ if final_resolution is None:
319
+ final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
+
321
+ logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
263
322
 
264
323
  # --- Render Images for Batch ---
265
324
  images_pil: List[Image.Image] = []
266
- page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
267
- logger.info(f"Rendering {len(target_pages)} pages to images...")
268
- failed_page_num = 'unknown' # Keep track of potentially failing page
325
+ page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
326
+ logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
327
+ failed_page_num = "unknown" # Keep track of potentially failing page
269
328
  try:
270
- ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
271
329
  for i, page in enumerate(target_pages):
272
- failed_page_num = page.number # Update current page number in case of error
330
+ failed_page_num = page.number # Update current page number in case of error
273
331
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
274
- # Use page.to_image but ensure highlights are off for OCR base image
275
- img = page.to_image(scale=ocr_scale, include_highlights=False)
332
+ # Use the determined final_resolution and apply exclusions if requested
333
+ to_image_kwargs = {
334
+ "resolution": final_resolution,
335
+ "include_highlights": False,
336
+ "exclusions": "mask" if apply_exclusions else None,
337
+ }
338
+ img = page.to_image(**to_image_kwargs)
339
+ if img is None:
340
+ logger.error(f" Failed to render page {page.number} to image.")
341
+ # Decide how to handle: skip page, raise error? For now, skip.
342
+ continue # Skip this page if rendering failed
276
343
  images_pil.append(img)
277
- page_image_map.append((page, img)) # Store pair
344
+ page_image_map.append((page, img)) # Store pair
278
345
  except Exception as e:
279
346
  logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
280
347
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
281
348
 
282
- if not images_pil:
283
- logger.error("No images were successfully rendered for batch OCR.")
284
- return self
349
+ if not images_pil or not page_image_map:
350
+ logger.error("No images were successfully rendered for batch OCR.")
351
+ return self
285
352
 
286
353
  # --- Prepare Arguments for Manager ---
287
- manager_args = {'images': images_pil, 'options': options, 'engine': engine}
288
- simple_args = {}
289
- if languages is not None: simple_args['languages'] = languages
290
- if min_confidence is not None: simple_args['min_confidence'] = min_confidence
291
- if device is not None: simple_args['device'] = device
292
- manager_args.update(simple_args) # Add simple args if options not provided
354
+ # Pass common args directly, engine-specific via options
355
+ manager_args = {
356
+ "images": images_pil,
357
+ "engine": engine,
358
+ "languages": languages,
359
+ "min_confidence": min_confidence, # Use the renamed parameter
360
+ "device": device,
361
+ "options": options,
362
+ "detect_only": detect_only,
363
+ # Note: resolution is used for rendering, not passed to OCR manager directly
364
+ }
365
+ # Filter out None values so manager can use its defaults
366
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
293
367
 
294
368
  # --- Call OCR Manager for Batch Processing ---
295
- logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
369
+ logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
296
370
  try:
297
- # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
371
+ # Manager's apply_ocr signature needs to accept common args directly
298
372
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
299
373
 
300
374
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
301
- logger.error(f"OCR Manager returned unexpected result format or length for batch processing. "
302
- f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
303
- f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}.")
304
- # Handle error - maybe return early or try processing valid parts?
305
- return self # Return self without adding elements
375
+ logger.error(
376
+ f"OCR Manager returned unexpected result format or length for batch processing. "
377
+ f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
378
+ f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
379
+ )
380
+ return self
306
381
 
307
382
  logger.info("OCR Manager batch processing complete.")
308
383
 
309
384
  except Exception as e:
310
- logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
311
- return self # Return self without adding elements
385
+ logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
386
+ return self
312
387
 
313
- # --- Distribute Results and Add Elements to Pages ---
388
+ # --- Distribute Results and Add Elements to Pages (unchanged) ---
314
389
  logger.info("Adding OCR results to respective pages...")
315
390
  total_elements_added = 0
316
391
  for i, (page, img) in enumerate(page_image_map):
317
392
  results_for_page = batch_results[i]
318
393
  if not isinstance(results_for_page, list):
319
- logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
320
- continue
394
+ logger.warning(
395
+ f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
396
+ )
397
+ continue
321
398
 
322
399
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
323
- # Use the page's element manager to create elements from its results
324
- # Changed from page._create_text_elements_from_ocr to use element_mgr
325
400
  try:
326
- # Calculate scale factors based on rendered image vs page dims
327
- img_scale_x = page.width / img.width if img.width > 0 else 1
328
- img_scale_y = page.height / img.height if img.height > 0 else 1
329
- elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img_scale_x, img_scale_y)
330
-
331
- if elements:
332
- # Note: element_mgr.create_text_elements_from_ocr already adds them
333
- total_elements_added += len(elements)
334
- logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
335
- else:
336
- logger.debug(f" No valid TextElements created for page {page.number}.")
401
+ img_scale_x = page.width / img.width if img.width > 0 else 1
402
+ img_scale_y = page.height / img.height if img.height > 0 else 1
403
+ elements = page._element_mgr.create_text_elements_from_ocr(
404
+ results_for_page, img_scale_x, img_scale_y
405
+ )
406
+
407
+ if elements:
408
+ total_elements_added += len(elements)
409
+ logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
410
+ else:
411
+ logger.debug(f" No valid TextElements created for page {page.number}.")
337
412
  except Exception as e:
338
- logger.error(f" Error adding OCR elements to page {page.number}: {e}", exc_info=True)
339
- # Continue to next page
413
+ logger.error(
414
+ f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
415
+ )
340
416
 
341
- logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
417
+ logger.info(
418
+ f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
419
+ )
342
420
  return self
343
-
344
- def add_region(self, region_func: Callable[['Page'], Optional[Region]], name: str = None) -> 'PDF':
421
+
422
+ def add_region(
423
+ self, region_func: Callable[["Page"], Optional[Region]], name: str = None
424
+ ) -> "PDF":
345
425
  """
346
426
  Add a region function to the PDF. This creates regions on all pages using the provided function.
347
-
427
+
348
428
  Args:
349
429
  region_func: A function that takes a Page and returns a Region, or None.
350
430
  name: Optional name for the region
351
-
431
+
352
432
  Returns:
353
433
  Self for method chaining
354
434
  """
355
435
  # Ensure _pages is initialized
356
- if not hasattr(self, '_pages'):
357
- raise AttributeError("PDF pages not yet initialized.")
436
+ if not hasattr(self, "_pages"):
437
+ raise AttributeError("PDF pages not yet initialized.")
358
438
 
359
439
  # Store region with its name at PDF level
360
440
  region_data = (region_func, name)
@@ -367,93 +447,108 @@ class PDF:
367
447
  region_instance = region_func(page)
368
448
  if region_instance and isinstance(region_instance, Region):
369
449
  # If a valid region is returned, add it to the page
370
- page.add_region(region_instance, name=name, source='named')
450
+ page.add_region(region_instance, name=name, source="named")
371
451
  elif region_instance is not None:
372
- logger.warning(f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}")
452
+ logger.warning(
453
+ f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
454
+ )
373
455
  except Exception as e:
374
- logger.error(f"Error executing or adding region function for page {page.number}: {e}", exc_info=True)
456
+ logger.error(
457
+ f"Error executing or adding region function for page {page.number}: {e}",
458
+ exc_info=True,
459
+ )
375
460
 
376
461
  return self
377
-
378
- def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Optional[Any]:
462
+
463
+ def find(
464
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
465
+ ) -> Optional[Any]:
379
466
  """
380
467
  Find the first element matching the selector.
381
-
468
+
382
469
  Args:
383
470
  selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
384
471
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
385
472
  regex: Whether to use regex for text search in :contains (default: False)
386
473
  case: Whether to do case-sensitive text search (default: True)
387
474
  **kwargs: Additional filter parameters
388
-
475
+
389
476
  Returns:
390
477
  Element object or None if not found
391
478
  """
392
479
  # Ensure _pages is initialized
393
- if not hasattr(self, '_pages'):
394
- raise AttributeError("PDF pages not yet initialized.")
480
+ if not hasattr(self, "_pages"):
481
+ raise AttributeError("PDF pages not yet initialized.")
395
482
 
396
483
  selector_obj = parse_selector(selector)
397
-
484
+
398
485
  # Pass regex and case flags to selector function
399
- kwargs['regex'] = regex
400
- kwargs['case'] = case
401
-
402
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs)
486
+ kwargs["regex"] = regex
487
+ kwargs["case"] = case
488
+
489
+ results = self._apply_selector(
490
+ selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
491
+ )
403
492
  return results.first if results else None
404
-
405
- def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
493
+
494
+ def find_all(
495
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
496
+ ) -> ElementCollection:
406
497
  """
407
498
  Find all elements matching the selector.
408
-
499
+
409
500
  Args:
410
501
  selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
411
502
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
412
503
  regex: Whether to use regex for text search in :contains (default: False)
413
504
  case: Whether to do case-sensitive text search (default: True)
414
505
  **kwargs: Additional filter parameters
415
-
506
+
416
507
  Returns:
417
508
  ElementCollection with matching elements
418
509
  """
419
510
  # Ensure _pages is initialized
420
- if not hasattr(self, '_pages'):
421
- raise AttributeError("PDF pages not yet initialized.")
511
+ if not hasattr(self, "_pages"):
512
+ raise AttributeError("PDF pages not yet initialized.")
422
513
 
423
514
  selector_obj = parse_selector(selector)
424
-
515
+
425
516
  # Pass regex and case flags to selector function
426
- kwargs['regex'] = regex
427
- kwargs['case'] = case
428
-
429
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs)
517
+ kwargs["regex"] = regex
518
+ kwargs["case"] = case
519
+
520
+ results = self._apply_selector(
521
+ selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
522
+ )
430
523
  return results
431
-
432
- def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs) -> ElementCollection:
524
+
525
+ def _apply_selector(
526
+ self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
527
+ ) -> ElementCollection:
433
528
  """
434
529
  Apply selector to PDF elements across all pages.
435
-
530
+
436
531
  Args:
437
532
  selector_obj: Parsed selector dictionary
438
533
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
439
534
  first_only: If True, stop searching after the first match is found.
440
535
  **kwargs: Additional filter parameters
441
-
536
+
442
537
  Returns:
443
538
  ElementCollection of matching elements
444
539
  """
445
540
  from natural_pdf.elements.collections import ElementCollection
446
-
541
+
447
542
  # Determine page range to search
448
- page_indices = kwargs.get('pages', range(len(self._pages)))
543
+ page_indices = kwargs.get("pages", range(len(self._pages)))
449
544
  if isinstance(page_indices, int):
450
545
  page_indices = [page_indices]
451
546
  elif isinstance(page_indices, slice):
452
547
  page_indices = range(*page_indices.indices(len(self._pages)))
453
548
 
454
549
  # Check for cross-page pseudo-classes (currently not supported)
455
- for pseudo in selector_obj.get('pseudo_classes', []):
456
- if pseudo.get('name') in ('spans', 'continues'):
550
+ for pseudo in selector_obj.get("pseudo_classes", []):
551
+ if pseudo.get("name") in ("spans", "continues"):
457
552
  logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
458
553
  return ElementCollection([])
459
554
 
@@ -464,134 +559,148 @@ class PDF:
464
559
  page = self._pages[page_idx]
465
560
  # Pass first_only down to page._apply_selector
466
561
  page_elements_collection = page._apply_selector(
467
- selector_obj,
468
- apply_exclusions=apply_exclusions,
469
- first_only=first_only,
470
- **kwargs
562
+ selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
471
563
  )
472
564
  if page_elements_collection:
473
- page_elements = page_elements_collection.elements
474
- all_elements.extend(page_elements)
475
- # If we only need the first match overall, and we found one on this page, stop
476
- if first_only and page_elements:
477
- break # Stop iterating through pages
565
+ page_elements = page_elements_collection.elements
566
+ all_elements.extend(page_elements)
567
+ # If we only need the first match overall, and we found one on this page, stop
568
+ if first_only and page_elements:
569
+ break # Stop iterating through pages
478
570
  else:
479
- logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
571
+ logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
480
572
 
481
573
  # Create a combined collection
482
574
  combined = ElementCollection(all_elements)
483
575
 
484
576
  # Sort in document order if requested and not first_only (already sorted by page)
485
- if not first_only and kwargs.get('document_order', True):
577
+ if not first_only and kwargs.get("document_order", True):
486
578
  # Check if elements have page, top, x0 before sorting
487
- if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
488
- combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
579
+ if all(
580
+ hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
581
+ for el in combined.elements
582
+ ):
583
+ combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
489
584
  else:
490
- # Elements might be Regions without inherent sorting order yet
491
- # Attempt sorting by page index if possible
492
- try:
493
- combined.sort(key=lambda el: el.page.index)
494
- except AttributeError:
495
- logger.warning("Cannot sort elements in document order: Missing required attributes (e.g., page).")
585
+ # Elements might be Regions without inherent sorting order yet
586
+ # Attempt sorting by page index if possible
587
+ try:
588
+ combined.sort(key=lambda el: el.page.index)
589
+ except AttributeError:
590
+ logger.warning(
591
+ "Cannot sort elements in document order: Missing required attributes (e.g., page)."
592
+ )
496
593
 
497
594
  return combined
498
-
499
- def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
500
- use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
595
+
596
+ def extract_text(
597
+ self,
598
+ selector: Optional[str] = None,
599
+ preserve_whitespace=True,
600
+ use_exclusions=True,
601
+ debug_exclusions=False,
602
+ **kwargs,
603
+ ) -> str:
501
604
  """
502
605
  Extract text from the entire document or matching elements.
503
-
606
+
504
607
  Args:
505
608
  selector: Optional selector to filter elements
506
609
  preserve_whitespace: Whether to keep blank characters (default: True)
507
610
  use_exclusions: Whether to apply exclusion regions (default: True)
508
611
  debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
509
612
  **kwargs: Additional extraction parameters
510
-
613
+
511
614
  Returns:
512
615
  Extracted text as string
513
616
  """
514
617
  # Ensure _pages is initialized
515
- if not hasattr(self, '_pages'):
516
- raise AttributeError("PDF pages not yet initialized.")
618
+ if not hasattr(self, "_pages"):
619
+ raise AttributeError("PDF pages not yet initialized.")
517
620
 
518
621
  # If selector is provided, find elements first
519
622
  if selector:
520
623
  elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
521
624
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
522
-
625
+
523
626
  # Otherwise extract from all pages
524
627
  if debug_exclusions:
525
628
  print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
526
629
  print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
527
-
630
+
528
631
  texts = []
529
632
  for page in self.pages:
530
- texts.append(page.extract_text(
531
- preserve_whitespace=preserve_whitespace,
532
- use_exclusions=use_exclusions,
533
- debug_exclusions=debug_exclusions,
534
- **kwargs
535
- ))
536
-
633
+ texts.append(
634
+ page.extract_text(
635
+ preserve_whitespace=preserve_whitespace,
636
+ use_exclusions=use_exclusions,
637
+ debug_exclusions=debug_exclusions,
638
+ **kwargs,
639
+ )
640
+ )
641
+
537
642
  if debug_exclusions:
538
643
  print(f"PDF: Combined {len(texts)} pages of text")
539
-
644
+
540
645
  return "\n".join(texts)
541
-
646
+
542
647
  def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
543
648
  """
544
649
  Shorthand for finding elements and extracting their text.
545
-
650
+
546
651
  Args:
547
652
  selector: CSS-like selector string
548
653
  preserve_whitespace: Whether to keep blank characters (default: True)
549
654
  **kwargs: Additional extraction parameters
550
-
655
+
551
656
  Returns:
552
657
  Extracted text from matching elements
553
658
  """
554
659
  # Ensure _pages is initialized
555
- if not hasattr(self, '_pages'):
556
- raise AttributeError("PDF pages not yet initialized.")
557
- return self.extract_text(selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs) # apply_exclusions is handled by find_all in extract_text
558
-
559
- def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
660
+ if not hasattr(self, "_pages"):
661
+ raise AttributeError("PDF pages not yet initialized.")
662
+ return self.extract_text(
663
+ selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
664
+ ) # apply_exclusions is handled by find_all in extract_text
665
+
666
+ def extract_tables(
667
+ self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
668
+ ) -> List[Any]:
560
669
  """
561
670
  Extract tables from the document or matching elements.
562
-
671
+
563
672
  Args:
564
673
  selector: Optional selector to filter tables
565
674
  merge_across_pages: Whether to merge tables that span across pages
566
675
  **kwargs: Additional extraction parameters
567
-
676
+
568
677
  Returns:
569
678
  List of extracted tables
570
679
  """
571
680
  # Ensure _pages is initialized
572
- if not hasattr(self, '_pages'):
573
- raise AttributeError("PDF pages not yet initialized.")
681
+ if not hasattr(self, "_pages"):
682
+ raise AttributeError("PDF pages not yet initialized.")
574
683
  # TODO: Implement table extraction
575
684
  logger.warning("PDF.extract_tables is not fully implemented yet.")
576
685
  all_tables = []
577
686
  for page in self.pages:
578
- # Assuming page.extract_tables(**kwargs) exists or is added
579
- if hasattr(page, 'extract_tables'):
580
- all_tables.extend(page.extract_tables(**kwargs))
581
- else:
582
- logger.debug(f"Page {page.number} does not have extract_tables method.")
687
+ # Assuming page.extract_tables(**kwargs) exists or is added
688
+ if hasattr(page, "extract_tables"):
689
+ all_tables.extend(page.extract_tables(**kwargs))
690
+ else:
691
+ logger.debug(f"Page {page.number} does not have extract_tables method.")
583
692
  # Placeholder filtering
584
693
  if selector:
585
694
  logger.warning("Filtering extracted tables by selector is not implemented.")
586
695
  # Would need to parse selector and filter the list `all_tables`
587
696
  # Placeholder merging
588
697
  if merge_across_pages:
589
- logger.warning("Merging tables across pages is not implemented.")
590
- # Would need logic to detect and merge related tables
698
+ logger.warning("Merging tables across pages is not implemented.")
699
+ # Would need logic to detect and merge related tables
591
700
  return all_tables
592
-
701
+
593
702
  # --- New Method: save_searchable ---
594
- def save_searchable(self, output_path: Union[str, 'Path'], dpi: int = 300, **kwargs):
703
+ def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
595
704
  """
596
705
  Saves the PDF with an OCR text layer, making content searchable.
597
706
 
@@ -608,15 +717,6 @@ class PDF:
608
717
  # Import moved here, assuming it's always available now
609
718
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
610
719
 
611
- # TODO: Need a reliable way for Page to signal if it has OCR elements.
612
- # This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
613
- # or checking if page.get_elements(source='ocr') returns anything.
614
- # For now, we pass through and let the exporter handle pages without OCR elements.
615
- # if not any(page.get_elements(source='ocr') for page in self.pages):
616
- # logger.warning("No OCR elements found on pages. "
617
- # "Ensure apply_ocr() was called. "
618
- # "Output PDF might not be searchable.")
619
-
620
720
  # Convert pathlib.Path to string if necessary
621
721
  output_path_str = str(output_path)
622
722
 
@@ -625,15 +725,18 @@ class PDF:
625
725
 
626
726
  # --- End New Method ---
627
727
 
628
- def ask(self, question: str,
629
- mode: str = "extractive",
630
- pages: Union[int, List[int], range] = None,
631
- min_confidence: float = 0.1,
632
- model: str = None,
633
- **kwargs) -> Dict[str, Any]:
728
+ def ask(
729
+ self,
730
+ question: str,
731
+ mode: str = "extractive",
732
+ pages: Union[int, List[int], range] = None,
733
+ min_confidence: float = 0.1,
734
+ model: str = None,
735
+ **kwargs,
736
+ ) -> Dict[str, Any]:
634
737
  """
635
738
  Ask a question about the document content.
636
-
739
+
637
740
  Args:
638
741
  question: Question to ask about the document
639
742
  mode: "extractive" to extract answer from document, "generative" to generate
@@ -641,16 +744,16 @@ class PDF:
641
744
  min_confidence: Minimum confidence threshold for answers
642
745
  model: Optional model name for question answering
643
746
  **kwargs: Additional parameters passed to the QA engine
644
-
747
+
645
748
  Returns:
646
749
  A dictionary containing the answer, confidence, and other metadata.
647
750
  Result will have an 'answer' key containing the answer text.
648
751
  """
649
752
  from natural_pdf.qa import get_qa_engine
650
-
753
+
651
754
  # Initialize or get QA engine
652
755
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
653
-
756
+
654
757
  # Determine which pages to query
655
758
  if pages is None:
656
759
  target_pages = list(range(len(self.pages)))
@@ -662,43 +765,40 @@ class PDF:
662
765
  target_pages = pages
663
766
  else:
664
767
  raise ValueError(f"Invalid pages parameter: {pages}")
665
-
768
+
666
769
  # Actually query each page and gather results
667
770
  results = []
668
771
  for page_idx in target_pages:
669
772
  if 0 <= page_idx < len(self.pages):
670
773
  page = self.pages[page_idx]
671
774
  page_result = qa_engine.ask_pdf_page(
672
- page=page,
673
- question=question,
674
- min_confidence=min_confidence,
675
- **kwargs
775
+ page=page, question=question, min_confidence=min_confidence, **kwargs
676
776
  )
677
-
777
+
678
778
  # Add to results if it found an answer
679
779
  if page_result and page_result.get("found", False):
680
780
  results.append(page_result)
681
-
781
+
682
782
  # Sort results by confidence
683
783
  results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
684
-
784
+
685
785
  # Return the best result, or a default result if none found
686
786
  if results:
687
787
  return results[0]
688
788
  else:
689
789
  # Return a structure indicating no answer found
690
790
  return {
691
- "answer": None,
692
- "confidence": 0.0,
693
- "found": False,
694
- "page_num": None, # Or maybe the pages searched?
695
- "source_elements": []
791
+ "answer": None,
792
+ "confidence": 0.0,
793
+ "found": False,
794
+ "page_num": None, # Or maybe the pages searched?
795
+ "source_elements": [],
696
796
  }
697
797
 
698
798
  def search_within_index(
699
799
  self,
700
800
  query: Union[str, Path, Image.Image, Region],
701
- search_service: SearchServiceProtocol, # Now required
801
+ search_service: SearchServiceProtocol, # Now required
702
802
  options: Optional[SearchOptions] = None,
703
803
  ) -> List[Dict[str, Any]]:
704
804
  """
@@ -730,14 +830,16 @@ class PDF:
730
830
  RuntimeError: For other search failures.
731
831
  """
732
832
  if not search_service:
733
- raise ValueError("A configured SearchServiceProtocol instance must be provided.")
833
+ raise ValueError("A configured SearchServiceProtocol instance must be provided.")
734
834
  # Optional stricter check:
735
835
  # if not isinstance(search_service, SearchServiceProtocol):
736
836
  # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
737
837
 
738
838
  # Get collection name from service for logging
739
- collection_name = getattr(search_service, 'collection_name', '<Unknown Collection>')
740
- logger.info(f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}.")
839
+ collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
840
+ logger.info(
841
+ f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
842
+ )
741
843
 
742
844
  # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
743
845
  # service: SearchServiceProtocol
@@ -748,7 +850,7 @@ class PDF:
748
850
  # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
749
851
  # # TODO: Pass embedding model from options/pdf config if needed?
750
852
  # service = get_search_service(**factory_args)
751
- service = search_service # Use validated provided service
853
+ service = search_service # Use validated provided service
752
854
 
753
855
  # --- 2. Prepare Query and Options ---
754
856
  query_input = query
@@ -757,119 +859,220 @@ class PDF:
757
859
 
758
860
  # Handle Region query - extract text for now
759
861
  if isinstance(query, Region):
760
- logger.debug("Query is a Region object. Extracting text.")
761
- if not isinstance(effective_options, TextSearchOptions):
762
- logger.warning("Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction.")
763
- query_input = query.extract_text()
764
- if not query_input or query_input.isspace():
765
- logger.error("Region has no extractable text for query.")
766
- return []
862
+ logger.debug("Query is a Region object. Extracting text.")
863
+ if not isinstance(effective_options, TextSearchOptions):
864
+ logger.warning(
865
+ "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
866
+ )
867
+ query_input = query.extract_text()
868
+ if not query_input or query_input.isspace():
869
+ logger.error("Region has no extractable text for query.")
870
+ return []
767
871
 
768
872
  # --- 3. Add Filter to Scope Search to THIS PDF ---
769
873
  # Assume metadata field 'pdf_path' stores the resolved path used during indexing
770
874
  pdf_scope_filter = {
771
- "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
772
- "operator": "eq",
773
- "value": self.path # Use the resolved path of this PDF instance
875
+ "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
876
+ "operator": "eq",
877
+ "value": self.path, # Use the resolved path of this PDF instance
774
878
  }
775
879
  logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
776
880
 
777
881
  # Combine with existing filters in options (if any)
778
882
  if effective_options.filters:
779
- logger.debug(f"Combining PDF scope filter with existing filters: {effective_options.filters}")
883
+ logger.debug(
884
+ f"Combining PDF scope filter with existing filters: {effective_options.filters}"
885
+ )
780
886
  # Assume filters are compatible with the underlying search service
781
887
  # If existing filters aren't already in an AND block, wrap them
782
- if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
783
- # Already an AND block, just append the condition
784
- effective_options.filters["conditions"].append(pdf_scope_filter)
888
+ if (
889
+ isinstance(effective_options.filters, dict)
890
+ and effective_options.filters.get("operator") == "AND"
891
+ ):
892
+ # Already an AND block, just append the condition
893
+ effective_options.filters["conditions"].append(pdf_scope_filter)
785
894
  elif isinstance(effective_options.filters, list):
786
- # Assume list represents implicit AND conditions
787
- effective_options.filters = {"operator": "AND", "conditions": effective_options.filters + [pdf_scope_filter]}
788
- elif isinstance(effective_options.filters, dict): # Single filter dict
789
- effective_options.filters = {"operator": "AND", "conditions": [effective_options.filters, pdf_scope_filter]}
895
+ # Assume list represents implicit AND conditions
896
+ effective_options.filters = {
897
+ "operator": "AND",
898
+ "conditions": effective_options.filters + [pdf_scope_filter],
899
+ }
900
+ elif isinstance(effective_options.filters, dict): # Single filter dict
901
+ effective_options.filters = {
902
+ "operator": "AND",
903
+ "conditions": [effective_options.filters, pdf_scope_filter],
904
+ }
790
905
  else:
791
- logger.warning(f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter.")
792
- effective_options.filters = pdf_scope_filter
906
+ logger.warning(
907
+ f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
908
+ )
909
+ effective_options.filters = pdf_scope_filter
793
910
  else:
794
911
  effective_options.filters = pdf_scope_filter
795
912
 
796
913
  logger.debug(f"Final filters for service search: {effective_options.filters}")
797
914
 
798
- # --- 4. Call SearchService ---
915
+ # --- 4. Call SearchService ---
799
916
  try:
800
917
  # Call the service's search method (no collection_name needed)
801
918
  results = service.search(
802
919
  query=query_input,
803
920
  options=effective_options,
804
921
  )
805
- logger.info(f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'.")
922
+ logger.info(
923
+ f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
924
+ )
806
925
  return results
807
926
  except FileNotFoundError as fnf:
808
- logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
809
- raise # Re-raise specific error
927
+ logger.error(
928
+ f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
929
+ )
930
+ raise # Re-raise specific error
931
+ except Exception as e:
932
+ logger.error(
933
+ f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
934
+ exc_info=True,
935
+ )
936
+ raise RuntimeError(
937
+ f"Search within index failed for PDF '{self.path}'. See logs for details."
938
+ ) from e
939
+
940
+ def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
941
+ """
942
+ Exports OCR results from this PDF into a correction task package (zip file).
943
+
944
+ Args:
945
+ output_zip_path: The path to save the output zip file.
946
+ **kwargs: Additional arguments passed to create_correction_task_package
947
+ (e.g., image_render_scale, overwrite).
948
+ """
949
+ try:
950
+ from natural_pdf.utils.packaging import create_correction_task_package
951
+ create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
952
+ except ImportError:
953
+ logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
954
+ # Or raise
810
955
  except Exception as e:
811
- logger.error(f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}", exc_info=True)
812
- raise RuntimeError(f"Search within index failed for PDF '{self.path}'. See logs for details.") from e
956
+ logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
957
+ raise # Re-raise the exception from the utility function
958
+
959
+ def correct_ocr(
960
+ self,
961
+ correction_callback: Callable[[Any], Optional[str]],
962
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
963
+ ) -> "PDF": # Return self for chaining
964
+ """
965
+ Applies corrections to OCR-generated text elements using a callback function,
966
+ delegating the core work to the `Page.correct_ocr` method.
967
+
968
+ Args:
969
+ correction_callback: A function that accepts a single argument (an element
970
+ object) and returns `Optional[str]`. It returns the
971
+ corrected text string if an update is needed, otherwise None.
972
+ pages: Optional page indices/slice to limit the scope of correction
973
+ (default: all pages).
974
+
975
+ Returns:
976
+ Self for method chaining.
977
+ """
978
+ # Determine target pages
979
+ target_page_indices: List[int] = []
980
+ if pages is None:
981
+ target_page_indices = list(range(len(self._pages)))
982
+ elif isinstance(pages, slice):
983
+ target_page_indices = list(range(*pages.indices(len(self._pages))))
984
+ elif hasattr(pages, "__iter__"):
985
+ try:
986
+ target_page_indices = [int(i) for i in pages]
987
+ # Validate indices
988
+ for idx in target_page_indices:
989
+ if not (0 <= idx < len(self._pages)):
990
+ raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
991
+ except (IndexError, TypeError, ValueError) as e:
992
+ raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
993
+ else:
994
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
995
+
996
+ if not target_page_indices:
997
+ logger.warning("No pages selected for OCR correction.")
998
+ return self
999
+
1000
+ logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
1001
+
1002
+ # Iterate through target pages and call their correct_ocr method
1003
+ for page_idx in target_page_indices:
1004
+ page = self._pages[page_idx]
1005
+ try:
1006
+ page.correct_ocr(correction_callback=correction_callback)
1007
+ except Exception as e:
1008
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1009
+ # Optionally re-raise or just log and continue
1010
+
1011
+ logger.info(f"OCR correction process finished for requested pages.")
1012
+ return self
813
1013
 
814
1014
  def __len__(self) -> int:
815
1015
  """Return the number of pages in the PDF."""
816
1016
  # Ensure _pages is initialized
817
- if not hasattr(self, '_pages'):
1017
+ if not hasattr(self, "_pages"):
818
1018
  # Return 0 or raise error if not fully initialized? Let's return 0.
819
- return 0
1019
+ return 0
820
1020
  return len(self._pages)
821
-
822
- def __getitem__(self, key) -> Union[Page, 'PageCollection']: # Return PageCollection for slice
1021
+
1022
+ def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
823
1023
  """Access pages by index or slice."""
824
1024
  # Check if self._pages has been initialized
825
- if not hasattr(self, '_pages'):
826
- raise AttributeError("PDF pages not initialized yet.")
1025
+ if not hasattr(self, "_pages"):
1026
+ raise AttributeError("PDF pages not initialized yet.")
827
1027
  if isinstance(key, slice):
828
- # Return a PageCollection slice
829
- from natural_pdf.elements.collections import PageCollection
830
- return PageCollection(self._pages[key])
1028
+ # Return a PageCollection slice
1029
+ from natural_pdf.elements.collections import PageCollection
1030
+
1031
+ return PageCollection(self._pages[key])
831
1032
  # Check index bounds before accessing
832
1033
  if isinstance(key, int):
833
1034
  if 0 <= key < len(self._pages):
834
- return self._pages[key]
1035
+ return self._pages[key]
835
1036
  else:
836
- raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
1037
+ raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
837
1038
  else:
838
- raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
839
-
1039
+ raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
1040
+
840
1041
  def close(self):
841
1042
  """Close the underlying PDF file and clean up any temporary files."""
842
- if hasattr(self, '_pdf') and self._pdf is not None:
843
- try:
844
- self._pdf.close()
845
- logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
846
- except Exception as e:
847
- logger.warning(f"Error closing pdfplumber object: {e}")
848
- finally:
849
- self._pdf = None
1043
+ if hasattr(self, "_pdf") and self._pdf is not None:
1044
+ try:
1045
+ self._pdf.close()
1046
+ logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
1047
+ except Exception as e:
1048
+ logger.warning(f"Error closing pdfplumber object: {e}")
1049
+ finally:
1050
+ self._pdf = None
850
1051
 
851
1052
  # Clean up temporary file if it exists
852
- if hasattr(self, '_temp_file') and self._temp_file is not None:
1053
+ if hasattr(self, "_temp_file") and self._temp_file is not None:
853
1054
  temp_file_path = None
854
1055
  try:
855
- if hasattr(self._temp_file, 'name') and self._temp_file.name:
856
- temp_file_path = self._temp_file.name
857
- if os.path.exists(temp_file_path):
858
- os.unlink(temp_file_path)
859
- logger.debug(f"Removed temporary PDF file: {temp_file_path}")
1056
+ if hasattr(self._temp_file, "name") and self._temp_file.name:
1057
+ temp_file_path = self._temp_file.name
1058
+ if os.path.exists(temp_file_path):
1059
+ os.unlink(temp_file_path)
1060
+ logger.debug(f"Removed temporary PDF file: {temp_file_path}")
860
1061
  except Exception as e:
861
- logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
1062
+ logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
862
1063
  finally:
863
- self._temp_file = None
1064
+ self._temp_file = None
864
1065
 
865
1066
  def __enter__(self):
866
1067
  """Context manager entry."""
867
1068
  return self
868
-
1069
+
869
1070
  def __exit__(self, exc_type, exc_val, exc_tb):
870
1071
  """Context manager exit."""
871
1072
  self.close()
872
1073
 
873
- # --- Added TYPE_CHECKING import (if not already present) ---
874
- if TYPE_CHECKING:
875
- from pathlib import Path # Assuming Path is used for type hint
1074
+
1075
+ # --- Indexable Protocol Methods --- Needed for search/sync
1076
+ def get_id(self) -> str:
1077
+ return self.path
1078
+