natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,11 +1,14 @@
1
- import copy # Add import for deepcopy
1
+ import copy
2
+ import io
2
3
  import logging
3
4
  import os
4
5
  import re
5
6
  import tempfile
7
+ import threading
8
+ import time
6
9
  import urllib.request
7
- from pathlib import Path # Added Path
8
- from typing import ( # Added Iterable and TYPE_CHECKING
10
+ from pathlib import Path
11
+ from typing import (
9
12
  TYPE_CHECKING,
10
13
  Any,
11
14
  Callable,
@@ -16,56 +19,72 @@ from typing import ( # Added Iterable and TYPE_CHECKING
16
19
  Tuple,
17
20
  Type,
18
21
  Union,
22
+ overload,
19
23
  )
20
- from pathlib import Path
21
-
22
24
 
23
25
  import pdfplumber
24
26
  from PIL import Image
25
27
 
26
- from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
27
- LayoutManager,
28
- )
29
- from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
30
- from natural_pdf.core.page import Page
31
- from natural_pdf.elements.collections import ElementCollection
28
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
+ from natural_pdf.classification.manager import ClassificationError, ClassificationManager
30
+ from natural_pdf.classification.mixin import ClassificationMixin
31
+ from natural_pdf.classification.results import ClassificationResult
32
+ from natural_pdf.core.highlighting_service import HighlightingService
33
+ from natural_pdf.elements.base import Element
32
34
  from natural_pdf.elements.region import Region
35
+ from natural_pdf.export.mixin import ExportMixin
36
+ from natural_pdf.extraction.manager import StructuredDataManager
37
+ from natural_pdf.extraction.mixin import ExtractionMixin
33
38
  from natural_pdf.ocr import OCRManager, OCROptions
34
39
  from natural_pdf.selectors.parser import parse_selector
40
+ from natural_pdf.utils.locks import pdf_render_lock
41
+ from natural_pdf.utils.tqdm_utils import get_tqdm
35
42
 
36
- # Import the flag directly - this should always work
37
-
38
- # --- Add Search Service Imports (needed for new methods) ---
39
43
  try:
40
- from typing import Any as TypingAny # Import Any if not already
44
+ from typing import Any as TypingAny
41
45
 
42
- from natural_pdf.search import TextSearchOptions # Keep for ask default
43
46
  from natural_pdf.search import (
44
47
  BaseSearchOptions,
45
48
  SearchOptions,
46
49
  SearchServiceProtocol,
50
+ TextSearchOptions,
47
51
  get_search_service,
48
52
  )
49
53
  except ImportError:
50
- # Define dummies if needed for type hints within the class
51
54
  SearchServiceProtocol = object
52
55
  SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
53
56
  TypingAny = object
54
57
 
55
- # Dummy factory needed for default arg in methods
56
58
  def get_search_service(**kwargs) -> SearchServiceProtocol:
57
59
  raise ImportError(
58
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
59
61
  )
60
62
 
61
63
 
62
- # --- End Search Service Imports ---
63
-
64
- # Set up logger early
65
64
  logger = logging.getLogger("natural_pdf.core.pdf")
65
+ tqdm = get_tqdm()
66
66
 
67
+ DEFAULT_MANAGERS = {
68
+ "classification": ClassificationManager,
69
+ "structured_data": StructuredDataManager,
70
+ }
67
71
 
68
- class PDF:
72
+ # Deskew Imports (Conditional)
73
+ import numpy as np
74
+ from PIL import Image
75
+
76
+ try:
77
+ import img2pdf
78
+ from deskew import determine_skew
79
+
80
+ DESKEW_AVAILABLE = True
81
+ except ImportError:
82
+ DESKEW_AVAILABLE = False
83
+ img2pdf = None
84
+ # End Deskew Imports
85
+
86
+
87
+ class PDF(ExtractionMixin, ExportMixin):
69
88
  """
70
89
  Enhanced PDF wrapper built on top of pdfplumber.
71
90
 
@@ -75,7 +94,7 @@ class PDF:
75
94
 
76
95
  def __init__(
77
96
  self,
78
- path_or_url: str,
97
+ path_or_url_or_stream,
79
98
  reading_order: bool = True,
80
99
  font_attrs: Optional[List[str]] = None,
81
100
  keep_spaces: bool = True,
@@ -84,95 +103,132 @@ class PDF:
84
103
  Initialize the enhanced PDF object.
85
104
 
86
105
  Args:
87
- path_or_url: Path to the PDF file or a URL to a PDF
106
+ path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
88
107
  reading_order: Whether to use natural reading order
89
- font_attrs: Font attributes to consider when grouping characters into words.
90
- Default: ['fontname', 'size'] (Group by font name and size)
91
- None: Only consider spatial relationships
92
- List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
93
- keep_spaces: Whether to include spaces in word elements (default: True).
94
- True: Spaces are part of words, better for multi-word searching
95
- False: Break text at spaces, each word is separate (legacy behavior)
108
+ font_attrs: Font attributes for grouping characters into words
109
+ keep_spaces: Whether to include spaces in word elements
96
110
  """
97
- # Check if the input is a URL
98
- is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
99
-
100
- # Initialize path-related attributes
101
- self._original_path = path_or_url
111
+ self._original_path_or_stream = path_or_url_or_stream
102
112
  self._temp_file = None
103
- self._resolved_path = None # Store the actual path used by pdfplumber
104
-
105
- if is_url:
106
- logger.info(f"Downloading PDF from URL: {path_or_url}")
107
- try:
108
- # Create a temporary file to store the downloaded PDF
109
- self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
110
-
111
- # Download the PDF
112
- with urllib.request.urlopen(path_or_url) as response:
113
- self._temp_file.write(response.read())
114
- self._temp_file.flush()
115
- self._temp_file.close()
116
-
117
- # Use the temporary file path
118
- self._resolved_path = self._temp_file.name
119
- logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
120
- except Exception as e:
121
- if self._temp_file and hasattr(self._temp_file, "name"):
122
- try:
123
- os.unlink(self._temp_file.name)
124
- except:
125
- pass
126
- logger.error(f"Failed to download PDF from URL: {e}")
127
- raise ValueError(f"Failed to download PDF from URL: {e}")
113
+ self._resolved_path = None
114
+ self._is_stream = False
115
+ stream_to_open = None
116
+
117
+ if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
118
+ logger.info("Initializing PDF from in-memory stream.")
119
+ self._is_stream = True
120
+ self._resolved_path = None # No resolved file path for streams
121
+ self.source_path = "<stream>" # Identifier for source
122
+ self.path = self.source_path # Use source identifier as path for streams
123
+ stream_to_open = path_or_url_or_stream
124
+ elif isinstance(path_or_url_or_stream, (str, Path)):
125
+ path_or_url = str(path_or_url_or_stream)
126
+ self.source_path = path_or_url # Store original path/URL as source
127
+ is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
128
+
129
+ if is_url:
130
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
131
+ try:
132
+ # Use a context manager for the temporary file
133
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
134
+ self._temp_file = temp_f # Store reference if needed for cleanup
135
+ with urllib.request.urlopen(path_or_url) as response:
136
+ temp_f.write(response.read())
137
+ temp_f.flush()
138
+ self._resolved_path = temp_f.name
139
+ logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
140
+ stream_to_open = self._resolved_path
141
+ except Exception as e:
142
+ if self._temp_file and hasattr(self._temp_file, "name"):
143
+ try:
144
+ os.unlink(self._temp_file.name)
145
+ except: # noqa E722
146
+ pass
147
+ logger.error(f"Failed to download PDF from URL: {e}")
148
+ raise ValueError(f"Failed to download PDF from URL: {e}")
149
+ else:
150
+ self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
151
+ stream_to_open = self._resolved_path
152
+ self.path = self._resolved_path # Use resolved path for file-based PDFs
128
153
  else:
129
- # Use the provided path directly
130
- self._resolved_path = path_or_url
154
+ raise TypeError(
155
+ f"Invalid input type: {type(path_or_url_or_stream)}. "
156
+ f"Expected path (str/Path), URL (str), or file-like object."
157
+ )
131
158
 
132
- logger.info(f"Initializing PDF from {self._resolved_path}")
159
+ logger.info(f"Opening PDF source: {self.source_path}")
133
160
  logger.debug(
134
161
  f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
135
162
  )
136
163
 
137
164
  try:
138
- self._pdf = pdfplumber.open(self._resolved_path)
165
+ self._pdf = pdfplumber.open(stream_to_open)
139
166
  except Exception as e:
140
- logger.error(
141
- f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
142
- exc_info=True,
143
- )
144
- # Clean up temp file if creation failed
145
- self.close()
146
- raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
147
-
148
- self._path = self._resolved_path # Keep original path too?
149
- self.path = self._resolved_path # Public attribute for the resolved path
150
- self.source_path = self._original_path # Public attribute for the user-provided path/URL
167
+ logger.error(f"Failed to open PDF: {e}", exc_info=True)
168
+ self.close() # Attempt cleanup if opening fails
169
+ raise IOError(f"Failed to open PDF source: {self.source_path}") from e
151
170
 
171
+ # Store configuration used for initialization
152
172
  self._reading_order = reading_order
153
173
  self._config = {"keep_spaces": keep_spaces}
174
+ self._font_attrs = font_attrs
154
175
 
155
- self._font_attrs = font_attrs # Store the font attribute configuration
156
-
157
- # Initialize Managers and Services (conditionally available)
158
176
  self._ocr_manager = OCRManager() if OCRManager else None
159
177
  self._layout_manager = LayoutManager() if LayoutManager else None
160
178
  self.highlighter = HighlightingService(self)
179
+ # self._classification_manager_instance = ClassificationManager() # Removed this line
180
+ self._manager_registry = {}
181
+
182
+ from natural_pdf.core.page import Page
161
183
 
162
- # Initialize pages last, passing necessary refs
163
184
  self._pages = [
164
185
  Page(p, parent=self, index=i, font_attrs=font_attrs)
165
186
  for i, p in enumerate(self._pdf.pages)
166
187
  ]
167
188
 
168
- # Other state
169
189
  self._element_cache = {}
170
- self._exclusions = [] # List to store exclusion functions/regions
171
- self._regions = [] # List to store region functions/definitions
190
+ self._exclusions = []
191
+ self._regions = []
172
192
 
173
- logger.info("Initialized HighlightingService.")
174
193
  logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
175
194
 
195
+ self._initialize_managers()
196
+ self._initialize_highlighter()
197
+
198
+ def _initialize_managers(self):
199
+ """Initialize manager instances based on DEFAULT_MANAGERS."""
200
+ self._managers = {}
201
+ for key, manager_class in DEFAULT_MANAGERS.items():
202
+ try:
203
+ self._managers[key] = manager_class()
204
+ logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
205
+ except Exception as e:
206
+ logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
207
+ self._managers[key] = None
208
+
209
+ def get_manager(self, key: str) -> Any:
210
+ """Retrieve a manager instance by its key."""
211
+ if key not in self._managers:
212
+ raise KeyError(
213
+ f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
214
+ )
215
+
216
+ manager_instance = self._managers.get(key)
217
+
218
+ if manager_instance is None:
219
+ manager_class = DEFAULT_MANAGERS.get(key)
220
+ if manager_class:
221
+ raise RuntimeError(
222
+ f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
223
+ )
224
+ else:
225
+ raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
226
+
227
+ return manager_instance
228
+
229
+ def _initialize_highlighter(self):
230
+ pass
231
+
176
232
  @property
177
233
  def metadata(self) -> Dict[str, Any]:
178
234
  """Access metadata as a dictionary."""
@@ -183,7 +239,6 @@ class PDF:
183
239
  """Access pages as a PageCollection object."""
184
240
  from natural_pdf.elements.collections import PageCollection
185
241
 
186
- # Ensure _pages is initialized
187
242
  if not hasattr(self, "_pages"):
188
243
  raise AttributeError("PDF pages not yet initialized.")
189
244
  return PageCollection(self._pages)
@@ -195,12 +250,10 @@ class PDF:
195
250
  Returns:
196
251
  Self for method chaining
197
252
  """
198
- # Ensure _pages is initialized
199
253
  if not hasattr(self, "_pages"):
200
254
  raise AttributeError("PDF pages not yet initialized.")
201
255
 
202
256
  self._exclusions = []
203
- # Also clear from pages
204
257
  for page in self._pages:
205
258
  page.clear_exclusions()
206
259
  return self
@@ -212,99 +265,90 @@ class PDF:
212
265
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
213
266
 
214
267
  Args:
215
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
268
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
269
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
216
270
  label: Optional label for this exclusion
217
271
 
218
272
  Returns:
219
273
  Self for method chaining
220
274
  """
221
- # Ensure _pages is initialized
222
275
  if not hasattr(self, "_pages"):
223
276
  raise AttributeError("PDF pages not yet initialized.")
224
277
 
225
- # Store exclusion with its label at PDF level
226
278
  exclusion_data = (exclusion_func, label)
227
279
  self._exclusions.append(exclusion_data)
228
280
 
229
- # Apply this exclusion to all pages
230
281
  for page in self._pages:
231
- # We pass the original function, Page.add_exclusion handles calling it
232
282
  page.add_exclusion(exclusion_func, label=label)
233
283
 
234
284
  return self
235
285
 
236
286
  def apply_ocr(
237
287
  self,
238
- pages: Optional[Union[Iterable[int], range, slice]] = None,
239
288
  engine: Optional[str] = None,
240
- # --- Common OCR Parameters (Direct Arguments) ---
241
289
  languages: Optional[List[str]] = None,
242
- min_confidence: Optional[float] = None, # Min confidence threshold
290
+ min_confidence: Optional[float] = None,
243
291
  device: Optional[str] = None,
244
- resolution: Optional[int] = None, # DPI for rendering before OCR
245
- apply_exclusions: bool = True, # New parameter
292
+ resolution: Optional[int] = None,
293
+ apply_exclusions: bool = True,
246
294
  detect_only: bool = False,
247
- # --- Engine-Specific Options --- Use 'options=' for this
248
- options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
- # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
295
+ replace: bool = True,
296
+ options: Optional[Any] = None,
297
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
250
298
  ) -> "PDF":
251
299
  """
252
- Applies OCR to specified pages (or all pages) of the PDF using batch processing.
253
-
254
- This method renders the specified pages to images, sends them as a batch
255
- to the OCRManager, and adds the resulting TextElements to each respective page.
300
+ Applies OCR to specified pages of the PDF using batch processing.
301
+ Applies OCR to specified pages of the PDF using batch processing.
256
302
 
257
303
  Args:
258
- pages: An iterable of 0-based page indices (list, range, tuple),
259
- a slice object, or None to process all pages.
260
- engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
- Uses manager's default ('easyocr') if None.
262
- languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
- **Must be codes understood by the specific selected engine.**
264
- No mapping is performed. Overrides manager/engine default.
265
- min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
- Overrides manager/engine default.
267
- device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
- Overrides manager/engine default.
269
- resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
- Affects input quality for OCR. Defaults to 150 if not set.
271
- apply_exclusions: If True (default), render page image for OCR with
272
- excluded areas masked (whited out). If False, OCR
273
- the raw page image without masking exclusions.
274
- detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
- options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
- containing parameters specific to the chosen engine.
304
+ engine: Name of the OCR engine
305
+ languages: List of language codes
306
+ min_confidence: Minimum confidence threshold
307
+ device: Device to run OCR on
308
+ resolution: DPI resolution for page images
309
+ apply_exclusions: Whether to mask excluded areas
310
+ detect_only: If True, only detect text boxes
311
+ replace: Whether to replace existing OCR elements
312
+ options: Engine-specific options
313
+ pages: Page indices to process or None for all pages
314
+ engine: Name of the OCR engine
315
+ languages: List of language codes
316
+ min_confidence: Minimum confidence threshold
317
+ device: Device to run OCR on
318
+ resolution: DPI resolution for page images
319
+ apply_exclusions: Whether to mask excluded areas
320
+ detect_only: If True, only detect text boxes
321
+ replace: Whether to replace existing OCR elements
322
+ options: Engine-specific options
323
+ pages: Page indices to process or None for all pages
277
324
 
278
325
  Returns:
279
- Self for method chaining.
280
-
281
- Raises:
282
- ValueError: If page indices are invalid.
283
- TypeError: If 'options' is not compatible with the engine.
284
- RuntimeError: If the OCRManager or selected engine is not available.
326
+ Self for method chaining
327
+ Self for method chaining
285
328
  """
286
329
  if not self._ocr_manager:
287
330
  logger.error("OCRManager not available. Cannot apply OCR.")
288
- # Or raise RuntimeError("OCRManager not initialized.")
289
331
  return self
290
332
 
291
- # --- Determine Target Pages (unchanged) ---
292
- target_pages: List[Page] = []
333
+ thread_id = threading.current_thread().name
334
+ logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
335
+
336
+ target_pages = []
337
+
338
+ target_pages = []
293
339
  if pages is None:
294
340
  target_pages = self._pages
295
341
  elif isinstance(pages, slice):
296
342
  target_pages = self._pages[pages]
297
- elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
343
+ elif hasattr(pages, "__iter__"):
298
344
  try:
299
345
  target_pages = [self._pages[i] for i in pages]
300
346
  except IndexError:
301
347
  raise ValueError("Invalid page index provided in 'pages' iterable.")
302
348
  except TypeError:
303
- raise TypeError(
304
- "'pages' must be None, a slice, or an iterable of page indices (int)."
305
- )
349
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
306
350
  else:
307
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
351
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
308
352
 
309
353
  if not target_pages:
310
354
  logger.warning("No pages selected for OCR processing.")
@@ -312,26 +356,20 @@ class PDF:
312
356
 
313
357
  page_numbers = [p.number for p in target_pages]
314
358
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
- # --- Determine Rendering Resolution ---
316
- # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
- final_resolution = resolution # Use direct arg if provided
318
- if final_resolution is None:
319
- final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
359
 
321
- logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
360
+ final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
361
+ logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
362
+
363
+ images_pil = []
364
+ page_image_map = []
365
+ logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
366
+ failed_page_num = "unknown"
367
+ render_start_time = time.monotonic()
322
368
 
323
- # --- Render Images for Batch ---
324
- images_pil: List[Image.Image] = []
325
- page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
326
- logger.info(
327
- f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
328
- )
329
- failed_page_num = "unknown" # Keep track of potentially failing page
330
369
  try:
331
- for i, page in enumerate(target_pages):
332
- failed_page_num = page.number # Update current page number in case of error
370
+ for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
371
+ failed_page_num = page.number
333
372
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
334
- # Use the determined final_resolution and apply exclusions if requested
335
373
  to_image_kwargs = {
336
374
  "resolution": final_resolution,
337
375
  "include_highlights": False,
@@ -340,58 +378,64 @@ class PDF:
340
378
  img = page.to_image(**to_image_kwargs)
341
379
  if img is None:
342
380
  logger.error(f" Failed to render page {page.number} to image.")
343
- # Decide how to handle: skip page, raise error? For now, skip.
344
- continue # Skip this page if rendering failed
381
+ continue
382
+ continue
345
383
  images_pil.append(img)
346
- page_image_map.append((page, img)) # Store pair
384
+ page_image_map.append((page, img))
347
385
  except Exception as e:
348
- logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
386
+ logger.error(f"Failed to render pages for batch OCR: {e}")
387
+ logger.error(f"Failed to render pages for batch OCR: {e}")
349
388
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
350
389
 
390
+ render_end_time = time.monotonic()
391
+ logger.debug(
392
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
393
+ )
394
+ logger.debug(
395
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
396
+ )
397
+
351
398
  if not images_pil or not page_image_map:
352
399
  logger.error("No images were successfully rendered for batch OCR.")
353
400
  return self
354
401
 
355
- # --- Prepare Arguments for Manager ---
356
- # Pass common args directly, engine-specific via options
357
402
  manager_args = {
358
403
  "images": images_pil,
359
404
  "engine": engine,
360
405
  "languages": languages,
361
- "min_confidence": min_confidence, # Use the renamed parameter
406
+ "min_confidence": min_confidence,
407
+ "min_confidence": min_confidence,
362
408
  "device": device,
363
409
  "options": options,
364
410
  "detect_only": detect_only,
365
- # Note: resolution is used for rendering, not passed to OCR manager directly
366
411
  }
367
- # Filter out None values so manager can use its defaults
368
412
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
369
413
 
370
- # --- Call OCR Manager for Batch Processing ---
371
- logger.info(
372
- f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
373
- )
414
+ ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
415
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
416
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
417
+ ocr_start_time = time.monotonic()
418
+
374
419
  try:
375
- # Manager's apply_ocr signature needs to accept common args directly
376
420
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
377
421
 
378
422
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
379
- logger.error(
380
- f"OCR Manager returned unexpected result format or length for batch processing. "
381
- f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
382
- f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
383
- )
423
+ logger.error(f"OCR Manager returned unexpected result format or length.")
384
424
  return self
385
425
 
386
426
  logger.info("OCR Manager batch processing complete.")
387
-
388
427
  except Exception as e:
389
- logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
428
+ logger.error(f"Batch OCR processing failed: {e}")
390
429
  return self
391
430
 
392
- # --- Distribute Results and Add Elements to Pages (unchanged) ---
431
+ ocr_end_time = time.monotonic()
432
+ logger.debug(
433
+ f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
434
+ )
435
+
393
436
  logger.info("Adding OCR results to respective pages...")
394
437
  total_elements_added = 0
438
+
395
439
  for i, (page, img) in enumerate(page_image_map):
396
440
  results_for_page = batch_results[i]
397
441
  if not isinstance(results_for_page, list):
@@ -402,6 +446,9 @@ class PDF:
402
446
 
403
447
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
404
448
  try:
449
+ if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
450
+ page._element_mgr.remove_ocr_elements()
451
+
405
452
  img_scale_x = page.width / img.width if img.width > 0 else 1
406
453
  img_scale_y = page.height / img.height if img.height > 0 else 1
407
454
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -414,188 +461,225 @@ class PDF:
414
461
  else:
415
462
  logger.debug(f" No valid TextElements created for page {page.number}.")
416
463
  except Exception as e:
417
- logger.error(
418
- f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
419
- )
464
+ logger.error(f" Error adding OCR elements to page {page.number}: {e}")
420
465
 
421
- logger.info(
422
- f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
423
- )
466
+ logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
424
467
  return self
425
468
 
426
469
  def add_region(
427
470
  self, region_func: Callable[["Page"], Optional[Region]], name: str = None
428
471
  ) -> "PDF":
429
472
  """
430
- Add a region function to the PDF. This creates regions on all pages using the provided function.
473
+ Add a region function to the PDF.
431
474
 
432
475
  Args:
433
- region_func: A function that takes a Page and returns a Region, or None.
476
+ region_func: A function that takes a Page and returns a Region, or None
477
+ region_func: A function that takes a Page and returns a Region, or None
434
478
  name: Optional name for the region
435
479
 
436
480
  Returns:
437
481
  Self for method chaining
438
482
  """
439
- # Ensure _pages is initialized
440
483
  if not hasattr(self, "_pages"):
441
484
  raise AttributeError("PDF pages not yet initialized.")
442
485
 
443
- # Store region with its name at PDF level
444
486
  region_data = (region_func, name)
445
487
  self._regions.append(region_data)
446
488
 
447
- # Apply this region to all pages
448
489
  for page in self._pages:
449
490
  try:
450
- # Call the function to get the region for this specific page
451
491
  region_instance = region_func(page)
452
492
  if region_instance and isinstance(region_instance, Region):
453
- # If a valid region is returned, add it to the page
454
493
  page.add_region(region_instance, name=name, source="named")
455
494
  elif region_instance is not None:
456
495
  logger.warning(
457
- f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
496
+ f"Region function did not return a valid Region for page {page.number}"
458
497
  )
459
498
  except Exception as e:
460
- logger.error(
461
- f"Error executing or adding region function for page {page.number}: {e}",
462
- exc_info=True,
463
- )
499
+ logger.error(f"Error adding region for page {page.number}: {e}")
464
500
 
465
501
  return self
466
502
 
503
+ @overload
504
+ def find(
505
+ self,
506
+ *,
507
+ text: str,
508
+ apply_exclusions: bool = True,
509
+ regex: bool = False,
510
+ case: bool = True,
511
+ **kwargs,
512
+ ) -> Optional[Any]: ...
513
+
514
+ @overload
515
+ def find(
516
+ self,
517
+ selector: str,
518
+ *,
519
+ apply_exclusions: bool = True,
520
+ regex: bool = False,
521
+ case: bool = True,
522
+ **kwargs,
523
+ ) -> Optional[Any]: ...
524
+
467
525
  def find(
468
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
526
+ self,
527
+ selector: Optional[str] = None,
528
+ *,
529
+ text: Optional[str] = None,
530
+ apply_exclusions: bool = True,
531
+ regex: bool = False,
532
+ case: bool = True,
533
+ **kwargs,
469
534
  ) -> Optional[Any]:
470
535
  """
471
- Find the first element matching the selector.
536
+ Find the first element matching the selector OR text content across all pages.
537
+
538
+ Provide EITHER `selector` OR `text`, but not both.
472
539
 
473
540
  Args:
474
- selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
475
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
476
- regex: Whether to use regex for text search in :contains (default: False)
477
- case: Whether to do case-sensitive text search (default: True)
478
- **kwargs: Additional filter parameters
541
+ selector: CSS-like selector string.
542
+ text: Text content to search for (equivalent to 'text:contains(...)').
543
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
544
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
545
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
546
+ **kwargs: Additional filter parameters.
479
547
 
480
548
  Returns:
481
- Element object or None if not found
549
+ Element object or None if not found.
482
550
  """
483
- # Ensure _pages is initialized
484
551
  if not hasattr(self, "_pages"):
485
552
  raise AttributeError("PDF pages not yet initialized.")
486
553
 
487
- selector_obj = parse_selector(selector)
554
+ if selector is not None and text is not None:
555
+ raise ValueError("Provide either 'selector' or 'text', not both.")
556
+ if selector is None and text is None:
557
+ raise ValueError("Provide either 'selector' or 'text'.")
488
558
 
489
- # Pass regex and case flags to selector function
559
+ # Construct selector if 'text' is provided
560
+ effective_selector = ""
561
+ if text is not None:
562
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
563
+ effective_selector = f'text:contains("{escaped_text}")'
564
+ logger.debug(
565
+ f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
566
+ )
567
+ elif selector is not None:
568
+ effective_selector = selector
569
+ else:
570
+ raise ValueError("Internal error: No selector or text provided.")
571
+
572
+ selector_obj = parse_selector(effective_selector)
490
573
  kwargs["regex"] = regex
491
574
  kwargs["case"] = case
492
575
 
493
- results = self._apply_selector(
494
- selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
495
- )
496
- return results.first if results else None
576
+ # Search page by page
577
+ for page in self.pages:
578
+ # Note: _apply_selector is on Page, so we call find directly here
579
+ # We pass the constructed/validated effective_selector
580
+ element = page.find(
581
+ selector=effective_selector, # Use the processed selector
582
+ apply_exclusions=apply_exclusions,
583
+ regex=regex, # Pass down flags
584
+ case=case,
585
+ **kwargs,
586
+ )
587
+ if element:
588
+ return element
589
+ return None # Not found on any page
590
+
591
+ @overload
592
+ def find_all(
593
+ self,
594
+ *,
595
+ text: str,
596
+ apply_exclusions: bool = True,
597
+ regex: bool = False,
598
+ case: bool = True,
599
+ **kwargs,
600
+ ) -> "ElementCollection": ...
497
601
 
602
+ @overload
498
603
  def find_all(
499
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
500
- ) -> ElementCollection:
604
+ self,
605
+ selector: str,
606
+ *,
607
+ apply_exclusions: bool = True,
608
+ regex: bool = False,
609
+ case: bool = True,
610
+ **kwargs,
611
+ ) -> "ElementCollection": ...
612
+
613
+ def find_all(
614
+ self,
615
+ selector: Optional[str] = None,
616
+ *,
617
+ text: Optional[str] = None,
618
+ apply_exclusions: bool = True,
619
+ regex: bool = False,
620
+ case: bool = True,
621
+ **kwargs,
622
+ ) -> "ElementCollection":
501
623
  """
502
- Find all elements matching the selector.
624
+ Find all elements matching the selector OR text content across all pages.
625
+
626
+ Provide EITHER `selector` OR `text`, but not both.
503
627
 
504
628
  Args:
505
- selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
506
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
507
- regex: Whether to use regex for text search in :contains (default: False)
508
- case: Whether to do case-sensitive text search (default: True)
509
- **kwargs: Additional filter parameters
629
+ selector: CSS-like selector string.
630
+ text: Text content to search for (equivalent to 'text:contains(...)').
631
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
632
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
633
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
634
+ **kwargs: Additional filter parameters.
510
635
 
511
636
  Returns:
512
- ElementCollection with matching elements
637
+ ElementCollection with matching elements.
513
638
  """
514
- # Ensure _pages is initialized
515
639
  if not hasattr(self, "_pages"):
516
640
  raise AttributeError("PDF pages not yet initialized.")
517
641
 
518
- selector_obj = parse_selector(selector)
519
-
520
- # Pass regex and case flags to selector function
521
- kwargs["regex"] = regex
522
- kwargs["case"] = case
523
-
524
- results = self._apply_selector(
525
- selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
526
- )
527
- return results
528
-
529
- def _apply_selector(
530
- self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
531
- ) -> ElementCollection:
532
- """
533
- Apply selector to PDF elements across all pages.
534
-
535
- Args:
536
- selector_obj: Parsed selector dictionary
537
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
538
- first_only: If True, stop searching after the first match is found.
539
- **kwargs: Additional filter parameters
642
+ if selector is not None and text is not None:
643
+ raise ValueError("Provide either 'selector' or 'text', not both.")
644
+ if selector is None and text is None:
645
+ raise ValueError("Provide either 'selector' or 'text'.")
540
646
 
541
- Returns:
542
- ElementCollection of matching elements
543
- """
544
- from natural_pdf.elements.collections import ElementCollection
647
+ # Construct selector if 'text' is provided
648
+ effective_selector = ""
649
+ if text is not None:
650
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
651
+ effective_selector = f'text:contains("{escaped_text}")'
652
+ logger.debug(
653
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
654
+ )
655
+ elif selector is not None:
656
+ effective_selector = selector
657
+ else:
658
+ raise ValueError("Internal error: No selector or text provided.")
545
659
 
546
- # Determine page range to search
547
- page_indices = kwargs.get("pages", range(len(self._pages)))
548
- if isinstance(page_indices, int):
549
- page_indices = [page_indices]
550
- elif isinstance(page_indices, slice):
551
- page_indices = range(*page_indices.indices(len(self._pages)))
660
+ # Instead of parsing here, let each page parse and apply
661
+ # This avoids parsing the same selector multiple times if not needed
662
+ # selector_obj = parse_selector(effective_selector)
552
663
 
553
- # Check for cross-page pseudo-classes (currently not supported)
554
- for pseudo in selector_obj.get("pseudo_classes", []):
555
- if pseudo.get("name") in ("spans", "continues"):
556
- logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
557
- return ElementCollection([])
664
+ # kwargs["regex"] = regex # Removed: Already passed explicitly
665
+ # kwargs["case"] = case # Removed: Already passed explicitly
558
666
 
559
- # Regular case: collect elements from each page
560
667
  all_elements = []
561
- for page_idx in page_indices:
562
- if 0 <= page_idx < len(self._pages):
563
- page = self._pages[page_idx]
564
- # Pass first_only down to page._apply_selector
565
- page_elements_collection = page._apply_selector(
566
- selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
567
- )
568
- if page_elements_collection:
569
- page_elements = page_elements_collection.elements
570
- all_elements.extend(page_elements)
571
- # If we only need the first match overall, and we found one on this page, stop
572
- if first_only and page_elements:
573
- break # Stop iterating through pages
574
- else:
575
- logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
576
-
577
- # Create a combined collection
578
- combined = ElementCollection(all_elements)
668
+ for page in self.pages:
669
+ # Call page.find_all with the effective selector and flags
670
+ page_elements = page.find_all(
671
+ selector=effective_selector,
672
+ apply_exclusions=apply_exclusions,
673
+ regex=regex,
674
+ case=case,
675
+ **kwargs,
676
+ )
677
+ if page_elements:
678
+ all_elements.extend(page_elements.elements)
579
679
 
580
- # Sort in document order if requested and not first_only (already sorted by page)
581
- if not first_only and kwargs.get("document_order", True):
582
- # Check if elements have page, top, x0 before sorting
583
- if all(
584
- hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
585
- for el in combined.elements
586
- ):
587
- combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
588
- else:
589
- # Elements might be Regions without inherent sorting order yet
590
- # Attempt sorting by page index if possible
591
- try:
592
- combined.sort(key=lambda el: el.page.index)
593
- except AttributeError:
594
- logger.warning(
595
- "Cannot sort elements in document order: Missing required attributes (e.g., page)."
596
- )
680
+ from natural_pdf.elements.collections import ElementCollection
597
681
 
598
- return combined
682
+ return ElementCollection(all_elements)
599
683
 
600
684
  def extract_text(
601
685
  self,
@@ -610,24 +694,24 @@ class PDF:
610
694
 
611
695
  Args:
612
696
  selector: Optional selector to filter elements
613
- preserve_whitespace: Whether to keep blank characters (default: True)
614
- use_exclusions: Whether to apply exclusion regions (default: True)
615
- debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
697
+ preserve_whitespace: Whether to keep blank characters
698
+ use_exclusions: Whether to apply exclusion regions
699
+ debug_exclusions: Whether to output detailed debugging for exclusions
700
+ preserve_whitespace: Whether to keep blank characters
701
+ use_exclusions: Whether to apply exclusion regions
702
+ debug_exclusions: Whether to output detailed debugging for exclusions
616
703
  **kwargs: Additional extraction parameters
617
704
 
618
705
  Returns:
619
706
  Extracted text as string
620
707
  """
621
- # Ensure _pages is initialized
622
708
  if not hasattr(self, "_pages"):
623
709
  raise AttributeError("PDF pages not yet initialized.")
624
710
 
625
- # If selector is provided, find elements first
626
711
  if selector:
627
712
  elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
628
713
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
629
714
 
630
- # Otherwise extract from all pages
631
715
  if debug_exclusions:
632
716
  print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
633
717
  print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
@@ -648,25 +732,6 @@ class PDF:
648
732
 
649
733
  return "\n".join(texts)
650
734
 
651
- def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
652
- """
653
- Shorthand for finding elements and extracting their text.
654
-
655
- Args:
656
- selector: CSS-like selector string
657
- preserve_whitespace: Whether to keep blank characters (default: True)
658
- **kwargs: Additional extraction parameters
659
-
660
- Returns:
661
- Extracted text from matching elements
662
- """
663
- # Ensure _pages is initialized
664
- if not hasattr(self, "_pages"):
665
- raise AttributeError("PDF pages not yet initialized.")
666
- return self.extract_text(
667
- selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
668
- ) # apply_exclusions is handled by find_all in extract_text
669
-
670
735
  def extract_tables(
671
736
  self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
672
737
  ) -> List[Any]:
@@ -681,54 +746,46 @@ class PDF:
681
746
  Returns:
682
747
  List of extracted tables
683
748
  """
684
- # Ensure _pages is initialized
685
749
  if not hasattr(self, "_pages"):
686
750
  raise AttributeError("PDF pages not yet initialized.")
687
- # TODO: Implement table extraction
751
+
688
752
  logger.warning("PDF.extract_tables is not fully implemented yet.")
689
753
  all_tables = []
754
+
690
755
  for page in self.pages:
691
- # Assuming page.extract_tables(**kwargs) exists or is added
692
756
  if hasattr(page, "extract_tables"):
693
757
  all_tables.extend(page.extract_tables(**kwargs))
694
758
  else:
695
759
  logger.debug(f"Page {page.number} does not have extract_tables method.")
696
- # Placeholder filtering
760
+
697
761
  if selector:
698
762
  logger.warning("Filtering extracted tables by selector is not implemented.")
699
- # Would need to parse selector and filter the list `all_tables`
700
- # Placeholder merging
763
+
701
764
  if merge_across_pages:
702
765
  logger.warning("Merging tables across pages is not implemented.")
703
- # Would need logic to detect and merge related tables
766
+
704
767
  return all_tables
705
768
 
706
- # --- New Method: save_searchable ---
707
769
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
708
770
  """
709
771
  Saves the PDF with an OCR text layer, making content searchable.
710
772
 
711
773
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
712
774
 
713
- Note: OCR must have been applied to the pages beforehand
714
- (e.g., using pdf.apply_ocr()).
715
-
716
775
  Args:
717
- output_path: Path to save the searchable PDF.
718
- dpi: Resolution for rendering and OCR overlay (default 300).
719
- **kwargs: Additional keyword arguments passed to the exporter.
776
+ output_path: Path to save the searchable PDF
777
+ dpi: Resolution for rendering and OCR overlay
778
+ **kwargs: Additional keyword arguments passed to the exporter
779
+ output_path: Path to save the searchable PDF
780
+ dpi: Resolution for rendering and OCR overlay
781
+ **kwargs: Additional keyword arguments passed to the exporter
720
782
  """
721
- # Import moved here, assuming it's always available now
722
783
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
723
784
 
724
- # Convert pathlib.Path to string if necessary
725
785
  output_path_str = str(output_path)
726
-
727
786
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
728
787
  logger.info(f"Searchable PDF saved to: {output_path_str}")
729
788
 
730
- # --- End New Method ---
731
-
732
789
  def ask(
733
790
  self,
734
791
  question: str,
@@ -750,27 +807,22 @@ class PDF:
750
807
  **kwargs: Additional parameters passed to the QA engine
751
808
 
752
809
  Returns:
753
- A dictionary containing the answer, confidence, and other metadata.
754
- Result will have an 'answer' key containing the answer text.
810
+ A dictionary containing the answer, confidence, and other metadata
811
+ A dictionary containing the answer, confidence, and other metadata
755
812
  """
756
813
  from natural_pdf.qa import get_qa_engine
757
814
 
758
- # Initialize or get QA engine
759
815
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
760
816
 
761
- # Determine which pages to query
762
817
  if pages is None:
763
818
  target_pages = list(range(len(self.pages)))
764
819
  elif isinstance(pages, int):
765
- # Single page
766
820
  target_pages = [pages]
767
821
  elif isinstance(pages, (list, range)):
768
- # List or range of pages
769
822
  target_pages = pages
770
823
  else:
771
824
  raise ValueError(f"Invalid pages parameter: {pages}")
772
825
 
773
- # Actually query each page and gather results
774
826
  results = []
775
827
  for page_idx in target_pages:
776
828
  if 0 <= page_idx < len(self.pages):
@@ -779,136 +831,110 @@ class PDF:
779
831
  page=page, question=question, min_confidence=min_confidence, **kwargs
780
832
  )
781
833
 
782
- # Add to results if it found an answer
783
834
  if page_result and page_result.get("found", False):
784
835
  results.append(page_result)
785
836
 
786
- # Sort results by confidence
787
837
  results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
788
838
 
789
- # Return the best result, or a default result if none found
790
839
  if results:
791
840
  return results[0]
792
841
  else:
793
- # Return a structure indicating no answer found
794
842
  return {
795
843
  "answer": None,
796
844
  "confidence": 0.0,
797
845
  "found": False,
798
- "page_num": None, # Or maybe the pages searched?
846
+ "page_num": None,
799
847
  "source_elements": [],
800
848
  }
801
849
 
802
850
  def search_within_index(
803
851
  self,
804
852
  query: Union[str, Path, Image.Image, Region],
805
- search_service: SearchServiceProtocol, # Now required
853
+ search_service: SearchServiceProtocol,
806
854
  options: Optional[SearchOptions] = None,
807
855
  ) -> List[Dict[str, Any]]:
808
856
  """
809
- Finds relevant documents specifically originating from THIS PDF document
810
- within a search index managed by the provided SearchService.
811
-
812
- This method uses a pre-configured SearchService instance and adds
813
- a filter to the search query to scope results only to pages from
814
- this specific PDF object (based on its resolved path).
857
+ Finds relevant documents from this PDF within a search index.
858
+ Finds relevant documents from this PDF within a search index.
815
859
 
816
860
  Args:
817
- query: The search query (text, image path, PIL Image, Region).
818
- search_service: A pre-configured SearchService instance pointing to the
819
- index where this PDF's content (or related content)
820
- is expected to be found.
821
- options: Optional SearchOptions to configure the query (top_k, filters, etc.).
822
- Any existing filters in `options` will be combined with the
823
- PDF-scoping filter using an 'AND' condition.
861
+ query: The search query (text, image path, PIL Image, Region)
862
+ search_service: A pre-configured SearchService instance
863
+ options: Optional SearchOptions to configure the query
864
+ query: The search query (text, image path, PIL Image, Region)
865
+ search_service: A pre-configured SearchService instance
866
+ options: Optional SearchOptions to configure the query
824
867
 
825
868
  Returns:
826
- A list of result dictionaries, sorted by relevance, containing only
827
- results originating from this PDF's pages.
869
+ A list of result dictionaries, sorted by relevance
870
+ A list of result dictionaries, sorted by relevance
828
871
 
829
872
  Raises:
830
- ImportError: If search dependencies are not installed.
831
- ValueError: If search_service is None.
832
- TypeError: If search_service does not conform to the protocol.
833
- FileNotFoundError: If the collection managed by the service does not exist.
834
- RuntimeError: For other search failures.
873
+ ImportError: If search dependencies are not installed
874
+ ValueError: If search_service is None
875
+ TypeError: If search_service does not conform to the protocol
876
+ FileNotFoundError: If the collection managed by the service does not exist
877
+ RuntimeError: For other search failures
878
+ ImportError: If search dependencies are not installed
879
+ ValueError: If search_service is None
880
+ TypeError: If search_service does not conform to the protocol
881
+ FileNotFoundError: If the collection managed by the service does not exist
882
+ RuntimeError: For other search failures
835
883
  """
836
884
  if not search_service:
837
885
  raise ValueError("A configured SearchServiceProtocol instance must be provided.")
838
- # Optional stricter check:
839
- # if not isinstance(search_service, SearchServiceProtocol):
840
- # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
841
886
 
842
- # Get collection name from service for logging
843
887
  collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
844
888
  logger.info(
845
- f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
889
+ f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
846
890
  )
847
891
 
848
- # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
849
- # service: SearchServiceProtocol
850
- # if search_service:
851
- # service = search_service
852
- # else:
853
- # logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
854
- # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
855
- # # TODO: Pass embedding model from options/pdf config if needed?
856
- # service = get_search_service(**factory_args)
857
- service = search_service # Use validated provided service
858
-
859
- # --- 2. Prepare Query and Options ---
892
+ service = search_service
893
+
860
894
  query_input = query
861
- # Resolve options (use default TextSearch if none provided)
862
895
  effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
863
896
 
864
- # Handle Region query - extract text for now
865
897
  if isinstance(query, Region):
866
898
  logger.debug("Query is a Region object. Extracting text.")
867
899
  if not isinstance(effective_options, TextSearchOptions):
868
900
  logger.warning(
869
- "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
901
+ "Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
870
902
  )
871
903
  query_input = query.extract_text()
872
904
  if not query_input or query_input.isspace():
873
905
  logger.error("Region has no extractable text for query.")
874
906
  return []
875
907
 
876
- # --- 3. Add Filter to Scope Search to THIS PDF ---
877
- # Assume metadata field 'pdf_path' stores the resolved path used during indexing
908
+ # Add filter to scope search to THIS PDF
909
+ # Add filter to scope search to THIS PDF
878
910
  pdf_scope_filter = {
879
- "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
911
+ "field": "pdf_path",
880
912
  "operator": "eq",
881
- "value": self.path, # Use the resolved path of this PDF instance
913
+ "value": self.path,
882
914
  }
883
915
  logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
884
916
 
885
917
  # Combine with existing filters in options (if any)
886
918
  if effective_options.filters:
887
- logger.debug(
888
- f"Combining PDF scope filter with existing filters: {effective_options.filters}"
889
- )
890
- # Assume filters are compatible with the underlying search service
891
- # If existing filters aren't already in an AND block, wrap them
919
+ logger.debug(f"Combining PDF scope filter with existing filters")
892
920
  if (
893
921
  isinstance(effective_options.filters, dict)
894
922
  and effective_options.filters.get("operator") == "AND"
895
923
  ):
896
- # Already an AND block, just append the condition
897
924
  effective_options.filters["conditions"].append(pdf_scope_filter)
898
925
  elif isinstance(effective_options.filters, list):
899
- # Assume list represents implicit AND conditions
900
926
  effective_options.filters = {
901
927
  "operator": "AND",
902
928
  "conditions": effective_options.filters + [pdf_scope_filter],
903
929
  }
904
- elif isinstance(effective_options.filters, dict): # Single filter dict
930
+ elif isinstance(effective_options.filters, dict):
905
931
  effective_options.filters = {
906
932
  "operator": "AND",
907
933
  "conditions": [effective_options.filters, pdf_scope_filter],
908
934
  }
909
935
  else:
910
936
  logger.warning(
911
- f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
937
+ f"Unsupported format for existing filters. Overwriting with PDF scope filter."
912
938
  )
913
939
  effective_options.filters = pdf_scope_filter
914
940
  else:
@@ -916,39 +942,33 @@ class PDF:
916
942
 
917
943
  logger.debug(f"Final filters for service search: {effective_options.filters}")
918
944
 
919
- # --- 4. Call SearchService ---
920
945
  try:
921
- # Call the service's search method (no collection_name needed)
922
946
  results = service.search(
923
947
  query=query_input,
924
948
  options=effective_options,
925
949
  )
926
- logger.info(
927
- f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
928
- )
950
+ logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
929
951
  return results
930
952
  except FileNotFoundError as fnf:
931
- logger.error(
932
- f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
933
- )
934
- raise # Re-raise specific error
953
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
954
+ raise
955
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
956
+ raise
935
957
  except Exception as e:
936
- logger.error(
937
- f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
938
- exc_info=True,
939
- )
940
- raise RuntimeError(
941
- f"Search within index failed for PDF '{self.path}'. See logs for details."
942
- ) from e
958
+ logger.error(f"SearchService search failed: {e}")
959
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
960
+ logger.error(f"SearchService search failed: {e}")
961
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
943
962
 
944
963
  def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
945
964
  """
946
- Exports OCR results from this PDF into a correction task package (zip file).
965
+ Exports OCR results from this PDF into a correction task package.
966
+ Exports OCR results from this PDF into a correction task package.
947
967
 
948
968
  Args:
949
- output_zip_path: The path to save the output zip file.
969
+ output_zip_path: The path to save the output zip file
970
+ output_zip_path: The path to save the output zip file
950
971
  **kwargs: Additional arguments passed to create_correction_task_package
951
- (e.g., image_render_scale, overwrite).
952
972
  """
953
973
  try:
954
974
  from natural_pdf.utils.packaging import create_correction_task_package
@@ -958,32 +978,41 @@ class PDF:
958
978
  logger.error(
959
979
  "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
960
980
  )
961
- # Or raise
981
+ logger.error(
982
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
983
+ )
962
984
  except Exception as e:
963
- logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
964
- raise # Re-raise the exception from the utility function
985
+ logger.error(f"Failed to export correction task: {e}")
986
+ raise
987
+ logger.error(f"Failed to export correction task: {e}")
988
+ raise
965
989
 
966
990
  def correct_ocr(
967
991
  self,
968
992
  correction_callback: Callable[[Any], Optional[str]],
969
993
  pages: Optional[Union[Iterable[int], range, slice]] = None,
970
- ) -> "PDF": # Return self for chaining
994
+ max_workers: Optional[int] = None,
995
+ progress_callback: Optional[Callable[[], None]] = None,
996
+ ) -> "PDF":
971
997
  """
972
- Applies corrections to OCR-generated text elements using a callback function,
973
- delegating the core work to the `Page.correct_ocr` method.
998
+ Applies corrections to OCR text elements using a callback function.
999
+ Applies corrections to OCR text elements using a callback function.
974
1000
 
975
1001
  Args:
976
- correction_callback: A function that accepts a single argument (an element
977
- object) and returns `Optional[str]`. It returns the
978
- corrected text string if an update is needed, otherwise None.
1002
+ correction_callback: Function that takes an element and returns corrected text or None
1003
+ correction_callback: Function that takes an element and returns corrected text or None
979
1004
  pages: Optional page indices/slice to limit the scope of correction
980
- (default: all pages).
1005
+ max_workers: Maximum number of threads to use for parallel execution
1006
+ progress_callback: Optional callback function for progress updates
1007
+ max_workers: Maximum number of threads to use for parallel execution
1008
+ progress_callback: Optional callback function for progress updates
981
1009
 
982
1010
  Returns:
983
- Self for method chaining.
1011
+ Self for method chaining
1012
+ Self for method chaining
984
1013
  """
985
- # Determine target pages
986
- target_page_indices: List[int] = []
1014
+ target_page_indices = []
1015
+ target_page_indices = []
987
1016
  if pages is None:
988
1017
  target_page_indices = list(range(len(self._pages)))
989
1018
  elif isinstance(pages, slice):
@@ -991,56 +1020,55 @@ class PDF:
991
1020
  elif hasattr(pages, "__iter__"):
992
1021
  try:
993
1022
  target_page_indices = [int(i) for i in pages]
994
- # Validate indices
995
1023
  for idx in target_page_indices:
996
1024
  if not (0 <= idx < len(self._pages)):
997
1025
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
998
1026
  except (IndexError, TypeError, ValueError) as e:
999
- raise ValueError(
1000
- f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
1001
- ) from e
1027
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1028
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1002
1029
  else:
1003
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
1030
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1031
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1004
1032
 
1005
1033
  if not target_page_indices:
1006
1034
  logger.warning("No pages selected for OCR correction.")
1007
1035
  return self
1008
1036
 
1009
- logger.info(
1010
- f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
1011
- )
1037
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1038
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1012
1039
 
1013
- # Iterate through target pages and call their correct_ocr method
1014
1040
  for page_idx in target_page_indices:
1015
1041
  page = self._pages[page_idx]
1016
1042
  try:
1017
- page.correct_ocr(correction_callback=correction_callback)
1043
+ page.correct_ocr(
1044
+ correction_callback=correction_callback,
1045
+ max_workers=max_workers,
1046
+ progress_callback=progress_callback,
1047
+ )
1018
1048
  except Exception as e:
1019
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1020
- # Optionally re-raise or just log and continue
1049
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1050
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1021
1051
 
1022
- logger.info(f"OCR correction process finished for requested pages.")
1052
+ logger.info("OCR correction process finished.")
1053
+ logger.info("OCR correction process finished.")
1023
1054
  return self
1024
1055
 
1025
1056
  def __len__(self) -> int:
1026
1057
  """Return the number of pages in the PDF."""
1027
- # Ensure _pages is initialized
1028
1058
  if not hasattr(self, "_pages"):
1029
- # Return 0 or raise error if not fully initialized? Let's return 0.
1030
1059
  return 0
1031
1060
  return len(self._pages)
1032
1061
 
1033
- def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
1062
+ def __getitem__(self, key) -> Union["Page", "PageCollection"]:
1034
1063
  """Access pages by index or slice."""
1035
- # Check if self._pages has been initialized
1036
1064
  if not hasattr(self, "_pages"):
1037
1065
  raise AttributeError("PDF pages not initialized yet.")
1066
+
1038
1067
  if isinstance(key, slice):
1039
- # Return a PageCollection slice
1040
1068
  from natural_pdf.elements.collections import PageCollection
1041
1069
 
1042
1070
  return PageCollection(self._pages[key])
1043
- # Check index bounds before accessing
1071
+
1044
1072
  if isinstance(key, int):
1045
1073
  if 0 <= key < len(self._pages):
1046
1074
  return self._pages[key]
@@ -1054,25 +1082,23 @@ class PDF:
1054
1082
  if hasattr(self, "_pdf") and self._pdf is not None:
1055
1083
  try:
1056
1084
  self._pdf.close()
1057
- logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
1085
+ logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
1058
1086
  except Exception as e:
1059
1087
  logger.warning(f"Error closing pdfplumber object: {e}")
1060
1088
  finally:
1061
1089
  self._pdf = None
1062
1090
 
1063
- # Clean up temporary file if it exists
1064
1091
  if hasattr(self, "_temp_file") and self._temp_file is not None:
1065
1092
  temp_file_path = None
1066
1093
  try:
1067
1094
  if hasattr(self._temp_file, "name") and self._temp_file.name:
1068
1095
  temp_file_path = self._temp_file.name
1069
- if os.path.exists(temp_file_path):
1096
+ # Only unlink if it exists and _is_stream is False (meaning WE created it)
1097
+ if not self._is_stream and os.path.exists(temp_file_path):
1070
1098
  os.unlink(temp_file_path)
1071
1099
  logger.debug(f"Removed temporary PDF file: {temp_file_path}")
1072
1100
  except Exception as e:
1073
- logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
1074
- finally:
1075
- self._temp_file = None
1101
+ logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
1076
1102
 
1077
1103
  def __enter__(self):
1078
1104
  """Context manager entry."""
@@ -1082,6 +1108,432 @@ class PDF:
1082
1108
  """Context manager exit."""
1083
1109
  self.close()
1084
1110
 
1085
- # --- Indexable Protocol Methods --- Needed for search/sync
1086
1111
  def get_id(self) -> str:
1112
+ """Get unique identifier for this PDF."""
1113
+ """Get unique identifier for this PDF."""
1087
1114
  return self.path
1115
+
1116
+ # --- Deskew Method --- #
1117
+
1118
+ def deskew(
1119
+ self,
1120
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
1121
+ resolution: int = 300,
1122
+ detection_resolution: int = 72,
1123
+ force_overwrite: bool = False,
1124
+ **deskew_kwargs,
1125
+ ) -> "PDF":
1126
+ """
1127
+ Creates a new, in-memory PDF object containing deskewed versions of the
1128
+ specified pages from the original PDF.
1129
+
1130
+ This method renders each selected page, detects and corrects skew using the 'deskew'
1131
+ library, and then combines the resulting images into a new PDF using 'img2pdf'.
1132
+ The new PDF object is returned directly.
1133
+
1134
+ Important: The returned PDF is image-based. Any existing text, OCR results,
1135
+ annotations, or other elements from the original pages will *not* be carried over.
1136
+
1137
+ Args:
1138
+ pages: Page indices/slice to include (0-based). If None, processes all pages.
1139
+ resolution: DPI resolution for rendering the output deskewed pages.
1140
+ detection_resolution: DPI resolution used for skew detection if angles are not
1141
+ already cached on the page objects.
1142
+ force_overwrite: If False (default), raises a ValueError if any target page
1143
+ already contains processed elements (text, OCR, regions) to
1144
+ prevent accidental data loss. Set to True to proceed anyway.
1145
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
1146
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
1147
+
1148
+ Returns:
1149
+ A new PDF object representing the deskewed document.
1150
+
1151
+ Raises:
1152
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
1153
+ ValueError: If `force_overwrite` is False and target pages contain elements.
1154
+ FileNotFoundError: If the source PDF cannot be read (if file-based).
1155
+ IOError: If creating the in-memory PDF fails.
1156
+ RuntimeError: If rendering or deskewing individual pages fails.
1157
+ """
1158
+ if not DESKEW_AVAILABLE:
1159
+ raise ImportError(
1160
+ "Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
1161
+ )
1162
+
1163
+ target_pages = self._get_target_pages(pages) # Use helper to resolve pages
1164
+
1165
+ # --- Safety Check --- #
1166
+ if not force_overwrite:
1167
+ for page in target_pages:
1168
+ # Check if the element manager has been initialized and contains any elements
1169
+ if (
1170
+ hasattr(page, "_element_mgr")
1171
+ and page._element_mgr
1172
+ and page._element_mgr.has_elements()
1173
+ ):
1174
+ raise ValueError(
1175
+ f"Page {page.number} contains existing elements (text, OCR, etc.). "
1176
+ f"Deskewing creates an image-only PDF, discarding these elements. "
1177
+ f"Set force_overwrite=True to proceed."
1178
+ )
1179
+
1180
+ # --- Process Pages --- #
1181
+ deskewed_images_bytes = []
1182
+ logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
1183
+
1184
+ # Use tqdm via get_tqdm
1185
+ for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
1186
+ try:
1187
+ # Use page.deskew to get the corrected PIL image
1188
+ # Pass down resolutions and kwargs
1189
+ deskewed_img = page.deskew(
1190
+ resolution=resolution,
1191
+ angle=None, # Let page.deskew handle detection/caching
1192
+ detection_resolution=detection_resolution,
1193
+ **deskew_kwargs,
1194
+ )
1195
+
1196
+ if not deskewed_img:
1197
+ logger.warning(
1198
+ f"Page {page.number}: Failed to generate deskewed image, skipping."
1199
+ )
1200
+ continue
1201
+
1202
+ # Convert image to bytes for img2pdf (use PNG for lossless quality)
1203
+ with io.BytesIO() as buf:
1204
+ deskewed_img.save(buf, format="PNG")
1205
+ deskewed_images_bytes.append(buf.getvalue())
1206
+
1207
+ except Exception as e:
1208
+ logger.error(
1209
+ f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
1210
+ )
1211
+ # Option: Raise a runtime error, or continue and skip the page?
1212
+ # Raising makes the whole operation fail if one page fails.
1213
+ raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
1214
+
1215
+ # --- Create PDF --- #
1216
+ if not deskewed_images_bytes:
1217
+ raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
1218
+
1219
+ logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
1220
+ try:
1221
+ # Use img2pdf to combine image bytes into PDF bytes
1222
+ pdf_bytes = img2pdf.convert(deskewed_images_bytes)
1223
+
1224
+ # Wrap bytes in a stream
1225
+ pdf_stream = io.BytesIO(pdf_bytes)
1226
+
1227
+ # Create a new PDF object from the stream using original config
1228
+ logger.info("Creating new PDF object from deskewed stream...")
1229
+ new_pdf = PDF(
1230
+ pdf_stream,
1231
+ reading_order=self._reading_order,
1232
+ font_attrs=self._font_attrs,
1233
+ keep_spaces=self._config.get("keep_spaces", True),
1234
+ )
1235
+ return new_pdf
1236
+ except Exception as e:
1237
+ logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
1238
+ raise IOError("Failed to create deskewed PDF object from image stream.") from e
1239
+
1240
+ # --- End Deskew Method --- #
1241
+
1242
+ # --- Classification Methods --- #
1243
+
1244
+ def classify_pages(
1245
+ self,
1246
+ categories: List[str],
1247
+ model: Optional[str] = None,
1248
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
1249
+ analysis_key: str = "classification",
1250
+ using: Optional[str] = None,
1251
+ **kwargs,
1252
+ ) -> "PDF":
1253
+ """
1254
+ Classifies specified pages of the PDF.
1255
+
1256
+ Args:
1257
+ categories: List of category names
1258
+ model: Model identifier ('text', 'vision', or specific HF ID)
1259
+ pages: Page indices, slice, or None for all pages
1260
+ analysis_key: Key to store results in page's analyses dict
1261
+ using: Processing mode ('text' or 'vision')
1262
+ **kwargs: Additional arguments for the ClassificationManager
1263
+
1264
+ Returns:
1265
+ Self for method chaining
1266
+ """
1267
+ if not categories:
1268
+ raise ValueError("Categories list cannot be empty.")
1269
+
1270
+ try:
1271
+ manager = self.get_manager("classification")
1272
+ except (ValueError, RuntimeError) as e:
1273
+ raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
1274
+
1275
+ if not manager or not manager.is_available():
1276
+ try:
1277
+ from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
1278
+
1279
+ if not _CLASSIFICATION_AVAILABLE:
1280
+ raise ImportError("Classification dependencies missing.")
1281
+ except ImportError:
1282
+ raise ImportError(
1283
+ "Classification dependencies missing. "
1284
+ 'Install with: pip install "natural-pdf[classification]"'
1285
+ )
1286
+ raise ClassificationError("ClassificationManager not available.")
1287
+
1288
+ target_pages = []
1289
+ if pages is None:
1290
+ target_pages = self._pages
1291
+ elif isinstance(pages, slice):
1292
+ target_pages = self._pages[pages]
1293
+ elif hasattr(pages, "__iter__"):
1294
+ try:
1295
+ target_pages = [self._pages[i] for i in pages]
1296
+ except IndexError:
1297
+ raise ValueError("Invalid page index provided.")
1298
+ except TypeError:
1299
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1300
+ else:
1301
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1302
+
1303
+ if not target_pages:
1304
+ logger.warning("No pages selected for classification.")
1305
+ return self
1306
+
1307
+ inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
1308
+ logger.info(
1309
+ f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
1310
+ )
1311
+
1312
+ page_contents = []
1313
+ pages_to_classify = []
1314
+ logger.debug(f"Gathering content for {len(target_pages)} pages...")
1315
+
1316
+ for page in target_pages:
1317
+ try:
1318
+ content = page._get_classification_content(model_type=inferred_using, **kwargs)
1319
+ page_contents.append(content)
1320
+ pages_to_classify.append(page)
1321
+ except ValueError as e:
1322
+ logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
1323
+ except Exception as e:
1324
+ logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
1325
+
1326
+ if not page_contents:
1327
+ logger.warning("No content could be gathered for batch classification.")
1328
+ return self
1329
+
1330
+ logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
1331
+
1332
+ try:
1333
+ batch_results = manager.classify_batch(
1334
+ item_contents=page_contents,
1335
+ categories=categories,
1336
+ model_id=model,
1337
+ using=inferred_using,
1338
+ **kwargs,
1339
+ )
1340
+ except Exception as e:
1341
+ logger.error(f"Batch classification failed: {e}")
1342
+ raise ClassificationError(f"Batch classification failed: {e}") from e
1343
+
1344
+ if len(batch_results) != len(pages_to_classify):
1345
+ logger.error(
1346
+ f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
1347
+ )
1348
+ return self
1349
+
1350
+ logger.debug(
1351
+ f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
1352
+ )
1353
+ for page, result_obj in zip(pages_to_classify, batch_results):
1354
+ try:
1355
+ if not hasattr(page, "analyses") or page.analyses is None:
1356
+ page.analyses = {}
1357
+ page.analyses[analysis_key] = result_obj
1358
+ except Exception as e:
1359
+ logger.warning(
1360
+ f"Failed to store classification results for page {page.number}: {e}"
1361
+ )
1362
+
1363
+ logger.info(f"Finished classifying PDF pages.")
1364
+ return self
1365
+
1366
+ # --- End Classification Methods --- #
1367
+
1368
+ # --- Extraction Support --- #
1369
+ def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
1370
+ """
1371
+ Retrieves the content for the entire PDF.
1372
+
1373
+ Args:
1374
+ using: 'text' or 'vision'
1375
+ **kwargs: Additional arguments passed to extract_text or page.to_image
1376
+
1377
+ Returns:
1378
+ str: Extracted text if using='text'
1379
+ List[PIL.Image.Image]: List of page images if using='vision'
1380
+ None: If content cannot be retrieved
1381
+ """
1382
+ if using == "text":
1383
+ try:
1384
+ layout = kwargs.pop("layout", True)
1385
+ return self.extract_text(layout=layout, **kwargs)
1386
+ except Exception as e:
1387
+ logger.error(f"Error extracting text from PDF: {e}")
1388
+ return None
1389
+ elif using == "vision":
1390
+ page_images = []
1391
+ logger.info(f"Rendering {len(self.pages)} pages to images...")
1392
+
1393
+ resolution = kwargs.pop("resolution", 72)
1394
+ include_highlights = kwargs.pop("include_highlights", False)
1395
+ labels = kwargs.pop("labels", False)
1396
+
1397
+ try:
1398
+ for page in tqdm(self.pages, desc="Rendering Pages"):
1399
+ img = page.to_image(
1400
+ resolution=resolution,
1401
+ include_highlights=include_highlights,
1402
+ labels=labels,
1403
+ **kwargs,
1404
+ )
1405
+ if img:
1406
+ page_images.append(img)
1407
+ else:
1408
+ logger.warning(f"Failed to render page {page.number}, skipping.")
1409
+ if not page_images:
1410
+ logger.error("Failed to render any pages.")
1411
+ return None
1412
+ return page_images
1413
+ except Exception as e:
1414
+ logger.error(f"Error rendering pages: {e}")
1415
+ return None
1416
+ else:
1417
+ logger.error(f"Unsupported value for 'using': {using}")
1418
+ return None
1419
+
1420
+ # --- End Extraction Support --- #
1421
+
1422
+ def _gather_analysis_data(
1423
+ self,
1424
+ analysis_keys: List[str],
1425
+ include_content: bool,
1426
+ include_images: bool,
1427
+ image_dir: Optional[Path],
1428
+ image_format: str,
1429
+ image_resolution: int,
1430
+ ) -> List[Dict[str, Any]]:
1431
+ """
1432
+ Gather analysis data from all pages in the PDF.
1433
+
1434
+ Args:
1435
+ analysis_keys: Keys in the analyses dictionary to export
1436
+ include_content: Whether to include extracted text
1437
+ include_images: Whether to export images
1438
+ image_dir: Directory to save images
1439
+ image_format: Format to save images
1440
+ image_resolution: Resolution for exported images
1441
+
1442
+ Returns:
1443
+ List of dictionaries containing analysis data
1444
+ """
1445
+ if not hasattr(self, "_pages") or not self._pages:
1446
+ logger.warning(f"No pages found in PDF {self.path}")
1447
+ return []
1448
+
1449
+ all_data = []
1450
+
1451
+ for page in tqdm(self._pages, desc="Gathering page data", leave=False):
1452
+ # Basic page information
1453
+ page_data = {
1454
+ "pdf_path": self.path,
1455
+ "page_number": page.number,
1456
+ "page_index": page.index,
1457
+ }
1458
+
1459
+ # Include extracted text if requested
1460
+ if include_content:
1461
+ try:
1462
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
1463
+ except Exception as e:
1464
+ logger.error(f"Error extracting text from page {page.number}: {e}")
1465
+ page_data["content"] = ""
1466
+
1467
+ # Save image if requested
1468
+ if include_images:
1469
+ try:
1470
+ # Create image filename
1471
+ image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
1472
+ image_path = image_dir / image_filename
1473
+
1474
+ # Save image
1475
+ page.save_image(
1476
+ str(image_path), resolution=image_resolution, include_highlights=True
1477
+ )
1478
+
1479
+ # Add relative path to data
1480
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1481
+ except Exception as e:
1482
+ logger.error(f"Error saving image for page {page.number}: {e}")
1483
+ page_data["image_path"] = None
1484
+
1485
+ # Add analyses data
1486
+ for key in analysis_keys:
1487
+ if not hasattr(page, "analyses") or not page.analyses:
1488
+ raise ValueError(f"Page {page.number} does not have analyses data")
1489
+
1490
+ if key not in page.analyses:
1491
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
1492
+
1493
+ # Get the analysis result
1494
+ analysis_result = page.analyses[key]
1495
+
1496
+ # If the result has a to_dict method, use it
1497
+ if hasattr(analysis_result, "to_dict"):
1498
+ analysis_data = analysis_result.to_dict()
1499
+ else:
1500
+ # Otherwise, use the result directly if it's dict-like
1501
+ try:
1502
+ analysis_data = dict(analysis_result)
1503
+ except (TypeError, ValueError):
1504
+ # Last resort: convert to string
1505
+ analysis_data = {"raw_result": str(analysis_result)}
1506
+
1507
+ # Add analysis data to page data with the key as prefix
1508
+ for k, v in analysis_data.items():
1509
+ page_data[f"{key}.{k}"] = v
1510
+
1511
+ all_data.append(page_data)
1512
+
1513
+ return all_data
1514
+
1515
+ def _get_target_pages(
1516
+ self, pages: Optional[Union[Iterable[int], range, slice]] = None
1517
+ ) -> List["Page"]:
1518
+ """
1519
+ Helper method to get a list of Page objects based on the input pages.
1520
+
1521
+ Args:
1522
+ pages: Page indices, slice, or None for all pages
1523
+
1524
+ Returns:
1525
+ List of Page objects
1526
+ """
1527
+ if pages is None:
1528
+ return self._pages
1529
+ elif isinstance(pages, slice):
1530
+ return self._pages[pages]
1531
+ elif hasattr(pages, "__iter__"):
1532
+ try:
1533
+ return [self._pages[i] for i in pages]
1534
+ except IndexError:
1535
+ raise ValueError("Invalid page index provided in 'pages' iterable.")
1536
+ except TypeError:
1537
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1538
+ else:
1539
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")