natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,11 +1,13 @@
1
- import copy # Add import for deepcopy
1
+ import copy
2
2
  import logging
3
3
  import os
4
4
  import re
5
5
  import tempfile
6
6
  import urllib.request
7
- from pathlib import Path # Added Path
8
- from typing import ( # Added Iterable and TYPE_CHECKING
7
+ import time
8
+ import threading
9
+ from pathlib import Path
10
+ from typing import (
9
11
  TYPE_CHECKING,
10
12
  Any,
11
13
  Callable,
@@ -17,29 +19,33 @@ from typing import ( # Added Iterable and TYPE_CHECKING
17
19
  Type,
18
20
  Union,
19
21
  )
20
- from pathlib import Path
21
-
22
+ from natural_pdf.utils.tqdm_utils import get_tqdm
22
23
 
23
24
  import pdfplumber
24
25
  from PIL import Image
25
26
 
26
- from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
27
- LayoutManager,
28
- )
29
- from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
27
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
28
+ from natural_pdf.core.highlighting_service import HighlightingService
30
29
  from natural_pdf.core.page import Page
31
30
  from natural_pdf.elements.collections import ElementCollection
32
31
  from natural_pdf.elements.region import Region
33
32
  from natural_pdf.ocr import OCRManager, OCROptions
34
33
  from natural_pdf.selectors.parser import parse_selector
35
34
 
36
- # Import the flag directly - this should always work
35
+ from natural_pdf.classification.manager import ClassificationManager
36
+ from natural_pdf.classification.manager import ClassificationError
37
+ from natural_pdf.classification.results import ClassificationResult
38
+ from natural_pdf.extraction.manager import StructuredDataManager
39
+
40
+ from natural_pdf.utils.locks import pdf_render_lock
41
+ from natural_pdf.elements.base import Element
42
+ from natural_pdf.classification.mixin import ClassificationMixin
43
+ from natural_pdf.extraction.mixin import ExtractionMixin
37
44
 
38
- # --- Add Search Service Imports (needed for new methods) ---
39
45
  try:
40
- from typing import Any as TypingAny # Import Any if not already
46
+ from typing import Any as TypingAny
41
47
 
42
- from natural_pdf.search import TextSearchOptions # Keep for ask default
48
+ from natural_pdf.search import TextSearchOptions
43
49
  from natural_pdf.search import (
44
50
  BaseSearchOptions,
45
51
  SearchOptions,
@@ -47,25 +53,24 @@ try:
47
53
  get_search_service,
48
54
  )
49
55
  except ImportError:
50
- # Define dummies if needed for type hints within the class
51
56
  SearchServiceProtocol = object
52
57
  SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
53
58
  TypingAny = object
54
59
 
55
- # Dummy factory needed for default arg in methods
56
60
  def get_search_service(**kwargs) -> SearchServiceProtocol:
57
61
  raise ImportError(
58
62
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
59
63
  )
60
64
 
61
-
62
- # --- End Search Service Imports ---
63
-
64
- # Set up logger early
65
65
  logger = logging.getLogger("natural_pdf.core.pdf")
66
+ tqdm = get_tqdm()
66
67
 
68
+ DEFAULT_MANAGERS = {
69
+ "classification": ClassificationManager,
70
+ "structured_data": StructuredDataManager,
71
+ }
67
72
 
68
- class PDF:
73
+ class PDF(ExtractionMixin):
69
74
  """
70
75
  Enhanced PDF wrapper built on top of pdfplumber.
71
76
 
@@ -86,35 +91,23 @@ class PDF:
86
91
  Args:
87
92
  path_or_url: Path to the PDF file or a URL to a PDF
88
93
  reading_order: Whether to use natural reading order
89
- font_attrs: Font attributes to consider when grouping characters into words.
90
- Default: ['fontname', 'size'] (Group by font name and size)
91
- None: Only consider spatial relationships
92
- List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
93
- keep_spaces: Whether to include spaces in word elements (default: True).
94
- True: Spaces are part of words, better for multi-word searching
95
- False: Break text at spaces, each word is separate (legacy behavior)
94
+ font_attrs: Font attributes for grouping characters into words
95
+ keep_spaces: Whether to include spaces in word elements
96
96
  """
97
- # Check if the input is a URL
98
97
  is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
99
98
 
100
- # Initialize path-related attributes
101
99
  self._original_path = path_or_url
102
100
  self._temp_file = None
103
- self._resolved_path = None # Store the actual path used by pdfplumber
101
+ self._resolved_path = None
104
102
 
105
103
  if is_url:
106
104
  logger.info(f"Downloading PDF from URL: {path_or_url}")
107
105
  try:
108
- # Create a temporary file to store the downloaded PDF
109
106
  self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
110
-
111
- # Download the PDF
112
107
  with urllib.request.urlopen(path_or_url) as response:
113
108
  self._temp_file.write(response.read())
114
109
  self._temp_file.flush()
115
110
  self._temp_file.close()
116
-
117
- # Use the temporary file path
118
111
  self._resolved_path = self._temp_file.name
119
112
  logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
120
113
  except Exception as e:
@@ -126,7 +119,6 @@ class PDF:
126
119
  logger.error(f"Failed to download PDF from URL: {e}")
127
120
  raise ValueError(f"Failed to download PDF from URL: {e}")
128
121
  else:
129
- # Use the provided path directly
130
122
  self._resolved_path = path_or_url
131
123
 
132
124
  logger.info(f"Initializing PDF from {self._resolved_path}")
@@ -137,42 +129,68 @@ class PDF:
137
129
  try:
138
130
  self._pdf = pdfplumber.open(self._resolved_path)
139
131
  except Exception as e:
140
- logger.error(
141
- f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
142
- exc_info=True,
143
- )
144
- # Clean up temp file if creation failed
132
+ logger.error(f"Failed to open PDF: {e}", exc_info=True)
145
133
  self.close()
146
134
  raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
147
135
 
148
- self._path = self._resolved_path # Keep original path too?
149
- self.path = self._resolved_path # Public attribute for the resolved path
150
- self.source_path = self._original_path # Public attribute for the user-provided path/URL
136
+ self._path = self._resolved_path
137
+ self.path = self._resolved_path
138
+ self.source_path = self._original_path
151
139
 
152
140
  self._reading_order = reading_order
153
141
  self._config = {"keep_spaces": keep_spaces}
142
+ self._font_attrs = font_attrs
154
143
 
155
- self._font_attrs = font_attrs # Store the font attribute configuration
156
-
157
- # Initialize Managers and Services (conditionally available)
158
144
  self._ocr_manager = OCRManager() if OCRManager else None
159
145
  self._layout_manager = LayoutManager() if LayoutManager else None
160
146
  self.highlighter = HighlightingService(self)
147
+ self._classification_manager_instance = ClassificationManager()
148
+ self._manager_registry = {}
161
149
 
162
- # Initialize pages last, passing necessary refs
163
150
  self._pages = [
164
151
  Page(p, parent=self, index=i, font_attrs=font_attrs)
165
152
  for i, p in enumerate(self._pdf.pages)
166
153
  ]
167
154
 
168
- # Other state
169
155
  self._element_cache = {}
170
- self._exclusions = [] # List to store exclusion functions/regions
171
- self._regions = [] # List to store region functions/definitions
156
+ self._exclusions = []
157
+ self._regions = []
172
158
 
173
- logger.info("Initialized HighlightingService.")
174
159
  logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
175
160
 
161
+ self._initialize_managers()
162
+ self._initialize_highlighter()
163
+
164
+ def _initialize_managers(self):
165
+ """Initialize manager instances based on DEFAULT_MANAGERS."""
166
+ self._managers = {}
167
+ for key, manager_class in DEFAULT_MANAGERS.items():
168
+ try:
169
+ self._managers[key] = manager_class()
170
+ logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
171
+ except Exception as e:
172
+ logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
173
+ self._managers[key] = None
174
+
175
+ def get_manager(self, key: str) -> Any:
176
+ """Retrieve a manager instance by its key."""
177
+ if key not in self._managers:
178
+ raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
179
+
180
+ manager_instance = self._managers.get(key)
181
+
182
+ if manager_instance is None:
183
+ manager_class = DEFAULT_MANAGERS.get(key)
184
+ if manager_class:
185
+ raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
186
+ else:
187
+ raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
188
+
189
+ return manager_instance
190
+
191
+ def _initialize_highlighter(self):
192
+ pass
193
+
176
194
  @property
177
195
  def metadata(self) -> Dict[str, Any]:
178
196
  """Access metadata as a dictionary."""
@@ -183,7 +201,6 @@ class PDF:
183
201
  """Access pages as a PageCollection object."""
184
202
  from natural_pdf.elements.collections import PageCollection
185
203
 
186
- # Ensure _pages is initialized
187
204
  if not hasattr(self, "_pages"):
188
205
  raise AttributeError("PDF pages not yet initialized.")
189
206
  return PageCollection(self._pages)
@@ -195,12 +212,10 @@ class PDF:
195
212
  Returns:
196
213
  Self for method chaining
197
214
  """
198
- # Ensure _pages is initialized
199
215
  if not hasattr(self, "_pages"):
200
216
  raise AttributeError("PDF pages not yet initialized.")
201
217
 
202
218
  self._exclusions = []
203
- # Also clear from pages
204
219
  for page in self._pages:
205
220
  page.clear_exclusions()
206
221
  return self
@@ -212,99 +227,75 @@ class PDF:
212
227
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
213
228
 
214
229
  Args:
215
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
230
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
216
231
  label: Optional label for this exclusion
217
232
 
218
233
  Returns:
219
234
  Self for method chaining
220
235
  """
221
- # Ensure _pages is initialized
222
236
  if not hasattr(self, "_pages"):
223
237
  raise AttributeError("PDF pages not yet initialized.")
224
238
 
225
- # Store exclusion with its label at PDF level
226
239
  exclusion_data = (exclusion_func, label)
227
240
  self._exclusions.append(exclusion_data)
228
241
 
229
- # Apply this exclusion to all pages
230
242
  for page in self._pages:
231
- # We pass the original function, Page.add_exclusion handles calling it
232
243
  page.add_exclusion(exclusion_func, label=label)
233
244
 
234
245
  return self
235
246
 
236
247
  def apply_ocr(
237
248
  self,
238
- pages: Optional[Union[Iterable[int], range, slice]] = None,
239
249
  engine: Optional[str] = None,
240
- # --- Common OCR Parameters (Direct Arguments) ---
241
250
  languages: Optional[List[str]] = None,
242
- min_confidence: Optional[float] = None, # Min confidence threshold
251
+ min_confidence: Optional[float] = None,
243
252
  device: Optional[str] = None,
244
- resolution: Optional[int] = None, # DPI for rendering before OCR
245
- apply_exclusions: bool = True, # New parameter
253
+ resolution: Optional[int] = None,
254
+ apply_exclusions: bool = True,
246
255
  detect_only: bool = False,
247
- # --- Engine-Specific Options --- Use 'options=' for this
248
- options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
- # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
256
+ replace: bool = True,
257
+ options: Optional[Any] = None,
258
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
250
259
  ) -> "PDF":
251
260
  """
252
- Applies OCR to specified pages (or all pages) of the PDF using batch processing.
253
-
254
- This method renders the specified pages to images, sends them as a batch
255
- to the OCRManager, and adds the resulting TextElements to each respective page.
261
+ Applies OCR to specified pages of the PDF using batch processing.
256
262
 
257
263
  Args:
258
- pages: An iterable of 0-based page indices (list, range, tuple),
259
- a slice object, or None to process all pages.
260
- engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
- Uses manager's default ('easyocr') if None.
262
- languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
- **Must be codes understood by the specific selected engine.**
264
- No mapping is performed. Overrides manager/engine default.
265
- min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
- Overrides manager/engine default.
267
- device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
- Overrides manager/engine default.
269
- resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
- Affects input quality for OCR. Defaults to 150 if not set.
271
- apply_exclusions: If True (default), render page image for OCR with
272
- excluded areas masked (whited out). If False, OCR
273
- the raw page image without masking exclusions.
274
- detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
- options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
- containing parameters specific to the chosen engine.
264
+ engine: Name of the OCR engine
265
+ languages: List of language codes
266
+ min_confidence: Minimum confidence threshold
267
+ device: Device to run OCR on
268
+ resolution: DPI resolution for page images
269
+ apply_exclusions: Whether to mask excluded areas
270
+ detect_only: If True, only detect text boxes
271
+ replace: Whether to replace existing OCR elements
272
+ options: Engine-specific options
273
+ pages: Page indices to process or None for all pages
277
274
 
278
275
  Returns:
279
- Self for method chaining.
280
-
281
- Raises:
282
- ValueError: If page indices are invalid.
283
- TypeError: If 'options' is not compatible with the engine.
284
- RuntimeError: If the OCRManager or selected engine is not available.
276
+ Self for method chaining
285
277
  """
286
278
  if not self._ocr_manager:
287
279
  logger.error("OCRManager not available. Cannot apply OCR.")
288
- # Or raise RuntimeError("OCRManager not initialized.")
289
280
  return self
290
281
 
291
- # --- Determine Target Pages (unchanged) ---
292
- target_pages: List[Page] = []
282
+ thread_id = threading.current_thread().name
283
+ logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
284
+
285
+ target_pages = []
293
286
  if pages is None:
294
287
  target_pages = self._pages
295
288
  elif isinstance(pages, slice):
296
289
  target_pages = self._pages[pages]
297
- elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
290
+ elif hasattr(pages, "__iter__"):
298
291
  try:
299
292
  target_pages = [self._pages[i] for i in pages]
300
293
  except IndexError:
301
294
  raise ValueError("Invalid page index provided in 'pages' iterable.")
302
295
  except TypeError:
303
- raise TypeError(
304
- "'pages' must be None, a slice, or an iterable of page indices (int)."
305
- )
296
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
306
297
  else:
307
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
298
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
308
299
 
309
300
  if not target_pages:
310
301
  logger.warning("No pages selected for OCR processing.")
@@ -312,26 +303,20 @@ class PDF:
312
303
 
313
304
  page_numbers = [p.number for p in target_pages]
314
305
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
- # --- Determine Rendering Resolution ---
316
- # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
- final_resolution = resolution # Use direct arg if provided
318
- if final_resolution is None:
319
- final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
-
321
- logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
322
-
323
- # --- Render Images for Batch ---
324
- images_pil: List[Image.Image] = []
325
- page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
326
- logger.info(
327
- f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
328
- )
329
- failed_page_num = "unknown" # Keep track of potentially failing page
306
+
307
+ final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
308
+ logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
309
+
310
+ images_pil = []
311
+ page_image_map = []
312
+ logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
313
+ failed_page_num = "unknown"
314
+ render_start_time = time.monotonic()
315
+
330
316
  try:
331
- for i, page in enumerate(target_pages):
332
- failed_page_num = page.number # Update current page number in case of error
317
+ for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
318
+ failed_page_num = page.number
333
319
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
334
- # Use the determined final_resolution and apply exclusions if requested
335
320
  to_image_kwargs = {
336
321
  "resolution": final_resolution,
337
322
  "include_highlights": False,
@@ -340,68 +325,64 @@ class PDF:
340
325
  img = page.to_image(**to_image_kwargs)
341
326
  if img is None:
342
327
  logger.error(f" Failed to render page {page.number} to image.")
343
- # Decide how to handle: skip page, raise error? For now, skip.
344
- continue # Skip this page if rendering failed
328
+ continue
345
329
  images_pil.append(img)
346
- page_image_map.append((page, img)) # Store pair
330
+ page_image_map.append((page, img))
347
331
  except Exception as e:
348
- logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
332
+ logger.error(f"Failed to render pages for batch OCR: {e}")
349
333
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
334
+
335
+ render_end_time = time.monotonic()
336
+ logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
350
337
 
351
338
  if not images_pil or not page_image_map:
352
339
  logger.error("No images were successfully rendered for batch OCR.")
353
340
  return self
354
341
 
355
- # --- Prepare Arguments for Manager ---
356
- # Pass common args directly, engine-specific via options
357
342
  manager_args = {
358
343
  "images": images_pil,
359
344
  "engine": engine,
360
345
  "languages": languages,
361
- "min_confidence": min_confidence, # Use the renamed parameter
346
+ "min_confidence": min_confidence,
362
347
  "device": device,
363
348
  "options": options,
364
349
  "detect_only": detect_only,
365
- # Note: resolution is used for rendering, not passed to OCR manager directly
366
350
  }
367
- # Filter out None values so manager can use its defaults
368
351
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
369
352
 
370
- # --- Call OCR Manager for Batch Processing ---
371
- logger.info(
372
- f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
373
- )
353
+ ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
354
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
355
+ ocr_start_time = time.monotonic()
356
+
374
357
  try:
375
- # Manager's apply_ocr signature needs to accept common args directly
376
358
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
377
359
 
378
360
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
379
- logger.error(
380
- f"OCR Manager returned unexpected result format or length for batch processing. "
381
- f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
382
- f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
383
- )
361
+ logger.error(f"OCR Manager returned unexpected result format or length.")
384
362
  return self
385
363
 
386
364
  logger.info("OCR Manager batch processing complete.")
387
-
388
365
  except Exception as e:
389
- logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
366
+ logger.error(f"Batch OCR processing failed: {e}")
390
367
  return self
368
+
369
+ ocr_end_time = time.monotonic()
370
+ logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
391
371
 
392
- # --- Distribute Results and Add Elements to Pages (unchanged) ---
393
372
  logger.info("Adding OCR results to respective pages...")
394
373
  total_elements_added = 0
374
+
395
375
  for i, (page, img) in enumerate(page_image_map):
396
376
  results_for_page = batch_results[i]
397
377
  if not isinstance(results_for_page, list):
398
- logger.warning(
399
- f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
400
- )
378
+ logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
401
379
  continue
402
380
 
403
381
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
404
382
  try:
383
+ if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
384
+ page._element_mgr.remove_ocr_elements()
385
+
405
386
  img_scale_x = page.width / img.width if img.width > 0 else 1
406
387
  img_scale_y = page.height / img.height if img.height > 0 else 1
407
388
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -414,53 +395,39 @@ class PDF:
414
395
  else:
415
396
  logger.debug(f" No valid TextElements created for page {page.number}.")
416
397
  except Exception as e:
417
- logger.error(
418
- f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
419
- )
398
+ logger.error(f" Error adding OCR elements to page {page.number}: {e}")
420
399
 
421
- logger.info(
422
- f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
423
- )
400
+ logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
424
401
  return self
425
402
 
426
403
  def add_region(
427
404
  self, region_func: Callable[["Page"], Optional[Region]], name: str = None
428
405
  ) -> "PDF":
429
406
  """
430
- Add a region function to the PDF. This creates regions on all pages using the provided function.
407
+ Add a region function to the PDF.
431
408
 
432
409
  Args:
433
- region_func: A function that takes a Page and returns a Region, or None.
410
+ region_func: A function that takes a Page and returns a Region, or None
434
411
  name: Optional name for the region
435
412
 
436
413
  Returns:
437
414
  Self for method chaining
438
415
  """
439
- # Ensure _pages is initialized
440
416
  if not hasattr(self, "_pages"):
441
417
  raise AttributeError("PDF pages not yet initialized.")
442
418
 
443
- # Store region with its name at PDF level
444
419
  region_data = (region_func, name)
445
420
  self._regions.append(region_data)
446
421
 
447
- # Apply this region to all pages
448
422
  for page in self._pages:
449
423
  try:
450
- # Call the function to get the region for this specific page
451
424
  region_instance = region_func(page)
452
425
  if region_instance and isinstance(region_instance, Region):
453
- # If a valid region is returned, add it to the page
454
426
  page.add_region(region_instance, name=name, source="named")
455
427
  elif region_instance is not None:
456
- logger.warning(
457
- f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
458
- )
428
+ logger.warning(f"Region function did not return a valid Region for page {page.number}")
459
429
  except Exception as e:
460
- logger.error(
461
- f"Error executing or adding region function for page {page.number}: {e}",
462
- exc_info=True,
463
- )
430
+ logger.error(f"Error adding region for page {page.number}: {e}")
464
431
 
465
432
  return self
466
433
 
@@ -471,22 +438,19 @@ class PDF:
471
438
  Find the first element matching the selector.
472
439
 
473
440
  Args:
474
- selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
475
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
476
- regex: Whether to use regex for text search in :contains (default: False)
477
- case: Whether to do case-sensitive text search (default: True)
441
+ selector: CSS-like selector string
442
+ apply_exclusions: Whether to exclude elements in exclusion regions
443
+ regex: Whether to use regex for text search
444
+ case: Whether to do case-sensitive text search
478
445
  **kwargs: Additional filter parameters
479
446
 
480
447
  Returns:
481
448
  Element object or None if not found
482
449
  """
483
- # Ensure _pages is initialized
484
450
  if not hasattr(self, "_pages"):
485
451
  raise AttributeError("PDF pages not yet initialized.")
486
452
 
487
453
  selector_obj = parse_selector(selector)
488
-
489
- # Pass regex and case flags to selector function
490
454
  kwargs["regex"] = regex
491
455
  kwargs["case"] = case
492
456
 
@@ -502,22 +466,19 @@ class PDF:
502
466
  Find all elements matching the selector.
503
467
 
504
468
  Args:
505
- selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
506
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
507
- regex: Whether to use regex for text search in :contains (default: False)
508
- case: Whether to do case-sensitive text search (default: True)
469
+ selector: CSS-like selector string
470
+ apply_exclusions: Whether to exclude elements in exclusion regions
471
+ regex: Whether to use regex for text search
472
+ case: Whether to do case-sensitive text search
509
473
  **kwargs: Additional filter parameters
510
474
 
511
475
  Returns:
512
476
  ElementCollection with matching elements
513
477
  """
514
- # Ensure _pages is initialized
515
478
  if not hasattr(self, "_pages"):
516
479
  raise AttributeError("PDF pages not yet initialized.")
517
480
 
518
481
  selector_obj = parse_selector(selector)
519
-
520
- # Pass regex and case flags to selector function
521
482
  kwargs["regex"] = regex
522
483
  kwargs["case"] = case
523
484
 
@@ -534,8 +495,8 @@ class PDF:
534
495
 
535
496
  Args:
536
497
  selector_obj: Parsed selector dictionary
537
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
538
- first_only: If True, stop searching after the first match is found.
498
+ apply_exclusions: Whether to exclude elements in exclusion regions
499
+ first_only: If True, stop searching after the first match is found
539
500
  **kwargs: Additional filter parameters
540
501
 
541
502
  Returns:
@@ -543,57 +504,45 @@ class PDF:
543
504
  """
544
505
  from natural_pdf.elements.collections import ElementCollection
545
506
 
546
- # Determine page range to search
547
507
  page_indices = kwargs.get("pages", range(len(self._pages)))
548
508
  if isinstance(page_indices, int):
549
509
  page_indices = [page_indices]
550
510
  elif isinstance(page_indices, slice):
551
511
  page_indices = range(*page_indices.indices(len(self._pages)))
552
512
 
553
- # Check for cross-page pseudo-classes (currently not supported)
554
513
  for pseudo in selector_obj.get("pseudo_classes", []):
555
514
  if pseudo.get("name") in ("spans", "continues"):
556
515
  logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
557
516
  return ElementCollection([])
558
517
 
559
- # Regular case: collect elements from each page
560
518
  all_elements = []
561
519
  for page_idx in page_indices:
562
520
  if 0 <= page_idx < len(self._pages):
563
521
  page = self._pages[page_idx]
564
- # Pass first_only down to page._apply_selector
565
522
  page_elements_collection = page._apply_selector(
566
523
  selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
567
524
  )
568
525
  if page_elements_collection:
569
526
  page_elements = page_elements_collection.elements
570
527
  all_elements.extend(page_elements)
571
- # If we only need the first match overall, and we found one on this page, stop
572
528
  if first_only and page_elements:
573
- break # Stop iterating through pages
529
+ break
574
530
  else:
575
531
  logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
576
532
 
577
- # Create a combined collection
578
533
  combined = ElementCollection(all_elements)
579
534
 
580
- # Sort in document order if requested and not first_only (already sorted by page)
581
535
  if not first_only and kwargs.get("document_order", True):
582
- # Check if elements have page, top, x0 before sorting
583
536
  if all(
584
537
  hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
585
538
  for el in combined.elements
586
539
  ):
587
540
  combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
588
541
  else:
589
- # Elements might be Regions without inherent sorting order yet
590
- # Attempt sorting by page index if possible
591
542
  try:
592
543
  combined.sort(key=lambda el: el.page.index)
593
544
  except AttributeError:
594
- logger.warning(
595
- "Cannot sort elements in document order: Missing required attributes (e.g., page)."
596
- )
545
+ logger.warning("Cannot sort elements in document order: Missing required attributes.")
597
546
 
598
547
  return combined
599
548
 
@@ -610,24 +559,21 @@ class PDF:
610
559
 
611
560
  Args:
612
561
  selector: Optional selector to filter elements
613
- preserve_whitespace: Whether to keep blank characters (default: True)
614
- use_exclusions: Whether to apply exclusion regions (default: True)
615
- debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
562
+ preserve_whitespace: Whether to keep blank characters
563
+ use_exclusions: Whether to apply exclusion regions
564
+ debug_exclusions: Whether to output detailed debugging for exclusions
616
565
  **kwargs: Additional extraction parameters
617
566
 
618
567
  Returns:
619
568
  Extracted text as string
620
569
  """
621
- # Ensure _pages is initialized
622
570
  if not hasattr(self, "_pages"):
623
571
  raise AttributeError("PDF pages not yet initialized.")
624
572
 
625
- # If selector is provided, find elements first
626
573
  if selector:
627
574
  elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
628
575
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
629
576
 
630
- # Otherwise extract from all pages
631
577
  if debug_exclusions:
632
578
  print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
633
579
  print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
@@ -648,25 +594,6 @@ class PDF:
648
594
 
649
595
  return "\n".join(texts)
650
596
 
651
- def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
652
- """
653
- Shorthand for finding elements and extracting their text.
654
-
655
- Args:
656
- selector: CSS-like selector string
657
- preserve_whitespace: Whether to keep blank characters (default: True)
658
- **kwargs: Additional extraction parameters
659
-
660
- Returns:
661
- Extracted text from matching elements
662
- """
663
- # Ensure _pages is initialized
664
- if not hasattr(self, "_pages"):
665
- raise AttributeError("PDF pages not yet initialized.")
666
- return self.extract_text(
667
- selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
668
- ) # apply_exclusions is handled by find_all in extract_text
669
-
670
597
  def extract_tables(
671
598
  self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
672
599
  ) -> List[Any]:
@@ -681,54 +608,43 @@ class PDF:
681
608
  Returns:
682
609
  List of extracted tables
683
610
  """
684
- # Ensure _pages is initialized
685
611
  if not hasattr(self, "_pages"):
686
612
  raise AttributeError("PDF pages not yet initialized.")
687
- # TODO: Implement table extraction
613
+
688
614
  logger.warning("PDF.extract_tables is not fully implemented yet.")
689
615
  all_tables = []
616
+
690
617
  for page in self.pages:
691
- # Assuming page.extract_tables(**kwargs) exists or is added
692
618
  if hasattr(page, "extract_tables"):
693
619
  all_tables.extend(page.extract_tables(**kwargs))
694
620
  else:
695
621
  logger.debug(f"Page {page.number} does not have extract_tables method.")
696
- # Placeholder filtering
622
+
697
623
  if selector:
698
624
  logger.warning("Filtering extracted tables by selector is not implemented.")
699
- # Would need to parse selector and filter the list `all_tables`
700
- # Placeholder merging
625
+
701
626
  if merge_across_pages:
702
627
  logger.warning("Merging tables across pages is not implemented.")
703
- # Would need logic to detect and merge related tables
628
+
704
629
  return all_tables
705
630
 
706
- # --- New Method: save_searchable ---
707
631
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
708
632
  """
709
633
  Saves the PDF with an OCR text layer, making content searchable.
710
634
 
711
635
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
712
636
 
713
- Note: OCR must have been applied to the pages beforehand
714
- (e.g., using pdf.apply_ocr()).
715
-
716
637
  Args:
717
- output_path: Path to save the searchable PDF.
718
- dpi: Resolution for rendering and OCR overlay (default 300).
719
- **kwargs: Additional keyword arguments passed to the exporter.
638
+ output_path: Path to save the searchable PDF
639
+ dpi: Resolution for rendering and OCR overlay
640
+ **kwargs: Additional keyword arguments passed to the exporter
720
641
  """
721
- # Import moved here, assuming it's always available now
722
642
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
723
643
 
724
- # Convert pathlib.Path to string if necessary
725
644
  output_path_str = str(output_path)
726
-
727
645
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
728
646
  logger.info(f"Searchable PDF saved to: {output_path_str}")
729
647
 
730
- # --- End New Method ---
731
-
732
648
  def ask(
733
649
  self,
734
650
  question: str,
@@ -750,27 +666,21 @@ class PDF:
750
666
  **kwargs: Additional parameters passed to the QA engine
751
667
 
752
668
  Returns:
753
- A dictionary containing the answer, confidence, and other metadata.
754
- Result will have an 'answer' key containing the answer text.
669
+ A dictionary containing the answer, confidence, and other metadata
755
670
  """
756
671
  from natural_pdf.qa import get_qa_engine
757
672
 
758
- # Initialize or get QA engine
759
673
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
760
674
 
761
- # Determine which pages to query
762
675
  if pages is None:
763
676
  target_pages = list(range(len(self.pages)))
764
677
  elif isinstance(pages, int):
765
- # Single page
766
678
  target_pages = [pages]
767
679
  elif isinstance(pages, (list, range)):
768
- # List or range of pages
769
680
  target_pages = pages
770
681
  else:
771
682
  raise ValueError(f"Invalid pages parameter: {pages}")
772
683
 
773
- # Actually query each page and gather results
774
684
  results = []
775
685
  for page_idx in target_pages:
776
686
  if 0 <= page_idx < len(self.pages):
@@ -779,211 +689,148 @@ class PDF:
779
689
  page=page, question=question, min_confidence=min_confidence, **kwargs
780
690
  )
781
691
 
782
- # Add to results if it found an answer
783
692
  if page_result and page_result.get("found", False):
784
693
  results.append(page_result)
785
694
 
786
- # Sort results by confidence
787
695
  results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
788
696
 
789
- # Return the best result, or a default result if none found
790
697
  if results:
791
698
  return results[0]
792
699
  else:
793
- # Return a structure indicating no answer found
794
700
  return {
795
701
  "answer": None,
796
702
  "confidence": 0.0,
797
703
  "found": False,
798
- "page_num": None, # Or maybe the pages searched?
704
+ "page_num": None,
799
705
  "source_elements": [],
800
706
  }
801
707
 
802
708
  def search_within_index(
803
709
  self,
804
710
  query: Union[str, Path, Image.Image, Region],
805
- search_service: SearchServiceProtocol, # Now required
711
+ search_service: SearchServiceProtocol,
806
712
  options: Optional[SearchOptions] = None,
807
713
  ) -> List[Dict[str, Any]]:
808
714
  """
809
- Finds relevant documents specifically originating from THIS PDF document
810
- within a search index managed by the provided SearchService.
811
-
812
- This method uses a pre-configured SearchService instance and adds
813
- a filter to the search query to scope results only to pages from
814
- this specific PDF object (based on its resolved path).
715
+ Finds relevant documents from this PDF within a search index.
815
716
 
816
717
  Args:
817
- query: The search query (text, image path, PIL Image, Region).
818
- search_service: A pre-configured SearchService instance pointing to the
819
- index where this PDF's content (or related content)
820
- is expected to be found.
821
- options: Optional SearchOptions to configure the query (top_k, filters, etc.).
822
- Any existing filters in `options` will be combined with the
823
- PDF-scoping filter using an 'AND' condition.
718
+ query: The search query (text, image path, PIL Image, Region)
719
+ search_service: A pre-configured SearchService instance
720
+ options: Optional SearchOptions to configure the query
824
721
 
825
722
  Returns:
826
- A list of result dictionaries, sorted by relevance, containing only
827
- results originating from this PDF's pages.
723
+ A list of result dictionaries, sorted by relevance
828
724
 
829
725
  Raises:
830
- ImportError: If search dependencies are not installed.
831
- ValueError: If search_service is None.
832
- TypeError: If search_service does not conform to the protocol.
833
- FileNotFoundError: If the collection managed by the service does not exist.
834
- RuntimeError: For other search failures.
726
+ ImportError: If search dependencies are not installed
727
+ ValueError: If search_service is None
728
+ TypeError: If search_service does not conform to the protocol
729
+ FileNotFoundError: If the collection managed by the service does not exist
730
+ RuntimeError: For other search failures
835
731
  """
836
732
  if not search_service:
837
733
  raise ValueError("A configured SearchServiceProtocol instance must be provided.")
838
- # Optional stricter check:
839
- # if not isinstance(search_service, SearchServiceProtocol):
840
- # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
841
734
 
842
- # Get collection name from service for logging
843
735
  collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
844
- logger.info(
845
- f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
846
- )
736
+ logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
737
+
738
+ service = search_service
847
739
 
848
- # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
849
- # service: SearchServiceProtocol
850
- # if search_service:
851
- # service = search_service
852
- # else:
853
- # logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
854
- # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
855
- # # TODO: Pass embedding model from options/pdf config if needed?
856
- # service = get_search_service(**factory_args)
857
- service = search_service # Use validated provided service
858
-
859
- # --- 2. Prepare Query and Options ---
860
740
  query_input = query
861
- # Resolve options (use default TextSearch if none provided)
862
741
  effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
863
742
 
864
- # Handle Region query - extract text for now
865
743
  if isinstance(query, Region):
866
744
  logger.debug("Query is a Region object. Extracting text.")
867
745
  if not isinstance(effective_options, TextSearchOptions):
868
- logger.warning(
869
- "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
870
- )
746
+ logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
871
747
  query_input = query.extract_text()
872
748
  if not query_input or query_input.isspace():
873
749
  logger.error("Region has no extractable text for query.")
874
750
  return []
875
751
 
876
- # --- 3. Add Filter to Scope Search to THIS PDF ---
877
- # Assume metadata field 'pdf_path' stores the resolved path used during indexing
752
+ # Add filter to scope search to THIS PDF
878
753
  pdf_scope_filter = {
879
- "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
754
+ "field": "pdf_path",
880
755
  "operator": "eq",
881
- "value": self.path, # Use the resolved path of this PDF instance
756
+ "value": self.path,
882
757
  }
883
758
  logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
884
759
 
885
760
  # Combine with existing filters in options (if any)
886
761
  if effective_options.filters:
887
- logger.debug(
888
- f"Combining PDF scope filter with existing filters: {effective_options.filters}"
889
- )
890
- # Assume filters are compatible with the underlying search service
891
- # If existing filters aren't already in an AND block, wrap them
892
- if (
893
- isinstance(effective_options.filters, dict)
894
- and effective_options.filters.get("operator") == "AND"
895
- ):
896
- # Already an AND block, just append the condition
762
+ logger.debug(f"Combining PDF scope filter with existing filters")
763
+ if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
897
764
  effective_options.filters["conditions"].append(pdf_scope_filter)
898
765
  elif isinstance(effective_options.filters, list):
899
- # Assume list represents implicit AND conditions
900
766
  effective_options.filters = {
901
767
  "operator": "AND",
902
768
  "conditions": effective_options.filters + [pdf_scope_filter],
903
769
  }
904
- elif isinstance(effective_options.filters, dict): # Single filter dict
770
+ elif isinstance(effective_options.filters, dict):
905
771
  effective_options.filters = {
906
772
  "operator": "AND",
907
773
  "conditions": [effective_options.filters, pdf_scope_filter],
908
774
  }
909
775
  else:
910
- logger.warning(
911
- f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
912
- )
776
+ logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
913
777
  effective_options.filters = pdf_scope_filter
914
778
  else:
915
779
  effective_options.filters = pdf_scope_filter
916
780
 
917
781
  logger.debug(f"Final filters for service search: {effective_options.filters}")
918
782
 
919
- # --- 4. Call SearchService ---
920
783
  try:
921
- # Call the service's search method (no collection_name needed)
922
784
  results = service.search(
923
785
  query=query_input,
924
786
  options=effective_options,
925
787
  )
926
- logger.info(
927
- f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
928
- )
788
+ logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
929
789
  return results
930
790
  except FileNotFoundError as fnf:
931
- logger.error(
932
- f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
933
- )
934
- raise # Re-raise specific error
791
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
792
+ raise
935
793
  except Exception as e:
936
- logger.error(
937
- f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
938
- exc_info=True,
939
- )
940
- raise RuntimeError(
941
- f"Search within index failed for PDF '{self.path}'. See logs for details."
942
- ) from e
794
+ logger.error(f"SearchService search failed: {e}")
795
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
943
796
 
944
797
  def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
945
798
  """
946
- Exports OCR results from this PDF into a correction task package (zip file).
799
+ Exports OCR results from this PDF into a correction task package.
947
800
 
948
801
  Args:
949
- output_zip_path: The path to save the output zip file.
802
+ output_zip_path: The path to save the output zip file
950
803
  **kwargs: Additional arguments passed to create_correction_task_package
951
- (e.g., image_render_scale, overwrite).
952
804
  """
953
805
  try:
954
806
  from natural_pdf.utils.packaging import create_correction_task_package
955
-
956
807
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
957
808
  except ImportError:
958
- logger.error(
959
- "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
960
- )
961
- # Or raise
809
+ logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
962
810
  except Exception as e:
963
- logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
964
- raise # Re-raise the exception from the utility function
811
+ logger.error(f"Failed to export correction task: {e}")
812
+ raise
965
813
 
966
814
  def correct_ocr(
967
815
  self,
968
816
  correction_callback: Callable[[Any], Optional[str]],
969
817
  pages: Optional[Union[Iterable[int], range, slice]] = None,
970
- ) -> "PDF": # Return self for chaining
818
+ max_workers: Optional[int] = None,
819
+ progress_callback: Optional[Callable[[], None]] = None,
820
+ ) -> "PDF":
971
821
  """
972
- Applies corrections to OCR-generated text elements using a callback function,
973
- delegating the core work to the `Page.correct_ocr` method.
822
+ Applies corrections to OCR text elements using a callback function.
974
823
 
975
824
  Args:
976
- correction_callback: A function that accepts a single argument (an element
977
- object) and returns `Optional[str]`. It returns the
978
- corrected text string if an update is needed, otherwise None.
825
+ correction_callback: Function that takes an element and returns corrected text or None
979
826
  pages: Optional page indices/slice to limit the scope of correction
980
- (default: all pages).
827
+ max_workers: Maximum number of threads to use for parallel execution
828
+ progress_callback: Optional callback function for progress updates
981
829
 
982
830
  Returns:
983
- Self for method chaining.
831
+ Self for method chaining
984
832
  """
985
- # Determine target pages
986
- target_page_indices: List[int] = []
833
+ target_page_indices = []
987
834
  if pages is None:
988
835
  target_page_indices = list(range(len(self._pages)))
989
836
  elif isinstance(pages, slice):
@@ -991,56 +838,49 @@ class PDF:
991
838
  elif hasattr(pages, "__iter__"):
992
839
  try:
993
840
  target_page_indices = [int(i) for i in pages]
994
- # Validate indices
995
841
  for idx in target_page_indices:
996
842
  if not (0 <= idx < len(self._pages)):
997
843
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
998
844
  except (IndexError, TypeError, ValueError) as e:
999
- raise ValueError(
1000
- f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
1001
- ) from e
845
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1002
846
  else:
1003
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
847
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1004
848
 
1005
849
  if not target_page_indices:
1006
850
  logger.warning("No pages selected for OCR correction.")
1007
851
  return self
1008
852
 
1009
- logger.info(
1010
- f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
1011
- )
853
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1012
854
 
1013
- # Iterate through target pages and call their correct_ocr method
1014
855
  for page_idx in target_page_indices:
1015
856
  page = self._pages[page_idx]
1016
857
  try:
1017
- page.correct_ocr(correction_callback=correction_callback)
858
+ page.correct_ocr(
859
+ correction_callback=correction_callback,
860
+ max_workers=max_workers,
861
+ progress_callback=progress_callback,
862
+ )
1018
863
  except Exception as e:
1019
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1020
- # Optionally re-raise or just log and continue
864
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1021
865
 
1022
- logger.info(f"OCR correction process finished for requested pages.")
866
+ logger.info("OCR correction process finished.")
1023
867
  return self
1024
868
 
1025
869
  def __len__(self) -> int:
1026
870
  """Return the number of pages in the PDF."""
1027
- # Ensure _pages is initialized
1028
871
  if not hasattr(self, "_pages"):
1029
- # Return 0 or raise error if not fully initialized? Let's return 0.
1030
872
  return 0
1031
873
  return len(self._pages)
1032
874
 
1033
- def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
875
+ def __getitem__(self, key) -> Union[Page, "PageCollection"]:
1034
876
  """Access pages by index or slice."""
1035
- # Check if self._pages has been initialized
1036
877
  if not hasattr(self, "_pages"):
1037
878
  raise AttributeError("PDF pages not initialized yet.")
879
+
1038
880
  if isinstance(key, slice):
1039
- # Return a PageCollection slice
1040
881
  from natural_pdf.elements.collections import PageCollection
1041
-
1042
882
  return PageCollection(self._pages[key])
1043
- # Check index bounds before accessing
883
+
1044
884
  if isinstance(key, int):
1045
885
  if 0 <= key < len(self._pages):
1046
886
  return self._pages[key]
@@ -1054,13 +894,12 @@ class PDF:
1054
894
  if hasattr(self, "_pdf") and self._pdf is not None:
1055
895
  try:
1056
896
  self._pdf.close()
1057
- logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
897
+ logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
1058
898
  except Exception as e:
1059
899
  logger.warning(f"Error closing pdfplumber object: {e}")
1060
900
  finally:
1061
901
  self._pdf = None
1062
902
 
1063
- # Clean up temporary file if it exists
1064
903
  if hasattr(self, "_temp_file") and self._temp_file is not None:
1065
904
  temp_file_path = None
1066
905
  try:
@@ -1070,7 +909,7 @@ class PDF:
1070
909
  os.unlink(temp_file_path)
1071
910
  logger.debug(f"Removed temporary PDF file: {temp_file_path}")
1072
911
  except Exception as e:
1073
- logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
912
+ logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
1074
913
  finally:
1075
914
  self._temp_file = None
1076
915
 
@@ -1082,6 +921,176 @@ class PDF:
1082
921
  """Context manager exit."""
1083
922
  self.close()
1084
923
 
1085
- # --- Indexable Protocol Methods --- Needed for search/sync
1086
924
  def get_id(self) -> str:
925
+ """Get unique identifier for this PDF."""
1087
926
  return self.path
927
+
928
+ # --- Classification Methods --- #
929
+
930
+ def classify_pages(
931
+ self,
932
+ categories: List[str],
933
+ model: Optional[str] = None,
934
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
935
+ analysis_key: str = "classification",
936
+ using: Optional[str] = None,
937
+ **kwargs,
938
+ ) -> "PDF":
939
+ """
940
+ Classifies specified pages of the PDF.
941
+
942
+ Args:
943
+ categories: List of category names
944
+ model: Model identifier ('text', 'vision', or specific HF ID)
945
+ pages: Page indices, slice, or None for all pages
946
+ analysis_key: Key to store results in page's analyses dict
947
+ using: Processing mode ('text' or 'vision')
948
+ **kwargs: Additional arguments for the ClassificationManager
949
+
950
+ Returns:
951
+ Self for method chaining
952
+ """
953
+ if not categories:
954
+ raise ValueError("Categories list cannot be empty.")
955
+
956
+ try:
957
+ manager = self.get_manager('classification')
958
+ except (ValueError, RuntimeError) as e:
959
+ raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
960
+
961
+ if not manager or not manager.is_available():
962
+ try:
963
+ from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
964
+ if not _CLASSIFICATION_AVAILABLE:
965
+ raise ImportError("Classification dependencies missing.")
966
+ except ImportError:
967
+ raise ImportError(
968
+ "Classification dependencies missing. "
969
+ "Install with: pip install \"natural-pdf[classification]\""
970
+ )
971
+ raise ClassificationError("ClassificationManager not available.")
972
+
973
+ target_pages = []
974
+ if pages is None:
975
+ target_pages = self._pages
976
+ elif isinstance(pages, slice):
977
+ target_pages = self._pages[pages]
978
+ elif hasattr(pages, "__iter__"):
979
+ try:
980
+ target_pages = [self._pages[i] for i in pages]
981
+ except IndexError:
982
+ raise ValueError("Invalid page index provided.")
983
+ except TypeError:
984
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
985
+ else:
986
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
987
+
988
+ if not target_pages:
989
+ logger.warning("No pages selected for classification.")
990
+ return self
991
+
992
+ inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
993
+ logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
994
+
995
+ page_contents = []
996
+ pages_to_classify = []
997
+ logger.debug(f"Gathering content for {len(target_pages)} pages...")
998
+
999
+ for page in target_pages:
1000
+ try:
1001
+ content = page._get_classification_content(model_type=inferred_using, **kwargs)
1002
+ page_contents.append(content)
1003
+ pages_to_classify.append(page)
1004
+ except ValueError as e:
1005
+ logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
1006
+ except Exception as e:
1007
+ logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
1008
+
1009
+ if not page_contents:
1010
+ logger.warning("No content could be gathered for batch classification.")
1011
+ return self
1012
+
1013
+ logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
1014
+
1015
+ try:
1016
+ batch_results = manager.classify_batch(
1017
+ item_contents=page_contents,
1018
+ categories=categories,
1019
+ model_id=model,
1020
+ using=inferred_using,
1021
+ **kwargs,
1022
+ )
1023
+ except Exception as e:
1024
+ logger.error(f"Batch classification failed: {e}")
1025
+ raise ClassificationError(f"Batch classification failed: {e}") from e
1026
+
1027
+ if len(batch_results) != len(pages_to_classify):
1028
+ logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
1029
+ return self
1030
+
1031
+ logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
1032
+ for page, result_obj in zip(pages_to_classify, batch_results):
1033
+ try:
1034
+ if not hasattr(page, 'analyses') or page.analyses is None:
1035
+ page.analyses = {}
1036
+ page.analyses[analysis_key] = result_obj
1037
+ except Exception as e:
1038
+ logger.warning(f"Failed to store classification results for page {page.number}: {e}")
1039
+
1040
+ logger.info(f"Finished classifying PDF pages.")
1041
+ return self
1042
+
1043
+ # --- End Classification Methods --- #
1044
+
1045
+ # --- Extraction Support --- #
1046
+ def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
1047
+ """
1048
+ Retrieves the content for the entire PDF.
1049
+
1050
+ Args:
1051
+ using: 'text' or 'vision'
1052
+ **kwargs: Additional arguments passed to extract_text or page.to_image
1053
+
1054
+ Returns:
1055
+ str: Extracted text if using='text'
1056
+ List[PIL.Image.Image]: List of page images if using='vision'
1057
+ None: If content cannot be retrieved
1058
+ """
1059
+ if using == 'text':
1060
+ try:
1061
+ layout = kwargs.pop('layout', True)
1062
+ return self.extract_text(layout=layout, **kwargs)
1063
+ except Exception as e:
1064
+ logger.error(f"Error extracting text from PDF: {e}")
1065
+ return None
1066
+ elif using == 'vision':
1067
+ page_images = []
1068
+ logger.info(f"Rendering {len(self.pages)} pages to images...")
1069
+
1070
+ resolution = kwargs.pop('resolution', 72)
1071
+ include_highlights = kwargs.pop('include_highlights', False)
1072
+ labels = kwargs.pop('labels', False)
1073
+
1074
+ try:
1075
+ for page in tqdm(self.pages, desc="Rendering Pages"):
1076
+ img = page.to_image(
1077
+ resolution=resolution,
1078
+ include_highlights=include_highlights,
1079
+ labels=labels,
1080
+ **kwargs
1081
+ )
1082
+ if img:
1083
+ page_images.append(img)
1084
+ else:
1085
+ logger.warning(f"Failed to render page {page.number}, skipping.")
1086
+ if not page_images:
1087
+ logger.error("Failed to render any pages.")
1088
+ return None
1089
+ return page_images
1090
+ except Exception as e:
1091
+ logger.error(f"Error rendering pages: {e}")
1092
+ return None
1093
+ else:
1094
+ logger.error(f"Unsupported value for 'using': {using}")
1095
+ return None
1096
+ # --- End Extraction Support --- #