natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,11 +1,13 @@
1
- import copy # Add import for deepcopy
1
+ import copy
2
2
  import logging
3
3
  import os
4
4
  import re
5
5
  import tempfile
6
6
  import urllib.request
7
- from pathlib import Path # Added Path
8
- from typing import ( # Added Iterable and TYPE_CHECKING
7
+ import time
8
+ import threading
9
+ from pathlib import Path
10
+ from typing import (
9
11
  TYPE_CHECKING,
10
12
  Any,
11
13
  Callable,
@@ -17,29 +19,33 @@ from typing import ( # Added Iterable and TYPE_CHECKING
17
19
  Type,
18
20
  Union,
19
21
  )
20
- from pathlib import Path
21
-
22
+ from natural_pdf.utils.tqdm_utils import get_tqdm
22
23
 
23
24
  import pdfplumber
24
25
  from PIL import Image
25
26
 
26
- from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
27
- LayoutManager,
28
- )
29
- from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
27
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
28
+ from natural_pdf.core.highlighting_service import HighlightingService
30
29
  from natural_pdf.core.page import Page
31
30
  from natural_pdf.elements.collections import ElementCollection
32
31
  from natural_pdf.elements.region import Region
33
32
  from natural_pdf.ocr import OCRManager, OCROptions
34
33
  from natural_pdf.selectors.parser import parse_selector
35
34
 
36
- # Import the flag directly - this should always work
35
+ from natural_pdf.classification.manager import ClassificationManager
36
+ from natural_pdf.classification.manager import ClassificationError
37
+ from natural_pdf.classification.results import ClassificationResult
38
+ from natural_pdf.extraction.manager import StructuredDataManager
39
+
40
+ from natural_pdf.utils.locks import pdf_render_lock
41
+ from natural_pdf.elements.base import Element
42
+ from natural_pdf.classification.mixin import ClassificationMixin
43
+ from natural_pdf.extraction.mixin import ExtractionMixin
37
44
 
38
- # --- Add Search Service Imports (needed for new methods) ---
39
45
  try:
40
- from typing import Any as TypingAny # Import Any if not already
46
+ from typing import Any as TypingAny
41
47
 
42
- from natural_pdf.search import TextSearchOptions # Keep for ask default
48
+ from natural_pdf.search import TextSearchOptions
43
49
  from natural_pdf.search import (
44
50
  BaseSearchOptions,
45
51
  SearchOptions,
@@ -47,25 +53,24 @@ try:
47
53
  get_search_service,
48
54
  )
49
55
  except ImportError:
50
- # Define dummies if needed for type hints within the class
51
56
  SearchServiceProtocol = object
52
57
  SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
53
58
  TypingAny = object
54
59
 
55
- # Dummy factory needed for default arg in methods
56
60
  def get_search_service(**kwargs) -> SearchServiceProtocol:
57
61
  raise ImportError(
58
62
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
59
63
  )
60
64
 
61
-
62
- # --- End Search Service Imports ---
63
-
64
- # Set up logger early
65
65
  logger = logging.getLogger("natural_pdf.core.pdf")
66
+ tqdm = get_tqdm()
66
67
 
68
+ DEFAULT_MANAGERS = {
69
+ "classification": ClassificationManager,
70
+ "structured_data": StructuredDataManager,
71
+ }
67
72
 
68
- class PDF:
73
+ class PDF(ExtractionMixin):
69
74
  """
70
75
  Enhanced PDF wrapper built on top of pdfplumber.
71
76
 
@@ -86,35 +91,23 @@ class PDF:
86
91
  Args:
87
92
  path_or_url: Path to the PDF file or a URL to a PDF
88
93
  reading_order: Whether to use natural reading order
89
- font_attrs: Font attributes to consider when grouping characters into words.
90
- Default: ['fontname', 'size'] (Group by font name and size)
91
- None: Only consider spatial relationships
92
- List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
93
- keep_spaces: Whether to include spaces in word elements (default: True).
94
- True: Spaces are part of words, better for multi-word searching
95
- False: Break text at spaces, each word is separate (legacy behavior)
94
+ font_attrs: Font attributes for grouping characters into words
95
+ keep_spaces: Whether to include spaces in word elements
96
96
  """
97
- # Check if the input is a URL
98
97
  is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
99
98
 
100
- # Initialize path-related attributes
101
99
  self._original_path = path_or_url
102
100
  self._temp_file = None
103
- self._resolved_path = None # Store the actual path used by pdfplumber
101
+ self._resolved_path = None
104
102
 
105
103
  if is_url:
106
104
  logger.info(f"Downloading PDF from URL: {path_or_url}")
107
105
  try:
108
- # Create a temporary file to store the downloaded PDF
109
106
  self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
110
-
111
- # Download the PDF
112
107
  with urllib.request.urlopen(path_or_url) as response:
113
108
  self._temp_file.write(response.read())
114
109
  self._temp_file.flush()
115
110
  self._temp_file.close()
116
-
117
- # Use the temporary file path
118
111
  self._resolved_path = self._temp_file.name
119
112
  logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
120
113
  except Exception as e:
@@ -126,7 +119,6 @@ class PDF:
126
119
  logger.error(f"Failed to download PDF from URL: {e}")
127
120
  raise ValueError(f"Failed to download PDF from URL: {e}")
128
121
  else:
129
- # Use the provided path directly
130
122
  self._resolved_path = path_or_url
131
123
 
132
124
  logger.info(f"Initializing PDF from {self._resolved_path}")
@@ -137,42 +129,68 @@ class PDF:
137
129
  try:
138
130
  self._pdf = pdfplumber.open(self._resolved_path)
139
131
  except Exception as e:
140
- logger.error(
141
- f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
142
- exc_info=True,
143
- )
144
- # Clean up temp file if creation failed
132
+ logger.error(f"Failed to open PDF: {e}", exc_info=True)
145
133
  self.close()
146
134
  raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
147
135
 
148
- self._path = self._resolved_path # Keep original path too?
149
- self.path = self._resolved_path # Public attribute for the resolved path
150
- self.source_path = self._original_path # Public attribute for the user-provided path/URL
136
+ self._path = self._resolved_path
137
+ self.path = self._resolved_path
138
+ self.source_path = self._original_path
151
139
 
152
140
  self._reading_order = reading_order
153
141
  self._config = {"keep_spaces": keep_spaces}
142
+ self._font_attrs = font_attrs
154
143
 
155
- self._font_attrs = font_attrs # Store the font attribute configuration
156
-
157
- # Initialize Managers and Services (conditionally available)
158
144
  self._ocr_manager = OCRManager() if OCRManager else None
159
145
  self._layout_manager = LayoutManager() if LayoutManager else None
160
146
  self.highlighter = HighlightingService(self)
147
+ self._classification_manager_instance = ClassificationManager()
148
+ self._manager_registry = {}
161
149
 
162
- # Initialize pages last, passing necessary refs
163
150
  self._pages = [
164
151
  Page(p, parent=self, index=i, font_attrs=font_attrs)
165
152
  for i, p in enumerate(self._pdf.pages)
166
153
  ]
167
154
 
168
- # Other state
169
155
  self._element_cache = {}
170
- self._exclusions = [] # List to store exclusion functions/regions
171
- self._regions = [] # List to store region functions/definitions
156
+ self._exclusions = []
157
+ self._regions = []
172
158
 
173
- logger.info("Initialized HighlightingService.")
174
159
  logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
175
160
 
161
+ self._initialize_managers()
162
+ self._initialize_highlighter()
163
+
164
+ def _initialize_managers(self):
165
+ """Initialize manager instances based on DEFAULT_MANAGERS."""
166
+ self._managers = {}
167
+ for key, manager_class in DEFAULT_MANAGERS.items():
168
+ try:
169
+ self._managers[key] = manager_class()
170
+ logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
171
+ except Exception as e:
172
+ logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
173
+ self._managers[key] = None
174
+
175
+ def get_manager(self, key: str) -> Any:
176
+ """Retrieve a manager instance by its key."""
177
+ if key not in self._managers:
178
+ raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
179
+
180
+ manager_instance = self._managers.get(key)
181
+
182
+ if manager_instance is None:
183
+ manager_class = DEFAULT_MANAGERS.get(key)
184
+ if manager_class:
185
+ raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
186
+ else:
187
+ raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
188
+
189
+ return manager_instance
190
+
191
+ def _initialize_highlighter(self):
192
+ pass
193
+
176
194
  @property
177
195
  def metadata(self) -> Dict[str, Any]:
178
196
  """Access metadata as a dictionary."""
@@ -183,7 +201,6 @@ class PDF:
183
201
  """Access pages as a PageCollection object."""
184
202
  from natural_pdf.elements.collections import PageCollection
185
203
 
186
- # Ensure _pages is initialized
187
204
  if not hasattr(self, "_pages"):
188
205
  raise AttributeError("PDF pages not yet initialized.")
189
206
  return PageCollection(self._pages)
@@ -195,12 +212,10 @@ class PDF:
195
212
  Returns:
196
213
  Self for method chaining
197
214
  """
198
- # Ensure _pages is initialized
199
215
  if not hasattr(self, "_pages"):
200
216
  raise AttributeError("PDF pages not yet initialized.")
201
217
 
202
218
  self._exclusions = []
203
- # Also clear from pages
204
219
  for page in self._pages:
205
220
  page.clear_exclusions()
206
221
  return self
@@ -212,99 +227,75 @@ class PDF:
212
227
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
213
228
 
214
229
  Args:
215
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
230
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
216
231
  label: Optional label for this exclusion
217
232
 
218
233
  Returns:
219
234
  Self for method chaining
220
235
  """
221
- # Ensure _pages is initialized
222
236
  if not hasattr(self, "_pages"):
223
237
  raise AttributeError("PDF pages not yet initialized.")
224
238
 
225
- # Store exclusion with its label at PDF level
226
239
  exclusion_data = (exclusion_func, label)
227
240
  self._exclusions.append(exclusion_data)
228
241
 
229
- # Apply this exclusion to all pages
230
242
  for page in self._pages:
231
- # We pass the original function, Page.add_exclusion handles calling it
232
243
  page.add_exclusion(exclusion_func, label=label)
233
244
 
234
245
  return self
235
246
 
236
247
  def apply_ocr(
237
248
  self,
238
- pages: Optional[Union[Iterable[int], range, slice]] = None,
239
249
  engine: Optional[str] = None,
240
- # --- Common OCR Parameters (Direct Arguments) ---
241
250
  languages: Optional[List[str]] = None,
242
- min_confidence: Optional[float] = None, # Min confidence threshold
251
+ min_confidence: Optional[float] = None,
243
252
  device: Optional[str] = None,
244
- resolution: Optional[int] = None, # DPI for rendering before OCR
245
- apply_exclusions: bool = True, # New parameter
253
+ resolution: Optional[int] = None,
254
+ apply_exclusions: bool = True,
246
255
  detect_only: bool = False,
247
- # --- Engine-Specific Options --- Use 'options=' for this
248
- options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
- # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
256
+ replace: bool = True,
257
+ options: Optional[Any] = None,
258
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
250
259
  ) -> "PDF":
251
260
  """
252
- Applies OCR to specified pages (or all pages) of the PDF using batch processing.
253
-
254
- This method renders the specified pages to images, sends them as a batch
255
- to the OCRManager, and adds the resulting TextElements to each respective page.
261
+ Applies OCR to specified pages of the PDF using batch processing.
256
262
 
257
263
  Args:
258
- pages: An iterable of 0-based page indices (list, range, tuple),
259
- a slice object, or None to process all pages.
260
- engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
- Uses manager's default ('easyocr') if None.
262
- languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
- **Must be codes understood by the specific selected engine.**
264
- No mapping is performed. Overrides manager/engine default.
265
- min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
- Overrides manager/engine default.
267
- device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
- Overrides manager/engine default.
269
- resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
- Affects input quality for OCR. Defaults to 150 if not set.
271
- apply_exclusions: If True (default), render page image for OCR with
272
- excluded areas masked (whited out). If False, OCR
273
- the raw page image without masking exclusions.
274
- detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
- options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
- containing parameters specific to the chosen engine.
264
+ engine: Name of the OCR engine
265
+ languages: List of language codes
266
+ min_confidence: Minimum confidence threshold
267
+ device: Device to run OCR on
268
+ resolution: DPI resolution for page images
269
+ apply_exclusions: Whether to mask excluded areas
270
+ detect_only: If True, only detect text boxes
271
+ replace: Whether to replace existing OCR elements
272
+ options: Engine-specific options
273
+ pages: Page indices to process or None for all pages
277
274
 
278
275
  Returns:
279
- Self for method chaining.
280
-
281
- Raises:
282
- ValueError: If page indices are invalid.
283
- TypeError: If 'options' is not compatible with the engine.
284
- RuntimeError: If the OCRManager or selected engine is not available.
276
+ Self for method chaining
285
277
  """
286
278
  if not self._ocr_manager:
287
279
  logger.error("OCRManager not available. Cannot apply OCR.")
288
- # Or raise RuntimeError("OCRManager not initialized.")
289
280
  return self
290
281
 
291
- # --- Determine Target Pages (unchanged) ---
292
- target_pages: List[Page] = []
282
+ thread_id = threading.current_thread().name
283
+ logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
284
+
285
+ target_pages = []
293
286
  if pages is None:
294
287
  target_pages = self._pages
295
288
  elif isinstance(pages, slice):
296
289
  target_pages = self._pages[pages]
297
- elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
290
+ elif hasattr(pages, "__iter__"):
298
291
  try:
299
292
  target_pages = [self._pages[i] for i in pages]
300
293
  except IndexError:
301
294
  raise ValueError("Invalid page index provided in 'pages' iterable.")
302
295
  except TypeError:
303
- raise TypeError(
304
- "'pages' must be None, a slice, or an iterable of page indices (int)."
305
- )
296
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
306
297
  else:
307
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
298
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
308
299
 
309
300
  if not target_pages:
310
301
  logger.warning("No pages selected for OCR processing.")
@@ -312,24 +303,20 @@ class PDF:
312
303
 
313
304
  page_numbers = [p.number for p in target_pages]
314
305
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
- # --- Determine Rendering Resolution ---
316
- # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
- final_resolution = resolution # Use direct arg if provided
318
- if final_resolution is None:
319
- final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
-
321
- logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
322
-
323
- # --- Render Images for Batch ---
324
- images_pil: List[Image.Image] = []
325
- page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
326
- logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
327
- failed_page_num = "unknown" # Keep track of potentially failing page
306
+
307
+ final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
308
+ logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
309
+
310
+ images_pil = []
311
+ page_image_map = []
312
+ logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
313
+ failed_page_num = "unknown"
314
+ render_start_time = time.monotonic()
315
+
328
316
  try:
329
- for i, page in enumerate(target_pages):
330
- failed_page_num = page.number # Update current page number in case of error
317
+ for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
318
+ failed_page_num = page.number
331
319
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
332
- # Use the determined final_resolution and apply exclusions if requested
333
320
  to_image_kwargs = {
334
321
  "resolution": final_resolution,
335
322
  "include_highlights": False,
@@ -338,66 +325,64 @@ class PDF:
338
325
  img = page.to_image(**to_image_kwargs)
339
326
  if img is None:
340
327
  logger.error(f" Failed to render page {page.number} to image.")
341
- # Decide how to handle: skip page, raise error? For now, skip.
342
- continue # Skip this page if rendering failed
328
+ continue
343
329
  images_pil.append(img)
344
- page_image_map.append((page, img)) # Store pair
330
+ page_image_map.append((page, img))
345
331
  except Exception as e:
346
- logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
332
+ logger.error(f"Failed to render pages for batch OCR: {e}")
347
333
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
334
+
335
+ render_end_time = time.monotonic()
336
+ logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
348
337
 
349
338
  if not images_pil or not page_image_map:
350
339
  logger.error("No images were successfully rendered for batch OCR.")
351
340
  return self
352
341
 
353
- # --- Prepare Arguments for Manager ---
354
- # Pass common args directly, engine-specific via options
355
342
  manager_args = {
356
343
  "images": images_pil,
357
344
  "engine": engine,
358
345
  "languages": languages,
359
- "min_confidence": min_confidence, # Use the renamed parameter
346
+ "min_confidence": min_confidence,
360
347
  "device": device,
361
348
  "options": options,
362
349
  "detect_only": detect_only,
363
- # Note: resolution is used for rendering, not passed to OCR manager directly
364
350
  }
365
- # Filter out None values so manager can use its defaults
366
351
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
367
352
 
368
- # --- Call OCR Manager for Batch Processing ---
369
- logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
353
+ ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
354
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
355
+ ocr_start_time = time.monotonic()
356
+
370
357
  try:
371
- # Manager's apply_ocr signature needs to accept common args directly
372
358
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
373
359
 
374
360
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
375
- logger.error(
376
- f"OCR Manager returned unexpected result format or length for batch processing. "
377
- f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
378
- f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
379
- )
361
+ logger.error(f"OCR Manager returned unexpected result format or length.")
380
362
  return self
381
363
 
382
364
  logger.info("OCR Manager batch processing complete.")
383
-
384
365
  except Exception as e:
385
- logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
366
+ logger.error(f"Batch OCR processing failed: {e}")
386
367
  return self
368
+
369
+ ocr_end_time = time.monotonic()
370
+ logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
387
371
 
388
- # --- Distribute Results and Add Elements to Pages (unchanged) ---
389
372
  logger.info("Adding OCR results to respective pages...")
390
373
  total_elements_added = 0
374
+
391
375
  for i, (page, img) in enumerate(page_image_map):
392
376
  results_for_page = batch_results[i]
393
377
  if not isinstance(results_for_page, list):
394
- logger.warning(
395
- f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
396
- )
378
+ logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
397
379
  continue
398
380
 
399
381
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
400
382
  try:
383
+ if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
384
+ page._element_mgr.remove_ocr_elements()
385
+
401
386
  img_scale_x = page.width / img.width if img.width > 0 else 1
402
387
  img_scale_y = page.height / img.height if img.height > 0 else 1
403
388
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -410,53 +395,39 @@ class PDF:
410
395
  else:
411
396
  logger.debug(f" No valid TextElements created for page {page.number}.")
412
397
  except Exception as e:
413
- logger.error(
414
- f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
415
- )
398
+ logger.error(f" Error adding OCR elements to page {page.number}: {e}")
416
399
 
417
- logger.info(
418
- f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
419
- )
400
+ logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
420
401
  return self
421
402
 
422
403
  def add_region(
423
404
  self, region_func: Callable[["Page"], Optional[Region]], name: str = None
424
405
  ) -> "PDF":
425
406
  """
426
- Add a region function to the PDF. This creates regions on all pages using the provided function.
407
+ Add a region function to the PDF.
427
408
 
428
409
  Args:
429
- region_func: A function that takes a Page and returns a Region, or None.
410
+ region_func: A function that takes a Page and returns a Region, or None
430
411
  name: Optional name for the region
431
412
 
432
413
  Returns:
433
414
  Self for method chaining
434
415
  """
435
- # Ensure _pages is initialized
436
416
  if not hasattr(self, "_pages"):
437
417
  raise AttributeError("PDF pages not yet initialized.")
438
418
 
439
- # Store region with its name at PDF level
440
419
  region_data = (region_func, name)
441
420
  self._regions.append(region_data)
442
421
 
443
- # Apply this region to all pages
444
422
  for page in self._pages:
445
423
  try:
446
- # Call the function to get the region for this specific page
447
424
  region_instance = region_func(page)
448
425
  if region_instance and isinstance(region_instance, Region):
449
- # If a valid region is returned, add it to the page
450
426
  page.add_region(region_instance, name=name, source="named")
451
427
  elif region_instance is not None:
452
- logger.warning(
453
- f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
454
- )
428
+ logger.warning(f"Region function did not return a valid Region for page {page.number}")
455
429
  except Exception as e:
456
- logger.error(
457
- f"Error executing or adding region function for page {page.number}: {e}",
458
- exc_info=True,
459
- )
430
+ logger.error(f"Error adding region for page {page.number}: {e}")
460
431
 
461
432
  return self
462
433
 
@@ -467,22 +438,19 @@ class PDF:
467
438
  Find the first element matching the selector.
468
439
 
469
440
  Args:
470
- selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
471
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
472
- regex: Whether to use regex for text search in :contains (default: False)
473
- case: Whether to do case-sensitive text search (default: True)
441
+ selector: CSS-like selector string
442
+ apply_exclusions: Whether to exclude elements in exclusion regions
443
+ regex: Whether to use regex for text search
444
+ case: Whether to do case-sensitive text search
474
445
  **kwargs: Additional filter parameters
475
446
 
476
447
  Returns:
477
448
  Element object or None if not found
478
449
  """
479
- # Ensure _pages is initialized
480
450
  if not hasattr(self, "_pages"):
481
451
  raise AttributeError("PDF pages not yet initialized.")
482
452
 
483
453
  selector_obj = parse_selector(selector)
484
-
485
- # Pass regex and case flags to selector function
486
454
  kwargs["regex"] = regex
487
455
  kwargs["case"] = case
488
456
 
@@ -498,22 +466,19 @@ class PDF:
498
466
  Find all elements matching the selector.
499
467
 
500
468
  Args:
501
- selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
502
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
503
- regex: Whether to use regex for text search in :contains (default: False)
504
- case: Whether to do case-sensitive text search (default: True)
469
+ selector: CSS-like selector string
470
+ apply_exclusions: Whether to exclude elements in exclusion regions
471
+ regex: Whether to use regex for text search
472
+ case: Whether to do case-sensitive text search
505
473
  **kwargs: Additional filter parameters
506
474
 
507
475
  Returns:
508
476
  ElementCollection with matching elements
509
477
  """
510
- # Ensure _pages is initialized
511
478
  if not hasattr(self, "_pages"):
512
479
  raise AttributeError("PDF pages not yet initialized.")
513
480
 
514
481
  selector_obj = parse_selector(selector)
515
-
516
- # Pass regex and case flags to selector function
517
482
  kwargs["regex"] = regex
518
483
  kwargs["case"] = case
519
484
 
@@ -530,8 +495,8 @@ class PDF:
530
495
 
531
496
  Args:
532
497
  selector_obj: Parsed selector dictionary
533
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
534
- first_only: If True, stop searching after the first match is found.
498
+ apply_exclusions: Whether to exclude elements in exclusion regions
499
+ first_only: If True, stop searching after the first match is found
535
500
  **kwargs: Additional filter parameters
536
501
 
537
502
  Returns:
@@ -539,57 +504,45 @@ class PDF:
539
504
  """
540
505
  from natural_pdf.elements.collections import ElementCollection
541
506
 
542
- # Determine page range to search
543
507
  page_indices = kwargs.get("pages", range(len(self._pages)))
544
508
  if isinstance(page_indices, int):
545
509
  page_indices = [page_indices]
546
510
  elif isinstance(page_indices, slice):
547
511
  page_indices = range(*page_indices.indices(len(self._pages)))
548
512
 
549
- # Check for cross-page pseudo-classes (currently not supported)
550
513
  for pseudo in selector_obj.get("pseudo_classes", []):
551
514
  if pseudo.get("name") in ("spans", "continues"):
552
515
  logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
553
516
  return ElementCollection([])
554
517
 
555
- # Regular case: collect elements from each page
556
518
  all_elements = []
557
519
  for page_idx in page_indices:
558
520
  if 0 <= page_idx < len(self._pages):
559
521
  page = self._pages[page_idx]
560
- # Pass first_only down to page._apply_selector
561
522
  page_elements_collection = page._apply_selector(
562
523
  selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
563
524
  )
564
525
  if page_elements_collection:
565
526
  page_elements = page_elements_collection.elements
566
527
  all_elements.extend(page_elements)
567
- # If we only need the first match overall, and we found one on this page, stop
568
528
  if first_only and page_elements:
569
- break # Stop iterating through pages
529
+ break
570
530
  else:
571
531
  logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
572
532
 
573
- # Create a combined collection
574
533
  combined = ElementCollection(all_elements)
575
534
 
576
- # Sort in document order if requested and not first_only (already sorted by page)
577
535
  if not first_only and kwargs.get("document_order", True):
578
- # Check if elements have page, top, x0 before sorting
579
536
  if all(
580
537
  hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
581
538
  for el in combined.elements
582
539
  ):
583
540
  combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
584
541
  else:
585
- # Elements might be Regions without inherent sorting order yet
586
- # Attempt sorting by page index if possible
587
542
  try:
588
543
  combined.sort(key=lambda el: el.page.index)
589
544
  except AttributeError:
590
- logger.warning(
591
- "Cannot sort elements in document order: Missing required attributes (e.g., page)."
592
- )
545
+ logger.warning("Cannot sort elements in document order: Missing required attributes.")
593
546
 
594
547
  return combined
595
548
 
@@ -606,24 +559,21 @@ class PDF:
606
559
 
607
560
  Args:
608
561
  selector: Optional selector to filter elements
609
- preserve_whitespace: Whether to keep blank characters (default: True)
610
- use_exclusions: Whether to apply exclusion regions (default: True)
611
- debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
562
+ preserve_whitespace: Whether to keep blank characters
563
+ use_exclusions: Whether to apply exclusion regions
564
+ debug_exclusions: Whether to output detailed debugging for exclusions
612
565
  **kwargs: Additional extraction parameters
613
566
 
614
567
  Returns:
615
568
  Extracted text as string
616
569
  """
617
- # Ensure _pages is initialized
618
570
  if not hasattr(self, "_pages"):
619
571
  raise AttributeError("PDF pages not yet initialized.")
620
572
 
621
- # If selector is provided, find elements first
622
573
  if selector:
623
574
  elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
624
575
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
625
576
 
626
- # Otherwise extract from all pages
627
577
  if debug_exclusions:
628
578
  print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
629
579
  print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
@@ -644,25 +594,6 @@ class PDF:
644
594
 
645
595
  return "\n".join(texts)
646
596
 
647
- def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
648
- """
649
- Shorthand for finding elements and extracting their text.
650
-
651
- Args:
652
- selector: CSS-like selector string
653
- preserve_whitespace: Whether to keep blank characters (default: True)
654
- **kwargs: Additional extraction parameters
655
-
656
- Returns:
657
- Extracted text from matching elements
658
- """
659
- # Ensure _pages is initialized
660
- if not hasattr(self, "_pages"):
661
- raise AttributeError("PDF pages not yet initialized.")
662
- return self.extract_text(
663
- selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
664
- ) # apply_exclusions is handled by find_all in extract_text
665
-
666
597
  def extract_tables(
667
598
  self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
668
599
  ) -> List[Any]:
@@ -677,54 +608,43 @@ class PDF:
677
608
  Returns:
678
609
  List of extracted tables
679
610
  """
680
- # Ensure _pages is initialized
681
611
  if not hasattr(self, "_pages"):
682
612
  raise AttributeError("PDF pages not yet initialized.")
683
- # TODO: Implement table extraction
613
+
684
614
  logger.warning("PDF.extract_tables is not fully implemented yet.")
685
615
  all_tables = []
616
+
686
617
  for page in self.pages:
687
- # Assuming page.extract_tables(**kwargs) exists or is added
688
618
  if hasattr(page, "extract_tables"):
689
619
  all_tables.extend(page.extract_tables(**kwargs))
690
620
  else:
691
621
  logger.debug(f"Page {page.number} does not have extract_tables method.")
692
- # Placeholder filtering
622
+
693
623
  if selector:
694
624
  logger.warning("Filtering extracted tables by selector is not implemented.")
695
- # Would need to parse selector and filter the list `all_tables`
696
- # Placeholder merging
625
+
697
626
  if merge_across_pages:
698
627
  logger.warning("Merging tables across pages is not implemented.")
699
- # Would need logic to detect and merge related tables
628
+
700
629
  return all_tables
701
630
 
702
- # --- New Method: save_searchable ---
703
631
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
704
632
  """
705
633
  Saves the PDF with an OCR text layer, making content searchable.
706
634
 
707
635
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
708
636
 
709
- Note: OCR must have been applied to the pages beforehand
710
- (e.g., using pdf.apply_ocr()).
711
-
712
637
  Args:
713
- output_path: Path to save the searchable PDF.
714
- dpi: Resolution for rendering and OCR overlay (default 300).
715
- **kwargs: Additional keyword arguments passed to the exporter.
638
+ output_path: Path to save the searchable PDF
639
+ dpi: Resolution for rendering and OCR overlay
640
+ **kwargs: Additional keyword arguments passed to the exporter
716
641
  """
717
- # Import moved here, assuming it's always available now
718
642
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
719
643
 
720
- # Convert pathlib.Path to string if necessary
721
644
  output_path_str = str(output_path)
722
-
723
645
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
724
646
  logger.info(f"Searchable PDF saved to: {output_path_str}")
725
647
 
726
- # --- End New Method ---
727
-
728
648
  def ask(
729
649
  self,
730
650
  question: str,
@@ -746,27 +666,21 @@ class PDF:
746
666
  **kwargs: Additional parameters passed to the QA engine
747
667
 
748
668
  Returns:
749
- A dictionary containing the answer, confidence, and other metadata.
750
- Result will have an 'answer' key containing the answer text.
669
+ A dictionary containing the answer, confidence, and other metadata
751
670
  """
752
671
  from natural_pdf.qa import get_qa_engine
753
672
 
754
- # Initialize or get QA engine
755
673
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
756
674
 
757
- # Determine which pages to query
758
675
  if pages is None:
759
676
  target_pages = list(range(len(self.pages)))
760
677
  elif isinstance(pages, int):
761
- # Single page
762
678
  target_pages = [pages]
763
679
  elif isinstance(pages, (list, range)):
764
- # List or range of pages
765
680
  target_pages = pages
766
681
  else:
767
682
  raise ValueError(f"Invalid pages parameter: {pages}")
768
683
 
769
- # Actually query each page and gather results
770
684
  results = []
771
685
  for page_idx in target_pages:
772
686
  if 0 <= page_idx < len(self.pages):
@@ -775,208 +689,148 @@ class PDF:
775
689
  page=page, question=question, min_confidence=min_confidence, **kwargs
776
690
  )
777
691
 
778
- # Add to results if it found an answer
779
692
  if page_result and page_result.get("found", False):
780
693
  results.append(page_result)
781
694
 
782
- # Sort results by confidence
783
695
  results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
784
696
 
785
- # Return the best result, or a default result if none found
786
697
  if results:
787
698
  return results[0]
788
699
  else:
789
- # Return a structure indicating no answer found
790
700
  return {
791
701
  "answer": None,
792
702
  "confidence": 0.0,
793
703
  "found": False,
794
- "page_num": None, # Or maybe the pages searched?
704
+ "page_num": None,
795
705
  "source_elements": [],
796
706
  }
797
707
 
798
708
  def search_within_index(
799
709
  self,
800
710
  query: Union[str, Path, Image.Image, Region],
801
- search_service: SearchServiceProtocol, # Now required
711
+ search_service: SearchServiceProtocol,
802
712
  options: Optional[SearchOptions] = None,
803
713
  ) -> List[Dict[str, Any]]:
804
714
  """
805
- Finds relevant documents specifically originating from THIS PDF document
806
- within a search index managed by the provided SearchService.
807
-
808
- This method uses a pre-configured SearchService instance and adds
809
- a filter to the search query to scope results only to pages from
810
- this specific PDF object (based on its resolved path).
715
+ Finds relevant documents from this PDF within a search index.
811
716
 
812
717
  Args:
813
- query: The search query (text, image path, PIL Image, Region).
814
- search_service: A pre-configured SearchService instance pointing to the
815
- index where this PDF's content (or related content)
816
- is expected to be found.
817
- options: Optional SearchOptions to configure the query (top_k, filters, etc.).
818
- Any existing filters in `options` will be combined with the
819
- PDF-scoping filter using an 'AND' condition.
718
+ query: The search query (text, image path, PIL Image, Region)
719
+ search_service: A pre-configured SearchService instance
720
+ options: Optional SearchOptions to configure the query
820
721
 
821
722
  Returns:
822
- A list of result dictionaries, sorted by relevance, containing only
823
- results originating from this PDF's pages.
723
+ A list of result dictionaries, sorted by relevance
824
724
 
825
725
  Raises:
826
- ImportError: If search dependencies are not installed.
827
- ValueError: If search_service is None.
828
- TypeError: If search_service does not conform to the protocol.
829
- FileNotFoundError: If the collection managed by the service does not exist.
830
- RuntimeError: For other search failures.
726
+ ImportError: If search dependencies are not installed
727
+ ValueError: If search_service is None
728
+ TypeError: If search_service does not conform to the protocol
729
+ FileNotFoundError: If the collection managed by the service does not exist
730
+ RuntimeError: For other search failures
831
731
  """
832
732
  if not search_service:
833
733
  raise ValueError("A configured SearchServiceProtocol instance must be provided.")
834
- # Optional stricter check:
835
- # if not isinstance(search_service, SearchServiceProtocol):
836
- # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
837
734
 
838
- # Get collection name from service for logging
839
735
  collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
840
- logger.info(
841
- f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
842
- )
736
+ logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
737
+
738
+ service = search_service
843
739
 
844
- # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
845
- # service: SearchServiceProtocol
846
- # if search_service:
847
- # service = search_service
848
- # else:
849
- # logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
850
- # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
851
- # # TODO: Pass embedding model from options/pdf config if needed?
852
- # service = get_search_service(**factory_args)
853
- service = search_service # Use validated provided service
854
-
855
- # --- 2. Prepare Query and Options ---
856
740
  query_input = query
857
- # Resolve options (use default TextSearch if none provided)
858
741
  effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
859
742
 
860
- # Handle Region query - extract text for now
861
743
  if isinstance(query, Region):
862
744
  logger.debug("Query is a Region object. Extracting text.")
863
745
  if not isinstance(effective_options, TextSearchOptions):
864
- logger.warning(
865
- "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
866
- )
746
+ logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
867
747
  query_input = query.extract_text()
868
748
  if not query_input or query_input.isspace():
869
749
  logger.error("Region has no extractable text for query.")
870
750
  return []
871
751
 
872
- # --- 3. Add Filter to Scope Search to THIS PDF ---
873
- # Assume metadata field 'pdf_path' stores the resolved path used during indexing
752
+ # Add filter to scope search to THIS PDF
874
753
  pdf_scope_filter = {
875
- "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
754
+ "field": "pdf_path",
876
755
  "operator": "eq",
877
- "value": self.path, # Use the resolved path of this PDF instance
756
+ "value": self.path,
878
757
  }
879
758
  logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
880
759
 
881
760
  # Combine with existing filters in options (if any)
882
761
  if effective_options.filters:
883
- logger.debug(
884
- f"Combining PDF scope filter with existing filters: {effective_options.filters}"
885
- )
886
- # Assume filters are compatible with the underlying search service
887
- # If existing filters aren't already in an AND block, wrap them
888
- if (
889
- isinstance(effective_options.filters, dict)
890
- and effective_options.filters.get("operator") == "AND"
891
- ):
892
- # Already an AND block, just append the condition
762
+ logger.debug(f"Combining PDF scope filter with existing filters")
763
+ if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
893
764
  effective_options.filters["conditions"].append(pdf_scope_filter)
894
765
  elif isinstance(effective_options.filters, list):
895
- # Assume list represents implicit AND conditions
896
766
  effective_options.filters = {
897
767
  "operator": "AND",
898
768
  "conditions": effective_options.filters + [pdf_scope_filter],
899
769
  }
900
- elif isinstance(effective_options.filters, dict): # Single filter dict
770
+ elif isinstance(effective_options.filters, dict):
901
771
  effective_options.filters = {
902
772
  "operator": "AND",
903
773
  "conditions": [effective_options.filters, pdf_scope_filter],
904
774
  }
905
775
  else:
906
- logger.warning(
907
- f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
908
- )
776
+ logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
909
777
  effective_options.filters = pdf_scope_filter
910
778
  else:
911
779
  effective_options.filters = pdf_scope_filter
912
780
 
913
781
  logger.debug(f"Final filters for service search: {effective_options.filters}")
914
782
 
915
- # --- 4. Call SearchService ---
916
783
  try:
917
- # Call the service's search method (no collection_name needed)
918
784
  results = service.search(
919
785
  query=query_input,
920
786
  options=effective_options,
921
787
  )
922
- logger.info(
923
- f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
924
- )
788
+ logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
925
789
  return results
926
790
  except FileNotFoundError as fnf:
927
- logger.error(
928
- f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
929
- )
930
- raise # Re-raise specific error
791
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
792
+ raise
931
793
  except Exception as e:
932
- logger.error(
933
- f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
934
- exc_info=True,
935
- )
936
- raise RuntimeError(
937
- f"Search within index failed for PDF '{self.path}'. See logs for details."
938
- ) from e
794
+ logger.error(f"SearchService search failed: {e}")
795
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
939
796
 
940
797
  def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
941
798
  """
942
- Exports OCR results from this PDF into a correction task package (zip file).
799
+ Exports OCR results from this PDF into a correction task package.
943
800
 
944
801
  Args:
945
- output_zip_path: The path to save the output zip file.
802
+ output_zip_path: The path to save the output zip file
946
803
  **kwargs: Additional arguments passed to create_correction_task_package
947
- (e.g., image_render_scale, overwrite).
948
804
  """
949
805
  try:
950
806
  from natural_pdf.utils.packaging import create_correction_task_package
951
807
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
952
808
  except ImportError:
953
809
  logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
954
- # Or raise
955
810
  except Exception as e:
956
- logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
957
- raise # Re-raise the exception from the utility function
811
+ logger.error(f"Failed to export correction task: {e}")
812
+ raise
958
813
 
959
814
  def correct_ocr(
960
815
  self,
961
816
  correction_callback: Callable[[Any], Optional[str]],
962
817
  pages: Optional[Union[Iterable[int], range, slice]] = None,
963
- ) -> "PDF": # Return self for chaining
818
+ max_workers: Optional[int] = None,
819
+ progress_callback: Optional[Callable[[], None]] = None,
820
+ ) -> "PDF":
964
821
  """
965
- Applies corrections to OCR-generated text elements using a callback function,
966
- delegating the core work to the `Page.correct_ocr` method.
822
+ Applies corrections to OCR text elements using a callback function.
967
823
 
968
824
  Args:
969
- correction_callback: A function that accepts a single argument (an element
970
- object) and returns `Optional[str]`. It returns the
971
- corrected text string if an update is needed, otherwise None.
825
+ correction_callback: Function that takes an element and returns corrected text or None
972
826
  pages: Optional page indices/slice to limit the scope of correction
973
- (default: all pages).
827
+ max_workers: Maximum number of threads to use for parallel execution
828
+ progress_callback: Optional callback function for progress updates
974
829
 
975
830
  Returns:
976
- Self for method chaining.
831
+ Self for method chaining
977
832
  """
978
- # Determine target pages
979
- target_page_indices: List[int] = []
833
+ target_page_indices = []
980
834
  if pages is None:
981
835
  target_page_indices = list(range(len(self._pages)))
982
836
  elif isinstance(pages, slice):
@@ -984,52 +838,49 @@ class PDF:
984
838
  elif hasattr(pages, "__iter__"):
985
839
  try:
986
840
  target_page_indices = [int(i) for i in pages]
987
- # Validate indices
988
841
  for idx in target_page_indices:
989
842
  if not (0 <= idx < len(self._pages)):
990
843
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
991
844
  except (IndexError, TypeError, ValueError) as e:
992
- raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
845
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
993
846
  else:
994
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
847
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
995
848
 
996
849
  if not target_page_indices:
997
850
  logger.warning("No pages selected for OCR correction.")
998
851
  return self
999
852
 
1000
- logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
853
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1001
854
 
1002
- # Iterate through target pages and call their correct_ocr method
1003
855
  for page_idx in target_page_indices:
1004
856
  page = self._pages[page_idx]
1005
857
  try:
1006
- page.correct_ocr(correction_callback=correction_callback)
858
+ page.correct_ocr(
859
+ correction_callback=correction_callback,
860
+ max_workers=max_workers,
861
+ progress_callback=progress_callback,
862
+ )
1007
863
  except Exception as e:
1008
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1009
- # Optionally re-raise or just log and continue
864
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1010
865
 
1011
- logger.info(f"OCR correction process finished for requested pages.")
866
+ logger.info("OCR correction process finished.")
1012
867
  return self
1013
868
 
1014
869
  def __len__(self) -> int:
1015
870
  """Return the number of pages in the PDF."""
1016
- # Ensure _pages is initialized
1017
871
  if not hasattr(self, "_pages"):
1018
- # Return 0 or raise error if not fully initialized? Let's return 0.
1019
872
  return 0
1020
873
  return len(self._pages)
1021
874
 
1022
- def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
875
+ def __getitem__(self, key) -> Union[Page, "PageCollection"]:
1023
876
  """Access pages by index or slice."""
1024
- # Check if self._pages has been initialized
1025
877
  if not hasattr(self, "_pages"):
1026
878
  raise AttributeError("PDF pages not initialized yet.")
879
+
1027
880
  if isinstance(key, slice):
1028
- # Return a PageCollection slice
1029
881
  from natural_pdf.elements.collections import PageCollection
1030
-
1031
882
  return PageCollection(self._pages[key])
1032
- # Check index bounds before accessing
883
+
1033
884
  if isinstance(key, int):
1034
885
  if 0 <= key < len(self._pages):
1035
886
  return self._pages[key]
@@ -1043,13 +894,12 @@ class PDF:
1043
894
  if hasattr(self, "_pdf") and self._pdf is not None:
1044
895
  try:
1045
896
  self._pdf.close()
1046
- logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
897
+ logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
1047
898
  except Exception as e:
1048
899
  logger.warning(f"Error closing pdfplumber object: {e}")
1049
900
  finally:
1050
901
  self._pdf = None
1051
902
 
1052
- # Clean up temporary file if it exists
1053
903
  if hasattr(self, "_temp_file") and self._temp_file is not None:
1054
904
  temp_file_path = None
1055
905
  try:
@@ -1059,7 +909,7 @@ class PDF:
1059
909
  os.unlink(temp_file_path)
1060
910
  logger.debug(f"Removed temporary PDF file: {temp_file_path}")
1061
911
  except Exception as e:
1062
- logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
912
+ logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
1063
913
  finally:
1064
914
  self._temp_file = None
1065
915
 
@@ -1071,8 +921,176 @@ class PDF:
1071
921
  """Context manager exit."""
1072
922
  self.close()
1073
923
 
1074
-
1075
- # --- Indexable Protocol Methods --- Needed for search/sync
1076
924
  def get_id(self) -> str:
925
+ """Get unique identifier for this PDF."""
1077
926
  return self.path
1078
927
 
928
+ # --- Classification Methods --- #
929
+
930
+ def classify_pages(
931
+ self,
932
+ categories: List[str],
933
+ model: Optional[str] = None,
934
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
935
+ analysis_key: str = "classification",
936
+ using: Optional[str] = None,
937
+ **kwargs,
938
+ ) -> "PDF":
939
+ """
940
+ Classifies specified pages of the PDF.
941
+
942
+ Args:
943
+ categories: List of category names
944
+ model: Model identifier ('text', 'vision', or specific HF ID)
945
+ pages: Page indices, slice, or None for all pages
946
+ analysis_key: Key to store results in page's analyses dict
947
+ using: Processing mode ('text' or 'vision')
948
+ **kwargs: Additional arguments for the ClassificationManager
949
+
950
+ Returns:
951
+ Self for method chaining
952
+ """
953
+ if not categories:
954
+ raise ValueError("Categories list cannot be empty.")
955
+
956
+ try:
957
+ manager = self.get_manager('classification')
958
+ except (ValueError, RuntimeError) as e:
959
+ raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
960
+
961
+ if not manager or not manager.is_available():
962
+ try:
963
+ from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
964
+ if not _CLASSIFICATION_AVAILABLE:
965
+ raise ImportError("Classification dependencies missing.")
966
+ except ImportError:
967
+ raise ImportError(
968
+ "Classification dependencies missing. "
969
+ "Install with: pip install \"natural-pdf[classification]\""
970
+ )
971
+ raise ClassificationError("ClassificationManager not available.")
972
+
973
+ target_pages = []
974
+ if pages is None:
975
+ target_pages = self._pages
976
+ elif isinstance(pages, slice):
977
+ target_pages = self._pages[pages]
978
+ elif hasattr(pages, "__iter__"):
979
+ try:
980
+ target_pages = [self._pages[i] for i in pages]
981
+ except IndexError:
982
+ raise ValueError("Invalid page index provided.")
983
+ except TypeError:
984
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
985
+ else:
986
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
987
+
988
+ if not target_pages:
989
+ logger.warning("No pages selected for classification.")
990
+ return self
991
+
992
+ inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
993
+ logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
994
+
995
+ page_contents = []
996
+ pages_to_classify = []
997
+ logger.debug(f"Gathering content for {len(target_pages)} pages...")
998
+
999
+ for page in target_pages:
1000
+ try:
1001
+ content = page._get_classification_content(model_type=inferred_using, **kwargs)
1002
+ page_contents.append(content)
1003
+ pages_to_classify.append(page)
1004
+ except ValueError as e:
1005
+ logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
1006
+ except Exception as e:
1007
+ logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
1008
+
1009
+ if not page_contents:
1010
+ logger.warning("No content could be gathered for batch classification.")
1011
+ return self
1012
+
1013
+ logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
1014
+
1015
+ try:
1016
+ batch_results = manager.classify_batch(
1017
+ item_contents=page_contents,
1018
+ categories=categories,
1019
+ model_id=model,
1020
+ using=inferred_using,
1021
+ **kwargs,
1022
+ )
1023
+ except Exception as e:
1024
+ logger.error(f"Batch classification failed: {e}")
1025
+ raise ClassificationError(f"Batch classification failed: {e}") from e
1026
+
1027
+ if len(batch_results) != len(pages_to_classify):
1028
+ logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
1029
+ return self
1030
+
1031
+ logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
1032
+ for page, result_obj in zip(pages_to_classify, batch_results):
1033
+ try:
1034
+ if not hasattr(page, 'analyses') or page.analyses is None:
1035
+ page.analyses = {}
1036
+ page.analyses[analysis_key] = result_obj
1037
+ except Exception as e:
1038
+ logger.warning(f"Failed to store classification results for page {page.number}: {e}")
1039
+
1040
+ logger.info(f"Finished classifying PDF pages.")
1041
+ return self
1042
+
1043
+ # --- End Classification Methods --- #
1044
+
1045
+ # --- Extraction Support --- #
1046
+ def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
1047
+ """
1048
+ Retrieves the content for the entire PDF.
1049
+
1050
+ Args:
1051
+ using: 'text' or 'vision'
1052
+ **kwargs: Additional arguments passed to extract_text or page.to_image
1053
+
1054
+ Returns:
1055
+ str: Extracted text if using='text'
1056
+ List[PIL.Image.Image]: List of page images if using='vision'
1057
+ None: If content cannot be retrieved
1058
+ """
1059
+ if using == 'text':
1060
+ try:
1061
+ layout = kwargs.pop('layout', True)
1062
+ return self.extract_text(layout=layout, **kwargs)
1063
+ except Exception as e:
1064
+ logger.error(f"Error extracting text from PDF: {e}")
1065
+ return None
1066
+ elif using == 'vision':
1067
+ page_images = []
1068
+ logger.info(f"Rendering {len(self.pages)} pages to images...")
1069
+
1070
+ resolution = kwargs.pop('resolution', 72)
1071
+ include_highlights = kwargs.pop('include_highlights', False)
1072
+ labels = kwargs.pop('labels', False)
1073
+
1074
+ try:
1075
+ for page in tqdm(self.pages, desc="Rendering Pages"):
1076
+ img = page.to_image(
1077
+ resolution=resolution,
1078
+ include_highlights=include_highlights,
1079
+ labels=labels,
1080
+ **kwargs
1081
+ )
1082
+ if img:
1083
+ page_images.append(img)
1084
+ else:
1085
+ logger.warning(f"Failed to render page {page.number}, skipping.")
1086
+ if not page_images:
1087
+ logger.error("Failed to render any pages.")
1088
+ return None
1089
+ return page_images
1090
+ except Exception as e:
1091
+ logger.error(f"Error rendering pages: {e}")
1092
+ return None
1093
+ else:
1094
+ logger.error(f"Unsupported value for 'using': {using}")
1095
+ return None
1096
+ # --- End Extraction Support --- #