natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,11 +1,12 @@
1
1
  import copy
2
+ import io
2
3
  import logging
3
4
  import os
4
5
  import re
5
6
  import tempfile
6
- import urllib.request
7
- import time
8
7
  import threading
8
+ import time
9
+ import urllib.request
9
10
  from pathlib import Path
10
11
  from typing import (
11
12
  TYPE_CHECKING,
@@ -18,38 +19,35 @@ from typing import (
18
19
  Tuple,
19
20
  Type,
20
21
  Union,
22
+ overload,
21
23
  )
22
- from natural_pdf.utils.tqdm_utils import get_tqdm
23
24
 
24
25
  import pdfplumber
25
26
  from PIL import Image
26
27
 
27
28
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
+ from natural_pdf.classification.manager import ClassificationError, ClassificationManager
30
+ from natural_pdf.classification.mixin import ClassificationMixin
31
+ from natural_pdf.classification.results import ClassificationResult
28
32
  from natural_pdf.core.highlighting_service import HighlightingService
29
- from natural_pdf.core.page import Page
30
- from natural_pdf.elements.collections import ElementCollection
33
+ from natural_pdf.elements.base import Element
31
34
  from natural_pdf.elements.region import Region
35
+ from natural_pdf.export.mixin import ExportMixin
36
+ from natural_pdf.extraction.manager import StructuredDataManager
37
+ from natural_pdf.extraction.mixin import ExtractionMixin
32
38
  from natural_pdf.ocr import OCRManager, OCROptions
33
39
  from natural_pdf.selectors.parser import parse_selector
34
-
35
- from natural_pdf.classification.manager import ClassificationManager
36
- from natural_pdf.classification.manager import ClassificationError
37
- from natural_pdf.classification.results import ClassificationResult
38
- from natural_pdf.extraction.manager import StructuredDataManager
39
-
40
40
  from natural_pdf.utils.locks import pdf_render_lock
41
- from natural_pdf.elements.base import Element
42
- from natural_pdf.classification.mixin import ClassificationMixin
43
- from natural_pdf.extraction.mixin import ExtractionMixin
41
+ from natural_pdf.utils.tqdm_utils import get_tqdm
44
42
 
45
43
  try:
46
44
  from typing import Any as TypingAny
47
45
 
48
- from natural_pdf.search import TextSearchOptions
49
46
  from natural_pdf.search import (
50
47
  BaseSearchOptions,
51
48
  SearchOptions,
52
49
  SearchServiceProtocol,
50
+ TextSearchOptions,
53
51
  get_search_service,
54
52
  )
55
53
  except ImportError:
@@ -62,6 +60,7 @@ except ImportError:
62
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
63
61
  )
64
62
 
63
+
65
64
  logger = logging.getLogger("natural_pdf.core.pdf")
66
65
  tqdm = get_tqdm()
67
66
 
@@ -70,7 +69,22 @@ DEFAULT_MANAGERS = {
70
69
  "structured_data": StructuredDataManager,
71
70
  }
72
71
 
73
- class PDF(ExtractionMixin):
72
+ # Deskew Imports (Conditional)
73
+ import numpy as np
74
+ from PIL import Image
75
+
76
+ try:
77
+ import img2pdf
78
+ from deskew import determine_skew
79
+
80
+ DESKEW_AVAILABLE = True
81
+ except ImportError:
82
+ DESKEW_AVAILABLE = False
83
+ img2pdf = None
84
+ # End Deskew Imports
85
+
86
+
87
+ class PDF(ExtractionMixin, ExportMixin):
74
88
  """
75
89
  Enhanced PDF wrapper built on top of pdfplumber.
76
90
 
@@ -80,7 +94,7 @@ class PDF(ExtractionMixin):
80
94
 
81
95
  def __init__(
82
96
  self,
83
- path_or_url: str,
97
+ path_or_url_or_stream,
84
98
  reading_order: bool = True,
85
99
  font_attrs: Optional[List[str]] = None,
86
100
  keep_spaces: bool = True,
@@ -89,54 +103,72 @@ class PDF(ExtractionMixin):
89
103
  Initialize the enhanced PDF object.
90
104
 
91
105
  Args:
92
- path_or_url: Path to the PDF file or a URL to a PDF
106
+ path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
93
107
  reading_order: Whether to use natural reading order
94
108
  font_attrs: Font attributes for grouping characters into words
95
109
  keep_spaces: Whether to include spaces in word elements
96
110
  """
97
- is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
98
-
99
- self._original_path = path_or_url
111
+ self._original_path_or_stream = path_or_url_or_stream
100
112
  self._temp_file = None
101
113
  self._resolved_path = None
102
-
103
- if is_url:
104
- logger.info(f"Downloading PDF from URL: {path_or_url}")
105
- try:
106
- self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
107
- with urllib.request.urlopen(path_or_url) as response:
108
- self._temp_file.write(response.read())
109
- self._temp_file.flush()
110
- self._temp_file.close()
111
- self._resolved_path = self._temp_file.name
112
- logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
113
- except Exception as e:
114
- if self._temp_file and hasattr(self._temp_file, "name"):
115
- try:
116
- os.unlink(self._temp_file.name)
117
- except:
118
- pass
119
- logger.error(f"Failed to download PDF from URL: {e}")
120
- raise ValueError(f"Failed to download PDF from URL: {e}")
114
+ self._is_stream = False
115
+ stream_to_open = None
116
+
117
+ if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
118
+ logger.info("Initializing PDF from in-memory stream.")
119
+ self._is_stream = True
120
+ self._resolved_path = None # No resolved file path for streams
121
+ self.source_path = "<stream>" # Identifier for source
122
+ self.path = self.source_path # Use source identifier as path for streams
123
+ stream_to_open = path_or_url_or_stream
124
+ elif isinstance(path_or_url_or_stream, (str, Path)):
125
+ path_or_url = str(path_or_url_or_stream)
126
+ self.source_path = path_or_url # Store original path/URL as source
127
+ is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
128
+
129
+ if is_url:
130
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
131
+ try:
132
+ # Use a context manager for the temporary file
133
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
134
+ self._temp_file = temp_f # Store reference if needed for cleanup
135
+ with urllib.request.urlopen(path_or_url) as response:
136
+ temp_f.write(response.read())
137
+ temp_f.flush()
138
+ self._resolved_path = temp_f.name
139
+ logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
140
+ stream_to_open = self._resolved_path
141
+ except Exception as e:
142
+ if self._temp_file and hasattr(self._temp_file, "name"):
143
+ try:
144
+ os.unlink(self._temp_file.name)
145
+ except: # noqa E722
146
+ pass
147
+ logger.error(f"Failed to download PDF from URL: {e}")
148
+ raise ValueError(f"Failed to download PDF from URL: {e}")
149
+ else:
150
+ self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
151
+ stream_to_open = self._resolved_path
152
+ self.path = self._resolved_path # Use resolved path for file-based PDFs
121
153
  else:
122
- self._resolved_path = path_or_url
154
+ raise TypeError(
155
+ f"Invalid input type: {type(path_or_url_or_stream)}. "
156
+ f"Expected path (str/Path), URL (str), or file-like object."
157
+ )
123
158
 
124
- logger.info(f"Initializing PDF from {self._resolved_path}")
159
+ logger.info(f"Opening PDF source: {self.source_path}")
125
160
  logger.debug(
126
161
  f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
127
162
  )
128
163
 
129
164
  try:
130
- self._pdf = pdfplumber.open(self._resolved_path)
165
+ self._pdf = pdfplumber.open(stream_to_open)
131
166
  except Exception as e:
132
167
  logger.error(f"Failed to open PDF: {e}", exc_info=True)
133
- self.close()
134
- raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
135
-
136
- self._path = self._resolved_path
137
- self.path = self._resolved_path
138
- self.source_path = self._original_path
168
+ self.close() # Attempt cleanup if opening fails
169
+ raise IOError(f"Failed to open PDF source: {self.source_path}") from e
139
170
 
171
+ # Store configuration used for initialization
140
172
  self._reading_order = reading_order
141
173
  self._config = {"keep_spaces": keep_spaces}
142
174
  self._font_attrs = font_attrs
@@ -144,9 +176,11 @@ class PDF(ExtractionMixin):
144
176
  self._ocr_manager = OCRManager() if OCRManager else None
145
177
  self._layout_manager = LayoutManager() if LayoutManager else None
146
178
  self.highlighter = HighlightingService(self)
147
- self._classification_manager_instance = ClassificationManager()
179
+ # self._classification_manager_instance = ClassificationManager() # Removed this line
148
180
  self._manager_registry = {}
149
181
 
182
+ from natural_pdf.core.page import Page
183
+
150
184
  self._pages = [
151
185
  Page(p, parent=self, index=i, font_attrs=font_attrs)
152
186
  for i, p in enumerate(self._pdf.pages)
@@ -175,16 +209,20 @@ class PDF(ExtractionMixin):
175
209
  def get_manager(self, key: str) -> Any:
176
210
  """Retrieve a manager instance by its key."""
177
211
  if key not in self._managers:
178
- raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
179
-
212
+ raise KeyError(
213
+ f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
214
+ )
215
+
180
216
  manager_instance = self._managers.get(key)
181
-
217
+
182
218
  if manager_instance is None:
183
- manager_class = DEFAULT_MANAGERS.get(key)
184
- if manager_class:
185
- raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
186
- else:
187
- raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
219
+ manager_class = DEFAULT_MANAGERS.get(key)
220
+ if manager_class:
221
+ raise RuntimeError(
222
+ f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
223
+ )
224
+ else:
225
+ raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
188
226
 
189
227
  return manager_instance
190
228
 
@@ -227,6 +265,7 @@ class PDF(ExtractionMixin):
227
265
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
228
266
 
229
267
  Args:
268
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
230
269
  exclusion_func: A function that takes a Page and returns a Region to exclude, or None
231
270
  label: Optional label for this exclusion
232
271
 
@@ -259,11 +298,22 @@ class PDF(ExtractionMixin):
259
298
  ) -> "PDF":
260
299
  """
261
300
  Applies OCR to specified pages of the PDF using batch processing.
301
+ Applies OCR to specified pages of the PDF using batch processing.
262
302
 
263
303
  Args:
264
304
  engine: Name of the OCR engine
265
305
  languages: List of language codes
266
- min_confidence: Minimum confidence threshold
306
+ min_confidence: Minimum confidence threshold
307
+ device: Device to run OCR on
308
+ resolution: DPI resolution for page images
309
+ apply_exclusions: Whether to mask excluded areas
310
+ detect_only: If True, only detect text boxes
311
+ replace: Whether to replace existing OCR elements
312
+ options: Engine-specific options
313
+ pages: Page indices to process or None for all pages
314
+ engine: Name of the OCR engine
315
+ languages: List of language codes
316
+ min_confidence: Minimum confidence threshold
267
317
  device: Device to run OCR on
268
318
  resolution: DPI resolution for page images
269
319
  apply_exclusions: Whether to mask excluded areas
@@ -274,6 +324,7 @@ class PDF(ExtractionMixin):
274
324
 
275
325
  Returns:
276
326
  Self for method chaining
327
+ Self for method chaining
277
328
  """
278
329
  if not self._ocr_manager:
279
330
  logger.error("OCRManager not available. Cannot apply OCR.")
@@ -281,7 +332,9 @@ class PDF(ExtractionMixin):
281
332
 
282
333
  thread_id = threading.current_thread().name
283
334
  logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
284
-
335
+
336
+ target_pages = []
337
+
285
338
  target_pages = []
286
339
  if pages is None:
287
340
  target_pages = self._pages
@@ -303,7 +356,7 @@ class PDF(ExtractionMixin):
303
356
 
304
357
  page_numbers = [p.number for p in target_pages]
305
358
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
306
-
359
+
307
360
  final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
308
361
  logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
309
362
 
@@ -312,7 +365,7 @@ class PDF(ExtractionMixin):
312
365
  logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
313
366
  failed_page_num = "unknown"
314
367
  render_start_time = time.monotonic()
315
-
368
+
316
369
  try:
317
370
  for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
318
371
  failed_page_num = page.number
@@ -326,14 +379,21 @@ class PDF(ExtractionMixin):
326
379
  if img is None:
327
380
  logger.error(f" Failed to render page {page.number} to image.")
328
381
  continue
382
+ continue
329
383
  images_pil.append(img)
330
384
  page_image_map.append((page, img))
331
385
  except Exception as e:
386
+ logger.error(f"Failed to render pages for batch OCR: {e}")
332
387
  logger.error(f"Failed to render pages for batch OCR: {e}")
333
388
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
334
-
389
+
335
390
  render_end_time = time.monotonic()
336
- logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
391
+ logger.debug(
392
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
393
+ )
394
+ logger.debug(
395
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
396
+ )
337
397
 
338
398
  if not images_pil or not page_image_map:
339
399
  logger.error("No images were successfully rendered for batch OCR.")
@@ -344,16 +404,18 @@ class PDF(ExtractionMixin):
344
404
  "engine": engine,
345
405
  "languages": languages,
346
406
  "min_confidence": min_confidence,
407
+ "min_confidence": min_confidence,
347
408
  "device": device,
348
409
  "options": options,
349
410
  "detect_only": detect_only,
350
411
  }
351
412
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
352
413
 
353
- ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
414
+ ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
415
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
354
416
  logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
355
417
  ocr_start_time = time.monotonic()
356
-
418
+
357
419
  try:
358
420
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
359
421
 
@@ -365,24 +427,28 @@ class PDF(ExtractionMixin):
365
427
  except Exception as e:
366
428
  logger.error(f"Batch OCR processing failed: {e}")
367
429
  return self
368
-
430
+
369
431
  ocr_end_time = time.monotonic()
370
- logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
432
+ logger.debug(
433
+ f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
434
+ )
371
435
 
372
436
  logger.info("Adding OCR results to respective pages...")
373
437
  total_elements_added = 0
374
-
438
+
375
439
  for i, (page, img) in enumerate(page_image_map):
376
440
  results_for_page = batch_results[i]
377
441
  if not isinstance(results_for_page, list):
378
- logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
442
+ logger.warning(
443
+ f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
444
+ )
379
445
  continue
380
446
 
381
447
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
382
448
  try:
383
449
  if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
384
450
  page._element_mgr.remove_ocr_elements()
385
-
451
+
386
452
  img_scale_x = page.width / img.width if img.width > 0 else 1
387
453
  img_scale_y = page.height / img.height if img.height > 0 else 1
388
454
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -407,6 +473,7 @@ class PDF(ExtractionMixin):
407
473
  Add a region function to the PDF.
408
474
 
409
475
  Args:
476
+ region_func: A function that takes a Page and returns a Region, or None
410
477
  region_func: A function that takes a Page and returns a Region, or None
411
478
  name: Optional name for the region
412
479
 
@@ -425,126 +492,194 @@ class PDF(ExtractionMixin):
425
492
  if region_instance and isinstance(region_instance, Region):
426
493
  page.add_region(region_instance, name=name, source="named")
427
494
  elif region_instance is not None:
428
- logger.warning(f"Region function did not return a valid Region for page {page.number}")
495
+ logger.warning(
496
+ f"Region function did not return a valid Region for page {page.number}"
497
+ )
429
498
  except Exception as e:
430
499
  logger.error(f"Error adding region for page {page.number}: {e}")
431
500
 
432
501
  return self
433
502
 
503
+ @overload
504
+ def find(
505
+ self,
506
+ *,
507
+ text: str,
508
+ apply_exclusions: bool = True,
509
+ regex: bool = False,
510
+ case: bool = True,
511
+ **kwargs,
512
+ ) -> Optional[Any]: ...
513
+
514
+ @overload
434
515
  def find(
435
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
516
+ self,
517
+ selector: str,
518
+ *,
519
+ apply_exclusions: bool = True,
520
+ regex: bool = False,
521
+ case: bool = True,
522
+ **kwargs,
523
+ ) -> Optional[Any]: ...
524
+
525
+ def find(
526
+ self,
527
+ selector: Optional[str] = None,
528
+ *,
529
+ text: Optional[str] = None,
530
+ apply_exclusions: bool = True,
531
+ regex: bool = False,
532
+ case: bool = True,
533
+ **kwargs,
436
534
  ) -> Optional[Any]:
437
535
  """
438
- Find the first element matching the selector.
536
+ Find the first element matching the selector OR text content across all pages.
537
+
538
+ Provide EITHER `selector` OR `text`, but not both.
439
539
 
440
540
  Args:
441
- selector: CSS-like selector string
442
- apply_exclusions: Whether to exclude elements in exclusion regions
443
- regex: Whether to use regex for text search
444
- case: Whether to do case-sensitive text search
445
- **kwargs: Additional filter parameters
541
+ selector: CSS-like selector string.
542
+ text: Text content to search for (equivalent to 'text:contains(...)').
543
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
544
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
545
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
546
+ **kwargs: Additional filter parameters.
446
547
 
447
548
  Returns:
448
- Element object or None if not found
549
+ Element object or None if not found.
449
550
  """
450
551
  if not hasattr(self, "_pages"):
451
552
  raise AttributeError("PDF pages not yet initialized.")
452
553
 
453
- selector_obj = parse_selector(selector)
554
+ if selector is not None and text is not None:
555
+ raise ValueError("Provide either 'selector' or 'text', not both.")
556
+ if selector is None and text is None:
557
+ raise ValueError("Provide either 'selector' or 'text'.")
558
+
559
+ # Construct selector if 'text' is provided
560
+ effective_selector = ""
561
+ if text is not None:
562
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
563
+ effective_selector = f'text:contains("{escaped_text}")'
564
+ logger.debug(
565
+ f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
566
+ )
567
+ elif selector is not None:
568
+ effective_selector = selector
569
+ else:
570
+ raise ValueError("Internal error: No selector or text provided.")
571
+
572
+ selector_obj = parse_selector(effective_selector)
454
573
  kwargs["regex"] = regex
455
574
  kwargs["case"] = case
456
575
 
457
- results = self._apply_selector(
458
- selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
459
- )
460
- return results.first if results else None
576
+ # Search page by page
577
+ for page in self.pages:
578
+ # Note: _apply_selector is on Page, so we call find directly here
579
+ # We pass the constructed/validated effective_selector
580
+ element = page.find(
581
+ selector=effective_selector, # Use the processed selector
582
+ apply_exclusions=apply_exclusions,
583
+ regex=regex, # Pass down flags
584
+ case=case,
585
+ **kwargs,
586
+ )
587
+ if element:
588
+ return element
589
+ return None # Not found on any page
461
590
 
591
+ @overload
462
592
  def find_all(
463
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
464
- ) -> ElementCollection:
465
- """
466
- Find all elements matching the selector.
593
+ self,
594
+ *,
595
+ text: str,
596
+ apply_exclusions: bool = True,
597
+ regex: bool = False,
598
+ case: bool = True,
599
+ **kwargs,
600
+ ) -> "ElementCollection": ...
467
601
 
468
- Args:
469
- selector: CSS-like selector string
470
- apply_exclusions: Whether to exclude elements in exclusion regions
471
- regex: Whether to use regex for text search
472
- case: Whether to do case-sensitive text search
473
- **kwargs: Additional filter parameters
602
+ @overload
603
+ def find_all(
604
+ self,
605
+ selector: str,
606
+ *,
607
+ apply_exclusions: bool = True,
608
+ regex: bool = False,
609
+ case: bool = True,
610
+ **kwargs,
611
+ ) -> "ElementCollection": ...
474
612
 
475
- Returns:
476
- ElementCollection with matching elements
613
+ def find_all(
614
+ self,
615
+ selector: Optional[str] = None,
616
+ *,
617
+ text: Optional[str] = None,
618
+ apply_exclusions: bool = True,
619
+ regex: bool = False,
620
+ case: bool = True,
621
+ **kwargs,
622
+ ) -> "ElementCollection":
477
623
  """
478
- if not hasattr(self, "_pages"):
479
- raise AttributeError("PDF pages not yet initialized.")
624
+ Find all elements matching the selector OR text content across all pages.
480
625
 
481
- selector_obj = parse_selector(selector)
482
- kwargs["regex"] = regex
483
- kwargs["case"] = case
484
-
485
- results = self._apply_selector(
486
- selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
487
- )
488
- return results
489
-
490
- def _apply_selector(
491
- self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
492
- ) -> ElementCollection:
493
- """
494
- Apply selector to PDF elements across all pages.
626
+ Provide EITHER `selector` OR `text`, but not both.
495
627
 
496
628
  Args:
497
- selector_obj: Parsed selector dictionary
498
- apply_exclusions: Whether to exclude elements in exclusion regions
499
- first_only: If True, stop searching after the first match is found
500
- **kwargs: Additional filter parameters
629
+ selector: CSS-like selector string.
630
+ text: Text content to search for (equivalent to 'text:contains(...)').
631
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
632
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
633
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
634
+ **kwargs: Additional filter parameters.
501
635
 
502
636
  Returns:
503
- ElementCollection of matching elements
637
+ ElementCollection with matching elements.
504
638
  """
505
- from natural_pdf.elements.collections import ElementCollection
639
+ if not hasattr(self, "_pages"):
640
+ raise AttributeError("PDF pages not yet initialized.")
506
641
 
507
- page_indices = kwargs.get("pages", range(len(self._pages)))
508
- if isinstance(page_indices, int):
509
- page_indices = [page_indices]
510
- elif isinstance(page_indices, slice):
511
- page_indices = range(*page_indices.indices(len(self._pages)))
642
+ if selector is not None and text is not None:
643
+ raise ValueError("Provide either 'selector' or 'text', not both.")
644
+ if selector is None and text is None:
645
+ raise ValueError("Provide either 'selector' or 'text'.")
646
+
647
+ # Construct selector if 'text' is provided
648
+ effective_selector = ""
649
+ if text is not None:
650
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
651
+ effective_selector = f'text:contains("{escaped_text}")'
652
+ logger.debug(
653
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
654
+ )
655
+ elif selector is not None:
656
+ effective_selector = selector
657
+ else:
658
+ raise ValueError("Internal error: No selector or text provided.")
512
659
 
513
- for pseudo in selector_obj.get("pseudo_classes", []):
514
- if pseudo.get("name") in ("spans", "continues"):
515
- logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
516
- return ElementCollection([])
660
+ # Instead of parsing here, let each page parse and apply
661
+ # This avoids parsing the same selector multiple times if not needed
662
+ # selector_obj = parse_selector(effective_selector)
517
663
 
518
- all_elements = []
519
- for page_idx in page_indices:
520
- if 0 <= page_idx < len(self._pages):
521
- page = self._pages[page_idx]
522
- page_elements_collection = page._apply_selector(
523
- selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
524
- )
525
- if page_elements_collection:
526
- page_elements = page_elements_collection.elements
527
- all_elements.extend(page_elements)
528
- if first_only and page_elements:
529
- break
530
- else:
531
- logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
664
+ # kwargs["regex"] = regex # Removed: Already passed explicitly
665
+ # kwargs["case"] = case # Removed: Already passed explicitly
532
666
 
533
- combined = ElementCollection(all_elements)
667
+ all_elements = []
668
+ for page in self.pages:
669
+ # Call page.find_all with the effective selector and flags
670
+ page_elements = page.find_all(
671
+ selector=effective_selector,
672
+ apply_exclusions=apply_exclusions,
673
+ regex=regex,
674
+ case=case,
675
+ **kwargs,
676
+ )
677
+ if page_elements:
678
+ all_elements.extend(page_elements.elements)
534
679
 
535
- if not first_only and kwargs.get("document_order", True):
536
- if all(
537
- hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
538
- for el in combined.elements
539
- ):
540
- combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
541
- else:
542
- try:
543
- combined.sort(key=lambda el: el.page.index)
544
- except AttributeError:
545
- logger.warning("Cannot sort elements in document order: Missing required attributes.")
680
+ from natural_pdf.elements.collections import ElementCollection
546
681
 
547
- return combined
682
+ return ElementCollection(all_elements)
548
683
 
549
684
  def extract_text(
550
685
  self,
@@ -562,6 +697,9 @@ class PDF(ExtractionMixin):
562
697
  preserve_whitespace: Whether to keep blank characters
563
698
  use_exclusions: Whether to apply exclusion regions
564
699
  debug_exclusions: Whether to output detailed debugging for exclusions
700
+ preserve_whitespace: Whether to keep blank characters
701
+ use_exclusions: Whether to apply exclusion regions
702
+ debug_exclusions: Whether to output detailed debugging for exclusions
565
703
  **kwargs: Additional extraction parameters
566
704
 
567
705
  Returns:
@@ -610,22 +748,22 @@ class PDF(ExtractionMixin):
610
748
  """
611
749
  if not hasattr(self, "_pages"):
612
750
  raise AttributeError("PDF pages not yet initialized.")
613
-
751
+
614
752
  logger.warning("PDF.extract_tables is not fully implemented yet.")
615
753
  all_tables = []
616
-
754
+
617
755
  for page in self.pages:
618
756
  if hasattr(page, "extract_tables"):
619
757
  all_tables.extend(page.extract_tables(**kwargs))
620
758
  else:
621
759
  logger.debug(f"Page {page.number} does not have extract_tables method.")
622
-
760
+
623
761
  if selector:
624
762
  logger.warning("Filtering extracted tables by selector is not implemented.")
625
-
763
+
626
764
  if merge_across_pages:
627
765
  logger.warning("Merging tables across pages is not implemented.")
628
-
766
+
629
767
  return all_tables
630
768
 
631
769
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -638,6 +776,9 @@ class PDF(ExtractionMixin):
638
776
  output_path: Path to save the searchable PDF
639
777
  dpi: Resolution for rendering and OCR overlay
640
778
  **kwargs: Additional keyword arguments passed to the exporter
779
+ output_path: Path to save the searchable PDF
780
+ dpi: Resolution for rendering and OCR overlay
781
+ **kwargs: Additional keyword arguments passed to the exporter
641
782
  """
642
783
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
643
784
 
@@ -667,6 +808,7 @@ class PDF(ExtractionMixin):
667
808
 
668
809
  Returns:
669
810
  A dictionary containing the answer, confidence, and other metadata
811
+ A dictionary containing the answer, confidence, and other metadata
670
812
  """
671
813
  from natural_pdf.qa import get_qa_engine
672
814
 
@@ -713,14 +855,19 @@ class PDF(ExtractionMixin):
713
855
  ) -> List[Dict[str, Any]]:
714
856
  """
715
857
  Finds relevant documents from this PDF within a search index.
858
+ Finds relevant documents from this PDF within a search index.
716
859
 
717
860
  Args:
718
861
  query: The search query (text, image path, PIL Image, Region)
719
862
  search_service: A pre-configured SearchService instance
720
863
  options: Optional SearchOptions to configure the query
864
+ query: The search query (text, image path, PIL Image, Region)
865
+ search_service: A pre-configured SearchService instance
866
+ options: Optional SearchOptions to configure the query
721
867
 
722
868
  Returns:
723
869
  A list of result dictionaries, sorted by relevance
870
+ A list of result dictionaries, sorted by relevance
724
871
 
725
872
  Raises:
726
873
  ImportError: If search dependencies are not installed
@@ -728,12 +875,19 @@ class PDF(ExtractionMixin):
728
875
  TypeError: If search_service does not conform to the protocol
729
876
  FileNotFoundError: If the collection managed by the service does not exist
730
877
  RuntimeError: For other search failures
878
+ ImportError: If search dependencies are not installed
879
+ ValueError: If search_service is None
880
+ TypeError: If search_service does not conform to the protocol
881
+ FileNotFoundError: If the collection managed by the service does not exist
882
+ RuntimeError: For other search failures
731
883
  """
732
884
  if not search_service:
733
885
  raise ValueError("A configured SearchServiceProtocol instance must be provided.")
734
886
 
735
887
  collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
736
- logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
888
+ logger.info(
889
+ f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
890
+ )
737
891
 
738
892
  service = search_service
739
893
 
@@ -743,12 +897,15 @@ class PDF(ExtractionMixin):
743
897
  if isinstance(query, Region):
744
898
  logger.debug("Query is a Region object. Extracting text.")
745
899
  if not isinstance(effective_options, TextSearchOptions):
746
- logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
900
+ logger.warning(
901
+ "Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
902
+ )
747
903
  query_input = query.extract_text()
748
904
  if not query_input or query_input.isspace():
749
905
  logger.error("Region has no extractable text for query.")
750
906
  return []
751
907
 
908
+ # Add filter to scope search to THIS PDF
752
909
  # Add filter to scope search to THIS PDF
753
910
  pdf_scope_filter = {
754
911
  "field": "pdf_path",
@@ -760,7 +917,10 @@ class PDF(ExtractionMixin):
760
917
  # Combine with existing filters in options (if any)
761
918
  if effective_options.filters:
762
919
  logger.debug(f"Combining PDF scope filter with existing filters")
763
- if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
920
+ if (
921
+ isinstance(effective_options.filters, dict)
922
+ and effective_options.filters.get("operator") == "AND"
923
+ ):
764
924
  effective_options.filters["conditions"].append(pdf_scope_filter)
765
925
  elif isinstance(effective_options.filters, list):
766
926
  effective_options.filters = {
@@ -773,7 +933,9 @@ class PDF(ExtractionMixin):
773
933
  "conditions": [effective_options.filters, pdf_scope_filter],
774
934
  }
775
935
  else:
776
- logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
936
+ logger.warning(
937
+ f"Unsupported format for existing filters. Overwriting with PDF scope filter."
938
+ )
777
939
  effective_options.filters = pdf_scope_filter
778
940
  else:
779
941
  effective_options.filters = pdf_scope_filter
@@ -790,26 +952,40 @@ class PDF(ExtractionMixin):
790
952
  except FileNotFoundError as fnf:
791
953
  logger.error(f"Search failed: Collection not found. Error: {fnf}")
792
954
  raise
955
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
956
+ raise
793
957
  except Exception as e:
794
958
  logger.error(f"SearchService search failed: {e}")
795
959
  raise RuntimeError(f"Search within index failed. See logs for details.") from e
960
+ logger.error(f"SearchService search failed: {e}")
961
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
796
962
 
797
963
  def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
798
964
  """
799
965
  Exports OCR results from this PDF into a correction task package.
966
+ Exports OCR results from this PDF into a correction task package.
800
967
 
801
968
  Args:
969
+ output_zip_path: The path to save the output zip file
802
970
  output_zip_path: The path to save the output zip file
803
971
  **kwargs: Additional arguments passed to create_correction_task_package
804
972
  """
805
973
  try:
806
974
  from natural_pdf.utils.packaging import create_correction_task_package
975
+
807
976
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
808
977
  except ImportError:
809
- logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
978
+ logger.error(
979
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
980
+ )
981
+ logger.error(
982
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
983
+ )
810
984
  except Exception as e:
811
985
  logger.error(f"Failed to export correction task: {e}")
812
986
  raise
987
+ logger.error(f"Failed to export correction task: {e}")
988
+ raise
813
989
 
814
990
  def correct_ocr(
815
991
  self,
@@ -820,17 +996,23 @@ class PDF(ExtractionMixin):
820
996
  ) -> "PDF":
821
997
  """
822
998
  Applies corrections to OCR text elements using a callback function.
999
+ Applies corrections to OCR text elements using a callback function.
823
1000
 
824
1001
  Args:
1002
+ correction_callback: Function that takes an element and returns corrected text or None
825
1003
  correction_callback: Function that takes an element and returns corrected text or None
826
1004
  pages: Optional page indices/slice to limit the scope of correction
827
1005
  max_workers: Maximum number of threads to use for parallel execution
828
1006
  progress_callback: Optional callback function for progress updates
1007
+ max_workers: Maximum number of threads to use for parallel execution
1008
+ progress_callback: Optional callback function for progress updates
829
1009
 
830
1010
  Returns:
831
1011
  Self for method chaining
1012
+ Self for method chaining
832
1013
  """
833
1014
  target_page_indices = []
1015
+ target_page_indices = []
834
1016
  if pages is None:
835
1017
  target_page_indices = list(range(len(self._pages)))
836
1018
  elif isinstance(pages, slice):
@@ -843,14 +1025,17 @@ class PDF(ExtractionMixin):
843
1025
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
844
1026
  except (IndexError, TypeError, ValueError) as e:
845
1027
  raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1028
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
846
1029
  else:
847
1030
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1031
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
848
1032
 
849
1033
  if not target_page_indices:
850
1034
  logger.warning("No pages selected for OCR correction.")
851
1035
  return self
852
1036
 
853
1037
  logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1038
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
854
1039
 
855
1040
  for page_idx in target_page_indices:
856
1041
  page = self._pages[page_idx]
@@ -862,7 +1047,9 @@ class PDF(ExtractionMixin):
862
1047
  )
863
1048
  except Exception as e:
864
1049
  logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1050
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
865
1051
 
1052
+ logger.info("OCR correction process finished.")
866
1053
  logger.info("OCR correction process finished.")
867
1054
  return self
868
1055
 
@@ -872,15 +1059,16 @@ class PDF(ExtractionMixin):
872
1059
  return 0
873
1060
  return len(self._pages)
874
1061
 
875
- def __getitem__(self, key) -> Union[Page, "PageCollection"]:
1062
+ def __getitem__(self, key) -> Union["Page", "PageCollection"]:
876
1063
  """Access pages by index or slice."""
877
1064
  if not hasattr(self, "_pages"):
878
1065
  raise AttributeError("PDF pages not initialized yet.")
879
-
1066
+
880
1067
  if isinstance(key, slice):
881
1068
  from natural_pdf.elements.collections import PageCollection
1069
+
882
1070
  return PageCollection(self._pages[key])
883
-
1071
+
884
1072
  if isinstance(key, int):
885
1073
  if 0 <= key < len(self._pages):
886
1074
  return self._pages[key]
@@ -905,13 +1093,12 @@ class PDF(ExtractionMixin):
905
1093
  try:
906
1094
  if hasattr(self._temp_file, "name") and self._temp_file.name:
907
1095
  temp_file_path = self._temp_file.name
908
- if os.path.exists(temp_file_path):
1096
+ # Only unlink if it exists and _is_stream is False (meaning WE created it)
1097
+ if not self._is_stream and os.path.exists(temp_file_path):
909
1098
  os.unlink(temp_file_path)
910
1099
  logger.debug(f"Removed temporary PDF file: {temp_file_path}")
911
1100
  except Exception as e:
912
1101
  logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
913
- finally:
914
- self._temp_file = None
915
1102
 
916
1103
  def __enter__(self):
917
1104
  """Context manager entry."""
@@ -922,9 +1109,136 @@ class PDF(ExtractionMixin):
922
1109
  self.close()
923
1110
 
924
1111
  def get_id(self) -> str:
1112
+ """Get unique identifier for this PDF."""
925
1113
  """Get unique identifier for this PDF."""
926
1114
  return self.path
927
1115
 
1116
+ # --- Deskew Method --- #
1117
+
1118
+ def deskew(
1119
+ self,
1120
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
1121
+ resolution: int = 300,
1122
+ detection_resolution: int = 72,
1123
+ force_overwrite: bool = False,
1124
+ **deskew_kwargs,
1125
+ ) -> "PDF":
1126
+ """
1127
+ Creates a new, in-memory PDF object containing deskewed versions of the
1128
+ specified pages from the original PDF.
1129
+
1130
+ This method renders each selected page, detects and corrects skew using the 'deskew'
1131
+ library, and then combines the resulting images into a new PDF using 'img2pdf'.
1132
+ The new PDF object is returned directly.
1133
+
1134
+ Important: The returned PDF is image-based. Any existing text, OCR results,
1135
+ annotations, or other elements from the original pages will *not* be carried over.
1136
+
1137
+ Args:
1138
+ pages: Page indices/slice to include (0-based). If None, processes all pages.
1139
+ resolution: DPI resolution for rendering the output deskewed pages.
1140
+ detection_resolution: DPI resolution used for skew detection if angles are not
1141
+ already cached on the page objects.
1142
+ force_overwrite: If False (default), raises a ValueError if any target page
1143
+ already contains processed elements (text, OCR, regions) to
1144
+ prevent accidental data loss. Set to True to proceed anyway.
1145
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
1146
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
1147
+
1148
+ Returns:
1149
+ A new PDF object representing the deskewed document.
1150
+
1151
+ Raises:
1152
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
1153
+ ValueError: If `force_overwrite` is False and target pages contain elements.
1154
+ FileNotFoundError: If the source PDF cannot be read (if file-based).
1155
+ IOError: If creating the in-memory PDF fails.
1156
+ RuntimeError: If rendering or deskewing individual pages fails.
1157
+ """
1158
+ if not DESKEW_AVAILABLE:
1159
+ raise ImportError(
1160
+ "Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
1161
+ )
1162
+
1163
+ target_pages = self._get_target_pages(pages) # Use helper to resolve pages
1164
+
1165
+ # --- Safety Check --- #
1166
+ if not force_overwrite:
1167
+ for page in target_pages:
1168
+ # Check if the element manager has been initialized and contains any elements
1169
+ if (
1170
+ hasattr(page, "_element_mgr")
1171
+ and page._element_mgr
1172
+ and page._element_mgr.has_elements()
1173
+ ):
1174
+ raise ValueError(
1175
+ f"Page {page.number} contains existing elements (text, OCR, etc.). "
1176
+ f"Deskewing creates an image-only PDF, discarding these elements. "
1177
+ f"Set force_overwrite=True to proceed."
1178
+ )
1179
+
1180
+ # --- Process Pages --- #
1181
+ deskewed_images_bytes = []
1182
+ logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
1183
+
1184
+ # Use tqdm via get_tqdm
1185
+ for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
1186
+ try:
1187
+ # Use page.deskew to get the corrected PIL image
1188
+ # Pass down resolutions and kwargs
1189
+ deskewed_img = page.deskew(
1190
+ resolution=resolution,
1191
+ angle=None, # Let page.deskew handle detection/caching
1192
+ detection_resolution=detection_resolution,
1193
+ **deskew_kwargs,
1194
+ )
1195
+
1196
+ if not deskewed_img:
1197
+ logger.warning(
1198
+ f"Page {page.number}: Failed to generate deskewed image, skipping."
1199
+ )
1200
+ continue
1201
+
1202
+ # Convert image to bytes for img2pdf (use PNG for lossless quality)
1203
+ with io.BytesIO() as buf:
1204
+ deskewed_img.save(buf, format="PNG")
1205
+ deskewed_images_bytes.append(buf.getvalue())
1206
+
1207
+ except Exception as e:
1208
+ logger.error(
1209
+ f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
1210
+ )
1211
+ # Option: Raise a runtime error, or continue and skip the page?
1212
+ # Raising makes the whole operation fail if one page fails.
1213
+ raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
1214
+
1215
+ # --- Create PDF --- #
1216
+ if not deskewed_images_bytes:
1217
+ raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
1218
+
1219
+ logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
1220
+ try:
1221
+ # Use img2pdf to combine image bytes into PDF bytes
1222
+ pdf_bytes = img2pdf.convert(deskewed_images_bytes)
1223
+
1224
+ # Wrap bytes in a stream
1225
+ pdf_stream = io.BytesIO(pdf_bytes)
1226
+
1227
+ # Create a new PDF object from the stream using original config
1228
+ logger.info("Creating new PDF object from deskewed stream...")
1229
+ new_pdf = PDF(
1230
+ pdf_stream,
1231
+ reading_order=self._reading_order,
1232
+ font_attrs=self._font_attrs,
1233
+ keep_spaces=self._config.get("keep_spaces", True),
1234
+ )
1235
+ return new_pdf
1236
+ except Exception as e:
1237
+ logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
1238
+ raise IOError("Failed to create deskewed PDF object from image stream.") from e
1239
+
1240
+ # --- End Deskew Method --- #
1241
+
928
1242
  # --- Classification Methods --- #
929
1243
 
930
1244
  def classify_pages(
@@ -954,19 +1268,20 @@ class PDF(ExtractionMixin):
954
1268
  raise ValueError("Categories list cannot be empty.")
955
1269
 
956
1270
  try:
957
- manager = self.get_manager('classification')
1271
+ manager = self.get_manager("classification")
958
1272
  except (ValueError, RuntimeError) as e:
959
1273
  raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
960
1274
 
961
1275
  if not manager or not manager.is_available():
962
1276
  try:
963
1277
  from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
1278
+
964
1279
  if not _CLASSIFICATION_AVAILABLE:
965
1280
  raise ImportError("Classification dependencies missing.")
966
1281
  except ImportError:
967
1282
  raise ImportError(
968
1283
  "Classification dependencies missing. "
969
- "Install with: pip install \"natural-pdf[classification]\""
1284
+ 'Install with: pip install "natural-pdf[classification]"'
970
1285
  )
971
1286
  raise ClassificationError("ClassificationManager not available.")
972
1287
 
@@ -990,12 +1305,14 @@ class PDF(ExtractionMixin):
990
1305
  return self
991
1306
 
992
1307
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
993
- logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
1308
+ logger.info(
1309
+ f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
1310
+ )
994
1311
 
995
1312
  page_contents = []
996
1313
  pages_to_classify = []
997
1314
  logger.debug(f"Gathering content for {len(target_pages)} pages...")
998
-
1315
+
999
1316
  for page in target_pages:
1000
1317
  try:
1001
1318
  content = page._get_classification_content(model_type=inferred_using, **kwargs)
@@ -1009,7 +1326,7 @@ class PDF(ExtractionMixin):
1009
1326
  if not page_contents:
1010
1327
  logger.warning("No content could be gathered for batch classification.")
1011
1328
  return self
1012
-
1329
+
1013
1330
  logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
1014
1331
 
1015
1332
  try:
@@ -1025,17 +1342,23 @@ class PDF(ExtractionMixin):
1025
1342
  raise ClassificationError(f"Batch classification failed: {e}") from e
1026
1343
 
1027
1344
  if len(batch_results) != len(pages_to_classify):
1028
- logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
1345
+ logger.error(
1346
+ f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
1347
+ )
1029
1348
  return self
1030
1349
 
1031
- logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
1350
+ logger.debug(
1351
+ f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
1352
+ )
1032
1353
  for page, result_obj in zip(pages_to_classify, batch_results):
1033
1354
  try:
1034
- if not hasattr(page, 'analyses') or page.analyses is None:
1355
+ if not hasattr(page, "analyses") or page.analyses is None:
1035
1356
  page.analyses = {}
1036
1357
  page.analyses[analysis_key] = result_obj
1037
1358
  except Exception as e:
1038
- logger.warning(f"Failed to store classification results for page {page.number}: {e}")
1359
+ logger.warning(
1360
+ f"Failed to store classification results for page {page.number}: {e}"
1361
+ )
1039
1362
 
1040
1363
  logger.info(f"Finished classifying PDF pages.")
1041
1364
  return self
@@ -1043,7 +1366,7 @@ class PDF(ExtractionMixin):
1043
1366
  # --- End Classification Methods --- #
1044
1367
 
1045
1368
  # --- Extraction Support --- #
1046
- def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
1369
+ def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
1047
1370
  """
1048
1371
  Retrieves the content for the entire PDF.
1049
1372
 
@@ -1056,28 +1379,28 @@ class PDF(ExtractionMixin):
1056
1379
  List[PIL.Image.Image]: List of page images if using='vision'
1057
1380
  None: If content cannot be retrieved
1058
1381
  """
1059
- if using == 'text':
1382
+ if using == "text":
1060
1383
  try:
1061
- layout = kwargs.pop('layout', True)
1384
+ layout = kwargs.pop("layout", True)
1062
1385
  return self.extract_text(layout=layout, **kwargs)
1063
1386
  except Exception as e:
1064
1387
  logger.error(f"Error extracting text from PDF: {e}")
1065
1388
  return None
1066
- elif using == 'vision':
1389
+ elif using == "vision":
1067
1390
  page_images = []
1068
1391
  logger.info(f"Rendering {len(self.pages)} pages to images...")
1069
-
1070
- resolution = kwargs.pop('resolution', 72)
1071
- include_highlights = kwargs.pop('include_highlights', False)
1072
- labels = kwargs.pop('labels', False)
1073
-
1392
+
1393
+ resolution = kwargs.pop("resolution", 72)
1394
+ include_highlights = kwargs.pop("include_highlights", False)
1395
+ labels = kwargs.pop("labels", False)
1396
+
1074
1397
  try:
1075
1398
  for page in tqdm(self.pages, desc="Rendering Pages"):
1076
1399
  img = page.to_image(
1077
1400
  resolution=resolution,
1078
1401
  include_highlights=include_highlights,
1079
1402
  labels=labels,
1080
- **kwargs
1403
+ **kwargs,
1081
1404
  )
1082
1405
  if img:
1083
1406
  page_images.append(img)
@@ -1093,4 +1416,124 @@ class PDF(ExtractionMixin):
1093
1416
  else:
1094
1417
  logger.error(f"Unsupported value for 'using': {using}")
1095
1418
  return None
1419
+
1096
1420
  # --- End Extraction Support --- #
1421
+
1422
+ def _gather_analysis_data(
1423
+ self,
1424
+ analysis_keys: List[str],
1425
+ include_content: bool,
1426
+ include_images: bool,
1427
+ image_dir: Optional[Path],
1428
+ image_format: str,
1429
+ image_resolution: int,
1430
+ ) -> List[Dict[str, Any]]:
1431
+ """
1432
+ Gather analysis data from all pages in the PDF.
1433
+
1434
+ Args:
1435
+ analysis_keys: Keys in the analyses dictionary to export
1436
+ include_content: Whether to include extracted text
1437
+ include_images: Whether to export images
1438
+ image_dir: Directory to save images
1439
+ image_format: Format to save images
1440
+ image_resolution: Resolution for exported images
1441
+
1442
+ Returns:
1443
+ List of dictionaries containing analysis data
1444
+ """
1445
+ if not hasattr(self, "_pages") or not self._pages:
1446
+ logger.warning(f"No pages found in PDF {self.path}")
1447
+ return []
1448
+
1449
+ all_data = []
1450
+
1451
+ for page in tqdm(self._pages, desc="Gathering page data", leave=False):
1452
+ # Basic page information
1453
+ page_data = {
1454
+ "pdf_path": self.path,
1455
+ "page_number": page.number,
1456
+ "page_index": page.index,
1457
+ }
1458
+
1459
+ # Include extracted text if requested
1460
+ if include_content:
1461
+ try:
1462
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
1463
+ except Exception as e:
1464
+ logger.error(f"Error extracting text from page {page.number}: {e}")
1465
+ page_data["content"] = ""
1466
+
1467
+ # Save image if requested
1468
+ if include_images:
1469
+ try:
1470
+ # Create image filename
1471
+ image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
1472
+ image_path = image_dir / image_filename
1473
+
1474
+ # Save image
1475
+ page.save_image(
1476
+ str(image_path), resolution=image_resolution, include_highlights=True
1477
+ )
1478
+
1479
+ # Add relative path to data
1480
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1481
+ except Exception as e:
1482
+ logger.error(f"Error saving image for page {page.number}: {e}")
1483
+ page_data["image_path"] = None
1484
+
1485
+ # Add analyses data
1486
+ for key in analysis_keys:
1487
+ if not hasattr(page, "analyses") or not page.analyses:
1488
+ raise ValueError(f"Page {page.number} does not have analyses data")
1489
+
1490
+ if key not in page.analyses:
1491
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
1492
+
1493
+ # Get the analysis result
1494
+ analysis_result = page.analyses[key]
1495
+
1496
+ # If the result has a to_dict method, use it
1497
+ if hasattr(analysis_result, "to_dict"):
1498
+ analysis_data = analysis_result.to_dict()
1499
+ else:
1500
+ # Otherwise, use the result directly if it's dict-like
1501
+ try:
1502
+ analysis_data = dict(analysis_result)
1503
+ except (TypeError, ValueError):
1504
+ # Last resort: convert to string
1505
+ analysis_data = {"raw_result": str(analysis_result)}
1506
+
1507
+ # Add analysis data to page data with the key as prefix
1508
+ for k, v in analysis_data.items():
1509
+ page_data[f"{key}.{k}"] = v
1510
+
1511
+ all_data.append(page_data)
1512
+
1513
+ return all_data
1514
+
1515
+ def _get_target_pages(
1516
+ self, pages: Optional[Union[Iterable[int], range, slice]] = None
1517
+ ) -> List["Page"]:
1518
+ """
1519
+ Helper method to get a list of Page objects based on the input pages.
1520
+
1521
+ Args:
1522
+ pages: Page indices, slice, or None for all pages
1523
+
1524
+ Returns:
1525
+ List of Page objects
1526
+ """
1527
+ if pages is None:
1528
+ return self._pages
1529
+ elif isinstance(pages, slice):
1530
+ return self._pages[pages]
1531
+ elif hasattr(pages, "__iter__"):
1532
+ try:
1533
+ return [self._pages[i] for i in pages]
1534
+ except IndexError:
1535
+ raise ValueError("Invalid page index provided in 'pages' iterable.")
1536
+ except TypeError:
1537
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1538
+ else:
1539
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")