natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,11 +1,12 @@
1
1
  import copy
2
+ import io
2
3
  import logging
3
4
  import os
4
5
  import re
5
6
  import tempfile
6
- import urllib.request
7
- import time
8
7
  import threading
8
+ import time
9
+ import urllib.request
9
10
  from pathlib import Path
10
11
  from typing import (
11
12
  TYPE_CHECKING,
@@ -18,38 +19,35 @@ from typing import (
18
19
  Tuple,
19
20
  Type,
20
21
  Union,
22
+ overload,
21
23
  )
22
- from natural_pdf.utils.tqdm_utils import get_tqdm
23
24
 
24
25
  import pdfplumber
25
26
  from PIL import Image
26
27
 
27
28
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
+ from natural_pdf.classification.manager import ClassificationError, ClassificationManager
30
+ from natural_pdf.classification.mixin import ClassificationMixin
31
+ from natural_pdf.classification.results import ClassificationResult
28
32
  from natural_pdf.core.highlighting_service import HighlightingService
29
- from natural_pdf.core.page import Page
30
- from natural_pdf.elements.collections import ElementCollection
33
+ from natural_pdf.elements.base import Element
31
34
  from natural_pdf.elements.region import Region
35
+ from natural_pdf.export.mixin import ExportMixin
36
+ from natural_pdf.extraction.manager import StructuredDataManager
37
+ from natural_pdf.extraction.mixin import ExtractionMixin
32
38
  from natural_pdf.ocr import OCRManager, OCROptions
33
39
  from natural_pdf.selectors.parser import parse_selector
34
-
35
- from natural_pdf.classification.manager import ClassificationManager
36
- from natural_pdf.classification.manager import ClassificationError
37
- from natural_pdf.classification.results import ClassificationResult
38
- from natural_pdf.extraction.manager import StructuredDataManager
39
-
40
40
  from natural_pdf.utils.locks import pdf_render_lock
41
- from natural_pdf.elements.base import Element
42
- from natural_pdf.classification.mixin import ClassificationMixin
43
- from natural_pdf.extraction.mixin import ExtractionMixin
41
+ from natural_pdf.utils.tqdm_utils import get_tqdm
44
42
 
45
43
  try:
46
44
  from typing import Any as TypingAny
47
45
 
48
- from natural_pdf.search import TextSearchOptions
49
46
  from natural_pdf.search import (
50
47
  BaseSearchOptions,
51
48
  SearchOptions,
52
49
  SearchServiceProtocol,
50
+ TextSearchOptions,
53
51
  get_search_service,
54
52
  )
55
53
  except ImportError:
@@ -62,6 +60,7 @@ except ImportError:
62
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
63
61
  )
64
62
 
63
+
65
64
  logger = logging.getLogger("natural_pdf.core.pdf")
66
65
  tqdm = get_tqdm()
67
66
 
@@ -70,7 +69,22 @@ DEFAULT_MANAGERS = {
70
69
  "structured_data": StructuredDataManager,
71
70
  }
72
71
 
73
- class PDF(ExtractionMixin):
72
+ # Deskew Imports (Conditional)
73
+ import numpy as np
74
+ from PIL import Image
75
+
76
+ try:
77
+ import img2pdf
78
+ from deskew import determine_skew
79
+
80
+ DESKEW_AVAILABLE = True
81
+ except ImportError:
82
+ DESKEW_AVAILABLE = False
83
+ img2pdf = None
84
+ # End Deskew Imports
85
+
86
+
87
+ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
74
88
  """
75
89
  Enhanced PDF wrapper built on top of pdfplumber.
76
90
 
@@ -80,7 +94,7 @@ class PDF(ExtractionMixin):
80
94
 
81
95
  def __init__(
82
96
  self,
83
- path_or_url: str,
97
+ path_or_url_or_stream,
84
98
  reading_order: bool = True,
85
99
  font_attrs: Optional[List[str]] = None,
86
100
  keep_spaces: bool = True,
@@ -89,54 +103,72 @@ class PDF(ExtractionMixin):
89
103
  Initialize the enhanced PDF object.
90
104
 
91
105
  Args:
92
- path_or_url: Path to the PDF file or a URL to a PDF
106
+ path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
93
107
  reading_order: Whether to use natural reading order
94
108
  font_attrs: Font attributes for grouping characters into words
95
109
  keep_spaces: Whether to include spaces in word elements
96
110
  """
97
- is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
98
-
99
- self._original_path = path_or_url
111
+ self._original_path_or_stream = path_or_url_or_stream
100
112
  self._temp_file = None
101
113
  self._resolved_path = None
102
-
103
- if is_url:
104
- logger.info(f"Downloading PDF from URL: {path_or_url}")
105
- try:
106
- self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
107
- with urllib.request.urlopen(path_or_url) as response:
108
- self._temp_file.write(response.read())
109
- self._temp_file.flush()
110
- self._temp_file.close()
111
- self._resolved_path = self._temp_file.name
112
- logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
113
- except Exception as e:
114
- if self._temp_file and hasattr(self._temp_file, "name"):
115
- try:
116
- os.unlink(self._temp_file.name)
117
- except:
118
- pass
119
- logger.error(f"Failed to download PDF from URL: {e}")
120
- raise ValueError(f"Failed to download PDF from URL: {e}")
114
+ self._is_stream = False
115
+ stream_to_open = None
116
+
117
+ if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
118
+ logger.info("Initializing PDF from in-memory stream.")
119
+ self._is_stream = True
120
+ self._resolved_path = None # No resolved file path for streams
121
+ self.source_path = "<stream>" # Identifier for source
122
+ self.path = self.source_path # Use source identifier as path for streams
123
+ stream_to_open = path_or_url_or_stream
124
+ elif isinstance(path_or_url_or_stream, (str, Path)):
125
+ path_or_url = str(path_or_url_or_stream)
126
+ self.source_path = path_or_url # Store original path/URL as source
127
+ is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
128
+
129
+ if is_url:
130
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
131
+ try:
132
+ # Use a context manager for the temporary file
133
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
134
+ self._temp_file = temp_f # Store reference if needed for cleanup
135
+ with urllib.request.urlopen(path_or_url) as response:
136
+ temp_f.write(response.read())
137
+ temp_f.flush()
138
+ self._resolved_path = temp_f.name
139
+ logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
140
+ stream_to_open = self._resolved_path
141
+ except Exception as e:
142
+ if self._temp_file and hasattr(self._temp_file, "name"):
143
+ try:
144
+ os.unlink(self._temp_file.name)
145
+ except: # noqa E722
146
+ pass
147
+ logger.error(f"Failed to download PDF from URL: {e}")
148
+ raise ValueError(f"Failed to download PDF from URL: {e}")
149
+ else:
150
+ self._resolved_path = str(Path(path_or_url).resolve()) # Resolve local paths
151
+ stream_to_open = self._resolved_path
152
+ self.path = self._resolved_path # Use resolved path for file-based PDFs
121
153
  else:
122
- self._resolved_path = path_or_url
154
+ raise TypeError(
155
+ f"Invalid input type: {type(path_or_url_or_stream)}. "
156
+ f"Expected path (str/Path), URL (str), or file-like object."
157
+ )
123
158
 
124
- logger.info(f"Initializing PDF from {self._resolved_path}")
159
+ logger.info(f"Opening PDF source: {self.source_path}")
125
160
  logger.debug(
126
161
  f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
127
162
  )
128
163
 
129
164
  try:
130
- self._pdf = pdfplumber.open(self._resolved_path)
165
+ self._pdf = pdfplumber.open(stream_to_open)
131
166
  except Exception as e:
132
167
  logger.error(f"Failed to open PDF: {e}", exc_info=True)
133
- self.close()
134
- raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
135
-
136
- self._path = self._resolved_path
137
- self.path = self._resolved_path
138
- self.source_path = self._original_path
168
+ self.close() # Attempt cleanup if opening fails
169
+ raise IOError(f"Failed to open PDF source: {self.source_path}") from e
139
170
 
171
+ # Store configuration used for initialization
140
172
  self._reading_order = reading_order
141
173
  self._config = {"keep_spaces": keep_spaces}
142
174
  self._font_attrs = font_attrs
@@ -144,9 +176,11 @@ class PDF(ExtractionMixin):
144
176
  self._ocr_manager = OCRManager() if OCRManager else None
145
177
  self._layout_manager = LayoutManager() if LayoutManager else None
146
178
  self.highlighter = HighlightingService(self)
147
- self._classification_manager_instance = ClassificationManager()
179
+ # self._classification_manager_instance = ClassificationManager() # Removed this line
148
180
  self._manager_registry = {}
149
181
 
182
+ from natural_pdf.core.page import Page
183
+
150
184
  self._pages = [
151
185
  Page(p, parent=self, index=i, font_attrs=font_attrs)
152
186
  for i, p in enumerate(self._pdf.pages)
@@ -160,6 +194,7 @@ class PDF(ExtractionMixin):
160
194
 
161
195
  self._initialize_managers()
162
196
  self._initialize_highlighter()
197
+ self.analyses: Dict[str, Any] = {}
163
198
 
164
199
  def _initialize_managers(self):
165
200
  """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -175,16 +210,20 @@ class PDF(ExtractionMixin):
175
210
  def get_manager(self, key: str) -> Any:
176
211
  """Retrieve a manager instance by its key."""
177
212
  if key not in self._managers:
178
- raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
179
-
213
+ raise KeyError(
214
+ f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
215
+ )
216
+
180
217
  manager_instance = self._managers.get(key)
181
-
218
+
182
219
  if manager_instance is None:
183
- manager_class = DEFAULT_MANAGERS.get(key)
184
- if manager_class:
185
- raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
186
- else:
187
- raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
220
+ manager_class = DEFAULT_MANAGERS.get(key)
221
+ if manager_class:
222
+ raise RuntimeError(
223
+ f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
224
+ )
225
+ else:
226
+ raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
188
227
 
189
228
  return manager_instance
190
229
 
@@ -227,6 +266,7 @@ class PDF(ExtractionMixin):
227
266
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
228
267
 
229
268
  Args:
269
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None
230
270
  exclusion_func: A function that takes a Page and returns a Region to exclude, or None
231
271
  label: Optional label for this exclusion
232
272
 
@@ -259,11 +299,22 @@ class PDF(ExtractionMixin):
259
299
  ) -> "PDF":
260
300
  """
261
301
  Applies OCR to specified pages of the PDF using batch processing.
302
+ Applies OCR to specified pages of the PDF using batch processing.
262
303
 
263
304
  Args:
264
305
  engine: Name of the OCR engine
265
306
  languages: List of language codes
266
- min_confidence: Minimum confidence threshold
307
+ min_confidence: Minimum confidence threshold
308
+ device: Device to run OCR on
309
+ resolution: DPI resolution for page images
310
+ apply_exclusions: Whether to mask excluded areas
311
+ detect_only: If True, only detect text boxes
312
+ replace: Whether to replace existing OCR elements
313
+ options: Engine-specific options
314
+ pages: Page indices to process or None for all pages
315
+ engine: Name of the OCR engine
316
+ languages: List of language codes
317
+ min_confidence: Minimum confidence threshold
267
318
  device: Device to run OCR on
268
319
  resolution: DPI resolution for page images
269
320
  apply_exclusions: Whether to mask excluded areas
@@ -274,6 +325,7 @@ class PDF(ExtractionMixin):
274
325
 
275
326
  Returns:
276
327
  Self for method chaining
328
+ Self for method chaining
277
329
  """
278
330
  if not self._ocr_manager:
279
331
  logger.error("OCRManager not available. Cannot apply OCR.")
@@ -281,7 +333,9 @@ class PDF(ExtractionMixin):
281
333
 
282
334
  thread_id = threading.current_thread().name
283
335
  logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
284
-
336
+
337
+ target_pages = []
338
+
285
339
  target_pages = []
286
340
  if pages is None:
287
341
  target_pages = self._pages
@@ -303,7 +357,7 @@ class PDF(ExtractionMixin):
303
357
 
304
358
  page_numbers = [p.number for p in target_pages]
305
359
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
306
-
360
+
307
361
  final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
308
362
  logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
309
363
 
@@ -312,7 +366,7 @@ class PDF(ExtractionMixin):
312
366
  logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
313
367
  failed_page_num = "unknown"
314
368
  render_start_time = time.monotonic()
315
-
369
+
316
370
  try:
317
371
  for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
318
372
  failed_page_num = page.number
@@ -326,14 +380,21 @@ class PDF(ExtractionMixin):
326
380
  if img is None:
327
381
  logger.error(f" Failed to render page {page.number} to image.")
328
382
  continue
383
+ continue
329
384
  images_pil.append(img)
330
385
  page_image_map.append((page, img))
331
386
  except Exception as e:
387
+ logger.error(f"Failed to render pages for batch OCR: {e}")
332
388
  logger.error(f"Failed to render pages for batch OCR: {e}")
333
389
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
334
-
390
+
335
391
  render_end_time = time.monotonic()
336
- logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
392
+ logger.debug(
393
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
394
+ )
395
+ logger.debug(
396
+ f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
397
+ )
337
398
 
338
399
  if not images_pil or not page_image_map:
339
400
  logger.error("No images were successfully rendered for batch OCR.")
@@ -344,16 +405,18 @@ class PDF(ExtractionMixin):
344
405
  "engine": engine,
345
406
  "languages": languages,
346
407
  "min_confidence": min_confidence,
408
+ "min_confidence": min_confidence,
347
409
  "device": device,
348
410
  "options": options,
349
411
  "detect_only": detect_only,
350
412
  }
351
413
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
352
414
 
353
- ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
415
+ ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
416
+ logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
354
417
  logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
355
418
  ocr_start_time = time.monotonic()
356
-
419
+
357
420
  try:
358
421
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
359
422
 
@@ -365,24 +428,28 @@ class PDF(ExtractionMixin):
365
428
  except Exception as e:
366
429
  logger.error(f"Batch OCR processing failed: {e}")
367
430
  return self
368
-
431
+
369
432
  ocr_end_time = time.monotonic()
370
- logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
433
+ logger.debug(
434
+ f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
435
+ )
371
436
 
372
437
  logger.info("Adding OCR results to respective pages...")
373
438
  total_elements_added = 0
374
-
439
+
375
440
  for i, (page, img) in enumerate(page_image_map):
376
441
  results_for_page = batch_results[i]
377
442
  if not isinstance(results_for_page, list):
378
- logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
443
+ logger.warning(
444
+ f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
445
+ )
379
446
  continue
380
447
 
381
448
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
382
449
  try:
383
450
  if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
384
451
  page._element_mgr.remove_ocr_elements()
385
-
452
+
386
453
  img_scale_x = page.width / img.width if img.width > 0 else 1
387
454
  img_scale_y = page.height / img.height if img.height > 0 else 1
388
455
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -407,6 +474,7 @@ class PDF(ExtractionMixin):
407
474
  Add a region function to the PDF.
408
475
 
409
476
  Args:
477
+ region_func: A function that takes a Page and returns a Region, or None
410
478
  region_func: A function that takes a Page and returns a Region, or None
411
479
  name: Optional name for the region
412
480
 
@@ -425,126 +493,194 @@ class PDF(ExtractionMixin):
425
493
  if region_instance and isinstance(region_instance, Region):
426
494
  page.add_region(region_instance, name=name, source="named")
427
495
  elif region_instance is not None:
428
- logger.warning(f"Region function did not return a valid Region for page {page.number}")
496
+ logger.warning(
497
+ f"Region function did not return a valid Region for page {page.number}"
498
+ )
429
499
  except Exception as e:
430
500
  logger.error(f"Error adding region for page {page.number}: {e}")
431
501
 
432
502
  return self
433
503
 
504
+ @overload
434
505
  def find(
435
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
506
+ self,
507
+ *,
508
+ text: str,
509
+ apply_exclusions: bool = True,
510
+ regex: bool = False,
511
+ case: bool = True,
512
+ **kwargs,
513
+ ) -> Optional[Any]: ...
514
+
515
+ @overload
516
+ def find(
517
+ self,
518
+ selector: str,
519
+ *,
520
+ apply_exclusions: bool = True,
521
+ regex: bool = False,
522
+ case: bool = True,
523
+ **kwargs,
524
+ ) -> Optional[Any]: ...
525
+
526
+ def find(
527
+ self,
528
+ selector: Optional[str] = None,
529
+ *,
530
+ text: Optional[str] = None,
531
+ apply_exclusions: bool = True,
532
+ regex: bool = False,
533
+ case: bool = True,
534
+ **kwargs,
436
535
  ) -> Optional[Any]:
437
536
  """
438
- Find the first element matching the selector.
537
+ Find the first element matching the selector OR text content across all pages.
538
+
539
+ Provide EITHER `selector` OR `text`, but not both.
439
540
 
440
541
  Args:
441
- selector: CSS-like selector string
442
- apply_exclusions: Whether to exclude elements in exclusion regions
443
- regex: Whether to use regex for text search
444
- case: Whether to do case-sensitive text search
445
- **kwargs: Additional filter parameters
542
+ selector: CSS-like selector string.
543
+ text: Text content to search for (equivalent to 'text:contains(...)').
544
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
545
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
546
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
547
+ **kwargs: Additional filter parameters.
446
548
 
447
549
  Returns:
448
- Element object or None if not found
550
+ Element object or None if not found.
449
551
  """
450
552
  if not hasattr(self, "_pages"):
451
553
  raise AttributeError("PDF pages not yet initialized.")
452
554
 
453
- selector_obj = parse_selector(selector)
555
+ if selector is not None and text is not None:
556
+ raise ValueError("Provide either 'selector' or 'text', not both.")
557
+ if selector is None and text is None:
558
+ raise ValueError("Provide either 'selector' or 'text'.")
559
+
560
+ # Construct selector if 'text' is provided
561
+ effective_selector = ""
562
+ if text is not None:
563
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
564
+ effective_selector = f'text:contains("{escaped_text}")'
565
+ logger.debug(
566
+ f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
567
+ )
568
+ elif selector is not None:
569
+ effective_selector = selector
570
+ else:
571
+ raise ValueError("Internal error: No selector or text provided.")
572
+
573
+ selector_obj = parse_selector(effective_selector)
454
574
  kwargs["regex"] = regex
455
575
  kwargs["case"] = case
456
576
 
457
- results = self._apply_selector(
458
- selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
459
- )
460
- return results.first if results else None
577
+ # Search page by page
578
+ for page in self.pages:
579
+ # Note: _apply_selector is on Page, so we call find directly here
580
+ # We pass the constructed/validated effective_selector
581
+ element = page.find(
582
+ selector=effective_selector, # Use the processed selector
583
+ apply_exclusions=apply_exclusions,
584
+ regex=regex, # Pass down flags
585
+ case=case,
586
+ **kwargs,
587
+ )
588
+ if element:
589
+ return element
590
+ return None # Not found on any page
461
591
 
592
+ @overload
462
593
  def find_all(
463
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
464
- ) -> ElementCollection:
465
- """
466
- Find all elements matching the selector.
594
+ self,
595
+ *,
596
+ text: str,
597
+ apply_exclusions: bool = True,
598
+ regex: bool = False,
599
+ case: bool = True,
600
+ **kwargs,
601
+ ) -> "ElementCollection": ...
467
602
 
468
- Args:
469
- selector: CSS-like selector string
470
- apply_exclusions: Whether to exclude elements in exclusion regions
471
- regex: Whether to use regex for text search
472
- case: Whether to do case-sensitive text search
473
- **kwargs: Additional filter parameters
603
+ @overload
604
+ def find_all(
605
+ self,
606
+ selector: str,
607
+ *,
608
+ apply_exclusions: bool = True,
609
+ regex: bool = False,
610
+ case: bool = True,
611
+ **kwargs,
612
+ ) -> "ElementCollection": ...
474
613
 
475
- Returns:
476
- ElementCollection with matching elements
614
+ def find_all(
615
+ self,
616
+ selector: Optional[str] = None,
617
+ *,
618
+ text: Optional[str] = None,
619
+ apply_exclusions: bool = True,
620
+ regex: bool = False,
621
+ case: bool = True,
622
+ **kwargs,
623
+ ) -> "ElementCollection":
477
624
  """
478
- if not hasattr(self, "_pages"):
479
- raise AttributeError("PDF pages not yet initialized.")
480
-
481
- selector_obj = parse_selector(selector)
482
- kwargs["regex"] = regex
483
- kwargs["case"] = case
484
-
485
- results = self._apply_selector(
486
- selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
487
- )
488
- return results
625
+ Find all elements matching the selector OR text content across all pages.
489
626
 
490
- def _apply_selector(
491
- self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
492
- ) -> ElementCollection:
493
- """
494
- Apply selector to PDF elements across all pages.
627
+ Provide EITHER `selector` OR `text`, but not both.
495
628
 
496
629
  Args:
497
- selector_obj: Parsed selector dictionary
498
- apply_exclusions: Whether to exclude elements in exclusion regions
499
- first_only: If True, stop searching after the first match is found
500
- **kwargs: Additional filter parameters
630
+ selector: CSS-like selector string.
631
+ text: Text content to search for (equivalent to 'text:contains(...)').
632
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
633
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
634
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
635
+ **kwargs: Additional filter parameters.
501
636
 
502
637
  Returns:
503
- ElementCollection of matching elements
638
+ ElementCollection with matching elements.
504
639
  """
505
- from natural_pdf.elements.collections import ElementCollection
640
+ if not hasattr(self, "_pages"):
641
+ raise AttributeError("PDF pages not yet initialized.")
506
642
 
507
- page_indices = kwargs.get("pages", range(len(self._pages)))
508
- if isinstance(page_indices, int):
509
- page_indices = [page_indices]
510
- elif isinstance(page_indices, slice):
511
- page_indices = range(*page_indices.indices(len(self._pages)))
643
+ if selector is not None and text is not None:
644
+ raise ValueError("Provide either 'selector' or 'text', not both.")
645
+ if selector is None and text is None:
646
+ raise ValueError("Provide either 'selector' or 'text'.")
647
+
648
+ # Construct selector if 'text' is provided
649
+ effective_selector = ""
650
+ if text is not None:
651
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
652
+ effective_selector = f'text:contains("{escaped_text}")'
653
+ logger.debug(
654
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
655
+ )
656
+ elif selector is not None:
657
+ effective_selector = selector
658
+ else:
659
+ raise ValueError("Internal error: No selector or text provided.")
512
660
 
513
- for pseudo in selector_obj.get("pseudo_classes", []):
514
- if pseudo.get("name") in ("spans", "continues"):
515
- logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
516
- return ElementCollection([])
661
+ # Instead of parsing here, let each page parse and apply
662
+ # This avoids parsing the same selector multiple times if not needed
663
+ # selector_obj = parse_selector(effective_selector)
517
664
 
518
- all_elements = []
519
- for page_idx in page_indices:
520
- if 0 <= page_idx < len(self._pages):
521
- page = self._pages[page_idx]
522
- page_elements_collection = page._apply_selector(
523
- selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
524
- )
525
- if page_elements_collection:
526
- page_elements = page_elements_collection.elements
527
- all_elements.extend(page_elements)
528
- if first_only and page_elements:
529
- break
530
- else:
531
- logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
665
+ # kwargs["regex"] = regex # Removed: Already passed explicitly
666
+ # kwargs["case"] = case # Removed: Already passed explicitly
532
667
 
533
- combined = ElementCollection(all_elements)
668
+ all_elements = []
669
+ for page in self.pages:
670
+ # Call page.find_all with the effective selector and flags
671
+ page_elements = page.find_all(
672
+ selector=effective_selector,
673
+ apply_exclusions=apply_exclusions,
674
+ regex=regex,
675
+ case=case,
676
+ **kwargs,
677
+ )
678
+ if page_elements:
679
+ all_elements.extend(page_elements.elements)
534
680
 
535
- if not first_only and kwargs.get("document_order", True):
536
- if all(
537
- hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
538
- for el in combined.elements
539
- ):
540
- combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
541
- else:
542
- try:
543
- combined.sort(key=lambda el: el.page.index)
544
- except AttributeError:
545
- logger.warning("Cannot sort elements in document order: Missing required attributes.")
681
+ from natural_pdf.elements.collections import ElementCollection
546
682
 
547
- return combined
683
+ return ElementCollection(all_elements)
548
684
 
549
685
  def extract_text(
550
686
  self,
@@ -562,6 +698,9 @@ class PDF(ExtractionMixin):
562
698
  preserve_whitespace: Whether to keep blank characters
563
699
  use_exclusions: Whether to apply exclusion regions
564
700
  debug_exclusions: Whether to output detailed debugging for exclusions
701
+ preserve_whitespace: Whether to keep blank characters
702
+ use_exclusions: Whether to apply exclusion regions
703
+ debug_exclusions: Whether to output detailed debugging for exclusions
565
704
  **kwargs: Additional extraction parameters
566
705
 
567
706
  Returns:
@@ -610,22 +749,22 @@ class PDF(ExtractionMixin):
610
749
  """
611
750
  if not hasattr(self, "_pages"):
612
751
  raise AttributeError("PDF pages not yet initialized.")
613
-
752
+
614
753
  logger.warning("PDF.extract_tables is not fully implemented yet.")
615
754
  all_tables = []
616
-
755
+
617
756
  for page in self.pages:
618
757
  if hasattr(page, "extract_tables"):
619
758
  all_tables.extend(page.extract_tables(**kwargs))
620
759
  else:
621
760
  logger.debug(f"Page {page.number} does not have extract_tables method.")
622
-
761
+
623
762
  if selector:
624
763
  logger.warning("Filtering extracted tables by selector is not implemented.")
625
-
764
+
626
765
  if merge_across_pages:
627
766
  logger.warning("Merging tables across pages is not implemented.")
628
-
767
+
629
768
  return all_tables
630
769
 
631
770
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -638,6 +777,9 @@ class PDF(ExtractionMixin):
638
777
  output_path: Path to save the searchable PDF
639
778
  dpi: Resolution for rendering and OCR overlay
640
779
  **kwargs: Additional keyword arguments passed to the exporter
780
+ output_path: Path to save the searchable PDF
781
+ dpi: Resolution for rendering and OCR overlay
782
+ **kwargs: Additional keyword arguments passed to the exporter
641
783
  """
642
784
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
643
785
 
@@ -667,6 +809,7 @@ class PDF(ExtractionMixin):
667
809
 
668
810
  Returns:
669
811
  A dictionary containing the answer, confidence, and other metadata
812
+ A dictionary containing the answer, confidence, and other metadata
670
813
  """
671
814
  from natural_pdf.qa import get_qa_engine
672
815
 
@@ -713,14 +856,19 @@ class PDF(ExtractionMixin):
713
856
  ) -> List[Dict[str, Any]]:
714
857
  """
715
858
  Finds relevant documents from this PDF within a search index.
859
+ Finds relevant documents from this PDF within a search index.
716
860
 
717
861
  Args:
718
862
  query: The search query (text, image path, PIL Image, Region)
719
863
  search_service: A pre-configured SearchService instance
720
864
  options: Optional SearchOptions to configure the query
865
+ query: The search query (text, image path, PIL Image, Region)
866
+ search_service: A pre-configured SearchService instance
867
+ options: Optional SearchOptions to configure the query
721
868
 
722
869
  Returns:
723
870
  A list of result dictionaries, sorted by relevance
871
+ A list of result dictionaries, sorted by relevance
724
872
 
725
873
  Raises:
726
874
  ImportError: If search dependencies are not installed
@@ -728,12 +876,19 @@ class PDF(ExtractionMixin):
728
876
  TypeError: If search_service does not conform to the protocol
729
877
  FileNotFoundError: If the collection managed by the service does not exist
730
878
  RuntimeError: For other search failures
879
+ ImportError: If search dependencies are not installed
880
+ ValueError: If search_service is None
881
+ TypeError: If search_service does not conform to the protocol
882
+ FileNotFoundError: If the collection managed by the service does not exist
883
+ RuntimeError: For other search failures
731
884
  """
732
885
  if not search_service:
733
886
  raise ValueError("A configured SearchServiceProtocol instance must be provided.")
734
887
 
735
888
  collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
736
- logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
889
+ logger.info(
890
+ f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
891
+ )
737
892
 
738
893
  service = search_service
739
894
 
@@ -743,12 +898,15 @@ class PDF(ExtractionMixin):
743
898
  if isinstance(query, Region):
744
899
  logger.debug("Query is a Region object. Extracting text.")
745
900
  if not isinstance(effective_options, TextSearchOptions):
746
- logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
901
+ logger.warning(
902
+ "Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
903
+ )
747
904
  query_input = query.extract_text()
748
905
  if not query_input or query_input.isspace():
749
906
  logger.error("Region has no extractable text for query.")
750
907
  return []
751
908
 
909
+ # Add filter to scope search to THIS PDF
752
910
  # Add filter to scope search to THIS PDF
753
911
  pdf_scope_filter = {
754
912
  "field": "pdf_path",
@@ -760,7 +918,10 @@ class PDF(ExtractionMixin):
760
918
  # Combine with existing filters in options (if any)
761
919
  if effective_options.filters:
762
920
  logger.debug(f"Combining PDF scope filter with existing filters")
763
- if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
921
+ if (
922
+ isinstance(effective_options.filters, dict)
923
+ and effective_options.filters.get("operator") == "AND"
924
+ ):
764
925
  effective_options.filters["conditions"].append(pdf_scope_filter)
765
926
  elif isinstance(effective_options.filters, list):
766
927
  effective_options.filters = {
@@ -773,7 +934,9 @@ class PDF(ExtractionMixin):
773
934
  "conditions": [effective_options.filters, pdf_scope_filter],
774
935
  }
775
936
  else:
776
- logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
937
+ logger.warning(
938
+ f"Unsupported format for existing filters. Overwriting with PDF scope filter."
939
+ )
777
940
  effective_options.filters = pdf_scope_filter
778
941
  else:
779
942
  effective_options.filters = pdf_scope_filter
@@ -790,26 +953,40 @@ class PDF(ExtractionMixin):
790
953
  except FileNotFoundError as fnf:
791
954
  logger.error(f"Search failed: Collection not found. Error: {fnf}")
792
955
  raise
956
+ logger.error(f"Search failed: Collection not found. Error: {fnf}")
957
+ raise
793
958
  except Exception as e:
794
959
  logger.error(f"SearchService search failed: {e}")
795
960
  raise RuntimeError(f"Search within index failed. See logs for details.") from e
961
+ logger.error(f"SearchService search failed: {e}")
962
+ raise RuntimeError(f"Search within index failed. See logs for details.") from e
796
963
 
797
964
  def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
798
965
  """
799
966
  Exports OCR results from this PDF into a correction task package.
967
+ Exports OCR results from this PDF into a correction task package.
800
968
 
801
969
  Args:
970
+ output_zip_path: The path to save the output zip file
802
971
  output_zip_path: The path to save the output zip file
803
972
  **kwargs: Additional arguments passed to create_correction_task_package
804
973
  """
805
974
  try:
806
975
  from natural_pdf.utils.packaging import create_correction_task_package
976
+
807
977
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
808
978
  except ImportError:
809
- logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
979
+ logger.error(
980
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
981
+ )
982
+ logger.error(
983
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
984
+ )
810
985
  except Exception as e:
811
986
  logger.error(f"Failed to export correction task: {e}")
812
987
  raise
988
+ logger.error(f"Failed to export correction task: {e}")
989
+ raise
813
990
 
814
991
  def correct_ocr(
815
992
  self,
@@ -820,17 +997,23 @@ class PDF(ExtractionMixin):
820
997
  ) -> "PDF":
821
998
  """
822
999
  Applies corrections to OCR text elements using a callback function.
1000
+ Applies corrections to OCR text elements using a callback function.
823
1001
 
824
1002
  Args:
1003
+ correction_callback: Function that takes an element and returns corrected text or None
825
1004
  correction_callback: Function that takes an element and returns corrected text or None
826
1005
  pages: Optional page indices/slice to limit the scope of correction
827
1006
  max_workers: Maximum number of threads to use for parallel execution
828
1007
  progress_callback: Optional callback function for progress updates
1008
+ max_workers: Maximum number of threads to use for parallel execution
1009
+ progress_callback: Optional callback function for progress updates
829
1010
 
830
1011
  Returns:
831
1012
  Self for method chaining
1013
+ Self for method chaining
832
1014
  """
833
1015
  target_page_indices = []
1016
+ target_page_indices = []
834
1017
  if pages is None:
835
1018
  target_page_indices = list(range(len(self._pages)))
836
1019
  elif isinstance(pages, slice):
@@ -843,14 +1026,17 @@ class PDF(ExtractionMixin):
843
1026
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
844
1027
  except (IndexError, TypeError, ValueError) as e:
845
1028
  raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1029
+ raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
846
1030
  else:
847
1031
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1032
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
848
1033
 
849
1034
  if not target_page_indices:
850
1035
  logger.warning("No pages selected for OCR correction.")
851
1036
  return self
852
1037
 
853
1038
  logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1039
+ logger.info(f"Starting OCR correction for pages: {target_page_indices}")
854
1040
 
855
1041
  for page_idx in target_page_indices:
856
1042
  page = self._pages[page_idx]
@@ -862,7 +1048,9 @@ class PDF(ExtractionMixin):
862
1048
  )
863
1049
  except Exception as e:
864
1050
  logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1051
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
865
1052
 
1053
+ logger.info("OCR correction process finished.")
866
1054
  logger.info("OCR correction process finished.")
867
1055
  return self
868
1056
 
@@ -872,15 +1060,16 @@ class PDF(ExtractionMixin):
872
1060
  return 0
873
1061
  return len(self._pages)
874
1062
 
875
- def __getitem__(self, key) -> Union[Page, "PageCollection"]:
1063
+ def __getitem__(self, key) -> Union["Page", "PageCollection"]:
876
1064
  """Access pages by index or slice."""
877
1065
  if not hasattr(self, "_pages"):
878
1066
  raise AttributeError("PDF pages not initialized yet.")
879
-
1067
+
880
1068
  if isinstance(key, slice):
881
1069
  from natural_pdf.elements.collections import PageCollection
1070
+
882
1071
  return PageCollection(self._pages[key])
883
-
1072
+
884
1073
  if isinstance(key, int):
885
1074
  if 0 <= key < len(self._pages):
886
1075
  return self._pages[key]
@@ -905,13 +1094,12 @@ class PDF(ExtractionMixin):
905
1094
  try:
906
1095
  if hasattr(self._temp_file, "name") and self._temp_file.name:
907
1096
  temp_file_path = self._temp_file.name
908
- if os.path.exists(temp_file_path):
1097
+ # Only unlink if it exists and _is_stream is False (meaning WE created it)
1098
+ if not self._is_stream and os.path.exists(temp_file_path):
909
1099
  os.unlink(temp_file_path)
910
1100
  logger.debug(f"Removed temporary PDF file: {temp_file_path}")
911
1101
  except Exception as e:
912
1102
  logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
913
- finally:
914
- self._temp_file = None
915
1103
 
916
1104
  def __enter__(self):
917
1105
  """Context manager entry."""
@@ -922,14 +1110,141 @@ class PDF(ExtractionMixin):
922
1110
  self.close()
923
1111
 
924
1112
  def get_id(self) -> str:
1113
+ """Get unique identifier for this PDF."""
925
1114
  """Get unique identifier for this PDF."""
926
1115
  return self.path
927
1116
 
1117
+ # --- Deskew Method --- #
1118
+
1119
+ def deskew(
1120
+ self,
1121
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
1122
+ resolution: int = 300,
1123
+ detection_resolution: int = 72,
1124
+ force_overwrite: bool = False,
1125
+ **deskew_kwargs,
1126
+ ) -> "PDF":
1127
+ """
1128
+ Creates a new, in-memory PDF object containing deskewed versions of the
1129
+ specified pages from the original PDF.
1130
+
1131
+ This method renders each selected page, detects and corrects skew using the 'deskew'
1132
+ library, and then combines the resulting images into a new PDF using 'img2pdf'.
1133
+ The new PDF object is returned directly.
1134
+
1135
+ Important: The returned PDF is image-based. Any existing text, OCR results,
1136
+ annotations, or other elements from the original pages will *not* be carried over.
1137
+
1138
+ Args:
1139
+ pages: Page indices/slice to include (0-based). If None, processes all pages.
1140
+ resolution: DPI resolution for rendering the output deskewed pages.
1141
+ detection_resolution: DPI resolution used for skew detection if angles are not
1142
+ already cached on the page objects.
1143
+ force_overwrite: If False (default), raises a ValueError if any target page
1144
+ already contains processed elements (text, OCR, regions) to
1145
+ prevent accidental data loss. Set to True to proceed anyway.
1146
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
1147
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
1148
+
1149
+ Returns:
1150
+ A new PDF object representing the deskewed document.
1151
+
1152
+ Raises:
1153
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
1154
+ ValueError: If `force_overwrite` is False and target pages contain elements.
1155
+ FileNotFoundError: If the source PDF cannot be read (if file-based).
1156
+ IOError: If creating the in-memory PDF fails.
1157
+ RuntimeError: If rendering or deskewing individual pages fails.
1158
+ """
1159
+ if not DESKEW_AVAILABLE:
1160
+ raise ImportError(
1161
+ "Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
1162
+ )
1163
+
1164
+ target_pages = self._get_target_pages(pages) # Use helper to resolve pages
1165
+
1166
+ # --- Safety Check --- #
1167
+ if not force_overwrite:
1168
+ for page in target_pages:
1169
+ # Check if the element manager has been initialized and contains any elements
1170
+ if (
1171
+ hasattr(page, "_element_mgr")
1172
+ and page._element_mgr
1173
+ and page._element_mgr.has_elements()
1174
+ ):
1175
+ raise ValueError(
1176
+ f"Page {page.number} contains existing elements (text, OCR, etc.). "
1177
+ f"Deskewing creates an image-only PDF, discarding these elements. "
1178
+ f"Set force_overwrite=True to proceed."
1179
+ )
1180
+
1181
+ # --- Process Pages --- #
1182
+ deskewed_images_bytes = []
1183
+ logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
1184
+
1185
+ # Use tqdm via get_tqdm
1186
+ for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
1187
+ try:
1188
+ # Use page.deskew to get the corrected PIL image
1189
+ # Pass down resolutions and kwargs
1190
+ deskewed_img = page.deskew(
1191
+ resolution=resolution,
1192
+ angle=None, # Let page.deskew handle detection/caching
1193
+ detection_resolution=detection_resolution,
1194
+ **deskew_kwargs,
1195
+ )
1196
+
1197
+ if not deskewed_img:
1198
+ logger.warning(
1199
+ f"Page {page.number}: Failed to generate deskewed image, skipping."
1200
+ )
1201
+ continue
1202
+
1203
+ # Convert image to bytes for img2pdf (use PNG for lossless quality)
1204
+ with io.BytesIO() as buf:
1205
+ deskewed_img.save(buf, format="PNG")
1206
+ deskewed_images_bytes.append(buf.getvalue())
1207
+
1208
+ except Exception as e:
1209
+ logger.error(
1210
+ f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
1211
+ )
1212
+ # Option: Raise a runtime error, or continue and skip the page?
1213
+ # Raising makes the whole operation fail if one page fails.
1214
+ raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
1215
+
1216
+ # --- Create PDF --- #
1217
+ if not deskewed_images_bytes:
1218
+ raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
1219
+
1220
+ logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
1221
+ try:
1222
+ # Use img2pdf to combine image bytes into PDF bytes
1223
+ pdf_bytes = img2pdf.convert(deskewed_images_bytes)
1224
+
1225
+ # Wrap bytes in a stream
1226
+ pdf_stream = io.BytesIO(pdf_bytes)
1227
+
1228
+ # Create a new PDF object from the stream using original config
1229
+ logger.info("Creating new PDF object from deskewed stream...")
1230
+ new_pdf = PDF(
1231
+ pdf_stream,
1232
+ reading_order=self._reading_order,
1233
+ font_attrs=self._font_attrs,
1234
+ keep_spaces=self._config.get("keep_spaces", True),
1235
+ )
1236
+ return new_pdf
1237
+ except Exception as e:
1238
+ logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
1239
+ raise IOError("Failed to create deskewed PDF object from image stream.") from e
1240
+
1241
+ # --- End Deskew Method --- #
1242
+
928
1243
  # --- Classification Methods --- #
929
1244
 
930
1245
  def classify_pages(
931
1246
  self,
932
- categories: List[str],
1247
+ labels: List[str],
933
1248
  model: Optional[str] = None,
934
1249
  pages: Optional[Union[Iterable[int], range, slice]] = None,
935
1250
  analysis_key: str = "classification",
@@ -940,7 +1255,7 @@ class PDF(ExtractionMixin):
940
1255
  Classifies specified pages of the PDF.
941
1256
 
942
1257
  Args:
943
- categories: List of category names
1258
+ labels: List of category names
944
1259
  model: Model identifier ('text', 'vision', or specific HF ID)
945
1260
  pages: Page indices, slice, or None for all pages
946
1261
  analysis_key: Key to store results in page's analyses dict
@@ -950,23 +1265,24 @@ class PDF(ExtractionMixin):
950
1265
  Returns:
951
1266
  Self for method chaining
952
1267
  """
953
- if not categories:
954
- raise ValueError("Categories list cannot be empty.")
1268
+ if not labels:
1269
+ raise ValueError("Labels list cannot be empty.")
955
1270
 
956
1271
  try:
957
- manager = self.get_manager('classification')
1272
+ manager = self.get_manager("classification")
958
1273
  except (ValueError, RuntimeError) as e:
959
1274
  raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
960
1275
 
961
1276
  if not manager or not manager.is_available():
962
1277
  try:
963
1278
  from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
1279
+
964
1280
  if not _CLASSIFICATION_AVAILABLE:
965
1281
  raise ImportError("Classification dependencies missing.")
966
1282
  except ImportError:
967
1283
  raise ImportError(
968
1284
  "Classification dependencies missing. "
969
- "Install with: pip install \"natural-pdf[classification]\""
1285
+ 'Install with: pip install "natural-pdf[classification]"'
970
1286
  )
971
1287
  raise ClassificationError("ClassificationManager not available.")
972
1288
 
@@ -990,12 +1306,14 @@ class PDF(ExtractionMixin):
990
1306
  return self
991
1307
 
992
1308
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
993
- logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
1309
+ logger.info(
1310
+ f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
1311
+ )
994
1312
 
995
1313
  page_contents = []
996
1314
  pages_to_classify = []
997
1315
  logger.debug(f"Gathering content for {len(target_pages)} pages...")
998
-
1316
+
999
1317
  for page in target_pages:
1000
1318
  try:
1001
1319
  content = page._get_classification_content(model_type=inferred_using, **kwargs)
@@ -1009,13 +1327,13 @@ class PDF(ExtractionMixin):
1009
1327
  if not page_contents:
1010
1328
  logger.warning("No content could be gathered for batch classification.")
1011
1329
  return self
1012
-
1330
+
1013
1331
  logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
1014
1332
 
1015
1333
  try:
1016
1334
  batch_results = manager.classify_batch(
1017
1335
  item_contents=page_contents,
1018
- categories=categories,
1336
+ labels=labels,
1019
1337
  model_id=model,
1020
1338
  using=inferred_using,
1021
1339
  **kwargs,
@@ -1025,17 +1343,23 @@ class PDF(ExtractionMixin):
1025
1343
  raise ClassificationError(f"Batch classification failed: {e}") from e
1026
1344
 
1027
1345
  if len(batch_results) != len(pages_to_classify):
1028
- logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
1346
+ logger.error(
1347
+ f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
1348
+ )
1029
1349
  return self
1030
1350
 
1031
- logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
1351
+ logger.debug(
1352
+ f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
1353
+ )
1032
1354
  for page, result_obj in zip(pages_to_classify, batch_results):
1033
1355
  try:
1034
- if not hasattr(page, 'analyses') or page.analyses is None:
1356
+ if not hasattr(page, "analyses") or page.analyses is None:
1035
1357
  page.analyses = {}
1036
1358
  page.analyses[analysis_key] = result_obj
1037
1359
  except Exception as e:
1038
- logger.warning(f"Failed to store classification results for page {page.number}: {e}")
1360
+ logger.warning(
1361
+ f"Failed to store classification results for page {page.number}: {e}"
1362
+ )
1039
1363
 
1040
1364
  logger.info(f"Finished classifying PDF pages.")
1041
1365
  return self
@@ -1043,7 +1367,7 @@ class PDF(ExtractionMixin):
1043
1367
  # --- End Classification Methods --- #
1044
1368
 
1045
1369
  # --- Extraction Support --- #
1046
- def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
1370
+ def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
1047
1371
  """
1048
1372
  Retrieves the content for the entire PDF.
1049
1373
 
@@ -1056,28 +1380,28 @@ class PDF(ExtractionMixin):
1056
1380
  List[PIL.Image.Image]: List of page images if using='vision'
1057
1381
  None: If content cannot be retrieved
1058
1382
  """
1059
- if using == 'text':
1383
+ if using == "text":
1060
1384
  try:
1061
- layout = kwargs.pop('layout', True)
1385
+ layout = kwargs.pop("layout", True)
1062
1386
  return self.extract_text(layout=layout, **kwargs)
1063
1387
  except Exception as e:
1064
1388
  logger.error(f"Error extracting text from PDF: {e}")
1065
1389
  return None
1066
- elif using == 'vision':
1390
+ elif using == "vision":
1067
1391
  page_images = []
1068
1392
  logger.info(f"Rendering {len(self.pages)} pages to images...")
1069
-
1070
- resolution = kwargs.pop('resolution', 72)
1071
- include_highlights = kwargs.pop('include_highlights', False)
1072
- labels = kwargs.pop('labels', False)
1073
-
1393
+
1394
+ resolution = kwargs.pop("resolution", 72)
1395
+ include_highlights = kwargs.pop("include_highlights", False)
1396
+ labels = kwargs.pop("labels", False)
1397
+
1074
1398
  try:
1075
1399
  for page in tqdm(self.pages, desc="Rendering Pages"):
1076
1400
  img = page.to_image(
1077
1401
  resolution=resolution,
1078
1402
  include_highlights=include_highlights,
1079
1403
  labels=labels,
1080
- **kwargs
1404
+ **kwargs,
1081
1405
  )
1082
1406
  if img:
1083
1407
  page_images.append(img)
@@ -1093,4 +1417,179 @@ class PDF(ExtractionMixin):
1093
1417
  else:
1094
1418
  logger.error(f"Unsupported value for 'using': {using}")
1095
1419
  return None
1420
+
1096
1421
  # --- End Extraction Support --- #
1422
+
1423
+ def _gather_analysis_data(
1424
+ self,
1425
+ analysis_keys: List[str],
1426
+ include_content: bool,
1427
+ include_images: bool,
1428
+ image_dir: Optional[Path],
1429
+ image_format: str,
1430
+ image_resolution: int,
1431
+ ) -> List[Dict[str, Any]]:
1432
+ """
1433
+ Gather analysis data from all pages in the PDF.
1434
+
1435
+ Args:
1436
+ analysis_keys: Keys in the analyses dictionary to export
1437
+ include_content: Whether to include extracted text
1438
+ include_images: Whether to export images
1439
+ image_dir: Directory to save images
1440
+ image_format: Format to save images
1441
+ image_resolution: Resolution for exported images
1442
+
1443
+ Returns:
1444
+ List of dictionaries containing analysis data
1445
+ """
1446
+ if not hasattr(self, "_pages") or not self._pages:
1447
+ logger.warning(f"No pages found in PDF {self.path}")
1448
+ return []
1449
+
1450
+ all_data = []
1451
+
1452
+ for page in tqdm(self._pages, desc="Gathering page data", leave=False):
1453
+ # Basic page information
1454
+ page_data = {
1455
+ "pdf_path": self.path,
1456
+ "page_number": page.number,
1457
+ "page_index": page.index,
1458
+ }
1459
+
1460
+ # Include extracted text if requested
1461
+ if include_content:
1462
+ try:
1463
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
1464
+ except Exception as e:
1465
+ logger.error(f"Error extracting text from page {page.number}: {e}")
1466
+ page_data["content"] = ""
1467
+
1468
+ # Save image if requested
1469
+ if include_images:
1470
+ try:
1471
+ # Create image filename
1472
+ image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
1473
+ image_path = image_dir / image_filename
1474
+
1475
+ # Save image
1476
+ page.save_image(
1477
+ str(image_path), resolution=image_resolution, include_highlights=True
1478
+ )
1479
+
1480
+ # Add relative path to data
1481
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1482
+ except Exception as e:
1483
+ logger.error(f"Error saving image for page {page.number}: {e}")
1484
+ page_data["image_path"] = None
1485
+
1486
+ # Add analyses data
1487
+ for key in analysis_keys:
1488
+ if not hasattr(page, "analyses") or not page.analyses:
1489
+ raise ValueError(f"Page {page.number} does not have analyses data")
1490
+
1491
+ if key not in page.analyses:
1492
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
1493
+
1494
+ # Get the analysis result
1495
+ analysis_result = page.analyses[key]
1496
+
1497
+ # If the result has a to_dict method, use it
1498
+ if hasattr(analysis_result, "to_dict"):
1499
+ analysis_data = analysis_result.to_dict()
1500
+ else:
1501
+ # Otherwise, use the result directly if it's dict-like
1502
+ try:
1503
+ analysis_data = dict(analysis_result)
1504
+ except (TypeError, ValueError):
1505
+ # Last resort: convert to string
1506
+ analysis_data = {"raw_result": str(analysis_result)}
1507
+
1508
+ # Add analysis data to page data with the key as prefix
1509
+ for k, v in analysis_data.items():
1510
+ page_data[f"{key}.{k}"] = v
1511
+
1512
+ all_data.append(page_data)
1513
+
1514
+ return all_data
1515
+
1516
+ def _get_target_pages(
1517
+ self, pages: Optional[Union[Iterable[int], range, slice]] = None
1518
+ ) -> List["Page"]:
1519
+ """
1520
+ Helper method to get a list of Page objects based on the input pages.
1521
+
1522
+ Args:
1523
+ pages: Page indices, slice, or None for all pages
1524
+
1525
+ Returns:
1526
+ List of Page objects
1527
+ """
1528
+ if pages is None:
1529
+ return self._pages
1530
+ elif isinstance(pages, slice):
1531
+ return self._pages[pages]
1532
+ elif hasattr(pages, "__iter__"):
1533
+ try:
1534
+ return [self._pages[i] for i in pages]
1535
+ except IndexError:
1536
+ raise ValueError("Invalid page index provided in 'pages' iterable.")
1537
+ except TypeError:
1538
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1539
+ else:
1540
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1541
+
1542
+ # --- Classification Mixin Implementation --- #
1543
+
1544
+ def _get_classification_manager(self) -> "ClassificationManager":
1545
+ """Returns the ClassificationManager instance for this PDF."""
1546
+ try:
1547
+ return self.get_manager("classification")
1548
+ except (KeyError, RuntimeError) as e:
1549
+ raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
1550
+
1551
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
1552
+ """
1553
+ Provides the content for classifying the entire PDF.
1554
+
1555
+ Args:
1556
+ model_type: 'text' or 'vision'.
1557
+ **kwargs: Additional arguments (e.g., for text extraction or image rendering).
1558
+
1559
+ Returns:
1560
+ Extracted text (str) or the first page's image (PIL.Image).
1561
+
1562
+ Raises:
1563
+ ValueError: If model_type is 'vision' and PDF has != 1 page,
1564
+ or if model_type is unsupported, or if content cannot be generated.
1565
+ """
1566
+ if model_type == "text":
1567
+ try:
1568
+ # Extract text from the whole document
1569
+ text = self.extract_text(**kwargs) # Pass relevant kwargs
1570
+ if not text or text.isspace():
1571
+ raise ValueError("PDF contains no extractable text for classification.")
1572
+ return text
1573
+ except Exception as e:
1574
+ logger.error(f"Error extracting text for PDF classification: {e}")
1575
+ raise ValueError("Failed to extract text for classification.") from e
1576
+
1577
+ elif model_type == "vision":
1578
+ if len(self.pages) == 1:
1579
+ # Use the single page's content method
1580
+ try:
1581
+ return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
1582
+ except Exception as e:
1583
+ logger.error(f"Error getting image from single page for classification: {e}")
1584
+ raise ValueError("Failed to get image from single page.") from e
1585
+ elif len(self.pages) == 0:
1586
+ raise ValueError("Cannot classify empty PDF using vision model.")
1587
+ else:
1588
+ raise ValueError(
1589
+ f"Vision classification for a PDF object is only supported for single-page PDFs. "
1590
+ f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
1591
+ )
1592
+ else:
1593
+ raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
1594
+
1595
+ # --- End Classification Mixin Implementation ---