natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -1,62 +1,86 @@
1
- import pdfplumber
1
+ import copy # Add import for deepcopy
2
2
  import logging
3
- import tempfile
4
3
  import os
5
4
  import re
5
+ import tempfile
6
6
  import urllib.request
7
- from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable, TYPE_CHECKING # Added Iterable and TYPE_CHECKING
8
- from pathlib import Path # Added Path
9
- import copy # Add import for deepcopy
7
+ from pathlib import Path # Added Path
8
+ from typing import ( # Added Iterable and TYPE_CHECKING
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ Iterable,
14
+ List,
15
+ Optional,
16
+ Tuple,
17
+ Type,
18
+ Union,
19
+ )
20
+
21
+ import pdfplumber
10
22
  from PIL import Image
11
23
 
24
+ from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
25
+ LayoutManager,
26
+ )
27
+ from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
12
28
  from natural_pdf.core.page import Page
13
- from natural_pdf.selectors.parser import parse_selector
14
29
  from natural_pdf.elements.collections import ElementCollection
15
30
  from natural_pdf.elements.region import Region
16
31
  from natural_pdf.ocr import OCRManager, OCROptions
17
- from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
18
- from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
32
+ from natural_pdf.selectors.parser import parse_selector
19
33
 
20
34
  # Import the flag directly - this should always work
21
35
 
22
36
  # --- Add Search Service Imports (needed for new methods) ---
23
37
  try:
38
+ from typing import Any as TypingAny # Import Any if not already
39
+
40
+ from natural_pdf.search import TextSearchOptions # Keep for ask default
24
41
  from natural_pdf.search import (
25
- get_search_service,
26
- SearchServiceProtocol,
42
+ BaseSearchOptions,
27
43
  SearchOptions,
28
- TextSearchOptions, # Keep for ask default
29
- BaseSearchOptions
44
+ SearchServiceProtocol,
45
+ get_search_service,
30
46
  )
31
- from typing import Any as TypingAny # Import Any if not already
32
47
  except ImportError:
33
48
  # Define dummies if needed for type hints within the class
34
49
  SearchServiceProtocol = object
35
50
  SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
36
51
  TypingAny = object
52
+
37
53
  # Dummy factory needed for default arg in methods
38
54
  def get_search_service(**kwargs) -> SearchServiceProtocol:
39
- raise ImportError("Search dependencies are not installed. Install with: pip install natural-pdf[search]")
55
+ raise ImportError(
56
+ "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
57
+ )
58
+
40
59
 
41
60
  # --- End Search Service Imports ---
42
61
 
43
62
  # Set up logger early
44
63
  logger = logging.getLogger("natural_pdf.core.pdf")
45
64
 
65
+
46
66
  class PDF:
47
67
  """
48
68
  Enhanced PDF wrapper built on top of pdfplumber.
49
-
69
+
50
70
  This class provides a fluent interface for working with PDF documents,
51
71
  with improved selection, navigation, and extraction capabilities.
52
72
  """
53
-
54
- def __init__(self, path_or_url: str, reading_order: bool = True,
55
- font_attrs: Optional[List[str]] = None,
56
- keep_spaces: bool = True):
73
+
74
+ def __init__(
75
+ self,
76
+ path_or_url: str,
77
+ reading_order: bool = True,
78
+ font_attrs: Optional[List[str]] = None,
79
+ keep_spaces: bool = True,
80
+ ):
57
81
  """
58
82
  Initialize the enhanced PDF object.
59
-
83
+
60
84
  Args:
61
85
  path_or_url: Path to the PDF file or a URL to a PDF
62
86
  reading_order: Whether to use natural reading order
@@ -69,30 +93,30 @@ class PDF:
69
93
  False: Break text at spaces, each word is separate (legacy behavior)
70
94
  """
71
95
  # Check if the input is a URL
72
- is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
73
-
96
+ is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
97
+
74
98
  # Initialize path-related attributes
75
99
  self._original_path = path_or_url
76
100
  self._temp_file = None
77
- self._resolved_path = None # Store the actual path used by pdfplumber
101
+ self._resolved_path = None # Store the actual path used by pdfplumber
78
102
 
79
103
  if is_url:
80
104
  logger.info(f"Downloading PDF from URL: {path_or_url}")
81
105
  try:
82
106
  # Create a temporary file to store the downloaded PDF
83
- self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
84
-
107
+ self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
108
+
85
109
  # Download the PDF
86
110
  with urllib.request.urlopen(path_or_url) as response:
87
111
  self._temp_file.write(response.read())
88
112
  self._temp_file.flush()
89
113
  self._temp_file.close()
90
-
114
+
91
115
  # Use the temporary file path
92
116
  self._resolved_path = self._temp_file.name
93
117
  logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
94
118
  except Exception as e:
95
- if self._temp_file and hasattr(self._temp_file, 'name'):
119
+ if self._temp_file and hasattr(self._temp_file, "name"):
96
120
  try:
97
121
  os.unlink(self._temp_file.name)
98
122
  except:
@@ -104,40 +128,46 @@ class PDF:
104
128
  self._resolved_path = path_or_url
105
129
 
106
130
  logger.info(f"Initializing PDF from {self._resolved_path}")
107
- logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
108
-
131
+ logger.debug(
132
+ f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
133
+ )
134
+
109
135
  try:
110
136
  self._pdf = pdfplumber.open(self._resolved_path)
111
137
  except Exception as e:
112
- logger.error(f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}", exc_info=True)
113
- # Clean up temp file if creation failed
114
- self.close()
115
- raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
138
+ logger.error(
139
+ f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
140
+ exc_info=True,
141
+ )
142
+ # Clean up temp file if creation failed
143
+ self.close()
144
+ raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
116
145
 
117
- self._path = self._resolved_path # Keep original path too?
118
- self.path = self._resolved_path # Public attribute for the resolved path
119
- self.source_path = self._original_path # Public attribute for the user-provided path/URL
146
+ self._path = self._resolved_path # Keep original path too?
147
+ self.path = self._resolved_path # Public attribute for the resolved path
148
+ self.source_path = self._original_path # Public attribute for the user-provided path/URL
120
149
 
121
150
  self._reading_order = reading_order
122
- self._config = {
123
- 'keep_spaces': keep_spaces
124
- }
151
+ self._config = {"keep_spaces": keep_spaces}
125
152
 
126
153
  self._font_attrs = font_attrs # Store the font attribute configuration
127
154
 
128
155
  # Initialize Managers and Services (conditionally available)
129
156
  self._ocr_manager = OCRManager() if OCRManager else None
130
157
  self._layout_manager = LayoutManager() if LayoutManager else None
131
- self.highlighter = HighlightingService(self)
158
+ self.highlighter = HighlightingService(self)
132
159
 
133
160
  # Initialize pages last, passing necessary refs
134
- self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
161
+ self._pages = [
162
+ Page(p, parent=self, index=i, font_attrs=font_attrs)
163
+ for i, p in enumerate(self._pdf.pages)
164
+ ]
135
165
 
136
166
  # Other state
137
167
  self._element_cache = {}
138
168
  self._exclusions = [] # List to store exclusion functions/regions
139
169
  self._regions = [] # List to store region functions/definitions
140
-
170
+
141
171
  logger.info("Initialized HighlightingService.")
142
172
  logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
143
173
 
@@ -147,45 +177,48 @@ class PDF:
147
177
  return self._pdf.metadata
148
178
 
149
179
  @property
150
- def pages(self) -> 'PageCollection':
180
+ def pages(self) -> "PageCollection":
151
181
  """Access pages as a PageCollection object."""
152
182
  from natural_pdf.elements.collections import PageCollection
183
+
153
184
  # Ensure _pages is initialized
154
- if not hasattr(self, '_pages'):
155
- raise AttributeError("PDF pages not yet initialized.")
185
+ if not hasattr(self, "_pages"):
186
+ raise AttributeError("PDF pages not yet initialized.")
156
187
  return PageCollection(self._pages)
157
-
158
- def clear_exclusions(self) -> 'PDF':
188
+
189
+ def clear_exclusions(self) -> "PDF":
159
190
  """
160
191
  Clear all exclusion functions from the PDF.
161
-
192
+
162
193
  Returns:
163
194
  Self for method chaining
164
195
  """
165
196
  # Ensure _pages is initialized
166
- if not hasattr(self, '_pages'):
167
- raise AttributeError("PDF pages not yet initialized.")
197
+ if not hasattr(self, "_pages"):
198
+ raise AttributeError("PDF pages not yet initialized.")
168
199
 
169
200
  self._exclusions = []
170
201
  # Also clear from pages
171
202
  for page in self._pages:
172
203
  page.clear_exclusions()
173
204
  return self
174
-
175
- def add_exclusion(self, exclusion_func: Callable[['Page'], Optional[Region]], label: str = None) -> 'PDF':
205
+
206
+ def add_exclusion(
207
+ self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
208
+ ) -> "PDF":
176
209
  """
177
210
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
178
-
211
+
179
212
  Args:
180
213
  exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
181
214
  label: Optional label for this exclusion
182
-
215
+
183
216
  Returns:
184
217
  Self for method chaining
185
218
  """
186
219
  # Ensure _pages is initialized
187
- if not hasattr(self, '_pages'):
188
- raise AttributeError("PDF pages not yet initialized.")
220
+ if not hasattr(self, "_pages"):
221
+ raise AttributeError("PDF pages not yet initialized.")
189
222
 
190
223
  # Store exclusion with its label at PDF level
191
224
  exclusion_data = (exclusion_func, label)
@@ -198,16 +231,16 @@ class PDF:
198
231
 
199
232
  return self
200
233
 
201
- def apply_ocr_to_pages(
234
+ def apply_ocr(
202
235
  self,
203
236
  pages: Optional[Union[Iterable[int], range, slice]] = None,
204
237
  engine: Optional[str] = None,
205
- options: Optional['OCROptions'] = None,
238
+ options: Optional["OCROptions"] = None,
206
239
  languages: Optional[List[str]] = None,
207
240
  min_confidence: Optional[float] = None,
208
241
  device: Optional[str] = None,
209
242
  # Add other simple mode args if needed
210
- ) -> 'PDF':
243
+ ) -> "PDF":
211
244
  """
212
245
  Applies OCR to specified pages (or all pages) of the PDF using batch processing.
213
246
 
@@ -234,9 +267,9 @@ class PDF:
234
267
  RuntimeError: If the OCRManager or selected engine is not available.
235
268
  """
236
269
  if not self._ocr_manager:
237
- logger.error("OCRManager not available. Cannot apply OCR.")
238
- # Or raise RuntimeError("OCRManager not initialized.")
239
- return self
270
+ logger.error("OCRManager not available. Cannot apply OCR.")
271
+ # Or raise RuntimeError("OCRManager not initialized.")
272
+ return self
240
273
 
241
274
  # --- Determine Target Pages ---
242
275
  target_pages: List[Page] = []
@@ -244,15 +277,17 @@ class PDF:
244
277
  target_pages = self._pages
245
278
  elif isinstance(pages, slice):
246
279
  target_pages = self._pages[pages]
247
- elif hasattr(pages, '__iter__'): # Check if it's iterable (list, range, tuple, etc.)
280
+ elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
248
281
  try:
249
282
  target_pages = [self._pages[i] for i in pages]
250
283
  except IndexError:
251
284
  raise ValueError("Invalid page index provided in 'pages' iterable.")
252
285
  except TypeError:
253
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
286
+ raise TypeError(
287
+ "'pages' must be None, a slice, or an iterable of page indices (int)."
288
+ )
254
289
  else:
255
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
290
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
256
291
 
257
292
  if not target_pages:
258
293
  logger.warning("No pages selected for OCR processing.")
@@ -263,33 +298,36 @@ class PDF:
263
298
 
264
299
  # --- Render Images for Batch ---
265
300
  images_pil: List[Image.Image] = []
266
- page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
301
+ page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
267
302
  logger.info(f"Rendering {len(target_pages)} pages to images...")
268
- failed_page_num = 'unknown' # Keep track of potentially failing page
303
+ failed_page_num = "unknown" # Keep track of potentially failing page
269
304
  try:
270
- ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
305
+ ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
271
306
  for i, page in enumerate(target_pages):
272
- failed_page_num = page.number # Update current page number in case of error
307
+ failed_page_num = page.number # Update current page number in case of error
273
308
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
274
309
  # Use page.to_image but ensure highlights are off for OCR base image
275
310
  img = page.to_image(scale=ocr_scale, include_highlights=False)
276
311
  images_pil.append(img)
277
- page_image_map.append((page, img)) # Store pair
312
+ page_image_map.append((page, img)) # Store pair
278
313
  except Exception as e:
279
314
  logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
280
315
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
281
316
 
282
317
  if not images_pil:
283
- logger.error("No images were successfully rendered for batch OCR.")
284
- return self
318
+ logger.error("No images were successfully rendered for batch OCR.")
319
+ return self
285
320
 
286
321
  # --- Prepare Arguments for Manager ---
287
- manager_args = {'images': images_pil, 'options': options, 'engine': engine}
322
+ manager_args = {"images": images_pil, "options": options, "engine": engine}
288
323
  simple_args = {}
289
- if languages is not None: simple_args['languages'] = languages
290
- if min_confidence is not None: simple_args['min_confidence'] = min_confidence
291
- if device is not None: simple_args['device'] = device
292
- manager_args.update(simple_args) # Add simple args if options not provided
324
+ if languages is not None:
325
+ simple_args["languages"] = languages
326
+ if min_confidence is not None:
327
+ simple_args["min_confidence"] = min_confidence
328
+ if device is not None:
329
+ simple_args["device"] = device
330
+ manager_args.update(simple_args) # Add simple args if options not provided
293
331
 
294
332
  # --- Call OCR Manager for Batch Processing ---
295
333
  logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
@@ -298,17 +336,19 @@ class PDF:
298
336
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
299
337
 
300
338
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
301
- logger.error(f"OCR Manager returned unexpected result format or length for batch processing. "
302
- f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
303
- f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}.")
339
+ logger.error(
340
+ f"OCR Manager returned unexpected result format or length for batch processing. "
341
+ f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
342
+ f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
343
+ )
304
344
  # Handle error - maybe return early or try processing valid parts?
305
- return self # Return self without adding elements
345
+ return self # Return self without adding elements
306
346
 
307
347
  logger.info("OCR Manager batch processing complete.")
308
348
 
309
349
  except Exception as e:
310
- logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
311
- return self # Return self without adding elements
350
+ logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
351
+ return self # Return self without adding elements
312
352
 
313
353
  # --- Distribute Results and Add Elements to Pages ---
314
354
  logger.info("Adding OCR results to respective pages...")
@@ -316,45 +356,55 @@ class PDF:
316
356
  for i, (page, img) in enumerate(page_image_map):
317
357
  results_for_page = batch_results[i]
318
358
  if not isinstance(results_for_page, list):
319
- logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
320
- continue
359
+ logger.warning(
360
+ f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
361
+ )
362
+ continue
321
363
 
322
364
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
323
365
  # Use the page's element manager to create elements from its results
324
366
  # Changed from page._create_text_elements_from_ocr to use element_mgr
325
367
  try:
326
368
  # Calculate scale factors based on rendered image vs page dims
327
- img_scale_x = page.width / img.width if img.width > 0 else 1
328
- img_scale_y = page.height / img.height if img.height > 0 else 1
329
- elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img_scale_x, img_scale_y)
330
-
331
- if elements:
332
- # Note: element_mgr.create_text_elements_from_ocr already adds them
333
- total_elements_added += len(elements)
334
- logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
335
- else:
336
- logger.debug(f" No valid TextElements created for page {page.number}.")
369
+ img_scale_x = page.width / img.width if img.width > 0 else 1
370
+ img_scale_y = page.height / img.height if img.height > 0 else 1
371
+ elements = page._element_mgr.create_text_elements_from_ocr(
372
+ results_for_page, img_scale_x, img_scale_y
373
+ )
374
+
375
+ if elements:
376
+ # Note: element_mgr.create_text_elements_from_ocr already adds them
377
+ total_elements_added += len(elements)
378
+ logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
379
+ else:
380
+ logger.debug(f" No valid TextElements created for page {page.number}.")
337
381
  except Exception as e:
338
- logger.error(f" Error adding OCR elements to page {page.number}: {e}", exc_info=True)
339
- # Continue to next page
382
+ logger.error(
383
+ f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
384
+ )
385
+ # Continue to next page
340
386
 
341
- logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
387
+ logger.info(
388
+ f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
389
+ )
342
390
  return self
343
-
344
- def add_region(self, region_func: Callable[['Page'], Optional[Region]], name: str = None) -> 'PDF':
391
+
392
+ def add_region(
393
+ self, region_func: Callable[["Page"], Optional[Region]], name: str = None
394
+ ) -> "PDF":
345
395
  """
346
396
  Add a region function to the PDF. This creates regions on all pages using the provided function.
347
-
397
+
348
398
  Args:
349
399
  region_func: A function that takes a Page and returns a Region, or None.
350
400
  name: Optional name for the region
351
-
401
+
352
402
  Returns:
353
403
  Self for method chaining
354
404
  """
355
405
  # Ensure _pages is initialized
356
- if not hasattr(self, '_pages'):
357
- raise AttributeError("PDF pages not yet initialized.")
406
+ if not hasattr(self, "_pages"):
407
+ raise AttributeError("PDF pages not yet initialized.")
358
408
 
359
409
  # Store region with its name at PDF level
360
410
  region_data = (region_func, name)
@@ -367,93 +417,108 @@ class PDF:
367
417
  region_instance = region_func(page)
368
418
  if region_instance and isinstance(region_instance, Region):
369
419
  # If a valid region is returned, add it to the page
370
- page.add_region(region_instance, name=name, source='named')
420
+ page.add_region(region_instance, name=name, source="named")
371
421
  elif region_instance is not None:
372
- logger.warning(f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}")
422
+ logger.warning(
423
+ f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
424
+ )
373
425
  except Exception as e:
374
- logger.error(f"Error executing or adding region function for page {page.number}: {e}", exc_info=True)
426
+ logger.error(
427
+ f"Error executing or adding region function for page {page.number}: {e}",
428
+ exc_info=True,
429
+ )
375
430
 
376
431
  return self
377
-
378
- def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Optional[Any]:
432
+
433
+ def find(
434
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
435
+ ) -> Optional[Any]:
379
436
  """
380
437
  Find the first element matching the selector.
381
-
438
+
382
439
  Args:
383
440
  selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
384
441
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
385
442
  regex: Whether to use regex for text search in :contains (default: False)
386
443
  case: Whether to do case-sensitive text search (default: True)
387
444
  **kwargs: Additional filter parameters
388
-
445
+
389
446
  Returns:
390
447
  Element object or None if not found
391
448
  """
392
449
  # Ensure _pages is initialized
393
- if not hasattr(self, '_pages'):
394
- raise AttributeError("PDF pages not yet initialized.")
450
+ if not hasattr(self, "_pages"):
451
+ raise AttributeError("PDF pages not yet initialized.")
395
452
 
396
453
  selector_obj = parse_selector(selector)
397
-
454
+
398
455
  # Pass regex and case flags to selector function
399
- kwargs['regex'] = regex
400
- kwargs['case'] = case
401
-
402
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs)
456
+ kwargs["regex"] = regex
457
+ kwargs["case"] = case
458
+
459
+ results = self._apply_selector(
460
+ selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
461
+ )
403
462
  return results.first if results else None
404
-
405
- def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
463
+
464
+ def find_all(
465
+ self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
466
+ ) -> ElementCollection:
406
467
  """
407
468
  Find all elements matching the selector.
408
-
469
+
409
470
  Args:
410
471
  selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
411
472
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
412
473
  regex: Whether to use regex for text search in :contains (default: False)
413
474
  case: Whether to do case-sensitive text search (default: True)
414
475
  **kwargs: Additional filter parameters
415
-
476
+
416
477
  Returns:
417
478
  ElementCollection with matching elements
418
479
  """
419
480
  # Ensure _pages is initialized
420
- if not hasattr(self, '_pages'):
421
- raise AttributeError("PDF pages not yet initialized.")
481
+ if not hasattr(self, "_pages"):
482
+ raise AttributeError("PDF pages not yet initialized.")
422
483
 
423
484
  selector_obj = parse_selector(selector)
424
-
485
+
425
486
  # Pass regex and case flags to selector function
426
- kwargs['regex'] = regex
427
- kwargs['case'] = case
428
-
429
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs)
487
+ kwargs["regex"] = regex
488
+ kwargs["case"] = case
489
+
490
+ results = self._apply_selector(
491
+ selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
492
+ )
430
493
  return results
431
-
432
- def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs) -> ElementCollection:
494
+
495
+ def _apply_selector(
496
+ self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
497
+ ) -> ElementCollection:
433
498
  """
434
499
  Apply selector to PDF elements across all pages.
435
-
500
+
436
501
  Args:
437
502
  selector_obj: Parsed selector dictionary
438
503
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
439
504
  first_only: If True, stop searching after the first match is found.
440
505
  **kwargs: Additional filter parameters
441
-
506
+
442
507
  Returns:
443
508
  ElementCollection of matching elements
444
509
  """
445
510
  from natural_pdf.elements.collections import ElementCollection
446
-
511
+
447
512
  # Determine page range to search
448
- page_indices = kwargs.get('pages', range(len(self._pages)))
513
+ page_indices = kwargs.get("pages", range(len(self._pages)))
449
514
  if isinstance(page_indices, int):
450
515
  page_indices = [page_indices]
451
516
  elif isinstance(page_indices, slice):
452
517
  page_indices = range(*page_indices.indices(len(self._pages)))
453
518
 
454
519
  # Check for cross-page pseudo-classes (currently not supported)
455
- for pseudo in selector_obj.get('pseudo_classes', []):
456
- if pseudo.get('name') in ('spans', 'continues'):
520
+ for pseudo in selector_obj.get("pseudo_classes", []):
521
+ if pseudo.get("name") in ("spans", "continues"):
457
522
  logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
458
523
  return ElementCollection([])
459
524
 
@@ -464,141 +529,155 @@ class PDF:
464
529
  page = self._pages[page_idx]
465
530
  # Pass first_only down to page._apply_selector
466
531
  page_elements_collection = page._apply_selector(
467
- selector_obj,
468
- apply_exclusions=apply_exclusions,
469
- first_only=first_only,
470
- **kwargs
532
+ selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
471
533
  )
472
534
  if page_elements_collection:
473
- page_elements = page_elements_collection.elements
474
- all_elements.extend(page_elements)
475
- # If we only need the first match overall, and we found one on this page, stop
476
- if first_only and page_elements:
477
- break # Stop iterating through pages
535
+ page_elements = page_elements_collection.elements
536
+ all_elements.extend(page_elements)
537
+ # If we only need the first match overall, and we found one on this page, stop
538
+ if first_only and page_elements:
539
+ break # Stop iterating through pages
478
540
  else:
479
- logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
541
+ logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
480
542
 
481
543
  # Create a combined collection
482
544
  combined = ElementCollection(all_elements)
483
545
 
484
546
  # Sort in document order if requested and not first_only (already sorted by page)
485
- if not first_only and kwargs.get('document_order', True):
547
+ if not first_only and kwargs.get("document_order", True):
486
548
  # Check if elements have page, top, x0 before sorting
487
- if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
488
- combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
549
+ if all(
550
+ hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
551
+ for el in combined.elements
552
+ ):
553
+ combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
489
554
  else:
490
- # Elements might be Regions without inherent sorting order yet
491
- # Attempt sorting by page index if possible
492
- try:
493
- combined.sort(key=lambda el: el.page.index)
494
- except AttributeError:
495
- logger.warning("Cannot sort elements in document order: Missing required attributes (e.g., page).")
555
+ # Elements might be Regions without inherent sorting order yet
556
+ # Attempt sorting by page index if possible
557
+ try:
558
+ combined.sort(key=lambda el: el.page.index)
559
+ except AttributeError:
560
+ logger.warning(
561
+ "Cannot sort elements in document order: Missing required attributes (e.g., page)."
562
+ )
496
563
 
497
564
  return combined
498
-
499
- def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
500
- use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
565
+
566
+ def extract_text(
567
+ self,
568
+ selector: Optional[str] = None,
569
+ preserve_whitespace=True,
570
+ use_exclusions=True,
571
+ debug_exclusions=False,
572
+ **kwargs,
573
+ ) -> str:
501
574
  """
502
575
  Extract text from the entire document or matching elements.
503
-
576
+
504
577
  Args:
505
578
  selector: Optional selector to filter elements
506
579
  preserve_whitespace: Whether to keep blank characters (default: True)
507
580
  use_exclusions: Whether to apply exclusion regions (default: True)
508
581
  debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
509
582
  **kwargs: Additional extraction parameters
510
-
583
+
511
584
  Returns:
512
585
  Extracted text as string
513
586
  """
514
587
  # Ensure _pages is initialized
515
- if not hasattr(self, '_pages'):
516
- raise AttributeError("PDF pages not yet initialized.")
588
+ if not hasattr(self, "_pages"):
589
+ raise AttributeError("PDF pages not yet initialized.")
517
590
 
518
591
  # If selector is provided, find elements first
519
592
  if selector:
520
593
  elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
521
594
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
522
-
595
+
523
596
  # Otherwise extract from all pages
524
597
  if debug_exclusions:
525
598
  print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
526
599
  print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
527
-
600
+
528
601
  texts = []
529
602
  for page in self.pages:
530
- texts.append(page.extract_text(
531
- preserve_whitespace=preserve_whitespace,
532
- use_exclusions=use_exclusions,
533
- debug_exclusions=debug_exclusions,
534
- **kwargs
535
- ))
536
-
603
+ texts.append(
604
+ page.extract_text(
605
+ preserve_whitespace=preserve_whitespace,
606
+ use_exclusions=use_exclusions,
607
+ debug_exclusions=debug_exclusions,
608
+ **kwargs,
609
+ )
610
+ )
611
+
537
612
  if debug_exclusions:
538
613
  print(f"PDF: Combined {len(texts)} pages of text")
539
-
614
+
540
615
  return "\n".join(texts)
541
-
616
+
542
617
  def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
543
618
  """
544
619
  Shorthand for finding elements and extracting their text.
545
-
620
+
546
621
  Args:
547
622
  selector: CSS-like selector string
548
623
  preserve_whitespace: Whether to keep blank characters (default: True)
549
624
  **kwargs: Additional extraction parameters
550
-
625
+
551
626
  Returns:
552
627
  Extracted text from matching elements
553
628
  """
554
629
  # Ensure _pages is initialized
555
- if not hasattr(self, '_pages'):
556
- raise AttributeError("PDF pages not yet initialized.")
557
- return self.extract_text(selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs) # apply_exclusions is handled by find_all in extract_text
558
-
559
- def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
630
+ if not hasattr(self, "_pages"):
631
+ raise AttributeError("PDF pages not yet initialized.")
632
+ return self.extract_text(
633
+ selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
634
+ ) # apply_exclusions is handled by find_all in extract_text
635
+
636
+ def extract_tables(
637
+ self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
638
+ ) -> List[Any]:
560
639
  """
561
640
  Extract tables from the document or matching elements.
562
-
641
+
563
642
  Args:
564
643
  selector: Optional selector to filter tables
565
644
  merge_across_pages: Whether to merge tables that span across pages
566
645
  **kwargs: Additional extraction parameters
567
-
646
+
568
647
  Returns:
569
648
  List of extracted tables
570
649
  """
571
650
  # Ensure _pages is initialized
572
- if not hasattr(self, '_pages'):
573
- raise AttributeError("PDF pages not yet initialized.")
651
+ if not hasattr(self, "_pages"):
652
+ raise AttributeError("PDF pages not yet initialized.")
574
653
  # TODO: Implement table extraction
575
654
  logger.warning("PDF.extract_tables is not fully implemented yet.")
576
655
  all_tables = []
577
656
  for page in self.pages:
578
- # Assuming page.extract_tables(**kwargs) exists or is added
579
- if hasattr(page, 'extract_tables'):
580
- all_tables.extend(page.extract_tables(**kwargs))
581
- else:
582
- logger.debug(f"Page {page.number} does not have extract_tables method.")
657
+ # Assuming page.extract_tables(**kwargs) exists or is added
658
+ if hasattr(page, "extract_tables"):
659
+ all_tables.extend(page.extract_tables(**kwargs))
660
+ else:
661
+ logger.debug(f"Page {page.number} does not have extract_tables method.")
583
662
  # Placeholder filtering
584
663
  if selector:
585
664
  logger.warning("Filtering extracted tables by selector is not implemented.")
586
665
  # Would need to parse selector and filter the list `all_tables`
587
666
  # Placeholder merging
588
667
  if merge_across_pages:
589
- logger.warning("Merging tables across pages is not implemented.")
590
- # Would need logic to detect and merge related tables
668
+ logger.warning("Merging tables across pages is not implemented.")
669
+ # Would need logic to detect and merge related tables
591
670
  return all_tables
592
-
671
+
593
672
  # --- New Method: save_searchable ---
594
- def save_searchable(self, output_path: Union[str, 'Path'], dpi: int = 300, **kwargs):
673
+ def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
595
674
  """
596
675
  Saves the PDF with an OCR text layer, making content searchable.
597
676
 
598
677
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
599
678
 
600
679
  Note: OCR must have been applied to the pages beforehand
601
- (e.g., using pdf.apply_ocr_to_pages()).
680
+ (e.g., using pdf.apply_ocr()).
602
681
 
603
682
  Args:
604
683
  output_path: Path to save the searchable PDF.
@@ -608,15 +687,6 @@ class PDF:
608
687
  # Import moved here, assuming it's always available now
609
688
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
610
689
 
611
- # TODO: Need a reliable way for Page to signal if it has OCR elements.
612
- # This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
613
- # or checking if page.get_elements(source='ocr') returns anything.
614
- # For now, we pass through and let the exporter handle pages without OCR elements.
615
- # if not any(page.get_elements(source='ocr') for page in self.pages):
616
- # logger.warning("No OCR elements found on pages. "
617
- # "Ensure apply_ocr_to_pages() was called. "
618
- # "Output PDF might not be searchable.")
619
-
620
690
  # Convert pathlib.Path to string if necessary
621
691
  output_path_str = str(output_path)
622
692
 
@@ -625,15 +695,18 @@ class PDF:
625
695
 
626
696
  # --- End New Method ---
627
697
 
628
- def ask(self, question: str,
629
- mode: str = "extractive",
630
- pages: Union[int, List[int], range] = None,
631
- min_confidence: float = 0.1,
632
- model: str = None,
633
- **kwargs) -> Dict[str, Any]:
698
+ def ask(
699
+ self,
700
+ question: str,
701
+ mode: str = "extractive",
702
+ pages: Union[int, List[int], range] = None,
703
+ min_confidence: float = 0.1,
704
+ model: str = None,
705
+ **kwargs,
706
+ ) -> Dict[str, Any]:
634
707
  """
635
708
  Ask a question about the document content.
636
-
709
+
637
710
  Args:
638
711
  question: Question to ask about the document
639
712
  mode: "extractive" to extract answer from document, "generative" to generate
@@ -641,16 +714,16 @@ class PDF:
641
714
  min_confidence: Minimum confidence threshold for answers
642
715
  model: Optional model name for question answering
643
716
  **kwargs: Additional parameters passed to the QA engine
644
-
717
+
645
718
  Returns:
646
719
  A dictionary containing the answer, confidence, and other metadata.
647
720
  Result will have an 'answer' key containing the answer text.
648
721
  """
649
722
  from natural_pdf.qa import get_qa_engine
650
-
723
+
651
724
  # Initialize or get QA engine
652
725
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
653
-
726
+
654
727
  # Determine which pages to query
655
728
  if pages is None:
656
729
  target_pages = list(range(len(self.pages)))
@@ -662,43 +735,40 @@ class PDF:
662
735
  target_pages = pages
663
736
  else:
664
737
  raise ValueError(f"Invalid pages parameter: {pages}")
665
-
738
+
666
739
  # Actually query each page and gather results
667
740
  results = []
668
741
  for page_idx in target_pages:
669
742
  if 0 <= page_idx < len(self.pages):
670
743
  page = self.pages[page_idx]
671
744
  page_result = qa_engine.ask_pdf_page(
672
- page=page,
673
- question=question,
674
- min_confidence=min_confidence,
675
- **kwargs
745
+ page=page, question=question, min_confidence=min_confidence, **kwargs
676
746
  )
677
-
747
+
678
748
  # Add to results if it found an answer
679
749
  if page_result and page_result.get("found", False):
680
750
  results.append(page_result)
681
-
751
+
682
752
  # Sort results by confidence
683
753
  results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
684
-
754
+
685
755
  # Return the best result, or a default result if none found
686
756
  if results:
687
757
  return results[0]
688
758
  else:
689
759
  # Return a structure indicating no answer found
690
760
  return {
691
- "answer": None,
692
- "confidence": 0.0,
693
- "found": False,
694
- "page_num": None, # Or maybe the pages searched?
695
- "source_elements": []
761
+ "answer": None,
762
+ "confidence": 0.0,
763
+ "found": False,
764
+ "page_num": None, # Or maybe the pages searched?
765
+ "source_elements": [],
696
766
  }
697
767
 
698
768
  def search_within_index(
699
769
  self,
700
770
  query: Union[str, Path, Image.Image, Region],
701
- search_service: SearchServiceProtocol, # Now required
771
+ search_service: SearchServiceProtocol, # Now required
702
772
  options: Optional[SearchOptions] = None,
703
773
  ) -> List[Dict[str, Any]]:
704
774
  """
@@ -730,14 +800,16 @@ class PDF:
730
800
  RuntimeError: For other search failures.
731
801
  """
732
802
  if not search_service:
733
- raise ValueError("A configured SearchServiceProtocol instance must be provided.")
803
+ raise ValueError("A configured SearchServiceProtocol instance must be provided.")
734
804
  # Optional stricter check:
735
805
  # if not isinstance(search_service, SearchServiceProtocol):
736
806
  # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
737
807
 
738
808
  # Get collection name from service for logging
739
- collection_name = getattr(search_service, 'collection_name', '<Unknown Collection>')
740
- logger.info(f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}.")
809
+ collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
810
+ logger.info(
811
+ f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
812
+ )
741
813
 
742
814
  # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
743
815
  # service: SearchServiceProtocol
@@ -748,7 +820,7 @@ class PDF:
748
820
  # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
749
821
  # # TODO: Pass embedding model from options/pdf config if needed?
750
822
  # service = get_search_service(**factory_args)
751
- service = search_service # Use validated provided service
823
+ service = search_service # Use validated provided service
752
824
 
753
825
  # --- 2. Prepare Query and Options ---
754
826
  query_input = query
@@ -757,119 +829,145 @@ class PDF:
757
829
 
758
830
  # Handle Region query - extract text for now
759
831
  if isinstance(query, Region):
760
- logger.debug("Query is a Region object. Extracting text.")
761
- if not isinstance(effective_options, TextSearchOptions):
762
- logger.warning("Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction.")
763
- query_input = query.extract_text()
764
- if not query_input or query_input.isspace():
765
- logger.error("Region has no extractable text for query.")
766
- return []
832
+ logger.debug("Query is a Region object. Extracting text.")
833
+ if not isinstance(effective_options, TextSearchOptions):
834
+ logger.warning(
835
+ "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
836
+ )
837
+ query_input = query.extract_text()
838
+ if not query_input or query_input.isspace():
839
+ logger.error("Region has no extractable text for query.")
840
+ return []
767
841
 
768
842
  # --- 3. Add Filter to Scope Search to THIS PDF ---
769
843
  # Assume metadata field 'pdf_path' stores the resolved path used during indexing
770
844
  pdf_scope_filter = {
771
- "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
772
- "operator": "eq",
773
- "value": self.path # Use the resolved path of this PDF instance
845
+ "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
846
+ "operator": "eq",
847
+ "value": self.path, # Use the resolved path of this PDF instance
774
848
  }
775
849
  logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
776
850
 
777
851
  # Combine with existing filters in options (if any)
778
852
  if effective_options.filters:
779
- logger.debug(f"Combining PDF scope filter with existing filters: {effective_options.filters}")
853
+ logger.debug(
854
+ f"Combining PDF scope filter with existing filters: {effective_options.filters}"
855
+ )
780
856
  # Assume filters are compatible with the underlying search service
781
857
  # If existing filters aren't already in an AND block, wrap them
782
- if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
783
- # Already an AND block, just append the condition
784
- effective_options.filters["conditions"].append(pdf_scope_filter)
858
+ if (
859
+ isinstance(effective_options.filters, dict)
860
+ and effective_options.filters.get("operator") == "AND"
861
+ ):
862
+ # Already an AND block, just append the condition
863
+ effective_options.filters["conditions"].append(pdf_scope_filter)
785
864
  elif isinstance(effective_options.filters, list):
786
- # Assume list represents implicit AND conditions
787
- effective_options.filters = {"operator": "AND", "conditions": effective_options.filters + [pdf_scope_filter]}
788
- elif isinstance(effective_options.filters, dict): # Single filter dict
789
- effective_options.filters = {"operator": "AND", "conditions": [effective_options.filters, pdf_scope_filter]}
865
+ # Assume list represents implicit AND conditions
866
+ effective_options.filters = {
867
+ "operator": "AND",
868
+ "conditions": effective_options.filters + [pdf_scope_filter],
869
+ }
870
+ elif isinstance(effective_options.filters, dict): # Single filter dict
871
+ effective_options.filters = {
872
+ "operator": "AND",
873
+ "conditions": [effective_options.filters, pdf_scope_filter],
874
+ }
790
875
  else:
791
- logger.warning(f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter.")
792
- effective_options.filters = pdf_scope_filter
876
+ logger.warning(
877
+ f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
878
+ )
879
+ effective_options.filters = pdf_scope_filter
793
880
  else:
794
881
  effective_options.filters = pdf_scope_filter
795
882
 
796
883
  logger.debug(f"Final filters for service search: {effective_options.filters}")
797
884
 
798
- # --- 4. Call SearchService ---
885
+ # --- 4. Call SearchService ---
799
886
  try:
800
887
  # Call the service's search method (no collection_name needed)
801
888
  results = service.search(
802
889
  query=query_input,
803
890
  options=effective_options,
804
891
  )
805
- logger.info(f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'.")
892
+ logger.info(
893
+ f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
894
+ )
806
895
  return results
807
896
  except FileNotFoundError as fnf:
808
- logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
809
- raise # Re-raise specific error
897
+ logger.error(
898
+ f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
899
+ )
900
+ raise # Re-raise specific error
810
901
  except Exception as e:
811
- logger.error(f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}", exc_info=True)
812
- raise RuntimeError(f"Search within index failed for PDF '{self.path}'. See logs for details.") from e
902
+ logger.error(
903
+ f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
904
+ exc_info=True,
905
+ )
906
+ raise RuntimeError(
907
+ f"Search within index failed for PDF '{self.path}'. See logs for details."
908
+ ) from e
813
909
 
814
910
  def __len__(self) -> int:
815
911
  """Return the number of pages in the PDF."""
816
912
  # Ensure _pages is initialized
817
- if not hasattr(self, '_pages'):
913
+ if not hasattr(self, "_pages"):
818
914
  # Return 0 or raise error if not fully initialized? Let's return 0.
819
- return 0
915
+ return 0
820
916
  return len(self._pages)
821
-
822
- def __getitem__(self, key) -> Union[Page, 'PageCollection']: # Return PageCollection for slice
917
+
918
+ def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
823
919
  """Access pages by index or slice."""
824
920
  # Check if self._pages has been initialized
825
- if not hasattr(self, '_pages'):
826
- raise AttributeError("PDF pages not initialized yet.")
921
+ if not hasattr(self, "_pages"):
922
+ raise AttributeError("PDF pages not initialized yet.")
827
923
  if isinstance(key, slice):
828
- # Return a PageCollection slice
829
- from natural_pdf.elements.collections import PageCollection
830
- return PageCollection(self._pages[key])
924
+ # Return a PageCollection slice
925
+ from natural_pdf.elements.collections import PageCollection
926
+
927
+ return PageCollection(self._pages[key])
831
928
  # Check index bounds before accessing
832
929
  if isinstance(key, int):
833
930
  if 0 <= key < len(self._pages):
834
- return self._pages[key]
931
+ return self._pages[key]
835
932
  else:
836
- raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
933
+ raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
837
934
  else:
838
- raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
839
-
935
+ raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
936
+
840
937
  def close(self):
841
938
  """Close the underlying PDF file and clean up any temporary files."""
842
- if hasattr(self, '_pdf') and self._pdf is not None:
843
- try:
844
- self._pdf.close()
845
- logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
846
- except Exception as e:
847
- logger.warning(f"Error closing pdfplumber object: {e}")
848
- finally:
849
- self._pdf = None
939
+ if hasattr(self, "_pdf") and self._pdf is not None:
940
+ try:
941
+ self._pdf.close()
942
+ logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
943
+ except Exception as e:
944
+ logger.warning(f"Error closing pdfplumber object: {e}")
945
+ finally:
946
+ self._pdf = None
850
947
 
851
948
  # Clean up temporary file if it exists
852
- if hasattr(self, '_temp_file') and self._temp_file is not None:
949
+ if hasattr(self, "_temp_file") and self._temp_file is not None:
853
950
  temp_file_path = None
854
951
  try:
855
- if hasattr(self._temp_file, 'name') and self._temp_file.name:
856
- temp_file_path = self._temp_file.name
857
- if os.path.exists(temp_file_path):
858
- os.unlink(temp_file_path)
859
- logger.debug(f"Removed temporary PDF file: {temp_file_path}")
952
+ if hasattr(self._temp_file, "name") and self._temp_file.name:
953
+ temp_file_path = self._temp_file.name
954
+ if os.path.exists(temp_file_path):
955
+ os.unlink(temp_file_path)
956
+ logger.debug(f"Removed temporary PDF file: {temp_file_path}")
860
957
  except Exception as e:
861
- logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
958
+ logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
862
959
  finally:
863
- self._temp_file = None
960
+ self._temp_file = None
864
961
 
865
962
  def __enter__(self):
866
963
  """Context manager entry."""
867
964
  return self
868
-
965
+
869
966
  def __exit__(self, exc_type, exc_val, exc_tb):
870
967
  """Context manager exit."""
871
968
  self.close()
872
969
 
970
+
873
971
  # --- Added TYPE_CHECKING import (if not already present) ---
874
972
  if TYPE_CHECKING:
875
- from pathlib import Path # Assuming Path is used for type hint
973
+ from pathlib import Path # Assuming Path is used for type hint