natural-pdf 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/pdf.py CHANGED
@@ -4,7 +4,9 @@ import tempfile
4
4
  import os
5
5
  import re
6
6
  import urllib.request
7
- from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable # Added Iterable
7
+ from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable, TYPE_CHECKING # Added Iterable and TYPE_CHECKING
8
+ from pathlib import Path # Added Path
9
+ import copy # Add import for deepcopy
8
10
  from PIL import Image
9
11
 
10
12
  from natural_pdf.core.page import Page
@@ -15,9 +17,31 @@ from natural_pdf.ocr import OCRManager, OCROptions
15
17
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
16
18
  from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
17
19
 
18
- # Set up module logger
19
- logger = logging.getLogger("natural_pdf.core.pdf")
20
+ # Import the flag directly - this should always work
21
+
22
+ # --- Add Search Service Imports (needed for new methods) ---
23
+ try:
24
+ from natural_pdf.search import (
25
+ get_search_service,
26
+ SearchServiceProtocol,
27
+ SearchOptions,
28
+ TextSearchOptions, # Keep for ask default
29
+ BaseSearchOptions
30
+ )
31
+ from typing import Any as TypingAny # Import Any if not already
32
+ except ImportError:
33
+ # Define dummies if needed for type hints within the class
34
+ SearchServiceProtocol = object
35
+ SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
36
+ TypingAny = object
37
+ # Dummy factory needed for default arg in methods
38
+ def get_search_service(**kwargs) -> SearchServiceProtocol:
39
+ raise ImportError("Search dependencies are not installed. Install with: pip install natural-pdf[search]")
20
40
 
41
+ # --- End Search Service Imports ---
42
+
43
+ # Set up logger early
44
+ logger = logging.getLogger("natural_pdf.core.pdf")
21
45
 
22
46
  class PDF:
23
47
  """
@@ -50,7 +74,8 @@ class PDF:
50
74
  # Initialize path-related attributes
51
75
  self._original_path = path_or_url
52
76
  self._temp_file = None
53
-
77
+ self._resolved_path = None # Store the actual path used by pdfplumber
78
+
54
79
  if is_url:
55
80
  logger.info(f"Downloading PDF from URL: {path_or_url}")
56
81
  try:
@@ -64,8 +89,8 @@ class PDF:
64
89
  self._temp_file.close()
65
90
 
66
91
  # Use the temporary file path
67
- path = self._temp_file.name
68
- logger.info(f"PDF downloaded to temporary file: {path}")
92
+ self._resolved_path = self._temp_file.name
93
+ logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
69
94
  except Exception as e:
70
95
  if self._temp_file and hasattr(self._temp_file, 'name'):
71
96
  try:
@@ -76,43 +101,45 @@ class PDF:
76
101
  raise ValueError(f"Failed to download PDF from URL: {e}")
77
102
  else:
78
103
  # Use the provided path directly
79
- path = path_or_url
80
-
81
- logger.info(f"Initializing PDF from {path}")
104
+ self._resolved_path = path_or_url
105
+
106
+ logger.info(f"Initializing PDF from {self._resolved_path}")
82
107
  logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
83
108
 
84
- self._pdf = pdfplumber.open(path)
85
- self._path = path
109
+ try:
110
+ self._pdf = pdfplumber.open(self._resolved_path)
111
+ except Exception as e:
112
+ logger.error(f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}", exc_info=True)
113
+ # Clean up temp file if creation failed
114
+ self.close()
115
+ raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
116
+
117
+ self._path = self._resolved_path # Keep original path too?
118
+ self.path = self._resolved_path # Public attribute for the resolved path
119
+ self.source_path = self._original_path # Public attribute for the user-provided path/URL
120
+
86
121
  self._reading_order = reading_order
87
122
  self._config = {
88
123
  'keep_spaces': keep_spaces
89
124
  }
90
- self.path = path
91
-
92
- self._font_attrs = font_attrs # Store the font attribute configuration
93
125
 
94
- if OCRManager:
95
- self._ocr_manager = OCRManager()
96
- logger.info(f"Initialized OCRManager. Available engines: {self._ocr_manager.get_available_engines()}")
97
- else:
98
- self._ocr_manager = None
99
- logger.warning("OCRManager could not be imported. OCR functionality disabled.")
126
+ self._font_attrs = font_attrs # Store the font attribute configuration
100
127
 
101
- if LayoutManager:
102
- self._layout_manager = LayoutManager()
103
- logger.info(f"Initialized LayoutManager. Available engines: {self._layout_manager.get_available_engines()}")
104
- else:
105
- self._layout_manager = None
106
- logger.warning("LayoutManager could not be imported. Layout analysis disabled.")
128
+ # Initialize Managers and Services (conditionally available)
129
+ self._ocr_manager = OCRManager() if OCRManager else None
130
+ self._layout_manager = LayoutManager() if LayoutManager else None
131
+ self.highlighter = HighlightingService(self)
107
132
 
133
+ # Initialize pages last, passing necessary refs
108
134
  self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
135
+
136
+ # Other state
109
137
  self._element_cache = {}
110
138
  self._exclusions = [] # List to store exclusion functions/regions
111
139
  self._regions = [] # List to store region functions/definitions
112
140
 
113
- # Initialize the Highlighting Service
114
- self.highlighter = HighlightingService(self)
115
141
  logger.info("Initialized HighlightingService.")
142
+ logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
116
143
 
117
144
  @property
118
145
  def metadata(self) -> Dict[str, Any]:
@@ -123,6 +150,9 @@ class PDF:
123
150
  def pages(self) -> 'PageCollection':
124
151
  """Access pages as a PageCollection object."""
125
152
  from natural_pdf.elements.collections import PageCollection
153
+ # Ensure _pages is initialized
154
+ if not hasattr(self, '_pages'):
155
+ raise AttributeError("PDF pages not yet initialized.")
126
156
  return PageCollection(self._pages)
127
157
 
128
158
  def clear_exclusions(self) -> 'PDF':
@@ -132,41 +162,43 @@ class PDF:
132
162
  Returns:
133
163
  Self for method chaining
134
164
  """
165
+ # Ensure _pages is initialized
166
+ if not hasattr(self, '_pages'):
167
+ raise AttributeError("PDF pages not yet initialized.")
135
168
 
136
169
  self._exclusions = []
170
+ # Also clear from pages
171
+ for page in self._pages:
172
+ page.clear_exclusions()
137
173
  return self
138
174
 
139
- def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
175
+ def add_exclusion(self, exclusion_func: Callable[['Page'], Optional[Region]], label: str = None) -> 'PDF':
140
176
  """
141
177
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
142
178
 
143
179
  Args:
144
- exclusion_func: A function that takes a Page and returns a Region to exclude
180
+ exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
145
181
  label: Optional label for this exclusion
146
182
 
147
183
  Returns:
148
184
  Self for method chaining
149
185
  """
186
+ # Ensure _pages is initialized
187
+ if not hasattr(self, '_pages'):
188
+ raise AttributeError("PDF pages not yet initialized.")
189
+
150
190
  # Store exclusion with its label at PDF level
151
191
  exclusion_data = (exclusion_func, label)
152
192
  self._exclusions.append(exclusion_data)
153
-
154
- # Create a wrapper function that properly evaluates on each page
155
- def exclusion_wrapper(page):
156
- try:
157
- region = exclusion_func(page)
158
- return region
159
- except Exception as e:
160
- print(f"Error in PDF-level exclusion for page {page.index}: {e}")
161
- return None
162
-
163
- # Apply this exclusion to all pages using the wrapper
193
+
194
+ # Apply this exclusion to all pages
164
195
  for page in self._pages:
165
- page.add_exclusion(exclusion_wrapper)
166
-
196
+ # We pass the original function, Page.add_exclusion handles calling it
197
+ page.add_exclusion(exclusion_func, label=label)
198
+
167
199
  return self
168
200
 
169
- def apply_ocr_to_pages(
201
+ def apply_ocr(
170
202
  self,
171
203
  pages: Optional[Union[Iterable[int], range, slice]] = None,
172
204
  engine: Optional[str] = None,
@@ -233,9 +265,11 @@ class PDF:
233
265
  images_pil: List[Image.Image] = []
234
266
  page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
235
267
  logger.info(f"Rendering {len(target_pages)} pages to images...")
268
+ failed_page_num = 'unknown' # Keep track of potentially failing page
236
269
  try:
237
270
  ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
238
271
  for i, page in enumerate(target_pages):
272
+ failed_page_num = page.number # Update current page number in case of error
239
273
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
240
274
  # Use page.to_image but ensure highlights are off for OCR base image
241
275
  img = page.to_image(scale=ocr_scale, include_highlights=False)
@@ -243,9 +277,7 @@ class PDF:
243
277
  page_image_map.append((page, img)) # Store pair
244
278
  except Exception as e:
245
279
  logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
246
- # Decide whether to continue with successfully rendered pages or fail completely
247
- # For now, let's fail if any page rendering fails.
248
- raise RuntimeError(f"Failed to render page {page.number} for OCR.") from e
280
+ raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
249
281
 
250
282
  if not images_pil:
251
283
  logger.error("No images were successfully rendered for batch OCR.")
@@ -253,9 +285,11 @@ class PDF:
253
285
 
254
286
  # --- Prepare Arguments for Manager ---
255
287
  manager_args = {'images': images_pil, 'options': options, 'engine': engine}
256
- if languages is not None: manager_args['languages'] = languages
257
- if min_confidence is not None: manager_args['min_confidence'] = min_confidence
258
- if device is not None: manager_args['device'] = device
288
+ simple_args = {}
289
+ if languages is not None: simple_args['languages'] = languages
290
+ if min_confidence is not None: simple_args['min_confidence'] = min_confidence
291
+ if device is not None: simple_args['device'] = device
292
+ manager_args.update(simple_args) # Add simple args if options not provided
259
293
 
260
294
  # --- Call OCR Manager for Batch Processing ---
261
295
  logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
@@ -288,59 +322,60 @@ class PDF:
288
322
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
289
323
  # Use the page's element manager to create elements from its results
290
324
  # Changed from page._create_text_elements_from_ocr to use element_mgr
291
- elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img.width, img.height)
325
+ try:
326
+ # Calculate scale factors based on rendered image vs page dims
327
+ img_scale_x = page.width / img.width if img.width > 0 else 1
328
+ img_scale_y = page.height / img.height if img.height > 0 else 1
329
+ elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img_scale_x, img_scale_y)
292
330
 
293
- if elements:
294
- # Note: element_mgr.create_text_elements_from_ocr already adds them
295
- total_elements_added += len(elements)
296
- logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
297
- else:
298
- logger.debug(f" No valid TextElements created for page {page.number}.")
331
+ if elements:
332
+ # Note: element_mgr.create_text_elements_from_ocr already adds them
333
+ total_elements_added += len(elements)
334
+ logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
335
+ else:
336
+ logger.debug(f" No valid TextElements created for page {page.number}.")
337
+ except Exception as e:
338
+ logger.error(f" Error adding OCR elements to page {page.number}: {e}", exc_info=True)
339
+ # Continue to next page
299
340
 
300
341
  logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
301
342
  return self
302
343
 
303
- def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
344
+ def add_region(self, region_func: Callable[['Page'], Optional[Region]], name: str = None) -> 'PDF':
304
345
  """
305
346
  Add a region function to the PDF. This creates regions on all pages using the provided function.
306
347
 
307
348
  Args:
308
- region_func: A function that takes a Page and returns a Region
349
+ region_func: A function that takes a Page and returns a Region, or None.
309
350
  name: Optional name for the region
310
351
 
311
352
  Returns:
312
353
  Self for method chaining
313
354
  """
355
+ # Ensure _pages is initialized
356
+ if not hasattr(self, '_pages'):
357
+ raise AttributeError("PDF pages not yet initialized.")
358
+
314
359
  # Store region with its name at PDF level
315
360
  region_data = (region_func, name)
316
361
  self._regions.append(region_data)
317
-
318
- # Create a wrapper function that properly evaluates on each page
319
- def region_wrapper(page):
320
- try:
321
- region = region_func(page)
322
- if region:
323
- # Apply name if provided
324
- if name:
325
- region.name = name
326
- region.source = 'named'
327
- return region
328
- except Exception as e:
329
- print(f"Error in PDF-level region for page {page.index}: {e}")
330
- return None
331
-
362
+
332
363
  # Apply this region to all pages
333
364
  for page in self._pages:
334
365
  try:
335
- region = region_wrapper(page)
336
- if region:
337
- page.add_region(region, name=name)
366
+ # Call the function to get the region for this specific page
367
+ region_instance = region_func(page)
368
+ if region_instance and isinstance(region_instance, Region):
369
+ # If a valid region is returned, add it to the page
370
+ page.add_region(region_instance, name=name, source='named')
371
+ elif region_instance is not None:
372
+ logger.warning(f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}")
338
373
  except Exception as e:
339
- print(f"Error adding region to page {page.index}: {e}")
340
-
374
+ logger.error(f"Error executing or adding region function for page {page.number}: {e}", exc_info=True)
375
+
341
376
  return self
342
377
 
343
- def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
378
+ def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Optional[Any]:
344
379
  """
345
380
  Find the first element matching the selector.
346
381
 
@@ -354,13 +389,17 @@ class PDF:
354
389
  Returns:
355
390
  Element object or None if not found
356
391
  """
392
+ # Ensure _pages is initialized
393
+ if not hasattr(self, '_pages'):
394
+ raise AttributeError("PDF pages not yet initialized.")
395
+
357
396
  selector_obj = parse_selector(selector)
358
397
 
359
398
  # Pass regex and case flags to selector function
360
399
  kwargs['regex'] = regex
361
400
  kwargs['case'] = case
362
401
 
363
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
402
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs)
364
403
  return results.first if results else None
365
404
 
366
405
  def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
@@ -377,22 +416,27 @@ class PDF:
377
416
  Returns:
378
417
  ElementCollection with matching elements
379
418
  """
419
+ # Ensure _pages is initialized
420
+ if not hasattr(self, '_pages'):
421
+ raise AttributeError("PDF pages not yet initialized.")
422
+
380
423
  selector_obj = parse_selector(selector)
381
424
 
382
425
  # Pass regex and case flags to selector function
383
426
  kwargs['regex'] = regex
384
427
  kwargs['case'] = case
385
428
 
386
- results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
429
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs)
387
430
  return results
388
431
 
389
- def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
432
+ def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs) -> ElementCollection:
390
433
  """
391
434
  Apply selector to PDF elements across all pages.
392
435
 
393
436
  Args:
394
437
  selector_obj: Parsed selector dictionary
395
438
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
439
+ first_only: If True, stop searching after the first match is found.
396
440
  **kwargs: Additional filter parameters
397
441
 
398
442
  Returns:
@@ -401,48 +445,55 @@ class PDF:
401
445
  from natural_pdf.elements.collections import ElementCollection
402
446
 
403
447
  # Determine page range to search
404
- page_range = kwargs.get('pages', range(len(self.pages)))
405
- if isinstance(page_range, (int, slice)):
406
- # Convert int or slice to range
407
- if isinstance(page_range, int):
408
- page_range = [page_range]
409
- elif isinstance(page_range, slice):
410
- start = page_range.start or 0
411
- stop = page_range.stop or len(self.pages)
412
- step = page_range.step or 1
413
- page_range = range(start, stop, step)
414
-
415
- # Check for cross-page pseudo-classes
416
- cross_page = False
448
+ page_indices = kwargs.get('pages', range(len(self._pages)))
449
+ if isinstance(page_indices, int):
450
+ page_indices = [page_indices]
451
+ elif isinstance(page_indices, slice):
452
+ page_indices = range(*page_indices.indices(len(self._pages)))
453
+
454
+ # Check for cross-page pseudo-classes (currently not supported)
417
455
  for pseudo in selector_obj.get('pseudo_classes', []):
418
456
  if pseudo.get('name') in ('spans', 'continues'):
419
- cross_page = True
420
- break
421
-
422
- # If searching across pages, handle specially
423
- if cross_page:
424
- # TODO: Implement cross-page element matching
425
- return ElementCollection([])
426
-
457
+ logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
458
+ return ElementCollection([])
459
+
427
460
  # Regular case: collect elements from each page
428
461
  all_elements = []
429
- for page_idx in page_range:
430
- if 0 <= page_idx < len(self.pages):
431
- page = self.pages[page_idx]
432
- page_elements = page._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
433
- all_elements.extend(page_elements.elements)
434
-
462
+ for page_idx in page_indices:
463
+ if 0 <= page_idx < len(self._pages):
464
+ page = self._pages[page_idx]
465
+ # Pass first_only down to page._apply_selector
466
+ page_elements_collection = page._apply_selector(
467
+ selector_obj,
468
+ apply_exclusions=apply_exclusions,
469
+ first_only=first_only,
470
+ **kwargs
471
+ )
472
+ if page_elements_collection:
473
+ page_elements = page_elements_collection.elements
474
+ all_elements.extend(page_elements)
475
+ # If we only need the first match overall, and we found one on this page, stop
476
+ if first_only and page_elements:
477
+ break # Stop iterating through pages
478
+ else:
479
+ logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
480
+
435
481
  # Create a combined collection
436
482
  combined = ElementCollection(all_elements)
437
-
438
- # Sort in document order if requested
439
- if kwargs.get('document_order', True):
483
+
484
+ # Sort in document order if requested and not first_only (already sorted by page)
485
+ if not first_only and kwargs.get('document_order', True):
440
486
  # Check if elements have page, top, x0 before sorting
441
487
  if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
442
488
  combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
443
489
  else:
444
- logger.warning("Cannot sort elements in document order: Missing required attributes (page, top, x0).")
445
-
490
+ # Elements might be Regions without inherent sorting order yet
491
+ # Attempt sorting by page index if possible
492
+ try:
493
+ combined.sort(key=lambda el: el.page.index)
494
+ except AttributeError:
495
+ logger.warning("Cannot sort elements in document order: Missing required attributes (e.g., page).")
496
+
446
497
  return combined
447
498
 
448
499
  def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
@@ -460,9 +511,13 @@ class PDF:
460
511
  Returns:
461
512
  Extracted text as string
462
513
  """
514
+ # Ensure _pages is initialized
515
+ if not hasattr(self, '_pages'):
516
+ raise AttributeError("PDF pages not yet initialized.")
517
+
463
518
  # If selector is provided, find elements first
464
519
  if selector:
465
- elements = self.find_all(selector)
520
+ elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
466
521
  return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
467
522
 
468
523
  # Otherwise extract from all pages
@@ -484,8 +539,6 @@ class PDF:
484
539
 
485
540
  return "\n".join(texts)
486
541
 
487
- # Note: extract_text_compat method removed
488
-
489
542
  def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
490
543
  """
491
544
  Shorthand for finding elements and extracting their text.
@@ -498,45 +551,11 @@ class PDF:
498
551
  Returns:
499
552
  Extracted text from matching elements
500
553
  """
501
- return self.extract_text(selector, preserve_whitespace=preserve_whitespace, **kwargs)
502
-
503
- # def debug_ocr(self, output_path, pages=None):
504
- # """
505
- # Generate an interactive HTML debug report for OCR results.
506
-
507
- # This creates a single-file HTML report with:
508
- # - Side-by-side view of image regions and OCR text
509
- # - Confidence scores with color coding
510
- # - Editable correction fields
511
- # - Filtering and sorting options
512
- # - Export functionality for corrected text
513
-
514
- # Args:
515
- # output_path: Path to save the HTML report
516
- # pages: Pages to include in the report (default: all pages)
517
- # Can be a page index, slice, or list of page indices
518
-
519
- # Returns:
520
- # Self for method chaining
521
- # """
522
- # from natural_pdf.utils.ocr import debug_ocr_to_html
554
+ # Ensure _pages is initialized
555
+ if not hasattr(self, '_pages'):
556
+ raise AttributeError("PDF pages not yet initialized.")
557
+ return self.extract_text(selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs) # apply_exclusions is handled by find_all in extract_text
523
558
 
524
- # if pages is None:
525
- # # Include all pages
526
- # target_pages = self.pages
527
- # elif isinstance(pages, int):
528
- # # Single page index
529
- # target_pages = [self.pages[pages]]
530
- # elif isinstance(pages, slice):
531
- # # Slice of pages
532
- # target_pages = self.pages[pages]
533
- # else:
534
- # # Assume it's an iterable of page indices
535
- # target_pages = [self.pages[i] for i in pages]
536
-
537
- # debug_ocr_to_html(target_pages, output_path)
538
- # return self
539
-
540
559
  def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
541
560
  """
542
561
  Extract tables from the document or matching elements.
@@ -549,9 +568,63 @@ class PDF:
549
568
  Returns:
550
569
  List of extracted tables
551
570
  """
571
+ # Ensure _pages is initialized
572
+ if not hasattr(self, '_pages'):
573
+ raise AttributeError("PDF pages not yet initialized.")
552
574
  # TODO: Implement table extraction
553
- return [] # Placeholder
575
+ logger.warning("PDF.extract_tables is not fully implemented yet.")
576
+ all_tables = []
577
+ for page in self.pages:
578
+ # Assuming page.extract_tables(**kwargs) exists or is added
579
+ if hasattr(page, 'extract_tables'):
580
+ all_tables.extend(page.extract_tables(**kwargs))
581
+ else:
582
+ logger.debug(f"Page {page.number} does not have extract_tables method.")
583
+ # Placeholder filtering
584
+ if selector:
585
+ logger.warning("Filtering extracted tables by selector is not implemented.")
586
+ # Would need to parse selector and filter the list `all_tables`
587
+ # Placeholder merging
588
+ if merge_across_pages:
589
+ logger.warning("Merging tables across pages is not implemented.")
590
+ # Would need logic to detect and merge related tables
591
+ return all_tables
554
592
 
593
+ # --- New Method: save_searchable ---
594
+ def save_searchable(self, output_path: Union[str, 'Path'], dpi: int = 300, **kwargs):
595
+ """
596
+ Saves the PDF with an OCR text layer, making content searchable.
597
+
598
+ Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
599
+
600
+ Note: OCR must have been applied to the pages beforehand
601
+ (e.g., using pdf.apply_ocr()).
602
+
603
+ Args:
604
+ output_path: Path to save the searchable PDF.
605
+ dpi: Resolution for rendering and OCR overlay (default 300).
606
+ **kwargs: Additional keyword arguments passed to the exporter.
607
+ """
608
+ # Import moved here, assuming it's always available now
609
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
610
+
611
+ # TODO: Need a reliable way for Page to signal if it has OCR elements.
612
+ # This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
613
+ # or checking if page.get_elements(source='ocr') returns anything.
614
+ # For now, we pass through and let the exporter handle pages without OCR elements.
615
+ # if not any(page.get_elements(source='ocr') for page in self.pages):
616
+ # logger.warning("No OCR elements found on pages. "
617
+ # "Ensure apply_ocr() was called. "
618
+ # "Output PDF might not be searchable.")
619
+
620
+ # Convert pathlib.Path to string if necessary
621
+ output_path_str = str(output_path)
622
+
623
+ create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
624
+ logger.info(f"Searchable PDF saved to: {output_path_str}")
625
+
626
+ # --- End New Method ---
627
+
555
628
  def ask(self, question: str,
556
629
  mode: str = "extractive",
557
630
  pages: Union[int, List[int], range] = None,
@@ -621,12 +694,132 @@ class PDF:
621
694
  "page_num": None, # Or maybe the pages searched?
622
695
  "source_elements": []
623
696
  }
624
-
697
+
698
+ def search_within_index(
699
+ self,
700
+ query: Union[str, Path, Image.Image, Region],
701
+ search_service: SearchServiceProtocol, # Now required
702
+ options: Optional[SearchOptions] = None,
703
+ ) -> List[Dict[str, Any]]:
704
+ """
705
+ Finds relevant documents specifically originating from THIS PDF document
706
+ within a search index managed by the provided SearchService.
707
+
708
+ This method uses a pre-configured SearchService instance and adds
709
+ a filter to the search query to scope results only to pages from
710
+ this specific PDF object (based on its resolved path).
711
+
712
+ Args:
713
+ query: The search query (text, image path, PIL Image, Region).
714
+ search_service: A pre-configured SearchService instance pointing to the
715
+ index where this PDF's content (or related content)
716
+ is expected to be found.
717
+ options: Optional SearchOptions to configure the query (top_k, filters, etc.).
718
+ Any existing filters in `options` will be combined with the
719
+ PDF-scoping filter using an 'AND' condition.
720
+
721
+ Returns:
722
+ A list of result dictionaries, sorted by relevance, containing only
723
+ results originating from this PDF's pages.
724
+
725
+ Raises:
726
+ ImportError: If search dependencies are not installed.
727
+ ValueError: If search_service is None.
728
+ TypeError: If search_service does not conform to the protocol.
729
+ FileNotFoundError: If the collection managed by the service does not exist.
730
+ RuntimeError: For other search failures.
731
+ """
732
+ if not search_service:
733
+ raise ValueError("A configured SearchServiceProtocol instance must be provided.")
734
+ # Optional stricter check:
735
+ # if not isinstance(search_service, SearchServiceProtocol):
736
+ # raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
737
+
738
+ # Get collection name from service for logging
739
+ collection_name = getattr(search_service, 'collection_name', '<Unknown Collection>')
740
+ logger.info(f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}.")
741
+
742
+ # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
743
+ # service: SearchServiceProtocol
744
+ # if search_service:
745
+ # service = search_service
746
+ # else:
747
+ # logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
748
+ # factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
749
+ # # TODO: Pass embedding model from options/pdf config if needed?
750
+ # service = get_search_service(**factory_args)
751
+ service = search_service # Use validated provided service
752
+
753
+ # --- 2. Prepare Query and Options ---
754
+ query_input = query
755
+ # Resolve options (use default TextSearch if none provided)
756
+ effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
757
+
758
+ # Handle Region query - extract text for now
759
+ if isinstance(query, Region):
760
+ logger.debug("Query is a Region object. Extracting text.")
761
+ if not isinstance(effective_options, TextSearchOptions):
762
+ logger.warning("Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction.")
763
+ query_input = query.extract_text()
764
+ if not query_input or query_input.isspace():
765
+ logger.error("Region has no extractable text for query.")
766
+ return []
767
+
768
+ # --- 3. Add Filter to Scope Search to THIS PDF ---
769
+ # Assume metadata field 'pdf_path' stores the resolved path used during indexing
770
+ pdf_scope_filter = {
771
+ "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
772
+ "operator": "eq",
773
+ "value": self.path # Use the resolved path of this PDF instance
774
+ }
775
+ logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
776
+
777
+ # Combine with existing filters in options (if any)
778
+ if effective_options.filters:
779
+ logger.debug(f"Combining PDF scope filter with existing filters: {effective_options.filters}")
780
+ # Assume filters are compatible with the underlying search service
781
+ # If existing filters aren't already in an AND block, wrap them
782
+ if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
783
+ # Already an AND block, just append the condition
784
+ effective_options.filters["conditions"].append(pdf_scope_filter)
785
+ elif isinstance(effective_options.filters, list):
786
+ # Assume list represents implicit AND conditions
787
+ effective_options.filters = {"operator": "AND", "conditions": effective_options.filters + [pdf_scope_filter]}
788
+ elif isinstance(effective_options.filters, dict): # Single filter dict
789
+ effective_options.filters = {"operator": "AND", "conditions": [effective_options.filters, pdf_scope_filter]}
790
+ else:
791
+ logger.warning(f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter.")
792
+ effective_options.filters = pdf_scope_filter
793
+ else:
794
+ effective_options.filters = pdf_scope_filter
795
+
796
+ logger.debug(f"Final filters for service search: {effective_options.filters}")
797
+
798
+ # --- 4. Call SearchService ---
799
+ try:
800
+ # Call the service's search method (no collection_name needed)
801
+ results = service.search(
802
+ query=query_input,
803
+ options=effective_options,
804
+ )
805
+ logger.info(f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'.")
806
+ return results
807
+ except FileNotFoundError as fnf:
808
+ logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
809
+ raise # Re-raise specific error
810
+ except Exception as e:
811
+ logger.error(f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}", exc_info=True)
812
+ raise RuntimeError(f"Search within index failed for PDF '{self.path}'. See logs for details.") from e
813
+
625
814
  def __len__(self) -> int:
626
815
  """Return the number of pages in the PDF."""
627
- return len(self.pages)
816
+ # Ensure _pages is initialized
817
+ if not hasattr(self, '_pages'):
818
+ # Return 0 or raise error if not fully initialized? Let's return 0.
819
+ return 0
820
+ return len(self._pages)
628
821
 
629
- def __getitem__(self, key) -> Union[Page, List[Page]]:
822
+ def __getitem__(self, key) -> Union[Page, 'PageCollection']: # Return PageCollection for slice
630
823
  """Access pages by index or slice."""
631
824
  # Check if self._pages has been initialized
632
825
  if not hasattr(self, '_pages'):
@@ -635,30 +828,48 @@ class PDF:
635
828
  # Return a PageCollection slice
636
829
  from natural_pdf.elements.collections import PageCollection
637
830
  return PageCollection(self._pages[key])
638
- # Return a single Page object
639
- return self._pages[key]
831
+ # Check index bounds before accessing
832
+ if isinstance(key, int):
833
+ if 0 <= key < len(self._pages):
834
+ return self._pages[key]
835
+ else:
836
+ raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
837
+ else:
838
+ raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
640
839
 
641
840
  def close(self):
642
841
  """Close the underlying PDF file and clean up any temporary files."""
643
842
  if hasattr(self, '_pdf') and self._pdf is not None:
644
- self._pdf.close()
645
- self._pdf = None
646
-
843
+ try:
844
+ self._pdf.close()
845
+ logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
846
+ except Exception as e:
847
+ logger.warning(f"Error closing pdfplumber object: {e}")
848
+ finally:
849
+ self._pdf = None
850
+
647
851
  # Clean up temporary file if it exists
648
852
  if hasattr(self, '_temp_file') and self._temp_file is not None:
853
+ temp_file_path = None
649
854
  try:
650
- if hasattr(self._temp_file, 'name') and self._temp_file.name and os.path.exists(self._temp_file.name):
651
- os.unlink(self._temp_file.name)
652
- logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
855
+ if hasattr(self._temp_file, 'name') and self._temp_file.name:
856
+ temp_file_path = self._temp_file.name
857
+ if os.path.exists(temp_file_path):
858
+ os.unlink(temp_file_path)
859
+ logger.debug(f"Removed temporary PDF file: {temp_file_path}")
653
860
  except Exception as e:
654
- logger.warning(f"Failed to clean up temporary PDF file: {e}")
861
+ logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
655
862
  finally:
656
- self._temp_file = None
657
-
863
+ self._temp_file = None
864
+
658
865
  def __enter__(self):
659
866
  """Context manager entry."""
660
867
  return self
661
868
 
662
869
  def __exit__(self, exc_type, exc_val, exc_tb):
663
870
  """Context manager exit."""
664
- self.close()
871
+ self.close()
872
+
873
+ # --- Added TYPE_CHECKING import (if not already present) ---
874
+ if TYPE_CHECKING:
875
+ from pathlib import Path # Assuming Path is used for type hint