natural-pdf 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -52,4 +52,36 @@ __version__ = "0.1.1"
52
52
  if HAS_QA:
53
53
  __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
54
54
  else:
55
- __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
55
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
56
+
57
+ # Core classes
58
+ from .core.pdf import PDF
59
+ from .collections.pdf_collection import PDFCollection
60
+ from .elements.region import Region
61
+
62
+ # Search options (if extras installed)
63
+ try:
64
+ from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
65
+ except ImportError:
66
+ # Define dummy classes if extras not installed, so imports don't break
67
+ # but using them will raise the ImportError from check_haystack_availability
68
+ class TextSearchOptions:
69
+ def __init__(self, *args, **kwargs): pass
70
+ class MultiModalSearchOptions:
71
+ def __init__(self, *args, **kwargs): pass
72
+ class BaseSearchOptions:
73
+ def __init__(self, *args, **kwargs): pass
74
+
75
+ # Expose logging setup? (Optional)
76
+ # from . import logging_config
77
+ # logging_config.setup_logging()
78
+
79
+ # Explicitly define what gets imported with 'from natural_pdf import *'
80
+ __all__ = [
81
+ 'PDF',
82
+ 'PDFCollection',
83
+ 'Region',
84
+ 'TextSearchOptions', # Include search options
85
+ 'MultiModalSearchOptions',
86
+ 'BaseSearchOptions'
87
+ ]
@@ -0,0 +1,259 @@
1
+ import os
2
+ import glob as py_glob
3
+ import logging
4
+ from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ import re # Added for safe path generation
8
+ import copy # Added for copying options
9
+ from tqdm import tqdm
10
+
11
+ # Set up logger early
12
+ logger = logging.getLogger(__name__)
13
+
14
+ from natural_pdf.core.pdf import PDF
15
+ from natural_pdf.elements.region import Region
16
+
17
+ # --- Search Imports ---
18
+ try:
19
+ from natural_pdf.search.search_service_protocol import (
20
+ SearchServiceProtocol, SearchOptions, Indexable
21
+ )
22
+ from natural_pdf.search.searchable_mixin import SearchableMixin
23
+ except ImportError as e:
24
+ logger_init = logging.getLogger(__name__)
25
+ logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
26
+ # Dummy definitions
27
+ class SearchableMixin: pass
28
+ SearchServiceProtocol, SearchOptions, Indexable = object, object, object
29
+
30
+ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
31
+
32
+ class PDFCollection(SearchableMixin): # Inherit from the mixin
33
+ def __init__(self,
34
+ source: Union[str, Iterable[Union[str, 'PDF']]],
35
+ recursive: bool = True,
36
+ **pdf_options: Any):
37
+ """
38
+ Initializes a collection of PDF documents from various sources.
39
+
40
+ Args:
41
+ source: The source of PDF documents. Can be:
42
+ - An iterable (e.g., list) of existing PDF objects.
43
+ - An iterable (e.g., list) of file paths/URLs/globs (strings).
44
+ - A single file path/URL/directory/glob string.
45
+ recursive: If source involves directories or glob patterns,
46
+ whether to search recursively (default: True).
47
+ **pdf_options: Keyword arguments passed to the PDF constructor.
48
+ """
49
+ self._pdfs: List['PDF'] = []
50
+ self._pdf_options = pdf_options # Store options for potential slicing later
51
+ self._recursive = recursive # Store setting for potential slicing
52
+
53
+ # Dynamically import PDF class within methods to avoid circular import at module load time
54
+ PDF = self._get_pdf_class()
55
+
56
+ if hasattr(source, '__iter__') and not isinstance(source, str):
57
+ source_list = list(source)
58
+ if not source_list: return # Empty list source
59
+ if isinstance(source_list[0], PDF):
60
+ if all(isinstance(item, PDF) for item in source_list):
61
+ self._pdfs = source_list # Direct assignment
62
+ # Don't adopt search context anymore
63
+ return
64
+ else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
65
+ # If it's an iterable but not PDFs, fall through to resolve sources
66
+
67
+ # Resolve string, iterable of strings, or single string source to paths/URLs
68
+ resolved_paths_or_urls = self._resolve_sources_to_paths(source)
69
+ self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
70
+
71
+ self._iter_index = 0
72
+
73
+ # Initialize internal search service reference
74
+ self._search_service: Optional[SearchServiceProtocol] = None
75
+
76
+ @staticmethod
77
+ def _get_pdf_class():
78
+ """Helper method to dynamically import the PDF class."""
79
+ try:
80
+ # Import needs to resolve path correctly
81
+ from natural_pdf.core.pdf import PDF
82
+ return PDF
83
+ except ImportError as e:
84
+ logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
85
+ raise ImportError("PDF class is required but could not be imported.") from e
86
+
87
+ # --- Internal Helpers ---
88
+
89
+ def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
90
+ def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
91
+
92
+ def _execute_glob(self, pattern: str) -> Set[str]:
93
+ """Glob for paths and return a set of valid PDF paths."""
94
+ found_paths = set()
95
+ try:
96
+ # Use iglob for potentially large directories/matches
97
+ paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
98
+ for path_str in paths_iter:
99
+ # Use Path object for easier checking
100
+ p = Path(path_str)
101
+ if p.is_file() and p.suffix.lower() == ".pdf":
102
+ found_paths.add(str(p.resolve())) # Store resolved absolute path
103
+ except Exception as e:
104
+ logger.error(f"Error processing glob pattern '{pattern}': {e}")
105
+ return found_paths
106
+
107
+ def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
108
+ """Resolves various source types into a list of unique PDF paths/URLs."""
109
+ final_paths = set()
110
+ sources_to_process = []
111
+
112
+ if isinstance(source, str):
113
+ sources_to_process.append(source)
114
+ elif hasattr(source, '__iter__'):
115
+ sources_to_process.extend(list(source))
116
+ else: # Should not happen based on __init__ checks, but safeguard
117
+ raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
118
+
119
+ for item in sources_to_process:
120
+ if not isinstance(item, str):
121
+ logger.warning(f"Skipping non-string item in source list: {type(item)}")
122
+ continue
123
+
124
+ item_path = Path(item)
125
+
126
+ if self._is_url(item):
127
+ final_paths.add(item) # Add URL directly
128
+ elif self._has_glob_magic(item):
129
+ glob_results = self._execute_glob(item)
130
+ final_paths.update(glob_results)
131
+ elif item_path.is_dir():
132
+ # Use glob to find PDFs in directory, respecting recursive flag
133
+ dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
134
+ dir_glob_results = self._execute_glob(dir_pattern)
135
+ final_paths.update(dir_glob_results)
136
+ elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
137
+ final_paths.add(str(item_path.resolve())) # Add resolved file path
138
+ else:
139
+ logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
140
+
141
+ return sorted(list(final_paths))
142
+
143
+ def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
144
+ """Initializes PDF objects from a list of paths/URLs."""
145
+ logger.info(f"Initializing {len(paths_or_urls)} PDF objects...")
146
+ failed_count = 0
147
+ for path_or_url in tqdm(paths_or_urls, desc="Loading PDFs"):
148
+ try:
149
+ pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
150
+ self._pdfs.append(pdf_instance)
151
+ except Exception as e:
152
+ logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
153
+ failed_count += 1
154
+ logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
155
+
156
+ # --- Public Factory Class Methods (Simplified) ---
157
+
158
+ @classmethod
159
+ def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
160
+ """Creates a PDFCollection explicitly from a list of file paths or URLs."""
161
+ # __init__ can handle List[str] directly now
162
+ return cls(paths_or_urls, **pdf_options)
163
+
164
+ @classmethod
165
+ def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
166
+ """Creates a PDFCollection explicitly from a single glob pattern."""
167
+ # __init__ can handle single glob string directly
168
+ return cls(pattern, recursive=recursive, **pdf_options)
169
+
170
+ @classmethod
171
+ def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
172
+ """Creates a PDFCollection explicitly from a list of glob patterns."""
173
+ # __init__ can handle List[str] containing globs directly
174
+ return cls(patterns, recursive=recursive, **pdf_options)
175
+
176
+ @classmethod
177
+ def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
178
+ """Creates a PDFCollection explicitly from PDF files within a directory."""
179
+ # __init__ can handle single directory string directly
180
+ return cls(directory_path, recursive=recursive, **pdf_options)
181
+
182
+ # --- Core Collection Methods ---
183
+ def __len__(self) -> int:
184
+ return len(self._pdfs)
185
+
186
+ def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
187
+ # Use dynamic import here as well
188
+ PDF = self._get_pdf_class()
189
+ if isinstance(key, slice):
190
+ # Create a new collection with the sliced PDFs and original options
191
+ new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
192
+ new_collection._pdfs = self._pdfs[key]
193
+ new_collection._pdf_options = self._pdf_options
194
+ new_collection._recursive = self._recursive
195
+ # Search context is not copied/inherited anymore
196
+ return new_collection
197
+ elif isinstance(key, int):
198
+ # Check bounds
199
+ if 0 <= key < len(self._pdfs):
200
+ return self._pdfs[key]
201
+ else:
202
+ raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
203
+ else:
204
+ raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
205
+
206
+ def __iter__(self):
207
+ return iter(self._pdfs)
208
+
209
+ def __repr__(self) -> str:
210
+ # Removed search status
211
+ return f"<PDFCollection(count={len(self)})>"
212
+
213
+ @property
214
+ def pdfs(self) -> List['PDF']:
215
+ """Returns the list of PDF objects held by the collection."""
216
+ return self._pdfs
217
+
218
+ # --- Other Methods (e.g., apply_ocr - could leverage service in future?) ---
219
+ def apply_ocr(self, *args, **kwargs):
220
+ PDF = self._get_pdf_class()
221
+ # Delegate to individual PDF objects
222
+ logger.info("Applying OCR to relevant PDFs in collection...")
223
+ results = []
224
+ for pdf in self._pdfs:
225
+ # We need to figure out which pages belong to which PDF if batching here
226
+ # For now, simpler to call on each PDF
227
+ try:
228
+ # Assume apply_ocr exists on PDF and accepts similar args
229
+ pdf.apply_ocr(*args, **kwargs)
230
+ except Exception as e:
231
+ logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
232
+ return self
233
+
234
+ # --- Advanced Method Placeholders ---
235
+ # Placeholder for categorize removed as find_relevant is now implemented
236
+
237
+ def categorize(self, categories: List[str], **kwargs):
238
+ """Categorizes PDFs in the collection based on content or features."""
239
+ # Implementation requires integrating with classification models or logic
240
+ raise NotImplementedError("categorize requires classification implementation.")
241
+
242
+ # --- Mixin Required Implementation ---
243
+ def get_indexable_items(self) -> Iterable[Indexable]:
244
+ """Yields Page objects from the collection, conforming to Indexable."""
245
+ if not self._pdfs:
246
+ return # Return empty iterator if no PDFs
247
+
248
+ for pdf in self._pdfs:
249
+ if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
250
+ logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
251
+ continue
252
+ for page in pdf.pages:
253
+ # Optional: Add filtering here if needed (e.g., skip empty pages)
254
+ # Assuming Page object conforms to Indexable
255
+ # We might still want the empty page check here for efficiency
256
+ # if not page.extract_text(use_exclusions=False).strip():
257
+ # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
258
+ # continue
259
+ yield page
natural_pdf/core/page.py CHANGED
@@ -7,6 +7,8 @@ from PIL import Image
7
7
  import base64
8
8
  import io
9
9
  import json
10
+ import re
11
+ import hashlib
10
12
 
11
13
  from natural_pdf.elements.collections import ElementCollection
12
14
  from natural_pdf.elements.region import Region
@@ -96,6 +98,11 @@ class Page:
96
98
  """Get page number (1-based)."""
97
99
  return self._page.page_number
98
100
 
101
+ @property
102
+ def page_number(self) -> int:
103
+ """Get page number (1-based)."""
104
+ return self._page.page_number
105
+
99
106
  @property
100
107
  def index(self) -> int:
101
108
  """Get page index (0-based)."""
@@ -127,7 +134,7 @@ class Page:
127
134
  self._exclusions = []
128
135
  return self
129
136
 
130
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
137
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
131
138
  """
132
139
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
133
140
  Ensures non-callable items are stored as Region objects if possible.
@@ -135,6 +142,7 @@ class Page:
135
142
  Args:
136
143
  exclusion_func_or_region: Either a callable function returning a Region,
137
144
  a Region object, or another object with a valid .bbox attribute.
145
+ label: Optional label for this exclusion (e.g., 'header', 'footer').
138
146
 
139
147
  Returns:
140
148
  Self for method chaining
@@ -142,28 +150,36 @@ class Page:
142
150
  Raises:
143
151
  TypeError: If a non-callable, non-Region object without a valid bbox is provided.
144
152
  """
153
+ exclusion_data = None # Initialize exclusion data
154
+
145
155
  if callable(exclusion_func_or_region):
146
- # Store callable functions directly
147
- self._exclusions.append(exclusion_func_or_region)
148
- logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
156
+ # Store callable functions along with their label
157
+ exclusion_data = (exclusion_func_or_region, label)
158
+ logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
149
159
  elif isinstance(exclusion_func_or_region, Region):
150
- # Store Region objects directly
151
- self._exclusions.append(exclusion_func_or_region)
152
- logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
160
+ # Store Region objects directly, assigning the label
161
+ exclusion_func_or_region.label = label # Assign label
162
+ exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
163
+ logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
153
164
  elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
154
165
  # Convert objects with a valid bbox to a Region before storing
155
166
  try:
156
167
  bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
157
- region_to_add = Region(self, bbox_coords)
158
- self._exclusions.append(region_to_add)
159
- logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
168
+ # Pass the label to the Region constructor
169
+ region_to_add = Region(self, bbox_coords, label=label)
170
+ exclusion_data = (region_to_add, label) # Store as tuple
171
+ logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
160
172
  except (ValueError, TypeError, Exception) as e:
161
173
  # Raise an error if conversion fails
162
174
  raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
163
175
  else:
164
176
  # Reject invalid types
165
177
  raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
166
-
178
+
179
+ # Append the stored data (tuple of object/callable and label)
180
+ if exclusion_data:
181
+ self._exclusions.append(exclusion_data)
182
+
167
183
  return self
168
184
 
169
185
  def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -222,75 +238,66 @@ class Page:
222
238
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
223
239
  """
224
240
  Get all exclusion regions for this page.
225
- Assumes self._exclusions contains only callables or Region objects.
241
+ Assumes self._exclusions contains tuples of (callable/Region, label).
226
242
 
227
243
  Args:
228
244
  include_callable: Whether to evaluate callable exclusion functions
229
245
  debug: Enable verbose debug logging for exclusion evaluation
230
246
 
231
247
  Returns:
232
- List of Region objects to exclude
248
+ List of Region objects to exclude, with labels assigned.
233
249
  """
234
250
  regions = []
235
251
 
236
- # Track exclusion results for debugging
237
252
  if debug:
238
253
  print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
239
-
240
- for i, exclusion in enumerate(self._exclusions):
241
- # Get exclusion label if it's a tuple from PDF level
242
- exclusion_label = f"exclusion {i}"
243
- original_exclusion = exclusion # Keep track for debugging
244
-
245
- # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
246
- if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
247
- exclusion_func, label = exclusion
248
- if label:
249
- exclusion_label = label
250
- exclusion = exclusion_func # Use the function part
251
-
254
+
255
+ for i, exclusion_data in enumerate(self._exclusions):
256
+ # Unpack the exclusion object/callable and its label
257
+ exclusion_item, label = exclusion_data
258
+ exclusion_label = label if label else f"exclusion {i}"
259
+
252
260
  # Process callable exclusion functions
253
- if callable(exclusion) and include_callable:
254
- # It's a function, call it with this page
261
+ if callable(exclusion_item) and include_callable:
255
262
  try:
256
263
  if debug:
257
- print(f" - Evaluating callable {exclusion_label}...")
258
-
259
- # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
260
- # This might be overly cautious depending on use case, but safer.
264
+ print(f" - Evaluating callable '{exclusion_label}'...")
265
+
266
+ # Temporarily clear exclusions (consider if really needed)
261
267
  temp_original_exclusions = self._exclusions
262
- self._exclusions = []
263
-
268
+ self._exclusions = []
269
+
264
270
  # Call the function - Expects it to return a Region or None
265
- region_result = exclusion(self)
266
-
271
+ region_result = exclusion_item(self)
272
+
267
273
  # Restore exclusions
268
274
  self._exclusions = temp_original_exclusions
269
-
275
+
270
276
  if isinstance(region_result, Region):
277
+ # Assign the label to the returned region
278
+ region_result.label = label
271
279
  regions.append(region_result)
272
280
  if debug:
273
- print(f" ✓ Added region from callable: {region_result}")
281
+ print(f" ✓ Added region from callable '{label}': {region_result}")
274
282
  elif region_result:
275
- # Log warning if callable returned something other than Region/None
276
- logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
283
+ logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
277
284
  if debug:
278
285
  print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
279
286
  else:
280
287
  if debug:
281
- print(f" ✗ Callable returned None, no region added")
282
-
288
+ print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
289
+
283
290
  except Exception as e:
284
- error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
291
+ error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
285
292
  print(error_msg)
286
293
  import traceback
287
294
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
288
-
289
- # Process direct Region objects (already validated by add_exclusion)
290
- elif isinstance(exclusion, Region):
291
- regions.append(exclusion)
295
+
296
+ # Process direct Region objects (label was assigned in add_exclusion)
297
+ elif isinstance(exclusion_item, Region):
298
+ regions.append(exclusion_item) # Label is already on the Region object
292
299
  if debug:
293
- print(f" - Added direct region: {exclusion}")
300
+ print(f" - Added direct region '{label}': {exclusion_item}")
294
301
  # No else needed, add_exclusion should prevent invalid types
295
302
 
296
303
  if debug:
@@ -1067,19 +1074,19 @@ class Page:
1067
1074
  device: Optional[str] = None,
1068
1075
  ) -> List[TextElement]:
1069
1076
  """
1070
- Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
1077
+ Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1071
1078
 
1072
1079
  Returns:
1073
1080
  List of created TextElements derived from OCR results for this page.
1074
1081
  """
1075
- if not hasattr(self._parent, 'apply_ocr_to_pages'):
1076
- logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
1082
+ if not hasattr(self._parent, 'apply_ocr'):
1083
+ logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1077
1084
  return []
1078
1085
 
1079
- logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
1086
+ logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1080
1087
  try:
1081
1088
  # Delegate to parent PDF, targeting only this page's index
1082
- self._parent.apply_ocr_to_pages(
1089
+ self._parent.apply_ocr(
1083
1090
  pages=[self.index],
1084
1091
  engine=engine, options=options, languages=languages,
1085
1092
  min_confidence=min_confidence, device=device
@@ -1485,25 +1492,46 @@ class Page:
1485
1492
  RuntimeError: If required dependencies (ipywidgets) are missing.
1486
1493
  ValueError: If image rendering or data preparation fails within from_page.
1487
1494
  """
1488
- # Import the widget class (might need to be moved to top if used elsewhere)
1489
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1495
+ # Dynamically import here if needed, or ensure it's globally available
1496
+ try:
1497
+ from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1498
+ except ImportError:
1499
+ logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
1500
+ raise
1501
+
1502
+ # Pass self (the Page object) to the factory method
1503
+ return SimpleInteractiveViewerWidget.from_page(self)
1504
+
1505
+ # --- Indexable Protocol Methods ---
1506
+ def get_id(self) -> str:
1507
+ """Returns a unique identifier for the page (required by Indexable protocol)."""
1508
+ # Ensure path is safe for use in IDs (replace problematic chars)
1509
+ safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
1510
+ return f"pdf_{safe_path}_page_{self.page_number}"
1511
+
1512
+ def get_metadata(self) -> Dict[str, Any]:
1513
+ """Returns metadata associated with the page (required by Indexable protocol)."""
1514
+ # Add content hash here for sync
1515
+ metadata = {
1516
+ "pdf_path": str(self.pdf.path),
1517
+ "page_number": self.page_number,
1518
+ "width": self.width,
1519
+ "height": self.height,
1520
+ "content_hash": self.get_content_hash() # Include the hash
1521
+ }
1522
+ return metadata
1490
1523
 
1491
- logger.info(f"Generating interactive viewer for Page {self.number} using SimpleInteractiveViewerWidget.from_page...")
1524
+ def get_content(self) -> 'Page':
1525
+ """
1526
+ Returns the primary content object (self) for indexing (required by Indexable protocol).
1527
+ SearchService implementations decide how to process this (e.g., call extract_text).
1528
+ """
1529
+ return self # Return the Page object itself
1492
1530
 
1493
- try:
1494
- # Delegate creation entirely to the from_page class method
1495
- viewer_widget = SimpleInteractiveViewerWidget.from_page(self)
1496
- if viewer_widget is None:
1497
- # This case might happen if from_page had error handling to return None, though we removed most.
1498
- # Keeping a check here just in case.
1499
- raise RuntimeError("SimpleInteractiveViewerWidget.from_page returned None, indicating an issue during widget creation.")
1500
-
1501
- logger.info("Interactive viewer widget created successfully.")
1502
- return viewer_widget
1503
- except ImportError as e:
1504
- logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
1505
- raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
1506
- except Exception as e:
1507
- logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
1508
- # Re-raise the exception to make it visible to the user
1509
- raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
1531
+ def get_content_hash(self) -> str:
1532
+ """Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
1533
+ # Hash the extracted text (without exclusions for consistency)
1534
+ # Consider if exclusions should be part of the hash? For now, hash raw text.
1535
+ # Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
1536
+ text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
1537
+ return hashlib.sha256(text_content.encode('utf-8')).hexdigest()