PyPI - natural-pdf - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

natural-pdf 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

natural_pdf/__init__.py +33 -1
natural_pdf/collections/pdf_collection.py +259 -0
natural_pdf/core/page.py +102 -74
natural_pdf/core/pdf.py +383 -172
natural_pdf/elements/collections.py +8 -8
natural_pdf/elements/region.py +3 -1
natural_pdf/exporters/__init__.py +1 -0
natural_pdf/exporters/searchable_pdf.py +252 -0
natural_pdf/search/__init__.py +94 -0
natural_pdf/search/haystack_search_service.py +520 -0
natural_pdf/search/haystack_utils.py +386 -0
natural_pdf/search/search_options.py +72 -0
natural_pdf/search/search_service_protocol.py +189 -0
natural_pdf/search/searchable_mixin.py +464 -0
{natural_pdf-0.1.2.dist-info → natural_pdf-0.1.4.dist-info}/METADATA +24 -11
{natural_pdf-0.1.2.dist-info → natural_pdf-0.1.4.dist-info}/RECORD +19 -10
{natural_pdf-0.1.2.dist-info → natural_pdf-0.1.4.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.2.dist-info → natural_pdf-0.1.4.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.2.dist-info → natural_pdf-0.1.4.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -52,4 +52,36 @@ __version__ = "0.1.1"
 if HAS_QA:
     __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
 else:
-    __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
+    __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
+# Core classes
+from .core.pdf import PDF
+from .collections.pdf_collection import PDFCollection
+from .elements.region import Region
+# Search options (if extras installed)
+try:
+    from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
+except ImportError:
+    # Define dummy classes if extras not installed, so imports don't break
+    # but using them will raise the ImportError from check_haystack_availability
+    class TextSearchOptions:
+        def __init__(self, *args, **kwargs): pass
+    class MultiModalSearchOptions:
+        def __init__(self, *args, **kwargs): pass
+    class BaseSearchOptions:
+        def __init__(self, *args, **kwargs): pass
+# Expose logging setup? (Optional)
+# from . import logging_config
+# logging_config.setup_logging()
+# Explicitly define what gets imported with 'from natural_pdf import *'
+__all__ = [
+    'PDF',
+    'PDFCollection',
+    'Region',
+    'TextSearchOptions',       # Include search options
+    'MultiModalSearchOptions',
+    'BaseSearchOptions'
+]

natural_pdf/collections/pdf_collection.py ADDED Viewed

@@ -0,0 +1,259 @@
+import os
+import glob as py_glob
+import logging
+from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
+from pathlib import Path
+from PIL import Image
+import re # Added for safe path generation
+import copy # Added for copying options
+from tqdm import tqdm
+# Set up logger early
+logger = logging.getLogger(__name__)
+from natural_pdf.core.pdf import PDF
+from natural_pdf.elements.region import Region
+# --- Search Imports ---
+try:
+    from natural_pdf.search.search_service_protocol import (
+         SearchServiceProtocol, SearchOptions, Indexable
+     )
+    from natural_pdf.search.searchable_mixin import SearchableMixin
+except ImportError as e:
+    logger_init = logging.getLogger(__name__)
+    logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
+    # Dummy definitions
+    class SearchableMixin: pass
+    SearchServiceProtocol, SearchOptions, Indexable = object, object, object
+from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
+class PDFCollection(SearchableMixin): # Inherit from the mixin
+    def __init__(self,
+                 source: Union[str, Iterable[Union[str, 'PDF']]],
+                 recursive: bool = True,
+                 **pdf_options: Any):
+        """
+        Initializes a collection of PDF documents from various sources.
+        Args:
+            source: The source of PDF documents. Can be:
+                - An iterable (e.g., list) of existing PDF objects.
+                - An iterable (e.g., list) of file paths/URLs/globs (strings).
+                - A single file path/URL/directory/glob string.
+            recursive: If source involves directories or glob patterns,
+                       whether to search recursively (default: True).
+            **pdf_options: Keyword arguments passed to the PDF constructor.
+        """
+        self._pdfs: List['PDF'] = []
+        self._pdf_options = pdf_options # Store options for potential slicing later
+        self._recursive = recursive # Store setting for potential slicing
+        # Dynamically import PDF class within methods to avoid circular import at module load time
+        PDF = self._get_pdf_class()
+        if hasattr(source, '__iter__') and not isinstance(source, str):
+             source_list = list(source)
+             if not source_list: return # Empty list source
+             if isinstance(source_list[0], PDF):
+                  if all(isinstance(item, PDF) for item in source_list):
+                       self._pdfs = source_list # Direct assignment
+                       # Don't adopt search context anymore
+                       return
+                  else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
+             # If it's an iterable but not PDFs, fall through to resolve sources
+        # Resolve string, iterable of strings, or single string source to paths/URLs
+        resolved_paths_or_urls = self._resolve_sources_to_paths(source)
+        self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
+        self._iter_index = 0
+        # Initialize internal search service reference
+        self._search_service: Optional[SearchServiceProtocol] = None
+    @staticmethod
+    def _get_pdf_class():
+        """Helper method to dynamically import the PDF class."""
+        try:
+            # Import needs to resolve path correctly
+            from natural_pdf.core.pdf import PDF
+            return PDF
+        except ImportError as e:
+            logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
+            raise ImportError("PDF class is required but could not be imported.") from e
+    # --- Internal Helpers ---
+    def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
+    def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
+    def _execute_glob(self, pattern: str) -> Set[str]:
+        """Glob for paths and return a set of valid PDF paths."""
+        found_paths = set()
+        try:
+            # Use iglob for potentially large directories/matches
+            paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
+            for path_str in paths_iter:
+                 # Use Path object for easier checking
+                 p = Path(path_str)
+                 if p.is_file() and p.suffix.lower() == ".pdf":
+                      found_paths.add(str(p.resolve())) # Store resolved absolute path
+        except Exception as e:
+            logger.error(f"Error processing glob pattern '{pattern}': {e}")
+        return found_paths
+    def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
+        """Resolves various source types into a list of unique PDF paths/URLs."""
+        final_paths = set()
+        sources_to_process = []
+        if isinstance(source, str):
+            sources_to_process.append(source)
+        elif hasattr(source, '__iter__'):
+            sources_to_process.extend(list(source))
+        else: # Should not happen based on __init__ checks, but safeguard
+             raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
+        for item in sources_to_process:
+             if not isinstance(item, str):
+                  logger.warning(f"Skipping non-string item in source list: {type(item)}")
+                  continue
+             item_path = Path(item)
+             if self._is_url(item):
+                 final_paths.add(item) # Add URL directly
+             elif self._has_glob_magic(item):
+                 glob_results = self._execute_glob(item)
+                 final_paths.update(glob_results)
+             elif item_path.is_dir():
+                 # Use glob to find PDFs in directory, respecting recursive flag
+                 dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
+                 dir_glob_results = self._execute_glob(dir_pattern)
+                 final_paths.update(dir_glob_results)
+             elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
+                 final_paths.add(str(item_path.resolve())) # Add resolved file path
+             else:
+                 logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
+        return sorted(list(final_paths))
+    def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
+        """Initializes PDF objects from a list of paths/URLs."""
+        logger.info(f"Initializing {len(paths_or_urls)} PDF objects...")
+        failed_count = 0
+        for path_or_url in tqdm(paths_or_urls, desc="Loading PDFs"):
+            try:
+                pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
+                self._pdfs.append(pdf_instance)
+            except Exception as e:
+                 logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
+                 failed_count += 1
+        logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
+    # --- Public Factory Class Methods (Simplified) ---
+    @classmethod
+    def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
+        """Creates a PDFCollection explicitly from a list of file paths or URLs."""
+        # __init__ can handle List[str] directly now
+        return cls(paths_or_urls, **pdf_options)
+    @classmethod
+    def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+        """Creates a PDFCollection explicitly from a single glob pattern."""
+        # __init__ can handle single glob string directly
+        return cls(pattern, recursive=recursive, **pdf_options)
+    @classmethod
+    def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+        """Creates a PDFCollection explicitly from a list of glob patterns."""
+         # __init__ can handle List[str] containing globs directly
+        return cls(patterns, recursive=recursive, **pdf_options)
+    @classmethod
+    def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+        """Creates a PDFCollection explicitly from PDF files within a directory."""
+        # __init__ can handle single directory string directly
+        return cls(directory_path, recursive=recursive, **pdf_options)
+    # --- Core Collection Methods ---
+    def __len__(self) -> int:
+        return len(self._pdfs)
+    def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
+        # Use dynamic import here as well
+        PDF = self._get_pdf_class()
+        if isinstance(key, slice):
+            # Create a new collection with the sliced PDFs and original options
+            new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
+            new_collection._pdfs = self._pdfs[key]
+            new_collection._pdf_options = self._pdf_options
+            new_collection._recursive = self._recursive
+            # Search context is not copied/inherited anymore
+            return new_collection
+        elif isinstance(key, int):
+            # Check bounds
+            if 0 <= key < len(self._pdfs):
+                return self._pdfs[key]
+            else:
+                 raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
+        else:
+             raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
+    def __iter__(self):
+        return iter(self._pdfs)
+    def __repr__(self) -> str:
+        # Removed search status
+        return f"<PDFCollection(count={len(self)})>"
+    @property
+    def pdfs(self) -> List['PDF']:
+         """Returns the list of PDF objects held by the collection."""
+         return self._pdfs
+    # --- Other Methods (e.g., apply_ocr - could leverage service in future?) ---
+    def apply_ocr(self, *args, **kwargs):
+        PDF = self._get_pdf_class()
+        # Delegate to individual PDF objects
+        logger.info("Applying OCR to relevant PDFs in collection...")
+        results = []
+        for pdf in self._pdfs:
+             # We need to figure out which pages belong to which PDF if batching here
+             # For now, simpler to call on each PDF
+             try:
+                 # Assume apply_ocr exists on PDF and accepts similar args
+                 pdf.apply_ocr(*args, **kwargs)
+             except Exception as e:
+                 logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
+        return self
+    # --- Advanced Method Placeholders ---
+    # Placeholder for categorize removed as find_relevant is now implemented
+    def categorize(self, categories: List[str], **kwargs):
+        """Categorizes PDFs in the collection based on content or features."""
+        # Implementation requires integrating with classification models or logic
+        raise NotImplementedError("categorize requires classification implementation.")
+    # --- Mixin Required Implementation ---
+    def get_indexable_items(self) -> Iterable[Indexable]:
+        """Yields Page objects from the collection, conforming to Indexable."""
+        if not self._pdfs:
+             return # Return empty iterator if no PDFs
+        for pdf in self._pdfs:
+             if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
+                 logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
+                 continue
+             for page in pdf.pages:
+                 # Optional: Add filtering here if needed (e.g., skip empty pages)
+                 # Assuming Page object conforms to Indexable
+                 # We might still want the empty page check here for efficiency
+                 # if not page.extract_text(use_exclusions=False).strip():
+                 #     logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
+                 #     continue
+                 yield page

natural_pdf/core/page.py CHANGED Viewed

@@ -7,6 +7,8 @@ from PIL import Image
 import base64
 import io
 import json
+import re
+import hashlib
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
@@ -96,6 +98,11 @@ class Page:
         """Get page number (1-based)."""
         return self._page.page_number
+    @property
+    def page_number(self) -> int:
+        """Get page number (1-based)."""
+        return self._page.page_number
     @property
     def index(self) -> int:
         """Get page index (0-based)."""
@@ -127,7 +134,7 @@ class Page:
         self._exclusions = []
         return self
-    def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
+    def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
         """
         Add an exclusion to the page. Text from these regions will be excluded from extraction.
         Ensures non-callable items are stored as Region objects if possible.
@@ -135,6 +142,7 @@ class Page:
         Args:
             exclusion_func_or_region: Either a callable function returning a Region,
                                       a Region object, or another object with a valid .bbox attribute.
+            label: Optional label for this exclusion (e.g., 'header', 'footer').
         Returns:
             Self for method chaining
@@ -142,28 +150,36 @@ class Page:
         Raises:
             TypeError: If a non-callable, non-Region object without a valid bbox is provided.
         """
+        exclusion_data = None # Initialize exclusion data
         if callable(exclusion_func_or_region):
-            # Store callable functions directly
-            self._exclusions.append(exclusion_func_or_region)
-            logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
+            # Store callable functions along with their label
+            exclusion_data = (exclusion_func_or_region, label)
+            logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
         elif isinstance(exclusion_func_or_region, Region):
-            # Store Region objects directly
-            self._exclusions.append(exclusion_func_or_region)
-            logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
+            # Store Region objects directly, assigning the label
+            exclusion_func_or_region.label = label # Assign label
+            exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
+            logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
         elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
             # Convert objects with a valid bbox to a Region before storing
             try:
                 bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
-                region_to_add = Region(self, bbox_coords)
-                self._exclusions.append(region_to_add)
-                logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
+                # Pass the label to the Region constructor
+                region_to_add = Region(self, bbox_coords, label=label)
+                exclusion_data = (region_to_add, label) # Store as tuple
+                logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
             except (ValueError, TypeError, Exception) as e:
                 # Raise an error if conversion fails
                 raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
         else:
             # Reject invalid types
             raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
+        # Append the stored data (tuple of object/callable and label)
+        if exclusion_data:
+            self._exclusions.append(exclusion_data)
         return self
     def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -222,75 +238,66 @@ class Page:
     def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
         """
         Get all exclusion regions for this page.
-        Assumes self._exclusions contains only callables or Region objects.
+        Assumes self._exclusions contains tuples of (callable/Region, label).
         Args:
             include_callable: Whether to evaluate callable exclusion functions
             debug: Enable verbose debug logging for exclusion evaluation
         Returns:
-            List of Region objects to exclude
+            List of Region objects to exclude, with labels assigned.
         """
         regions = []
-        # Track exclusion results for debugging
         if debug:
             print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
-        for i, exclusion in enumerate(self._exclusions):
-            # Get exclusion label if it's a tuple from PDF level
-            exclusion_label = f"exclusion {i}"
-            original_exclusion = exclusion # Keep track for debugging
-            # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
-            if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
-                exclusion_func, label = exclusion
-                if label:
-                    exclusion_label = label
-                exclusion = exclusion_func # Use the function part
+        for i, exclusion_data in enumerate(self._exclusions):
+            # Unpack the exclusion object/callable and its label
+            exclusion_item, label = exclusion_data
+            exclusion_label = label if label else f"exclusion {i}"
             # Process callable exclusion functions
-            if callable(exclusion) and include_callable:
-                # It's a function, call it with this page
+            if callable(exclusion_item) and include_callable:
                 try:
                     if debug:
-                        print(f"  - Evaluating callable {exclusion_label}...")
-                    # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
-                    # This might be overly cautious depending on use case, but safer.
+                        print(f"  - Evaluating callable '{exclusion_label}'...")
+                    # Temporarily clear exclusions (consider if really needed)
                     temp_original_exclusions = self._exclusions
-                    self._exclusions = []
+                    self._exclusions = []
                     # Call the function - Expects it to return a Region or None
-                    region_result = exclusion(self)
+                    region_result = exclusion_item(self)
                     # Restore exclusions
                     self._exclusions = temp_original_exclusions
                     if isinstance(region_result, Region):
+                        # Assign the label to the returned region
+                        region_result.label = label
                         regions.append(region_result)
                         if debug:
-                            print(f"    ✓ Added region from callable: {region_result}")
+                            print(f"    ✓ Added region from callable '{label}': {region_result}")
                     elif region_result:
-                         # Log warning if callable returned something other than Region/None
-                         logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
+                         logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
                          if debug:
                              print(f"    ✗ Callable returned non-Region/None: {type(region_result)}")
                     else:
                         if debug:
-                            print(f"    ✗ Callable returned None, no region added")
+                            print(f"    ✗ Callable '{exclusion_label}' returned None, no region added")
                 except Exception as e:
-                    error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
+                    error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
                     print(error_msg)
                     import traceback
                     print(f"    Traceback: {traceback.format_exc().splitlines()[-3:]}")
-            # Process direct Region objects (already validated by add_exclusion)
-            elif isinstance(exclusion, Region):
-                regions.append(exclusion)
+            # Process direct Region objects (label was assigned in add_exclusion)
+            elif isinstance(exclusion_item, Region):
+                regions.append(exclusion_item) # Label is already on the Region object
                 if debug:
-                    print(f"  - Added direct region: {exclusion}")
+                    print(f"  - Added direct region '{label}': {exclusion_item}")
             # No else needed, add_exclusion should prevent invalid types
         if debug:
@@ -1067,19 +1074,19 @@ class Page:
         device: Optional[str] = None,
     ) -> List[TextElement]:
         """
-        Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
+        Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
         Returns:
             List of created TextElements derived from OCR results for this page.
         """
-        if not hasattr(self._parent, 'apply_ocr_to_pages'):
-             logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
+        if not hasattr(self._parent, 'apply_ocr'):
+             logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
              return []
-        logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
+        logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
         try:
             # Delegate to parent PDF, targeting only this page's index
-            self._parent.apply_ocr_to_pages(
+            self._parent.apply_ocr(
                 pages=[self.index],
                 engine=engine, options=options, languages=languages,
                 min_confidence=min_confidence, device=device
@@ -1485,25 +1492,46 @@ class Page:
             RuntimeError: If required dependencies (ipywidgets) are missing.
             ValueError: If image rendering or data preparation fails within from_page.
         """
-        # Import the widget class (might need to be moved to top if used elsewhere)
-        from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
+        # Dynamically import here if needed, or ensure it's globally available
+        try:
+            from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
+        except ImportError:
+            logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
+            raise
+        # Pass self (the Page object) to the factory method
+        return SimpleInteractiveViewerWidget.from_page(self)
+    # --- Indexable Protocol Methods ---
+    def get_id(self) -> str:
+        """Returns a unique identifier for the page (required by Indexable protocol)."""
+        # Ensure path is safe for use in IDs (replace problematic chars)
+        safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
+        return f"pdf_{safe_path}_page_{self.page_number}"
+    def get_metadata(self) -> Dict[str, Any]:
+        """Returns metadata associated with the page (required by Indexable protocol)."""
+        # Add content hash here for sync
+        metadata = {
+            "pdf_path": str(self.pdf.path),
+            "page_number": self.page_number,
+            "width": self.width,
+            "height": self.height,
+            "content_hash": self.get_content_hash() # Include the hash
+        }
+        return metadata
-        logger.info(f"Generating interactive viewer for Page {self.number} using SimpleInteractiveViewerWidget.from_page...")
+    def get_content(self) -> 'Page':
+        """
+        Returns the primary content object (self) for indexing (required by Indexable protocol).
+        SearchService implementations decide how to process this (e.g., call extract_text).
+        """
+        return self # Return the Page object itself
-        try:
-            # Delegate creation entirely to the from_page class method
-            viewer_widget = SimpleInteractiveViewerWidget.from_page(self)
-            if viewer_widget is None:
-                 # This case might happen if from_page had error handling to return None, though we removed most.
-                 # Keeping a check here just in case.
-                 raise RuntimeError("SimpleInteractiveViewerWidget.from_page returned None, indicating an issue during widget creation.")
-            logger.info("Interactive viewer widget created successfully.")
-            return viewer_widget
-        except ImportError as e:
-            logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
-            raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
-        except Exception as e:
-            logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
-            # Re-raise the exception to make it visible to the user
-            raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
+    def get_content_hash(self) -> str:
+        """Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
+        # Hash the extracted text (without exclusions for consistency)
+        # Consider if exclusions should be part of the hash? For now, hash raw text.
+        # Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
+        text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
+        return hashlib.sha256(text_content.encode('utf-8')).hexdigest()

natural-pdf 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

natural-pdf 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl