PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,62 +1,88 @@
-import pdfplumber
+import copy  # Add import for deepcopy
 import logging
-import tempfile
 import os
 import re
+import tempfile
 import urllib.request
-from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable, TYPE_CHECKING # Added Iterable and TYPE_CHECKING
-from pathlib import Path # Added Path
-import copy # Add import for deepcopy
+from pathlib import Path  # Added Path
+from typing import (  # Added Iterable and TYPE_CHECKING
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+from pathlib import Path
+import pdfplumber
 from PIL import Image
+from natural_pdf.analyzers.layout.layout_manager import (  # Import the new LayoutManager
+    LayoutManager,
+)
+from natural_pdf.core.highlighting_service import HighlightingService  # <-- Import the new service
 from natural_pdf.core.page import Page
-from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
 from natural_pdf.ocr import OCRManager, OCROptions
-from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
-from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
+from natural_pdf.selectors.parser import parse_selector
 # Import the flag directly - this should always work
 # --- Add Search Service Imports (needed for new methods) ---
 try:
+    from typing import Any as TypingAny  # Import Any if not already
+    from natural_pdf.search import TextSearchOptions  # Keep for ask default
     from natural_pdf.search import (
-        get_search_service,
-        SearchServiceProtocol,
+        BaseSearchOptions,
         SearchOptions,
-        TextSearchOptions, # Keep for ask default
-        BaseSearchOptions
+        SearchServiceProtocol,
+        get_search_service,
     )
-    from typing import Any as TypingAny # Import Any if not already
 except ImportError:
     # Define dummies if needed for type hints within the class
     SearchServiceProtocol = object
     SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
     TypingAny = object
     # Dummy factory needed for default arg in methods
     def get_search_service(**kwargs) -> SearchServiceProtocol:
-        raise ImportError("Search dependencies are not installed. Install with: pip install natural-pdf[search]")
+        raise ImportError(
+            "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
+        )
 # --- End Search Service Imports ---
 # Set up logger early
 logger = logging.getLogger("natural_pdf.core.pdf")
 class PDF:
     """
     Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,
     with improved selection, navigation, and extraction capabilities.
     """
-    def __init__(self, path_or_url: str, reading_order: bool = True,
-                 font_attrs: Optional[List[str]] = None,
-                 keep_spaces: bool = True):
+    def __init__(
+        self,
+        path_or_url: str,
+        reading_order: bool = True,
+        font_attrs: Optional[List[str]] = None,
+        keep_spaces: bool = True,
+    ):
         """
         Initialize the enhanced PDF object.
         Args:
             path_or_url: Path to the PDF file or a URL to a PDF
             reading_order: Whether to use natural reading order
@@ -69,30 +95,30 @@ class PDF:
                        False: Break text at spaces, each word is separate (legacy behavior)
         """
         # Check if the input is a URL
-        is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
+        is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
         # Initialize path-related attributes
         self._original_path = path_or_url
         self._temp_file = None
-        self._resolved_path = None # Store the actual path used by pdfplumber
+        self._resolved_path = None  # Store the actual path used by pdfplumber
         if is_url:
             logger.info(f"Downloading PDF from URL: {path_or_url}")
             try:
                 # Create a temporary file to store the downloaded PDF
-                self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+                self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
                 # Download the PDF
                 with urllib.request.urlopen(path_or_url) as response:
                     self._temp_file.write(response.read())
                     self._temp_file.flush()
                     self._temp_file.close()
                 # Use the temporary file path
                 self._resolved_path = self._temp_file.name
                 logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
             except Exception as e:
-                if self._temp_file and hasattr(self._temp_file, 'name'):
+                if self._temp_file and hasattr(self._temp_file, "name"):
                     try:
                         os.unlink(self._temp_file.name)
                     except:
@@ -104,40 +130,46 @@ class PDF:
             self._resolved_path = path_or_url
         logger.info(f"Initializing PDF from {self._resolved_path}")
-        logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
+        logger.debug(
+            f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
+        )
         try:
             self._pdf = pdfplumber.open(self._resolved_path)
         except Exception as e:
-             logger.error(f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}", exc_info=True)
-             # Clean up temp file if creation failed
-             self.close()
-             raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
+            logger.error(
+                f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
+                exc_info=True,
+            )
+            # Clean up temp file if creation failed
+            self.close()
+            raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
-        self._path = self._resolved_path # Keep original path too?
-        self.path = self._resolved_path # Public attribute for the resolved path
-        self.source_path = self._original_path # Public attribute for the user-provided path/URL
+        self._path = self._resolved_path  # Keep original path too?
+        self.path = self._resolved_path  # Public attribute for the resolved path
+        self.source_path = self._original_path  # Public attribute for the user-provided path/URL
         self._reading_order = reading_order
-        self._config = {
-            'keep_spaces': keep_spaces
-        }
+        self._config = {"keep_spaces": keep_spaces}
         self._font_attrs = font_attrs  # Store the font attribute configuration
         # Initialize Managers and Services (conditionally available)
         self._ocr_manager = OCRManager() if OCRManager else None
         self._layout_manager = LayoutManager() if LayoutManager else None
-        self.highlighter = HighlightingService(self)
+        self.highlighter = HighlightingService(self)
         # Initialize pages last, passing necessary refs
-        self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
+        self._pages = [
+            Page(p, parent=self, index=i, font_attrs=font_attrs)
+            for i, p in enumerate(self._pdf.pages)
+        ]
         # Other state
         self._element_cache = {}
         self._exclusions = []  # List to store exclusion functions/regions
         self._regions = []  # List to store region functions/definitions
         logger.info("Initialized HighlightingService.")
         logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
@@ -147,45 +179,48 @@ class PDF:
         return self._pdf.metadata
     @property
-    def pages(self) -> 'PageCollection':
+    def pages(self) -> "PageCollection":
         """Access pages as a PageCollection object."""
         from natural_pdf.elements.collections import PageCollection
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         return PageCollection(self._pages)
-    def clear_exclusions(self) -> 'PDF':
+    def clear_exclusions(self) -> "PDF":
         """
         Clear all exclusion functions from the PDF.
         Returns:
             Self for method chaining
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
         # Also clear from pages
         for page in self._pages:
             page.clear_exclusions()
         return self
-    def add_exclusion(self, exclusion_func: Callable[['Page'], Optional[Region]], label: str = None) -> 'PDF':
+    def add_exclusion(
+        self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
+    ) -> "PDF":
         """
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
         Args:
             exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
             label: Optional label for this exclusion
         Returns:
             Self for method chaining
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         # Store exclusion with its label at PDF level
         exclusion_data = (exclusion_func, label)
@@ -202,12 +237,17 @@ class PDF:
         self,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         engine: Optional[str] = None,
-        options: Optional['OCROptions'] = None,
+        # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None,
+        min_confidence: Optional[float] = None, # Min confidence threshold
         device: Optional[str] = None,
-        # Add other simple mode args if needed
-    ) -> 'PDF':
+        resolution: Optional[int] = None, # DPI for rendering before OCR
+        apply_exclusions: bool = True, # New parameter
+        detect_only: bool = False,
+        # --- Engine-Specific Options --- Use 'options=' for this
+        options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
+        # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
+    ) -> "PDF":
         """
         Applies OCR to specified pages (or all pages) of the PDF using batch processing.
@@ -217,42 +257,54 @@ class PDF:
         Args:
             pages: An iterable of 0-based page indices (list, range, tuple),
                    a slice object, or None to process all pages.
-            engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
-                    Uses manager's default if None. Ignored if 'options' is provided.
-            options: An specific Options object (e.g., EasyOCROptions) for
-                     advanced configuration. Overrides simple arguments.
-            languages: List of language codes for simple mode.
-            min_confidence: Minimum confidence threshold for simple mode.
-            device: Device string ('cpu', 'cuda', etc.) for simple mode.
+            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
+                    Uses manager's default ('easyocr') if None.
+            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
+                       **Must be codes understood by the specific selected engine.**
+                       No mapping is performed. Overrides manager/engine default.
+            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
+                            Overrides manager/engine default.
+            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
+                    Overrides manager/engine default.
+            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
+                        Affects input quality for OCR. Defaults to 150 if not set.
+            apply_exclusions: If True (default), render page image for OCR with
+                              excluded areas masked (whited out). If False, OCR
+                              the raw page image without masking exclusions.
+            detect_only: If True, only detect text bounding boxes, don't perform OCR.
+            options: An engine-specific options object (e.g., EasyOCROptions) or dict
+                     containing parameters specific to the chosen engine.
         Returns:
             Self for method chaining.
         Raises:
-            ValueError: If page indices are invalid or the engine name is invalid.
-            TypeError: If unexpected keyword arguments are provided in simple mode.
+            ValueError: If page indices are invalid.
+            TypeError: If 'options' is not compatible with the engine.
             RuntimeError: If the OCRManager or selected engine is not available.
         """
         if not self._ocr_manager:
-             logger.error("OCRManager not available. Cannot apply OCR.")
-             # Or raise RuntimeError("OCRManager not initialized.")
-             return self
+            logger.error("OCRManager not available. Cannot apply OCR.")
+            # Or raise RuntimeError("OCRManager not initialized.")
+            return self
-        # --- Determine Target Pages ---
+        # --- Determine Target Pages (unchanged) ---
         target_pages: List[Page] = []
         if pages is None:
             target_pages = self._pages
         elif isinstance(pages, slice):
             target_pages = self._pages[pages]
-        elif hasattr(pages, '__iter__'): # Check if it's iterable (list, range, tuple, etc.)
+        elif hasattr(pages, "__iter__"):  # Check if it's iterable (list, range, tuple, etc.)
             try:
                 target_pages = [self._pages[i] for i in pages]
             except IndexError:
                 raise ValueError("Invalid page index provided in 'pages' iterable.")
             except TypeError:
-                 raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+                raise TypeError(
+                    "'pages' must be None, a slice, or an iterable of page indices (int)."
+                )
         else:
-             raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
         if not target_pages:
             logger.warning("No pages selected for OCR processing.")
@@ -260,101 +312,129 @@ class PDF:
         page_numbers = [p.number for p in target_pages]
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
+        # --- Determine Rendering Resolution ---
+        # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
+        final_resolution = resolution # Use direct arg if provided
+        if final_resolution is None:
+            final_resolution = getattr(self, "_config", {}).get("resolution", 150)
+        logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
         # --- Render Images for Batch ---
         images_pil: List[Image.Image] = []
-        page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
-        logger.info(f"Rendering {len(target_pages)} pages to images...")
-        failed_page_num = 'unknown' # Keep track of potentially failing page
+        page_image_map: List[Tuple[Page, Image.Image]] = []  # Store page and its image
+        logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
+        failed_page_num = "unknown"  # Keep track of potentially failing page
         try:
-            ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
             for i, page in enumerate(target_pages):
-                failed_page_num = page.number # Update current page number in case of error
+                failed_page_num = page.number  # Update current page number in case of error
                 logger.debug(f"  Rendering page {page.number} (index {page.index})...")
-                # Use page.to_image but ensure highlights are off for OCR base image
-                img = page.to_image(scale=ocr_scale, include_highlights=False)
+                # Use the determined final_resolution and apply exclusions if requested
+                to_image_kwargs = {
+                    "resolution": final_resolution,
+                    "include_highlights": False,
+                    "exclusions": "mask" if apply_exclusions else None,
+                }
+                img = page.to_image(**to_image_kwargs)
+                if img is None:
+                    logger.error(f"  Failed to render page {page.number} to image.")
+                    # Decide how to handle: skip page, raise error? For now, skip.
+                    continue # Skip this page if rendering failed
                 images_pil.append(img)
-                page_image_map.append((page, img)) # Store pair
+                page_image_map.append((page, img))  # Store pair
         except Exception as e:
             logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
             raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
-        if not images_pil:
-             logger.error("No images were successfully rendered for batch OCR.")
-             return self
+        if not images_pil or not page_image_map:
+            logger.error("No images were successfully rendered for batch OCR.")
+            return self
         # --- Prepare Arguments for Manager ---
-        manager_args = {'images': images_pil, 'options': options, 'engine': engine}
-        simple_args = {}
-        if languages is not None: simple_args['languages'] = languages
-        if min_confidence is not None: simple_args['min_confidence'] = min_confidence
-        if device is not None: simple_args['device'] = device
-        manager_args.update(simple_args) # Add simple args if options not provided
+        # Pass common args directly, engine-specific via options
+        manager_args = {
+            "images": images_pil,
+            "engine": engine,
+            "languages": languages,
+            "min_confidence": min_confidence, # Use the renamed parameter
+            "device": device,
+            "options": options,
+            "detect_only": detect_only,
+            # Note: resolution is used for rendering, not passed to OCR manager directly
+        }
+        # Filter out None values so manager can use its defaults
+        manager_args = {k: v for k, v in manager_args.items() if v is not None}
         # --- Call OCR Manager for Batch Processing ---
-        logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
+        logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
         try:
-            # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
+            # Manager's apply_ocr signature needs to accept common args directly
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
             if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
-                logger.error(f"OCR Manager returned unexpected result format or length for batch processing. "
-                             f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
-                             f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}.")
-                # Handle error - maybe return early or try processing valid parts?
-                return self # Return self without adding elements
+                logger.error(
+                    f"OCR Manager returned unexpected result format or length for batch processing. "
+                    f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
+                    f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
+                )
+                return self
             logger.info("OCR Manager batch processing complete.")
         except Exception as e:
-             logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
-             return self # Return self without adding elements
+            logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
+            return self
-        # --- Distribute Results and Add Elements to Pages ---
+        # --- Distribute Results and Add Elements to Pages (unchanged) ---
         logger.info("Adding OCR results to respective pages...")
         total_elements_added = 0
         for i, (page, img) in enumerate(page_image_map):
             results_for_page = batch_results[i]
             if not isinstance(results_for_page, list):
-                 logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
-                 continue
+                logger.warning(
+                    f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
+                )
+                continue
             logger.debug(f"  Processing {len(results_for_page)} results for page {page.number}...")
-            # Use the page's element manager to create elements from its results
-            # Changed from page._create_text_elements_from_ocr to use element_mgr
             try:
-                # Calculate scale factors based on rendered image vs page dims
-                 img_scale_x = page.width / img.width if img.width > 0 else 1
-                 img_scale_y = page.height / img.height if img.height > 0 else 1
-                 elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img_scale_x, img_scale_y)
-                 if elements:
-                      # Note: element_mgr.create_text_elements_from_ocr already adds them
-                      total_elements_added += len(elements)
-                      logger.debug(f"  Added {len(elements)} OCR TextElements to page {page.number}.")
-                 else:
-                      logger.debug(f"  No valid TextElements created for page {page.number}.")
+                img_scale_x = page.width / img.width if img.width > 0 else 1
+                img_scale_y = page.height / img.height if img.height > 0 else 1
+                elements = page._element_mgr.create_text_elements_from_ocr(
+                    results_for_page, img_scale_x, img_scale_y
+                )
+                if elements:
+                    total_elements_added += len(elements)
+                    logger.debug(f"  Added {len(elements)} OCR TextElements to page {page.number}.")
+                else:
+                    logger.debug(f"  No valid TextElements created for page {page.number}.")
             except Exception as e:
-                 logger.error(f"  Error adding OCR elements to page {page.number}: {e}", exc_info=True)
-                 # Continue to next page
+                logger.error(
+                    f"  Error adding OCR elements to page {page.number}: {e}", exc_info=True
+                )
-        logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
+        logger.info(
+            f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
+        )
         return self
-    def add_region(self, region_func: Callable[['Page'], Optional[Region]], name: str = None) -> 'PDF':
+    def add_region(
+        self, region_func: Callable[["Page"], Optional[Region]], name: str = None
+    ) -> "PDF":
         """
         Add a region function to the PDF. This creates regions on all pages using the provided function.
         Args:
             region_func: A function that takes a Page and returns a Region, or None.
             name: Optional name for the region
         Returns:
             Self for method chaining
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         # Store region with its name at PDF level
         region_data = (region_func, name)
@@ -367,93 +447,108 @@ class PDF:
                 region_instance = region_func(page)
                 if region_instance and isinstance(region_instance, Region):
                     # If a valid region is returned, add it to the page
-                    page.add_region(region_instance, name=name, source='named')
+                    page.add_region(region_instance, name=name, source="named")
                 elif region_instance is not None:
-                     logger.warning(f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}")
+                    logger.warning(
+                        f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
+                    )
             except Exception as e:
-                logger.error(f"Error executing or adding region function for page {page.number}: {e}", exc_info=True)
+                logger.error(
+                    f"Error executing or adding region function for page {page.number}: {e}",
+                    exc_info=True,
+                )
         return self
-    def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Optional[Any]:
+    def find(
+        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
+    ) -> Optional[Any]:
         """
         Find the first element matching the selector.
         Args:
             selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
             regex: Whether to use regex for text search in :contains (default: False)
             case: Whether to do case-sensitive text search (default: True)
             **kwargs: Additional filter parameters
         Returns:
             Element object or None if not found
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         selector_obj = parse_selector(selector)
         # Pass regex and case flags to selector function
-        kwargs['regex'] = regex
-        kwargs['case'] = case
-        results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs)
+        kwargs["regex"] = regex
+        kwargs["case"] = case
+        results = self._apply_selector(
+            selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
+        )
         return results.first if results else None
-    def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
+    def find_all(
+        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
+    ) -> ElementCollection:
         """
         Find all elements matching the selector.
         Args:
             selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
             regex: Whether to use regex for text search in :contains (default: False)
             case: Whether to do case-sensitive text search (default: True)
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection with matching elements
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         selector_obj = parse_selector(selector)
         # Pass regex and case flags to selector function
-        kwargs['regex'] = regex
-        kwargs['case'] = case
-        results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs)
+        kwargs["regex"] = regex
+        kwargs["case"] = case
+        results = self._apply_selector(
+            selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
+        )
         return results
-    def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs) -> ElementCollection:
+    def _apply_selector(
+        self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
+    ) -> ElementCollection:
         """
         Apply selector to PDF elements across all pages.
         Args:
             selector_obj: Parsed selector dictionary
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
             first_only: If True, stop searching after the first match is found.
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection of matching elements
         """
         from natural_pdf.elements.collections import ElementCollection
         # Determine page range to search
-        page_indices = kwargs.get('pages', range(len(self._pages)))
+        page_indices = kwargs.get("pages", range(len(self._pages)))
         if isinstance(page_indices, int):
             page_indices = [page_indices]
         elif isinstance(page_indices, slice):
             page_indices = range(*page_indices.indices(len(self._pages)))
         # Check for cross-page pseudo-classes (currently not supported)
-        for pseudo in selector_obj.get('pseudo_classes', []):
-            if pseudo.get('name') in ('spans', 'continues'):
+        for pseudo in selector_obj.get("pseudo_classes", []):
+            if pseudo.get("name") in ("spans", "continues"):
                 logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
                 return ElementCollection([])
@@ -464,134 +559,148 @@ class PDF:
                 page = self._pages[page_idx]
                 # Pass first_only down to page._apply_selector
                 page_elements_collection = page._apply_selector(
-                    selector_obj,
-                    apply_exclusions=apply_exclusions,
-                    first_only=first_only,
-                    **kwargs
+                    selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
                 )
                 if page_elements_collection:
-                     page_elements = page_elements_collection.elements
-                     all_elements.extend(page_elements)
-                     # If we only need the first match overall, and we found one on this page, stop
-                     if first_only and page_elements:
-                          break # Stop iterating through pages
+                    page_elements = page_elements_collection.elements
+                    all_elements.extend(page_elements)
+                    # If we only need the first match overall, and we found one on this page, stop
+                    if first_only and page_elements:
+                        break  # Stop iterating through pages
             else:
-                 logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
+                logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
         # Create a combined collection
         combined = ElementCollection(all_elements)
         # Sort in document order if requested and not first_only (already sorted by page)
-        if not first_only and kwargs.get('document_order', True):
+        if not first_only and kwargs.get("document_order", True):
             # Check if elements have page, top, x0 before sorting
-            if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
-                 combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
+            if all(
+                hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
+                for el in combined.elements
+            ):
+                combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
             else:
-                 # Elements might be Regions without inherent sorting order yet
-                 # Attempt sorting by page index if possible
-                 try:
-                     combined.sort(key=lambda el: el.page.index)
-                 except AttributeError:
-                     logger.warning("Cannot sort elements in document order: Missing required attributes (e.g., page).")
+                # Elements might be Regions without inherent sorting order yet
+                # Attempt sorting by page index if possible
+                try:
+                    combined.sort(key=lambda el: el.page.index)
+                except AttributeError:
+                    logger.warning(
+                        "Cannot sort elements in document order: Missing required attributes (e.g., page)."
+                    )
         return combined
-    def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
-                  use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
+    def extract_text(
+        self,
+        selector: Optional[str] = None,
+        preserve_whitespace=True,
+        use_exclusions=True,
+        debug_exclusions=False,
+        **kwargs,
+    ) -> str:
         """
         Extract text from the entire document or matching elements.
         Args:
             selector: Optional selector to filter elements
             preserve_whitespace: Whether to keep blank characters (default: True)
             use_exclusions: Whether to apply exclusion regions (default: True)
             debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
             **kwargs: Additional extraction parameters
         Returns:
             Extracted text as string
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         # If selector is provided, find elements first
         if selector:
             elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
             return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
         # Otherwise extract from all pages
         if debug_exclusions:
             print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
             print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
         texts = []
         for page in self.pages:
-            texts.append(page.extract_text(
-                preserve_whitespace=preserve_whitespace,
-                use_exclusions=use_exclusions,
-                debug_exclusions=debug_exclusions,
-                **kwargs
-            ))
+            texts.append(
+                page.extract_text(
+                    preserve_whitespace=preserve_whitespace,
+                    use_exclusions=use_exclusions,
+                    debug_exclusions=debug_exclusions,
+                    **kwargs,
+                )
+            )
         if debug_exclusions:
             print(f"PDF: Combined {len(texts)} pages of text")
         return "\n".join(texts)
     def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
         """
         Shorthand for finding elements and extracting their text.
         Args:
             selector: CSS-like selector string
             preserve_whitespace: Whether to keep blank characters (default: True)
             **kwargs: Additional extraction parameters
         Returns:
             Extracted text from matching elements
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
-        return self.extract_text(selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs) # apply_exclusions is handled by find_all in extract_text
-    def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
+        return self.extract_text(
+            selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
+        )  # apply_exclusions is handled by find_all in extract_text
+    def extract_tables(
+        self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
+    ) -> List[Any]:
         """
         Extract tables from the document or matching elements.
         Args:
             selector: Optional selector to filter tables
             merge_across_pages: Whether to merge tables that span across pages
             **kwargs: Additional extraction parameters
         Returns:
             List of extracted tables
         """
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not yet initialized.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
         # TODO: Implement table extraction
         logger.warning("PDF.extract_tables is not fully implemented yet.")
         all_tables = []
         for page in self.pages:
-             # Assuming page.extract_tables(**kwargs) exists or is added
-             if hasattr(page, 'extract_tables'):
-                  all_tables.extend(page.extract_tables(**kwargs))
-             else:
-                  logger.debug(f"Page {page.number} does not have extract_tables method.")
+            # Assuming page.extract_tables(**kwargs) exists or is added
+            if hasattr(page, "extract_tables"):
+                all_tables.extend(page.extract_tables(**kwargs))
+            else:
+                logger.debug(f"Page {page.number} does not have extract_tables method.")
         # Placeholder filtering
         if selector:
             logger.warning("Filtering extracted tables by selector is not implemented.")
             # Would need to parse selector and filter the list `all_tables`
         # Placeholder merging
         if merge_across_pages:
-             logger.warning("Merging tables across pages is not implemented.")
-             # Would need logic to detect and merge related tables
+            logger.warning("Merging tables across pages is not implemented.")
+            # Would need logic to detect and merge related tables
         return all_tables
     # --- New Method: save_searchable ---
-    def save_searchable(self, output_path: Union[str, 'Path'], dpi: int = 300, **kwargs):
+    def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         Saves the PDF with an OCR text layer, making content searchable.
@@ -608,15 +717,6 @@ class PDF:
         # Import moved here, assuming it's always available now
         from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
-        # TODO: Need a reliable way for Page to signal if it has OCR elements.
-        # This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
-        # or checking if page.get_elements(source='ocr') returns anything.
-        # For now, we pass through and let the exporter handle pages without OCR elements.
-        # if not any(page.get_elements(source='ocr') for page in self.pages):
-        #      logger.warning("No OCR elements found on pages. "
-        #                   "Ensure apply_ocr() was called. "
-        #                   "Output PDF might not be searchable.")
         # Convert pathlib.Path to string if necessary
         output_path_str = str(output_path)
@@ -625,15 +725,18 @@ class PDF:
     # --- End New Method ---
-    def ask(self, question: str,
-           mode: str = "extractive",
-           pages: Union[int, List[int], range] = None,
-           min_confidence: float = 0.1,
-           model: str = None,
-           **kwargs) -> Dict[str, Any]:
+    def ask(
+        self,
+        question: str,
+        mode: str = "extractive",
+        pages: Union[int, List[int], range] = None,
+        min_confidence: float = 0.1,
+        model: str = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
         """
         Ask a question about the document content.
         Args:
             question: Question to ask about the document
             mode: "extractive" to extract answer from document, "generative" to generate
@@ -641,16 +744,16 @@ class PDF:
             min_confidence: Minimum confidence threshold for answers
             model: Optional model name for question answering
             **kwargs: Additional parameters passed to the QA engine
         Returns:
             A dictionary containing the answer, confidence, and other metadata.
             Result will have an 'answer' key containing the answer text.
         """
         from natural_pdf.qa import get_qa_engine
         # Initialize or get QA engine
         qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
         # Determine which pages to query
         if pages is None:
             target_pages = list(range(len(self.pages)))
@@ -662,43 +765,40 @@ class PDF:
             target_pages = pages
         else:
             raise ValueError(f"Invalid pages parameter: {pages}")
         # Actually query each page and gather results
         results = []
         for page_idx in target_pages:
             if 0 <= page_idx < len(self.pages):
                 page = self.pages[page_idx]
                 page_result = qa_engine.ask_pdf_page(
-                    page=page,
-                    question=question,
-                    min_confidence=min_confidence,
-                    **kwargs
+                    page=page, question=question, min_confidence=min_confidence, **kwargs
                 )
                 # Add to results if it found an answer
                 if page_result and page_result.get("found", False):
                     results.append(page_result)
         # Sort results by confidence
         results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
         # Return the best result, or a default result if none found
         if results:
             return results[0]
         else:
             # Return a structure indicating no answer found
             return {
-                 "answer": None,
-                 "confidence": 0.0,
-                 "found": False,
-                 "page_num": None, # Or maybe the pages searched?
-                 "source_elements": []
+                "answer": None,
+                "confidence": 0.0,
+                "found": False,
+                "page_num": None,  # Or maybe the pages searched?
+                "source_elements": [],
             }
     def search_within_index(
         self,
         query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol, # Now required
+        search_service: SearchServiceProtocol,  # Now required
         options: Optional[SearchOptions] = None,
     ) -> List[Dict[str, Any]]:
         """
@@ -730,14 +830,16 @@ class PDF:
             RuntimeError: For other search failures.
         """
         if not search_service:
-             raise ValueError("A configured SearchServiceProtocol instance must be provided.")
+            raise ValueError("A configured SearchServiceProtocol instance must be provided.")
         # Optional stricter check:
         # if not isinstance(search_service, SearchServiceProtocol):
         #     raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
         # Get collection name from service for logging
-        collection_name = getattr(search_service, 'collection_name', '<Unknown Collection>')
-        logger.info(f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}.")
+        collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
+        logger.info(
+            f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
+        )
         # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
         # service: SearchServiceProtocol
@@ -748,7 +850,7 @@ class PDF:
         #     factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
         #     # TODO: Pass embedding model from options/pdf config if needed?
         #     service = get_search_service(**factory_args)
-        service = search_service # Use validated provided service
+        service = search_service  # Use validated provided service
         # --- 2. Prepare Query and Options ---
         query_input = query
@@ -757,119 +859,220 @@ class PDF:
         # Handle Region query - extract text for now
         if isinstance(query, Region):
-             logger.debug("Query is a Region object. Extracting text.")
-             if not isinstance(effective_options, TextSearchOptions):
-                 logger.warning("Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction.")
-             query_input = query.extract_text()
-             if not query_input or query_input.isspace():
-                   logger.error("Region has no extractable text for query.")
-                   return []
+            logger.debug("Query is a Region object. Extracting text.")
+            if not isinstance(effective_options, TextSearchOptions):
+                logger.warning(
+                    "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
+                )
+            query_input = query.extract_text()
+            if not query_input or query_input.isspace():
+                logger.error("Region has no extractable text for query.")
+                return []
         # --- 3. Add Filter to Scope Search to THIS PDF ---
         # Assume metadata field 'pdf_path' stores the resolved path used during indexing
         pdf_scope_filter = {
-             "field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
-             "operator": "eq",
-             "value": self.path # Use the resolved path of this PDF instance
+            "field": "pdf_path",  # Or potentially "source_path" depending on indexing metadata
+            "operator": "eq",
+            "value": self.path,  # Use the resolved path of this PDF instance
         }
         logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
         # Combine with existing filters in options (if any)
         if effective_options.filters:
-            logger.debug(f"Combining PDF scope filter with existing filters: {effective_options.filters}")
+            logger.debug(
+                f"Combining PDF scope filter with existing filters: {effective_options.filters}"
+            )
             # Assume filters are compatible with the underlying search service
             # If existing filters aren't already in an AND block, wrap them
-            if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
-                 # Already an AND block, just append the condition
-                 effective_options.filters["conditions"].append(pdf_scope_filter)
+            if (
+                isinstance(effective_options.filters, dict)
+                and effective_options.filters.get("operator") == "AND"
+            ):
+                # Already an AND block, just append the condition
+                effective_options.filters["conditions"].append(pdf_scope_filter)
             elif isinstance(effective_options.filters, list):
-                 # Assume list represents implicit AND conditions
-                 effective_options.filters = {"operator": "AND", "conditions": effective_options.filters + [pdf_scope_filter]}
-            elif isinstance(effective_options.filters, dict): # Single filter dict
-                 effective_options.filters = {"operator": "AND", "conditions": [effective_options.filters, pdf_scope_filter]}
+                # Assume list represents implicit AND conditions
+                effective_options.filters = {
+                    "operator": "AND",
+                    "conditions": effective_options.filters + [pdf_scope_filter],
+                }
+            elif isinstance(effective_options.filters, dict):  # Single filter dict
+                effective_options.filters = {
+                    "operator": "AND",
+                    "conditions": [effective_options.filters, pdf_scope_filter],
+                }
             else:
-                 logger.warning(f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter.")
-                 effective_options.filters = pdf_scope_filter
+                logger.warning(
+                    f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
+                )
+                effective_options.filters = pdf_scope_filter
         else:
             effective_options.filters = pdf_scope_filter
         logger.debug(f"Final filters for service search: {effective_options.filters}")
-        # --- 4. Call SearchService ---
+        # --- 4. Call SearchService ---
         try:
             # Call the service's search method (no collection_name needed)
             results = service.search(
                 query=query_input,
                 options=effective_options,
             )
-            logger.info(f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'.")
+            logger.info(
+                f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
+            )
             return results
         except FileNotFoundError as fnf:
-             logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
-             raise # Re-raise specific error
+            logger.error(
+                f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
+            )
+            raise  # Re-raise specific error
+        except Exception as e:
+            logger.error(
+                f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
+                exc_info=True,
+            )
+            raise RuntimeError(
+                f"Search within index failed for PDF '{self.path}'. See logs for details."
+            ) from e
+    def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
+        """
+        Exports OCR results from this PDF into a correction task package (zip file).
+        Args:
+            output_zip_path: The path to save the output zip file.
+            **kwargs: Additional arguments passed to create_correction_task_package
+                      (e.g., image_render_scale, overwrite).
+        """
+        try:
+            from natural_pdf.utils.packaging import create_correction_task_package
+            create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
+        except ImportError:
+            logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
+            # Or raise
         except Exception as e:
-             logger.error(f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}", exc_info=True)
-             raise RuntimeError(f"Search within index failed for PDF '{self.path}'. See logs for details.") from e
+            logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
+            raise # Re-raise the exception from the utility function
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+    ) -> "PDF": # Return self for chaining
+        """
+        Applies corrections to OCR-generated text elements using a callback function,
+        delegating the core work to the `Page.correct_ocr` method.
+        Args:
+            correction_callback: A function that accepts a single argument (an element
+                                object) and returns `Optional[str]`. It returns the
+                                corrected text string if an update is needed, otherwise None.
+            pages: Optional page indices/slice to limit the scope of correction
+                (default: all pages).
+        Returns:
+            Self for method chaining.
+        """
+        # Determine target pages
+        target_page_indices: List[int] = []
+        if pages is None:
+            target_page_indices = list(range(len(self._pages)))
+        elif isinstance(pages, slice):
+            target_page_indices = list(range(*pages.indices(len(self._pages))))
+        elif hasattr(pages, "__iter__"):
+            try:
+                target_page_indices = [int(i) for i in pages]
+                # Validate indices
+                for idx in target_page_indices:
+                    if not (0 <= idx < len(self._pages)):
+                        raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
+            except (IndexError, TypeError, ValueError) as e:
+                raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+        if not target_page_indices:
+            logger.warning("No pages selected for OCR correction.")
+            return self
+        logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
+        # Iterate through target pages and call their correct_ocr method
+        for page_idx in target_page_indices:
+            page = self._pages[page_idx]
+            try:
+                page.correct_ocr(correction_callback=correction_callback)
+            except Exception as e:
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
+                # Optionally re-raise or just log and continue
+        logger.info(f"OCR correction process finished for requested pages.")
+        return self
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
         # Ensure _pages is initialized
-        if not hasattr(self, '_pages'):
+        if not hasattr(self, "_pages"):
             # Return 0 or raise error if not fully initialized? Let's return 0.
-             return 0
+            return 0
         return len(self._pages)
-    def __getitem__(self, key) -> Union[Page, 'PageCollection']: # Return PageCollection for slice
+    def __getitem__(self, key) -> Union[Page, "PageCollection"]:  # Return PageCollection for slice
         """Access pages by index or slice."""
         # Check if self._pages has been initialized
-        if not hasattr(self, '_pages'):
-             raise AttributeError("PDF pages not initialized yet.")
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not initialized yet.")
         if isinstance(key, slice):
-             # Return a PageCollection slice
-             from natural_pdf.elements.collections import PageCollection
-             return PageCollection(self._pages[key])
+            # Return a PageCollection slice
+            from natural_pdf.elements.collections import PageCollection
+            return PageCollection(self._pages[key])
         # Check index bounds before accessing
         if isinstance(key, int):
             if 0 <= key < len(self._pages):
-                 return self._pages[key]
+                return self._pages[key]
             else:
-                 raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
+                raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
         else:
-             raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
+            raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
     def close(self):
         """Close the underlying PDF file and clean up any temporary files."""
-        if hasattr(self, '_pdf') and self._pdf is not None:
-             try:
-                  self._pdf.close()
-                  logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
-             except Exception as e:
-                  logger.warning(f"Error closing pdfplumber object: {e}")
-             finally:
-                  self._pdf = None
+        if hasattr(self, "_pdf") and self._pdf is not None:
+            try:
+                self._pdf.close()
+                logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
+            except Exception as e:
+                logger.warning(f"Error closing pdfplumber object: {e}")
+            finally:
+                self._pdf = None
         # Clean up temporary file if it exists
-        if hasattr(self, '_temp_file') and self._temp_file is not None:
+        if hasattr(self, "_temp_file") and self._temp_file is not None:
             temp_file_path = None
             try:
-                 if hasattr(self._temp_file, 'name') and self._temp_file.name:
-                      temp_file_path = self._temp_file.name
-                      if os.path.exists(temp_file_path):
-                           os.unlink(temp_file_path)
-                           logger.debug(f"Removed temporary PDF file: {temp_file_path}")
+                if hasattr(self._temp_file, "name") and self._temp_file.name:
+                    temp_file_path = self._temp_file.name
+                    if os.path.exists(temp_file_path):
+                        os.unlink(temp_file_path)
+                        logger.debug(f"Removed temporary PDF file: {temp_file_path}")
             except Exception as e:
-                 logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
+                logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
             finally:
-                 self._temp_file = None
+                self._temp_file = None
     def __enter__(self):
         """Context manager entry."""
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit."""
         self.close()
-# --- Added TYPE_CHECKING import (if not already present) ---
-if TYPE_CHECKING:
-    from pathlib import Path # Assuming Path is used for type hint
+    # --- Indexable Protocol Methods --- Needed for search/sync
+    def get_id(self) -> str:
+        return self.path

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl