PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,11 +1,14 @@
-import copy  # Add import for deepcopy
+import copy
+import io
 import logging
 import os
 import re
 import tempfile
+import threading
+import time
 import urllib.request
-from pathlib import Path  # Added Path
-from typing import (  # Added Iterable and TYPE_CHECKING
+from pathlib import Path
+from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
@@ -16,56 +19,72 @@ from typing import (  # Added Iterable and TYPE_CHECKING
     Tuple,
     Type,
     Union,
+    overload,
 )
-from pathlib import Path
 import pdfplumber
 from PIL import Image
-from natural_pdf.analyzers.layout.layout_manager import (  # Import the new LayoutManager
-    LayoutManager,
-)
-from natural_pdf.core.highlighting_service import HighlightingService  # <-- Import the new service
-from natural_pdf.core.page import Page
-from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.analyzers.layout.layout_manager import LayoutManager
+from natural_pdf.classification.manager import ClassificationError, ClassificationManager
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.results import ClassificationResult
+from natural_pdf.core.highlighting_service import HighlightingService
+from natural_pdf.elements.base import Element
 from natural_pdf.elements.region import Region
+from natural_pdf.export.mixin import ExportMixin
+from natural_pdf.extraction.manager import StructuredDataManager
+from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
+from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.utils.tqdm_utils import get_tqdm
-# Import the flag directly - this should always work
-# --- Add Search Service Imports (needed for new methods) ---
 try:
-    from typing import Any as TypingAny  # Import Any if not already
+    from typing import Any as TypingAny
-    from natural_pdf.search import TextSearchOptions  # Keep for ask default
     from natural_pdf.search import (
         BaseSearchOptions,
         SearchOptions,
         SearchServiceProtocol,
+        TextSearchOptions,
         get_search_service,
     )
 except ImportError:
-    # Define dummies if needed for type hints within the class
     SearchServiceProtocol = object
     SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
     TypingAny = object
-    # Dummy factory needed for default arg in methods
     def get_search_service(**kwargs) -> SearchServiceProtocol:
         raise ImportError(
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
-# --- End Search Service Imports ---
-# Set up logger early
 logger = logging.getLogger("natural_pdf.core.pdf")
+tqdm = get_tqdm()
+DEFAULT_MANAGERS = {
+    "classification": ClassificationManager,
+    "structured_data": StructuredDataManager,
+}
-class PDF:
+# Deskew Imports (Conditional)
+import numpy as np
+from PIL import Image
+try:
+    import img2pdf
+    from deskew import determine_skew
+    DESKEW_AVAILABLE = True
+except ImportError:
+    DESKEW_AVAILABLE = False
+    img2pdf = None
+# End Deskew Imports
+class PDF(ExtractionMixin, ExportMixin):
     """
     Enhanced PDF wrapper built on top of pdfplumber.
@@ -75,7 +94,7 @@ class PDF:
     def __init__(
         self,
-        path_or_url: str,
+        path_or_url_or_stream,
         reading_order: bool = True,
         font_attrs: Optional[List[str]] = None,
         keep_spaces: bool = True,
@@ -84,95 +103,132 @@ class PDF:
         Initialize the enhanced PDF object.
         Args:
-            path_or_url: Path to the PDF file or a URL to a PDF
+            path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
             reading_order: Whether to use natural reading order
-            font_attrs: Font attributes to consider when grouping characters into words.
-                       Default: ['fontname', 'size'] (Group by font name and size)
-                       None: Only consider spatial relationships
-                       List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
-            keep_spaces: Whether to include spaces in word elements (default: True).
-                       True: Spaces are part of words, better for multi-word searching
-                       False: Break text at spaces, each word is separate (legacy behavior)
+            font_attrs: Font attributes for grouping characters into words
+            keep_spaces: Whether to include spaces in word elements
         """
-        # Check if the input is a URL
-        is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
-        # Initialize path-related attributes
-        self._original_path = path_or_url
+        self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
-        self._resolved_path = None  # Store the actual path used by pdfplumber
-        if is_url:
-            logger.info(f"Downloading PDF from URL: {path_or_url}")
-            try:
-                # Create a temporary file to store the downloaded PDF
-                self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
-                # Download the PDF
-                with urllib.request.urlopen(path_or_url) as response:
-                    self._temp_file.write(response.read())
-                    self._temp_file.flush()
-                    self._temp_file.close()
-                # Use the temporary file path
-                self._resolved_path = self._temp_file.name
-                logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
-            except Exception as e:
-                if self._temp_file and hasattr(self._temp_file, "name"):
-                    try:
-                        os.unlink(self._temp_file.name)
-                    except:
-                        pass
-                logger.error(f"Failed to download PDF from URL: {e}")
-                raise ValueError(f"Failed to download PDF from URL: {e}")
+        self._resolved_path = None
+        self._is_stream = False
+        stream_to_open = None
+        if hasattr(path_or_url_or_stream, "read"):  # Check if it's file-like
+            logger.info("Initializing PDF from in-memory stream.")
+            self._is_stream = True
+            self._resolved_path = None  # No resolved file path for streams
+            self.source_path = "<stream>"  # Identifier for source
+            self.path = self.source_path  # Use source identifier as path for streams
+            stream_to_open = path_or_url_or_stream
+        elif isinstance(path_or_url_or_stream, (str, Path)):
+            path_or_url = str(path_or_url_or_stream)
+            self.source_path = path_or_url  # Store original path/URL as source
+            is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
+            if is_url:
+                logger.info(f"Downloading PDF from URL: {path_or_url}")
+                try:
+                    # Use a context manager for the temporary file
+                    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
+                        self._temp_file = temp_f  # Store reference if needed for cleanup
+                        with urllib.request.urlopen(path_or_url) as response:
+                            temp_f.write(response.read())
+                            temp_f.flush()
+                        self._resolved_path = temp_f.name
+                        logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
+                        stream_to_open = self._resolved_path
+                except Exception as e:
+                    if self._temp_file and hasattr(self._temp_file, "name"):
+                        try:
+                            os.unlink(self._temp_file.name)
+                        except:  # noqa E722
+                            pass
+                    logger.error(f"Failed to download PDF from URL: {e}")
+                    raise ValueError(f"Failed to download PDF from URL: {e}")
+            else:
+                self._resolved_path = str(Path(path_or_url).resolve())  # Resolve local paths
+                stream_to_open = self._resolved_path
+            self.path = self._resolved_path  # Use resolved path for file-based PDFs
         else:
-            # Use the provided path directly
-            self._resolved_path = path_or_url
+            raise TypeError(
+                f"Invalid input type: {type(path_or_url_or_stream)}. "
+                f"Expected path (str/Path), URL (str), or file-like object."
+            )
-        logger.info(f"Initializing PDF from {self._resolved_path}")
+        logger.info(f"Opening PDF source: {self.source_path}")
         logger.debug(
             f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
         )
         try:
-            self._pdf = pdfplumber.open(self._resolved_path)
+            self._pdf = pdfplumber.open(stream_to_open)
         except Exception as e:
-            logger.error(
-                f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
-                exc_info=True,
-            )
-            # Clean up temp file if creation failed
-            self.close()
-            raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
-        self._path = self._resolved_path  # Keep original path too?
-        self.path = self._resolved_path  # Public attribute for the resolved path
-        self.source_path = self._original_path  # Public attribute for the user-provided path/URL
+            logger.error(f"Failed to open PDF: {e}", exc_info=True)
+            self.close()  # Attempt cleanup if opening fails
+            raise IOError(f"Failed to open PDF source: {self.source_path}") from e
+        # Store configuration used for initialization
         self._reading_order = reading_order
         self._config = {"keep_spaces": keep_spaces}
+        self._font_attrs = font_attrs
-        self._font_attrs = font_attrs  # Store the font attribute configuration
-        # Initialize Managers and Services (conditionally available)
         self._ocr_manager = OCRManager() if OCRManager else None
         self._layout_manager = LayoutManager() if LayoutManager else None
         self.highlighter = HighlightingService(self)
+        # self._classification_manager_instance = ClassificationManager() # Removed this line
+        self._manager_registry = {}
+        from natural_pdf.core.page import Page
-        # Initialize pages last, passing necessary refs
         self._pages = [
             Page(p, parent=self, index=i, font_attrs=font_attrs)
             for i, p in enumerate(self._pdf.pages)
         ]
-        # Other state
         self._element_cache = {}
-        self._exclusions = []  # List to store exclusion functions/regions
-        self._regions = []  # List to store region functions/definitions
+        self._exclusions = []
+        self._regions = []
-        logger.info("Initialized HighlightingService.")
         logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
+        self._initialize_managers()
+        self._initialize_highlighter()
+    def _initialize_managers(self):
+        """Initialize manager instances based on DEFAULT_MANAGERS."""
+        self._managers = {}
+        for key, manager_class in DEFAULT_MANAGERS.items():
+            try:
+                self._managers[key] = manager_class()
+                logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
+            except Exception as e:
+                logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
+                self._managers[key] = None
+    def get_manager(self, key: str) -> Any:
+        """Retrieve a manager instance by its key."""
+        if key not in self._managers:
+            raise KeyError(
+                f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
+            )
+        manager_instance = self._managers.get(key)
+        if manager_instance is None:
+            manager_class = DEFAULT_MANAGERS.get(key)
+            if manager_class:
+                raise RuntimeError(
+                    f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
+                )
+            else:
+                raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
+        return manager_instance
+    def _initialize_highlighter(self):
+        pass
     @property
     def metadata(self) -> Dict[str, Any]:
         """Access metadata as a dictionary."""
@@ -183,7 +239,6 @@ class PDF:
         """Access pages as a PageCollection object."""
         from natural_pdf.elements.collections import PageCollection
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         return PageCollection(self._pages)
@@ -195,12 +250,10 @@ class PDF:
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
-        # Also clear from pages
         for page in self._pages:
             page.clear_exclusions()
         return self
@@ -212,99 +265,90 @@ class PDF:
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
         Args:
-            exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
+            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
+            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
             label: Optional label for this exclusion
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # Store exclusion with its label at PDF level
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        # Apply this exclusion to all pages
         for page in self._pages:
-            # We pass the original function, Page.add_exclusion handles calling it
             page.add_exclusion(exclusion_func, label=label)
         return self
     def apply_ocr(
         self,
-        pages: Optional[Union[Iterable[int], range, slice]] = None,
         engine: Optional[str] = None,
-        # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None,  # Min confidence threshold
+        min_confidence: Optional[float] = None,
         device: Optional[str] = None,
-        resolution: Optional[int] = None,  # DPI for rendering before OCR
-        apply_exclusions: bool = True,  # New parameter
+        resolution: Optional[int] = None,
+        apply_exclusions: bool = True,
         detect_only: bool = False,
-        # --- Engine-Specific Options --- Use 'options=' for this
-        options: Optional[Any] = None,  # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
-        # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
+        replace: bool = True,
+        options: Optional[Any] = None,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
     ) -> "PDF":
         """
-        Applies OCR to specified pages (or all pages) of the PDF using batch processing.
-        This method renders the specified pages to images, sends them as a batch
-        to the OCRManager, and adds the resulting TextElements to each respective page.
+        Applies OCR to specified pages of the PDF using batch processing.
+        Applies OCR to specified pages of the PDF using batch processing.
         Args:
-            pages: An iterable of 0-based page indices (list, range, tuple),
-                   a slice object, or None to process all pages.
-            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
-                    Uses manager's default ('easyocr') if None.
-            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
-                       **Must be codes understood by the specific selected engine.**
-                       No mapping is performed. Overrides manager/engine default.
-            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
-                            Overrides manager/engine default.
-            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
-                    Overrides manager/engine default.
-            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
-                        Affects input quality for OCR. Defaults to 150 if not set.
-            apply_exclusions: If True (default), render page image for OCR with
-                              excluded areas masked (whited out). If False, OCR
-                              the raw page image without masking exclusions.
-            detect_only: If True, only detect text bounding boxes, don't perform OCR.
-            options: An engine-specific options object (e.g., EasyOCROptions) or dict
-                     containing parameters specific to the chosen engine.
+            engine: Name of the OCR engine
+            languages: List of language codes
+            min_confidence: Minimum confidence threshold
+            device: Device to run OCR on
+            resolution: DPI resolution for page images
+            apply_exclusions: Whether to mask excluded areas
+            detect_only: If True, only detect text boxes
+            replace: Whether to replace existing OCR elements
+            options: Engine-specific options
+            pages: Page indices to process or None for all pages
+            engine: Name of the OCR engine
+            languages: List of language codes
+            min_confidence: Minimum confidence threshold
+            device: Device to run OCR on
+            resolution: DPI resolution for page images
+            apply_exclusions: Whether to mask excluded areas
+            detect_only: If True, only detect text boxes
+            replace: Whether to replace existing OCR elements
+            options: Engine-specific options
+            pages: Page indices to process or None for all pages
         Returns:
-            Self for method chaining.
-        Raises:
-            ValueError: If page indices are invalid.
-            TypeError: If 'options' is not compatible with the engine.
-            RuntimeError: If the OCRManager or selected engine is not available.
+            Self for method chaining
+            Self for method chaining
         """
         if not self._ocr_manager:
             logger.error("OCRManager not available. Cannot apply OCR.")
-            # Or raise RuntimeError("OCRManager not initialized.")
             return self
-        # --- Determine Target Pages (unchanged) ---
-        target_pages: List[Page] = []
+        thread_id = threading.current_thread().name
+        logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
+        target_pages = []
+        target_pages = []
         if pages is None:
             target_pages = self._pages
         elif isinstance(pages, slice):
             target_pages = self._pages[pages]
-        elif hasattr(pages, "__iter__"):  # Check if it's iterable (list, range, tuple, etc.)
+        elif hasattr(pages, "__iter__"):
             try:
                 target_pages = [self._pages[i] for i in pages]
             except IndexError:
                 raise ValueError("Invalid page index provided in 'pages' iterable.")
             except TypeError:
-                raise TypeError(
-                    "'pages' must be None, a slice, or an iterable of page indices (int)."
-                )
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         else:
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_pages:
             logger.warning("No pages selected for OCR processing.")
@@ -312,26 +356,20 @@ class PDF:
         page_numbers = [p.number for p in target_pages]
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
-        # --- Determine Rendering Resolution ---
-        # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
-        final_resolution = resolution  # Use direct arg if provided
-        if final_resolution is None:
-            final_resolution = getattr(self, "_config", {}).get("resolution", 150)
-        logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
+        final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
+        logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
+        images_pil = []
+        page_image_map = []
+        logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
+        failed_page_num = "unknown"
+        render_start_time = time.monotonic()
-        # --- Render Images for Batch ---
-        images_pil: List[Image.Image] = []
-        page_image_map: List[Tuple[Page, Image.Image]] = []  # Store page and its image
-        logger.info(
-            f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
-        )
-        failed_page_num = "unknown"  # Keep track of potentially failing page
         try:
-            for i, page in enumerate(target_pages):
-                failed_page_num = page.number  # Update current page number in case of error
+            for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
+                failed_page_num = page.number
                 logger.debug(f"  Rendering page {page.number} (index {page.index})...")
-                # Use the determined final_resolution and apply exclusions if requested
                 to_image_kwargs = {
                     "resolution": final_resolution,
                     "include_highlights": False,
@@ -340,58 +378,64 @@ class PDF:
                 img = page.to_image(**to_image_kwargs)
                 if img is None:
                     logger.error(f"  Failed to render page {page.number} to image.")
-                    # Decide how to handle: skip page, raise error? For now, skip.
-                    continue  # Skip this page if rendering failed
+                    continue
+                    continue
                 images_pil.append(img)
-                page_image_map.append((page, img))  # Store pair
+                page_image_map.append((page, img))
         except Exception as e:
-            logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
+            logger.error(f"Failed to render pages for batch OCR: {e}")
+            logger.error(f"Failed to render pages for batch OCR: {e}")
             raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
+        render_end_time = time.monotonic()
+        logger.debug(
+            f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
+        )
+        logger.debug(
+            f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
+        )
         if not images_pil or not page_image_map:
             logger.error("No images were successfully rendered for batch OCR.")
             return self
-        # --- Prepare Arguments for Manager ---
-        # Pass common args directly, engine-specific via options
         manager_args = {
             "images": images_pil,
             "engine": engine,
             "languages": languages,
-            "min_confidence": min_confidence,  # Use the renamed parameter
+            "min_confidence": min_confidence,
+            "min_confidence": min_confidence,
             "device": device,
             "options": options,
             "detect_only": detect_only,
-            # Note: resolution is used for rendering, not passed to OCR manager directly
         }
-        # Filter out None values so manager can use its defaults
         manager_args = {k: v for k, v in manager_args.items() if v is not None}
-        # --- Call OCR Manager for Batch Processing ---
-        logger.info(
-            f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
-        )
+        ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
+        logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
+        logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
+        ocr_start_time = time.monotonic()
         try:
-            # Manager's apply_ocr signature needs to accept common args directly
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
             if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
-                logger.error(
-                    f"OCR Manager returned unexpected result format or length for batch processing. "
-                    f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
-                    f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
-                )
+                logger.error(f"OCR Manager returned unexpected result format or length.")
                 return self
             logger.info("OCR Manager batch processing complete.")
         except Exception as e:
-            logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
+            logger.error(f"Batch OCR processing failed: {e}")
             return self
-        # --- Distribute Results and Add Elements to Pages (unchanged) ---
+        ocr_end_time = time.monotonic()
+        logger.debug(
+            f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
+        )
         logger.info("Adding OCR results to respective pages...")
         total_elements_added = 0
         for i, (page, img) in enumerate(page_image_map):
             results_for_page = batch_results[i]
             if not isinstance(results_for_page, list):
@@ -402,6 +446,9 @@ class PDF:
             logger.debug(f"  Processing {len(results_for_page)} results for page {page.number}...")
             try:
+                if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
+                    page._element_mgr.remove_ocr_elements()
                 img_scale_x = page.width / img.width if img.width > 0 else 1
                 img_scale_y = page.height / img.height if img.height > 0 else 1
                 elements = page._element_mgr.create_text_elements_from_ocr(
@@ -414,188 +461,225 @@ class PDF:
                 else:
                     logger.debug(f"  No valid TextElements created for page {page.number}.")
             except Exception as e:
-                logger.error(
-                    f"  Error adding OCR elements to page {page.number}: {e}", exc_info=True
-                )
+                logger.error(f"  Error adding OCR elements to page {page.number}: {e}")
-        logger.info(
-            f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
-        )
+        logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
         return self
     def add_region(
         self, region_func: Callable[["Page"], Optional[Region]], name: str = None
     ) -> "PDF":
         """
-        Add a region function to the PDF. This creates regions on all pages using the provided function.
+        Add a region function to the PDF.
         Args:
-            region_func: A function that takes a Page and returns a Region, or None.
+            region_func: A function that takes a Page and returns a Region, or None
+            region_func: A function that takes a Page and returns a Region, or None
             name: Optional name for the region
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # Store region with its name at PDF level
         region_data = (region_func, name)
         self._regions.append(region_data)
-        # Apply this region to all pages
         for page in self._pages:
             try:
-                # Call the function to get the region for this specific page
                 region_instance = region_func(page)
                 if region_instance and isinstance(region_instance, Region):
-                    # If a valid region is returned, add it to the page
                     page.add_region(region_instance, name=name, source="named")
                 elif region_instance is not None:
                     logger.warning(
-                        f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
+                        f"Region function did not return a valid Region for page {page.number}"
                     )
             except Exception as e:
-                logger.error(
-                    f"Error executing or adding region function for page {page.number}: {e}",
-                    exc_info=True,
-                )
+                logger.error(f"Error adding region for page {page.number}: {e}")
         return self
+    @overload
+    def find(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
     def find(
-        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
     ) -> Optional[Any]:
         """
-        Find the first element matching the selector.
+        Find the first element matching the selector OR text content across all pages.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            Element object or None if not found
+            Element object or None if not found.
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        selector_obj = parse_selector(selector)
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
-        # Pass regex and case flags to selector function
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            effective_selector = f'text:contains("{escaped_text}")'
+            logger.debug(
+                f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            raise ValueError("Internal error: No selector or text provided.")
+        selector_obj = parse_selector(effective_selector)
         kwargs["regex"] = regex
         kwargs["case"] = case
-        results = self._apply_selector(
-            selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
-        )
-        return results.first if results else None
+        # Search page by page
+        for page in self.pages:
+            # Note: _apply_selector is on Page, so we call find directly here
+            # We pass the constructed/validated effective_selector
+            element = page.find(
+                selector=effective_selector,  # Use the processed selector
+                apply_exclusions=apply_exclusions,
+                regex=regex,  # Pass down flags
+                case=case,
+                **kwargs,
+            )
+            if element:
+                return element
+        return None  # Not found on any page
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
     def find_all(
-        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
-    ) -> ElementCollection:
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
         """
-        Find all elements matching the selector.
+        Find all elements matching the selector OR text content across all pages.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            ElementCollection with matching elements
+            ElementCollection with matching elements.
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        selector_obj = parse_selector(selector)
-        # Pass regex and case flags to selector function
-        kwargs["regex"] = regex
-        kwargs["case"] = case
-        results = self._apply_selector(
-            selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
-        )
-        return results
-    def _apply_selector(
-        self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
-    ) -> ElementCollection:
-        """
-        Apply selector to PDF elements across all pages.
-        Args:
-            selector_obj: Parsed selector dictionary
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            first_only: If True, stop searching after the first match is found.
-            **kwargs: Additional filter parameters
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
-        Returns:
-            ElementCollection of matching elements
-        """
-        from natural_pdf.elements.collections import ElementCollection
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            effective_selector = f'text:contains("{escaped_text}")'
+            logger.debug(
+                f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            raise ValueError("Internal error: No selector or text provided.")
-        # Determine page range to search
-        page_indices = kwargs.get("pages", range(len(self._pages)))
-        if isinstance(page_indices, int):
-            page_indices = [page_indices]
-        elif isinstance(page_indices, slice):
-            page_indices = range(*page_indices.indices(len(self._pages)))
+        # Instead of parsing here, let each page parse and apply
+        # This avoids parsing the same selector multiple times if not needed
+        # selector_obj = parse_selector(effective_selector)
-        # Check for cross-page pseudo-classes (currently not supported)
-        for pseudo in selector_obj.get("pseudo_classes", []):
-            if pseudo.get("name") in ("spans", "continues"):
-                logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
-                return ElementCollection([])
+        # kwargs["regex"] = regex # Removed: Already passed explicitly
+        # kwargs["case"] = case   # Removed: Already passed explicitly
-        # Regular case: collect elements from each page
         all_elements = []
-        for page_idx in page_indices:
-            if 0 <= page_idx < len(self._pages):
-                page = self._pages[page_idx]
-                # Pass first_only down to page._apply_selector
-                page_elements_collection = page._apply_selector(
-                    selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
-                )
-                if page_elements_collection:
-                    page_elements = page_elements_collection.elements
-                    all_elements.extend(page_elements)
-                    # If we only need the first match overall, and we found one on this page, stop
-                    if first_only and page_elements:
-                        break  # Stop iterating through pages
-            else:
-                logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
-        # Create a combined collection
-        combined = ElementCollection(all_elements)
+        for page in self.pages:
+            # Call page.find_all with the effective selector and flags
+            page_elements = page.find_all(
+                selector=effective_selector,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
+            if page_elements:
+                all_elements.extend(page_elements.elements)
-        # Sort in document order if requested and not first_only (already sorted by page)
-        if not first_only and kwargs.get("document_order", True):
-            # Check if elements have page, top, x0 before sorting
-            if all(
-                hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
-                for el in combined.elements
-            ):
-                combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
-            else:
-                # Elements might be Regions without inherent sorting order yet
-                # Attempt sorting by page index if possible
-                try:
-                    combined.sort(key=lambda el: el.page.index)
-                except AttributeError:
-                    logger.warning(
-                        "Cannot sort elements in document order: Missing required attributes (e.g., page)."
-                    )
+        from natural_pdf.elements.collections import ElementCollection
-        return combined
+        return ElementCollection(all_elements)
     def extract_text(
         self,
@@ -610,24 +694,24 @@ class PDF:
         Args:
             selector: Optional selector to filter elements
-            preserve_whitespace: Whether to keep blank characters (default: True)
-            use_exclusions: Whether to apply exclusion regions (default: True)
-            debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
+            preserve_whitespace: Whether to keep blank characters
+            use_exclusions: Whether to apply exclusion regions
+            debug_exclusions: Whether to output detailed debugging for exclusions
+            preserve_whitespace: Whether to keep blank characters
+            use_exclusions: Whether to apply exclusion regions
+            debug_exclusions: Whether to output detailed debugging for exclusions
             **kwargs: Additional extraction parameters
         Returns:
             Extracted text as string
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # If selector is provided, find elements first
         if selector:
             elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
             return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
-        # Otherwise extract from all pages
         if debug_exclusions:
             print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
             print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
@@ -648,25 +732,6 @@ class PDF:
         return "\n".join(texts)
-    def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
-        """
-        Shorthand for finding elements and extracting their text.
-        Args:
-            selector: CSS-like selector string
-            preserve_whitespace: Whether to keep blank characters (default: True)
-            **kwargs: Additional extraction parameters
-        Returns:
-            Extracted text from matching elements
-        """
-        # Ensure _pages is initialized
-        if not hasattr(self, "_pages"):
-            raise AttributeError("PDF pages not yet initialized.")
-        return self.extract_text(
-            selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
-        )  # apply_exclusions is handled by find_all in extract_text
     def extract_tables(
         self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
     ) -> List[Any]:
@@ -681,54 +746,46 @@ class PDF:
         Returns:
             List of extracted tables
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # TODO: Implement table extraction
         logger.warning("PDF.extract_tables is not fully implemented yet.")
         all_tables = []
         for page in self.pages:
-            # Assuming page.extract_tables(**kwargs) exists or is added
             if hasattr(page, "extract_tables"):
                 all_tables.extend(page.extract_tables(**kwargs))
             else:
                 logger.debug(f"Page {page.number} does not have extract_tables method.")
-        # Placeholder filtering
         if selector:
             logger.warning("Filtering extracted tables by selector is not implemented.")
-            # Would need to parse selector and filter the list `all_tables`
-        # Placeholder merging
         if merge_across_pages:
             logger.warning("Merging tables across pages is not implemented.")
-            # Would need logic to detect and merge related tables
         return all_tables
-    # --- New Method: save_searchable ---
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         Saves the PDF with an OCR text layer, making content searchable.
         Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
-        Note: OCR must have been applied to the pages beforehand
-              (e.g., using pdf.apply_ocr()).
         Args:
-            output_path: Path to save the searchable PDF.
-            dpi: Resolution for rendering and OCR overlay (default 300).
-            **kwargs: Additional keyword arguments passed to the exporter.
+            output_path: Path to save the searchable PDF
+            dpi: Resolution for rendering and OCR overlay
+            **kwargs: Additional keyword arguments passed to the exporter
+            output_path: Path to save the searchable PDF
+            dpi: Resolution for rendering and OCR overlay
+            **kwargs: Additional keyword arguments passed to the exporter
         """
-        # Import moved here, assuming it's always available now
         from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
-        # Convert pathlib.Path to string if necessary
         output_path_str = str(output_path)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
         logger.info(f"Searchable PDF saved to: {output_path_str}")
-    # --- End New Method ---
     def ask(
         self,
         question: str,
@@ -750,27 +807,22 @@ class PDF:
             **kwargs: Additional parameters passed to the QA engine
         Returns:
-            A dictionary containing the answer, confidence, and other metadata.
-            Result will have an 'answer' key containing the answer text.
+            A dictionary containing the answer, confidence, and other metadata
+            A dictionary containing the answer, confidence, and other metadata
         """
         from natural_pdf.qa import get_qa_engine
-        # Initialize or get QA engine
         qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
-        # Determine which pages to query
         if pages is None:
             target_pages = list(range(len(self.pages)))
         elif isinstance(pages, int):
-            # Single page
             target_pages = [pages]
         elif isinstance(pages, (list, range)):
-            # List or range of pages
             target_pages = pages
         else:
             raise ValueError(f"Invalid pages parameter: {pages}")
-        # Actually query each page and gather results
         results = []
         for page_idx in target_pages:
             if 0 <= page_idx < len(self.pages):
@@ -779,136 +831,110 @@ class PDF:
                     page=page, question=question, min_confidence=min_confidence, **kwargs
                 )
-                # Add to results if it found an answer
                 if page_result and page_result.get("found", False):
                     results.append(page_result)
-        # Sort results by confidence
         results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
-        # Return the best result, or a default result if none found
         if results:
             return results[0]
         else:
-            # Return a structure indicating no answer found
             return {
                 "answer": None,
                 "confidence": 0.0,
                 "found": False,
-                "page_num": None,  # Or maybe the pages searched?
+                "page_num": None,
                 "source_elements": [],
             }
     def search_within_index(
         self,
         query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol,  # Now required
+        search_service: SearchServiceProtocol,
         options: Optional[SearchOptions] = None,
     ) -> List[Dict[str, Any]]:
         """
-        Finds relevant documents specifically originating from THIS PDF document
-        within a search index managed by the provided SearchService.
-        This method uses a pre-configured SearchService instance and adds
-        a filter to the search query to scope results only to pages from
-        this specific PDF object (based on its resolved path).
+        Finds relevant documents from this PDF within a search index.
+        Finds relevant documents from this PDF within a search index.
         Args:
-            query: The search query (text, image path, PIL Image, Region).
-            search_service: A pre-configured SearchService instance pointing to the
-                            index where this PDF's content (or related content)
-                            is expected to be found.
-            options: Optional SearchOptions to configure the query (top_k, filters, etc.).
-                     Any existing filters in `options` will be combined with the
-                     PDF-scoping filter using an 'AND' condition.
+            query: The search query (text, image path, PIL Image, Region)
+            search_service: A pre-configured SearchService instance
+            options: Optional SearchOptions to configure the query
+            query: The search query (text, image path, PIL Image, Region)
+            search_service: A pre-configured SearchService instance
+            options: Optional SearchOptions to configure the query
         Returns:
-            A list of result dictionaries, sorted by relevance, containing only
-            results originating from this PDF's pages.
+            A list of result dictionaries, sorted by relevance
+            A list of result dictionaries, sorted by relevance
         Raises:
-            ImportError: If search dependencies are not installed.
-            ValueError: If search_service is None.
-            TypeError: If search_service does not conform to the protocol.
-            FileNotFoundError: If the collection managed by the service does not exist.
-            RuntimeError: For other search failures.
+            ImportError: If search dependencies are not installed
+            ValueError: If search_service is None
+            TypeError: If search_service does not conform to the protocol
+            FileNotFoundError: If the collection managed by the service does not exist
+            RuntimeError: For other search failures
+            ImportError: If search dependencies are not installed
+            ValueError: If search_service is None
+            TypeError: If search_service does not conform to the protocol
+            FileNotFoundError: If the collection managed by the service does not exist
+            RuntimeError: For other search failures
         """
         if not search_service:
             raise ValueError("A configured SearchServiceProtocol instance must be provided.")
-        # Optional stricter check:
-        # if not isinstance(search_service, SearchServiceProtocol):
-        #     raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
-        # Get collection name from service for logging
         collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
         logger.info(
-            f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
+            f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
         )
-        # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
-        # service: SearchServiceProtocol
-        # if search_service:
-        #     service = search_service
-        # else:
-        #     logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
-        #     factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
-        #     # TODO: Pass embedding model from options/pdf config if needed?
-        #     service = get_search_service(**factory_args)
-        service = search_service  # Use validated provided service
-        # --- 2. Prepare Query and Options ---
+        service = search_service
         query_input = query
-        # Resolve options (use default TextSearch if none provided)
         effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
-        # Handle Region query - extract text for now
         if isinstance(query, Region):
             logger.debug("Query is a Region object. Extracting text.")
             if not isinstance(effective_options, TextSearchOptions):
                 logger.warning(
-                    "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
+                    "Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
                 )
             query_input = query.extract_text()
             if not query_input or query_input.isspace():
                 logger.error("Region has no extractable text for query.")
                 return []
-        # --- 3. Add Filter to Scope Search to THIS PDF ---
-        # Assume metadata field 'pdf_path' stores the resolved path used during indexing
+        # Add filter to scope search to THIS PDF
+        # Add filter to scope search to THIS PDF
         pdf_scope_filter = {
-            "field": "pdf_path",  # Or potentially "source_path" depending on indexing metadata
+            "field": "pdf_path",
             "operator": "eq",
-            "value": self.path,  # Use the resolved path of this PDF instance
+            "value": self.path,
         }
         logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
         # Combine with existing filters in options (if any)
         if effective_options.filters:
-            logger.debug(
-                f"Combining PDF scope filter with existing filters: {effective_options.filters}"
-            )
-            # Assume filters are compatible with the underlying search service
-            # If existing filters aren't already in an AND block, wrap them
+            logger.debug(f"Combining PDF scope filter with existing filters")
             if (
                 isinstance(effective_options.filters, dict)
                 and effective_options.filters.get("operator") == "AND"
             ):
-                # Already an AND block, just append the condition
                 effective_options.filters["conditions"].append(pdf_scope_filter)
             elif isinstance(effective_options.filters, list):
-                # Assume list represents implicit AND conditions
                 effective_options.filters = {
                     "operator": "AND",
                     "conditions": effective_options.filters + [pdf_scope_filter],
                 }
-            elif isinstance(effective_options.filters, dict):  # Single filter dict
+            elif isinstance(effective_options.filters, dict):
                 effective_options.filters = {
                     "operator": "AND",
                     "conditions": [effective_options.filters, pdf_scope_filter],
                 }
             else:
                 logger.warning(
-                    f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
+                    f"Unsupported format for existing filters. Overwriting with PDF scope filter."
                 )
                 effective_options.filters = pdf_scope_filter
         else:
@@ -916,39 +942,33 @@ class PDF:
         logger.debug(f"Final filters for service search: {effective_options.filters}")
-        # --- 4. Call SearchService ---
         try:
-            # Call the service's search method (no collection_name needed)
             results = service.search(
                 query=query_input,
                 options=effective_options,
             )
-            logger.info(
-                f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
-            )
+            logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
             return results
         except FileNotFoundError as fnf:
-            logger.error(
-                f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
-            )
-            raise  # Re-raise specific error
+            logger.error(f"Search failed: Collection not found. Error: {fnf}")
+            raise
+            logger.error(f"Search failed: Collection not found. Error: {fnf}")
+            raise
         except Exception as e:
-            logger.error(
-                f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
-                exc_info=True,
-            )
-            raise RuntimeError(
-                f"Search within index failed for PDF '{self.path}'. See logs for details."
-            ) from e
+            logger.error(f"SearchService search failed: {e}")
+            raise RuntimeError(f"Search within index failed. See logs for details.") from e
+            logger.error(f"SearchService search failed: {e}")
+            raise RuntimeError(f"Search within index failed. See logs for details.") from e
     def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
         """
-        Exports OCR results from this PDF into a correction task package (zip file).
+        Exports OCR results from this PDF into a correction task package.
+        Exports OCR results from this PDF into a correction task package.
         Args:
-            output_zip_path: The path to save the output zip file.
+            output_zip_path: The path to save the output zip file
+            output_zip_path: The path to save the output zip file
             **kwargs: Additional arguments passed to create_correction_task_package
-                      (e.g., image_render_scale, overwrite).
         """
         try:
             from natural_pdf.utils.packaging import create_correction_task_package
@@ -958,32 +978,41 @@ class PDF:
             logger.error(
                 "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
             )
-            # Or raise
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
         except Exception as e:
-            logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
-            raise  # Re-raise the exception from the utility function
+            logger.error(f"Failed to export correction task: {e}")
+            raise
+            logger.error(f"Failed to export correction task: {e}")
+            raise
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
         pages: Optional[Union[Iterable[int], range, slice]] = None,
-    ) -> "PDF":  # Return self for chaining
+        max_workers: Optional[int] = None,
+        progress_callback: Optional[Callable[[], None]] = None,
+    ) -> "PDF":
         """
-        Applies corrections to OCR-generated text elements using a callback function,
-        delegating the core work to the `Page.correct_ocr` method.
+        Applies corrections to OCR text elements using a callback function.
+        Applies corrections to OCR text elements using a callback function.
         Args:
-            correction_callback: A function that accepts a single argument (an element
-                                object) and returns `Optional[str]`. It returns the
-                                corrected text string if an update is needed, otherwise None.
+            correction_callback: Function that takes an element and returns corrected text or None
+            correction_callback: Function that takes an element and returns corrected text or None
             pages: Optional page indices/slice to limit the scope of correction
-                (default: all pages).
+            max_workers: Maximum number of threads to use for parallel execution
+            progress_callback: Optional callback function for progress updates
+            max_workers: Maximum number of threads to use for parallel execution
+            progress_callback: Optional callback function for progress updates
         Returns:
-            Self for method chaining.
+            Self for method chaining
+            Self for method chaining
         """
-        # Determine target pages
-        target_page_indices: List[int] = []
+        target_page_indices = []
+        target_page_indices = []
         if pages is None:
             target_page_indices = list(range(len(self._pages)))
         elif isinstance(pages, slice):
@@ -991,56 +1020,55 @@ class PDF:
         elif hasattr(pages, "__iter__"):
             try:
                 target_page_indices = [int(i) for i in pages]
-                # Validate indices
                 for idx in target_page_indices:
                     if not (0 <= idx < len(self._pages)):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
-                raise ValueError(
-                    f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
-                ) from e
+                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
+                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
         else:
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_page_indices:
             logger.warning("No pages selected for OCR correction.")
             return self
-        logger.info(
-            f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
-        )
+        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
+        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
-        # Iterate through target pages and call their correct_ocr method
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
             try:
-                page.correct_ocr(correction_callback=correction_callback)
+                page.correct_ocr(
+                    correction_callback=correction_callback,
+                    max_workers=max_workers,
+                    progress_callback=progress_callback,
+                )
             except Exception as e:
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
-                # Optionally re-raise or just log and continue
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
-        logger.info(f"OCR correction process finished for requested pages.")
+        logger.info("OCR correction process finished.")
+        logger.info("OCR correction process finished.")
         return self
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
-            # Return 0 or raise error if not fully initialized? Let's return 0.
             return 0
         return len(self._pages)
-    def __getitem__(self, key) -> Union[Page, "PageCollection"]:  # Return PageCollection for slice
+    def __getitem__(self, key) -> Union["Page", "PageCollection"]:
         """Access pages by index or slice."""
-        # Check if self._pages has been initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not initialized yet.")
         if isinstance(key, slice):
-            # Return a PageCollection slice
             from natural_pdf.elements.collections import PageCollection
             return PageCollection(self._pages[key])
-        # Check index bounds before accessing
         if isinstance(key, int):
             if 0 <= key < len(self._pages):
                 return self._pages[key]
@@ -1054,25 +1082,23 @@ class PDF:
         if hasattr(self, "_pdf") and self._pdf is not None:
             try:
                 self._pdf.close()
-                logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
+                logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
             except Exception as e:
                 logger.warning(f"Error closing pdfplumber object: {e}")
             finally:
                 self._pdf = None
-        # Clean up temporary file if it exists
         if hasattr(self, "_temp_file") and self._temp_file is not None:
             temp_file_path = None
             try:
                 if hasattr(self._temp_file, "name") and self._temp_file.name:
                     temp_file_path = self._temp_file.name
-                    if os.path.exists(temp_file_path):
+                    # Only unlink if it exists and _is_stream is False (meaning WE created it)
+                    if not self._is_stream and os.path.exists(temp_file_path):
                         os.unlink(temp_file_path)
                         logger.debug(f"Removed temporary PDF file: {temp_file_path}")
             except Exception as e:
-                logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
-            finally:
-                self._temp_file = None
+                logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
     def __enter__(self):
         """Context manager entry."""
@@ -1082,6 +1108,432 @@ class PDF:
         """Context manager exit."""
         self.close()
-    # --- Indexable Protocol Methods --- Needed for search/sync
     def get_id(self) -> str:
+        """Get unique identifier for this PDF."""
+        """Get unique identifier for this PDF."""
         return self.path
+    # --- Deskew Method --- #
+    def deskew(
+        self,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+        resolution: int = 300,
+        detection_resolution: int = 72,
+        force_overwrite: bool = False,
+        **deskew_kwargs,
+    ) -> "PDF":
+        """
+        Creates a new, in-memory PDF object containing deskewed versions of the
+        specified pages from the original PDF.
+        This method renders each selected page, detects and corrects skew using the 'deskew'
+        library, and then combines the resulting images into a new PDF using 'img2pdf'.
+        The new PDF object is returned directly.
+        Important: The returned PDF is image-based. Any existing text, OCR results,
+        annotations, or other elements from the original pages will *not* be carried over.
+        Args:
+            pages: Page indices/slice to include (0-based). If None, processes all pages.
+            resolution: DPI resolution for rendering the output deskewed pages.
+            detection_resolution: DPI resolution used for skew detection if angles are not
+                                  already cached on the page objects.
+            force_overwrite: If False (default), raises a ValueError if any target page
+                             already contains processed elements (text, OCR, regions) to
+                             prevent accidental data loss. Set to True to proceed anyway.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             during automatic detection (e.g., `max_angle`, `num_peaks`).
+        Returns:
+            A new PDF object representing the deskewed document.
+        Raises:
+            ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
+            ValueError: If `force_overwrite` is False and target pages contain elements.
+            FileNotFoundError: If the source PDF cannot be read (if file-based).
+            IOError: If creating the in-memory PDF fails.
+            RuntimeError: If rendering or deskewing individual pages fails.
+        """
+        if not DESKEW_AVAILABLE:
+            raise ImportError(
+                "Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
+            )
+        target_pages = self._get_target_pages(pages)  # Use helper to resolve pages
+        # --- Safety Check --- #
+        if not force_overwrite:
+            for page in target_pages:
+                # Check if the element manager has been initialized and contains any elements
+                if (
+                    hasattr(page, "_element_mgr")
+                    and page._element_mgr
+                    and page._element_mgr.has_elements()
+                ):
+                    raise ValueError(
+                        f"Page {page.number} contains existing elements (text, OCR, etc.). "
+                        f"Deskewing creates an image-only PDF, discarding these elements. "
+                        f"Set force_overwrite=True to proceed."
+                    )
+        # --- Process Pages --- #
+        deskewed_images_bytes = []
+        logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
+        # Use tqdm via get_tqdm
+        for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
+            try:
+                # Use page.deskew to get the corrected PIL image
+                # Pass down resolutions and kwargs
+                deskewed_img = page.deskew(
+                    resolution=resolution,
+                    angle=None,  # Let page.deskew handle detection/caching
+                    detection_resolution=detection_resolution,
+                    **deskew_kwargs,
+                )
+                if not deskewed_img:
+                    logger.warning(
+                        f"Page {page.number}: Failed to generate deskewed image, skipping."
+                    )
+                    continue
+                # Convert image to bytes for img2pdf (use PNG for lossless quality)
+                with io.BytesIO() as buf:
+                    deskewed_img.save(buf, format="PNG")
+                    deskewed_images_bytes.append(buf.getvalue())
+            except Exception as e:
+                logger.error(
+                    f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
+                )
+                # Option: Raise a runtime error, or continue and skip the page?
+                # Raising makes the whole operation fail if one page fails.
+                raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
+        # --- Create PDF --- #
+        if not deskewed_images_bytes:
+            raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
+        logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
+        try:
+            # Use img2pdf to combine image bytes into PDF bytes
+            pdf_bytes = img2pdf.convert(deskewed_images_bytes)
+            # Wrap bytes in a stream
+            pdf_stream = io.BytesIO(pdf_bytes)
+            # Create a new PDF object from the stream using original config
+            logger.info("Creating new PDF object from deskewed stream...")
+            new_pdf = PDF(
+                pdf_stream,
+                reading_order=self._reading_order,
+                font_attrs=self._font_attrs,
+                keep_spaces=self._config.get("keep_spaces", True),
+            )
+            return new_pdf
+        except Exception as e:
+            logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
+            raise IOError("Failed to create deskewed PDF object from image stream.") from e
+    # --- End Deskew Method --- #
+    # --- Classification Methods --- #
+    def classify_pages(
+        self,
+        categories: List[str],
+        model: Optional[str] = None,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+        analysis_key: str = "classification",
+        using: Optional[str] = None,
+        **kwargs,
+    ) -> "PDF":
+        """
+        Classifies specified pages of the PDF.
+        Args:
+            categories: List of category names
+            model: Model identifier ('text', 'vision', or specific HF ID)
+            pages: Page indices, slice, or None for all pages
+            analysis_key: Key to store results in page's analyses dict
+            using: Processing mode ('text' or 'vision')
+            **kwargs: Additional arguments for the ClassificationManager
+        Returns:
+            Self for method chaining
+        """
+        if not categories:
+            raise ValueError("Categories list cannot be empty.")
+        try:
+            manager = self.get_manager("classification")
+        except (ValueError, RuntimeError) as e:
+            raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
+        if not manager or not manager.is_available():
+            try:
+                from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
+                if not _CLASSIFICATION_AVAILABLE:
+                    raise ImportError("Classification dependencies missing.")
+            except ImportError:
+                raise ImportError(
+                    "Classification dependencies missing. "
+                    'Install with: pip install "natural-pdf[classification]"'
+                )
+            raise ClassificationError("ClassificationManager not available.")
+        target_pages = []
+        if pages is None:
+            target_pages = self._pages
+        elif isinstance(pages, slice):
+            target_pages = self._pages[pages]
+        elif hasattr(pages, "__iter__"):
+            try:
+                target_pages = [self._pages[i] for i in pages]
+            except IndexError:
+                raise ValueError("Invalid page index provided.")
+            except TypeError:
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        if not target_pages:
+            logger.warning("No pages selected for classification.")
+            return self
+        inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+        logger.info(
+            f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
+        )
+        page_contents = []
+        pages_to_classify = []
+        logger.debug(f"Gathering content for {len(target_pages)} pages...")
+        for page in target_pages:
+            try:
+                content = page._get_classification_content(model_type=inferred_using, **kwargs)
+                page_contents.append(content)
+                pages_to_classify.append(page)
+            except ValueError as e:
+                logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
+            except Exception as e:
+                logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
+        if not page_contents:
+            logger.warning("No content could be gathered for batch classification.")
+            return self
+        logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
+        try:
+            batch_results = manager.classify_batch(
+                item_contents=page_contents,
+                categories=categories,
+                model_id=model,
+                using=inferred_using,
+                **kwargs,
+            )
+        except Exception as e:
+            logger.error(f"Batch classification failed: {e}")
+            raise ClassificationError(f"Batch classification failed: {e}") from e
+        if len(batch_results) != len(pages_to_classify):
+            logger.error(
+                f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
+            )
+            return self
+        logger.debug(
+            f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
+        )
+        for page, result_obj in zip(pages_to_classify, batch_results):
+            try:
+                if not hasattr(page, "analyses") or page.analyses is None:
+                    page.analyses = {}
+                page.analyses[analysis_key] = result_obj
+            except Exception as e:
+                logger.warning(
+                    f"Failed to store classification results for page {page.number}: {e}"
+                )
+        logger.info(f"Finished classifying PDF pages.")
+        return self
+    # --- End Classification Methods --- #
+    # --- Extraction Support --- #
+    def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
+        """
+        Retrieves the content for the entire PDF.
+        Args:
+            using: 'text' or 'vision'
+            **kwargs: Additional arguments passed to extract_text or page.to_image
+        Returns:
+            str: Extracted text if using='text'
+            List[PIL.Image.Image]: List of page images if using='vision'
+            None: If content cannot be retrieved
+        """
+        if using == "text":
+            try:
+                layout = kwargs.pop("layout", True)
+                return self.extract_text(layout=layout, **kwargs)
+            except Exception as e:
+                logger.error(f"Error extracting text from PDF: {e}")
+                return None
+        elif using == "vision":
+            page_images = []
+            logger.info(f"Rendering {len(self.pages)} pages to images...")
+            resolution = kwargs.pop("resolution", 72)
+            include_highlights = kwargs.pop("include_highlights", False)
+            labels = kwargs.pop("labels", False)
+            try:
+                for page in tqdm(self.pages, desc="Rendering Pages"):
+                    img = page.to_image(
+                        resolution=resolution,
+                        include_highlights=include_highlights,
+                        labels=labels,
+                        **kwargs,
+                    )
+                    if img:
+                        page_images.append(img)
+                    else:
+                        logger.warning(f"Failed to render page {page.number}, skipping.")
+                if not page_images:
+                    logger.error("Failed to render any pages.")
+                    return None
+                return page_images
+            except Exception as e:
+                logger.error(f"Error rendering pages: {e}")
+                return None
+        else:
+            logger.error(f"Unsupported value for 'using': {using}")
+            return None
+    # --- End Extraction Support --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all pages in the PDF.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not hasattr(self, "_pages") or not self._pages:
+            logger.warning(f"No pages found in PDF {self.path}")
+            return []
+        all_data = []
+        for page in tqdm(self._pages, desc="Gathering page data", leave=False):
+            # Basic page information
+            page_data = {
+                "pdf_path": self.path,
+                "page_number": page.number,
+                "page_index": page.index,
+            }
+            # Include extracted text if requested
+            if include_content:
+                try:
+                    page_data["content"] = page.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page.number}: {e}")
+                    page_data["content"] = ""
+            # Save image if requested
+            if include_images:
+                try:
+                    # Create image filename
+                    image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    page.save_image(
+                        str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for page {page.number}: {e}")
+                    page_data["image_path"] = None
+            # Add analyses data
+            for key in analysis_keys:
+                if not hasattr(page, "analyses") or not page.analyses:
+                    raise ValueError(f"Page {page.number} does not have analyses data")
+                if key not in page.analyses:
+                    raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
+                # Get the analysis result
+                analysis_result = page.analyses[key]
+                # If the result has a to_dict method, use it
+                if hasattr(analysis_result, "to_dict"):
+                    analysis_data = analysis_result.to_dict()
+                else:
+                    # Otherwise, use the result directly if it's dict-like
+                    try:
+                        analysis_data = dict(analysis_result)
+                    except (TypeError, ValueError):
+                        # Last resort: convert to string
+                        analysis_data = {"raw_result": str(analysis_result)}
+                # Add analysis data to page data with the key as prefix
+                for k, v in analysis_data.items():
+                    page_data[f"{key}.{k}"] = v
+            all_data.append(page_data)
+        return all_data
+    def _get_target_pages(
+        self, pages: Optional[Union[Iterable[int], range, slice]] = None
+    ) -> List["Page"]:
+        """
+        Helper method to get a list of Page objects based on the input pages.
+        Args:
+            pages: Page indices, slice, or None for all pages
+        Returns:
+            List of Page objects
+        """
+        if pages is None:
+            return self._pages
+        elif isinstance(pages, slice):
+            return self._pages[pages]
+        elif hasattr(pages, "__iter__"):
+            try:
+                return [self._pages[i] for i in pages]
+            except IndexError:
+                raise ValueError("Invalid page index provided in 'pages' iterable.")
+            except TypeError:
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl