PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/finetuning/index.md +176 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +411 -248
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +326 -17
natural_pdf/core/element_manager.py +73 -4
natural_pdf/core/page.py +255 -83
natural_pdf/core/pdf.py +385 -367
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +279 -49
natural_pdf/elements/region.py +106 -21
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +86 -42
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +98 -34
natural_pdf/ocr/ocr_options.py +38 -10
natural_pdf/ocr/utils.py +59 -33
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +96 -65
natural_pdf/utils/tqdm_utils.py +43 -0
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import copy  # Add import for deepcopy
+import copy
 import logging
 import os
 import re
 import tempfile
 import urllib.request
-from pathlib import Path  # Added Path
-from typing import (  # Added Iterable and TYPE_CHECKING
+import time
+import threading
+from pathlib import Path
+from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
@@ -17,29 +19,33 @@ from typing import (  # Added Iterable and TYPE_CHECKING
     Type,
     Union,
 )
-from pathlib import Path
+from natural_pdf.utils.tqdm_utils import get_tqdm
 import pdfplumber
 from PIL import Image
-from natural_pdf.analyzers.layout.layout_manager import (  # Import the new LayoutManager
-    LayoutManager,
-)
-from natural_pdf.core.highlighting_service import HighlightingService  # <-- Import the new service
+from natural_pdf.analyzers.layout.layout_manager import LayoutManager
+from natural_pdf.core.highlighting_service import HighlightingService
 from natural_pdf.core.page import Page
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
-# Import the flag directly - this should always work
+from natural_pdf.classification.manager import ClassificationManager
+from natural_pdf.classification.manager import ClassificationError
+from natural_pdf.classification.results import ClassificationResult
+from natural_pdf.extraction.manager import StructuredDataManager
+from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.elements.base import Element
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.extraction.mixin import ExtractionMixin
-# --- Add Search Service Imports (needed for new methods) ---
 try:
-    from typing import Any as TypingAny  # Import Any if not already
+    from typing import Any as TypingAny
-    from natural_pdf.search import TextSearchOptions  # Keep for ask default
+    from natural_pdf.search import TextSearchOptions
     from natural_pdf.search import (
         BaseSearchOptions,
         SearchOptions,
@@ -47,25 +53,24 @@ try:
         get_search_service,
     )
 except ImportError:
-    # Define dummies if needed for type hints within the class
     SearchServiceProtocol = object
     SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
     TypingAny = object
-    # Dummy factory needed for default arg in methods
     def get_search_service(**kwargs) -> SearchServiceProtocol:
         raise ImportError(
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
-# --- End Search Service Imports ---
-# Set up logger early
 logger = logging.getLogger("natural_pdf.core.pdf")
+tqdm = get_tqdm()
+DEFAULT_MANAGERS = {
+    "classification": ClassificationManager,
+    "structured_data": StructuredDataManager,
+}
-class PDF:
+class PDF(ExtractionMixin):
     """
     Enhanced PDF wrapper built on top of pdfplumber.
@@ -86,35 +91,23 @@ class PDF:
         Args:
             path_or_url: Path to the PDF file or a URL to a PDF
             reading_order: Whether to use natural reading order
-            font_attrs: Font attributes to consider when grouping characters into words.
-                       Default: ['fontname', 'size'] (Group by font name and size)
-                       None: Only consider spatial relationships
-                       List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
-            keep_spaces: Whether to include spaces in word elements (default: True).
-                       True: Spaces are part of words, better for multi-word searching
-                       False: Break text at spaces, each word is separate (legacy behavior)
+            font_attrs: Font attributes for grouping characters into words
+            keep_spaces: Whether to include spaces in word elements
         """
-        # Check if the input is a URL
         is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
-        # Initialize path-related attributes
         self._original_path = path_or_url
         self._temp_file = None
-        self._resolved_path = None  # Store the actual path used by pdfplumber
+        self._resolved_path = None
         if is_url:
             logger.info(f"Downloading PDF from URL: {path_or_url}")
             try:
-                # Create a temporary file to store the downloaded PDF
                 self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
-                # Download the PDF
                 with urllib.request.urlopen(path_or_url) as response:
                     self._temp_file.write(response.read())
                     self._temp_file.flush()
                     self._temp_file.close()
-                # Use the temporary file path
                 self._resolved_path = self._temp_file.name
                 logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
             except Exception as e:
@@ -126,7 +119,6 @@ class PDF:
                 logger.error(f"Failed to download PDF from URL: {e}")
                 raise ValueError(f"Failed to download PDF from URL: {e}")
         else:
-            # Use the provided path directly
             self._resolved_path = path_or_url
         logger.info(f"Initializing PDF from {self._resolved_path}")
@@ -137,42 +129,68 @@ class PDF:
         try:
             self._pdf = pdfplumber.open(self._resolved_path)
         except Exception as e:
-            logger.error(
-                f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
-                exc_info=True,
-            )
-            # Clean up temp file if creation failed
+            logger.error(f"Failed to open PDF: {e}", exc_info=True)
             self.close()
             raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
-        self._path = self._resolved_path  # Keep original path too?
-        self.path = self._resolved_path  # Public attribute for the resolved path
-        self.source_path = self._original_path  # Public attribute for the user-provided path/URL
+        self._path = self._resolved_path
+        self.path = self._resolved_path
+        self.source_path = self._original_path
         self._reading_order = reading_order
         self._config = {"keep_spaces": keep_spaces}
+        self._font_attrs = font_attrs
-        self._font_attrs = font_attrs  # Store the font attribute configuration
-        # Initialize Managers and Services (conditionally available)
         self._ocr_manager = OCRManager() if OCRManager else None
         self._layout_manager = LayoutManager() if LayoutManager else None
         self.highlighter = HighlightingService(self)
+        self._classification_manager_instance = ClassificationManager()
+        self._manager_registry = {}
-        # Initialize pages last, passing necessary refs
         self._pages = [
             Page(p, parent=self, index=i, font_attrs=font_attrs)
             for i, p in enumerate(self._pdf.pages)
         ]
-        # Other state
         self._element_cache = {}
-        self._exclusions = []  # List to store exclusion functions/regions
-        self._regions = []  # List to store region functions/definitions
+        self._exclusions = []
+        self._regions = []
-        logger.info("Initialized HighlightingService.")
         logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
+        self._initialize_managers()
+        self._initialize_highlighter()
+    def _initialize_managers(self):
+        """Initialize manager instances based on DEFAULT_MANAGERS."""
+        self._managers = {}
+        for key, manager_class in DEFAULT_MANAGERS.items():
+            try:
+                self._managers[key] = manager_class()
+                logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
+            except Exception as e:
+                logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
+                self._managers[key] = None
+    def get_manager(self, key: str) -> Any:
+        """Retrieve a manager instance by its key."""
+        if key not in self._managers:
+            raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
+        manager_instance = self._managers.get(key)
+        if manager_instance is None:
+             manager_class = DEFAULT_MANAGERS.get(key)
+             if manager_class:
+                  raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
+             else:
+                  raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
+        return manager_instance
+    def _initialize_highlighter(self):
+        pass
     @property
     def metadata(self) -> Dict[str, Any]:
         """Access metadata as a dictionary."""
@@ -183,7 +201,6 @@ class PDF:
         """Access pages as a PageCollection object."""
         from natural_pdf.elements.collections import PageCollection
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         return PageCollection(self._pages)
@@ -195,12 +212,10 @@ class PDF:
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
-        # Also clear from pages
         for page in self._pages:
             page.clear_exclusions()
         return self
@@ -212,99 +227,75 @@ class PDF:
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
         Args:
-            exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
+            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
             label: Optional label for this exclusion
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # Store exclusion with its label at PDF level
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        # Apply this exclusion to all pages
         for page in self._pages:
-            # We pass the original function, Page.add_exclusion handles calling it
             page.add_exclusion(exclusion_func, label=label)
         return self
     def apply_ocr(
         self,
-        pages: Optional[Union[Iterable[int], range, slice]] = None,
         engine: Optional[str] = None,
-        # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None, # Min confidence threshold
+        min_confidence: Optional[float] = None,
         device: Optional[str] = None,
-        resolution: Optional[int] = None, # DPI for rendering before OCR
-        apply_exclusions: bool = True, # New parameter
+        resolution: Optional[int] = None,
+        apply_exclusions: bool = True,
         detect_only: bool = False,
-        # --- Engine-Specific Options --- Use 'options=' for this
-        options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
-        # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
+        replace: bool = True,
+        options: Optional[Any] = None,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
     ) -> "PDF":
         """
-        Applies OCR to specified pages (or all pages) of the PDF using batch processing.
-        This method renders the specified pages to images, sends them as a batch
-        to the OCRManager, and adds the resulting TextElements to each respective page.
+        Applies OCR to specified pages of the PDF using batch processing.
         Args:
-            pages: An iterable of 0-based page indices (list, range, tuple),
-                   a slice object, or None to process all pages.
-            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
-                    Uses manager's default ('easyocr') if None.
-            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
-                       **Must be codes understood by the specific selected engine.**
-                       No mapping is performed. Overrides manager/engine default.
-            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
-                            Overrides manager/engine default.
-            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
-                    Overrides manager/engine default.
-            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
-                        Affects input quality for OCR. Defaults to 150 if not set.
-            apply_exclusions: If True (default), render page image for OCR with
-                              excluded areas masked (whited out). If False, OCR
-                              the raw page image without masking exclusions.
-            detect_only: If True, only detect text bounding boxes, don't perform OCR.
-            options: An engine-specific options object (e.g., EasyOCROptions) or dict
-                     containing parameters specific to the chosen engine.
+            engine: Name of the OCR engine
+            languages: List of language codes
+            min_confidence: Minimum confidence threshold
+            device: Device to run OCR on
+            resolution: DPI resolution for page images
+            apply_exclusions: Whether to mask excluded areas
+            detect_only: If True, only detect text boxes
+            replace: Whether to replace existing OCR elements
+            options: Engine-specific options
+            pages: Page indices to process or None for all pages
         Returns:
-            Self for method chaining.
-        Raises:
-            ValueError: If page indices are invalid.
-            TypeError: If 'options' is not compatible with the engine.
-            RuntimeError: If the OCRManager or selected engine is not available.
+            Self for method chaining
         """
         if not self._ocr_manager:
             logger.error("OCRManager not available. Cannot apply OCR.")
-            # Or raise RuntimeError("OCRManager not initialized.")
             return self
-        # --- Determine Target Pages (unchanged) ---
-        target_pages: List[Page] = []
+        thread_id = threading.current_thread().name
+        logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
+        target_pages = []
         if pages is None:
             target_pages = self._pages
         elif isinstance(pages, slice):
             target_pages = self._pages[pages]
-        elif hasattr(pages, "__iter__"):  # Check if it's iterable (list, range, tuple, etc.)
+        elif hasattr(pages, "__iter__"):
             try:
                 target_pages = [self._pages[i] for i in pages]
             except IndexError:
                 raise ValueError("Invalid page index provided in 'pages' iterable.")
             except TypeError:
-                raise TypeError(
-                    "'pages' must be None, a slice, or an iterable of page indices (int)."
-                )
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         else:
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_pages:
             logger.warning("No pages selected for OCR processing.")
@@ -312,24 +303,20 @@ class PDF:
         page_numbers = [p.number for p in target_pages]
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
-        # --- Determine Rendering Resolution ---
-        # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
-        final_resolution = resolution # Use direct arg if provided
-        if final_resolution is None:
-            final_resolution = getattr(self, "_config", {}).get("resolution", 150)
-        logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
-        # --- Render Images for Batch ---
-        images_pil: List[Image.Image] = []
-        page_image_map: List[Tuple[Page, Image.Image]] = []  # Store page and its image
-        logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
-        failed_page_num = "unknown"  # Keep track of potentially failing page
+        final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
+        logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
+        images_pil = []
+        page_image_map = []
+        logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
+        failed_page_num = "unknown"
+        render_start_time = time.monotonic()
         try:
-            for i, page in enumerate(target_pages):
-                failed_page_num = page.number  # Update current page number in case of error
+            for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
+                failed_page_num = page.number
                 logger.debug(f"  Rendering page {page.number} (index {page.index})...")
-                # Use the determined final_resolution and apply exclusions if requested
                 to_image_kwargs = {
                     "resolution": final_resolution,
                     "include_highlights": False,
@@ -338,66 +325,64 @@ class PDF:
                 img = page.to_image(**to_image_kwargs)
                 if img is None:
                     logger.error(f"  Failed to render page {page.number} to image.")
-                    # Decide how to handle: skip page, raise error? For now, skip.
-                    continue # Skip this page if rendering failed
+                    continue
                 images_pil.append(img)
-                page_image_map.append((page, img))  # Store pair
+                page_image_map.append((page, img))
         except Exception as e:
-            logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
+            logger.error(f"Failed to render pages for batch OCR: {e}")
             raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
+        render_end_time = time.monotonic()
+        logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
         if not images_pil or not page_image_map:
             logger.error("No images were successfully rendered for batch OCR.")
             return self
-        # --- Prepare Arguments for Manager ---
-        # Pass common args directly, engine-specific via options
         manager_args = {
             "images": images_pil,
             "engine": engine,
             "languages": languages,
-            "min_confidence": min_confidence, # Use the renamed parameter
+            "min_confidence": min_confidence,
             "device": device,
             "options": options,
             "detect_only": detect_only,
-            # Note: resolution is used for rendering, not passed to OCR manager directly
         }
-        # Filter out None values so manager can use its defaults
         manager_args = {k: v for k, v in manager_args.items() if v is not None}
-        # --- Call OCR Manager for Batch Processing ---
-        logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
+        ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
+        logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
+        ocr_start_time = time.monotonic()
         try:
-            # Manager's apply_ocr signature needs to accept common args directly
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
             if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
-                logger.error(
-                    f"OCR Manager returned unexpected result format or length for batch processing. "
-                    f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
-                    f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
-                )
+                logger.error(f"OCR Manager returned unexpected result format or length.")
                 return self
             logger.info("OCR Manager batch processing complete.")
         except Exception as e:
-            logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
+            logger.error(f"Batch OCR processing failed: {e}")
             return self
+        ocr_end_time = time.monotonic()
+        logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
-        # --- Distribute Results and Add Elements to Pages (unchanged) ---
         logger.info("Adding OCR results to respective pages...")
         total_elements_added = 0
         for i, (page, img) in enumerate(page_image_map):
             results_for_page = batch_results[i]
             if not isinstance(results_for_page, list):
-                logger.warning(
-                    f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
-                )
+                logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
                 continue
             logger.debug(f"  Processing {len(results_for_page)} results for page {page.number}...")
             try:
+                if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
+                    page._element_mgr.remove_ocr_elements()
                 img_scale_x = page.width / img.width if img.width > 0 else 1
                 img_scale_y = page.height / img.height if img.height > 0 else 1
                 elements = page._element_mgr.create_text_elements_from_ocr(
@@ -410,53 +395,39 @@ class PDF:
                 else:
                     logger.debug(f"  No valid TextElements created for page {page.number}.")
             except Exception as e:
-                logger.error(
-                    f"  Error adding OCR elements to page {page.number}: {e}", exc_info=True
-                )
+                logger.error(f"  Error adding OCR elements to page {page.number}: {e}")
-        logger.info(
-            f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
-        )
+        logger.info(f"Finished adding OCR results. Total elements added: {total_elements_added}")
         return self
     def add_region(
         self, region_func: Callable[["Page"], Optional[Region]], name: str = None
     ) -> "PDF":
         """
-        Add a region function to the PDF. This creates regions on all pages using the provided function.
+        Add a region function to the PDF.
         Args:
-            region_func: A function that takes a Page and returns a Region, or None.
+            region_func: A function that takes a Page and returns a Region, or None
             name: Optional name for the region
         Returns:
             Self for method chaining
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # Store region with its name at PDF level
         region_data = (region_func, name)
         self._regions.append(region_data)
-        # Apply this region to all pages
         for page in self._pages:
             try:
-                # Call the function to get the region for this specific page
                 region_instance = region_func(page)
                 if region_instance and isinstance(region_instance, Region):
-                    # If a valid region is returned, add it to the page
                     page.add_region(region_instance, name=name, source="named")
                 elif region_instance is not None:
-                    logger.warning(
-                        f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
-                    )
+                    logger.warning(f"Region function did not return a valid Region for page {page.number}")
             except Exception as e:
-                logger.error(
-                    f"Error executing or adding region function for page {page.number}: {e}",
-                    exc_info=True,
-                )
+                logger.error(f"Error adding region for page {page.number}: {e}")
         return self
@@ -467,22 +438,19 @@ class PDF:
         Find the first element matching the selector.
         Args:
-            selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
+            selector: CSS-like selector string
+            apply_exclusions: Whether to exclude elements in exclusion regions
+            regex: Whether to use regex for text search
+            case: Whether to do case-sensitive text search
             **kwargs: Additional filter parameters
         Returns:
             Element object or None if not found
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         selector_obj = parse_selector(selector)
-        # Pass regex and case flags to selector function
         kwargs["regex"] = regex
         kwargs["case"] = case
@@ -498,22 +466,19 @@ class PDF:
         Find all elements matching the selector.
         Args:
-            selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
+            selector: CSS-like selector string
+            apply_exclusions: Whether to exclude elements in exclusion regions
+            regex: Whether to use regex for text search
+            case: Whether to do case-sensitive text search
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection with matching elements
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         selector_obj = parse_selector(selector)
-        # Pass regex and case flags to selector function
         kwargs["regex"] = regex
         kwargs["case"] = case
@@ -530,8 +495,8 @@ class PDF:
         Args:
             selector_obj: Parsed selector dictionary
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            first_only: If True, stop searching after the first match is found.
+            apply_exclusions: Whether to exclude elements in exclusion regions
+            first_only: If True, stop searching after the first match is found
             **kwargs: Additional filter parameters
         Returns:
@@ -539,57 +504,45 @@ class PDF:
         """
         from natural_pdf.elements.collections import ElementCollection
-        # Determine page range to search
         page_indices = kwargs.get("pages", range(len(self._pages)))
         if isinstance(page_indices, int):
             page_indices = [page_indices]
         elif isinstance(page_indices, slice):
             page_indices = range(*page_indices.indices(len(self._pages)))
-        # Check for cross-page pseudo-classes (currently not supported)
         for pseudo in selector_obj.get("pseudo_classes", []):
             if pseudo.get("name") in ("spans", "continues"):
                 logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
                 return ElementCollection([])
-        # Regular case: collect elements from each page
         all_elements = []
         for page_idx in page_indices:
             if 0 <= page_idx < len(self._pages):
                 page = self._pages[page_idx]
-                # Pass first_only down to page._apply_selector
                 page_elements_collection = page._apply_selector(
                     selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
                 )
                 if page_elements_collection:
                     page_elements = page_elements_collection.elements
                     all_elements.extend(page_elements)
-                    # If we only need the first match overall, and we found one on this page, stop
                     if first_only and page_elements:
-                        break  # Stop iterating through pages
+                        break
             else:
                 logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
-        # Create a combined collection
         combined = ElementCollection(all_elements)
-        # Sort in document order if requested and not first_only (already sorted by page)
         if not first_only and kwargs.get("document_order", True):
-            # Check if elements have page, top, x0 before sorting
             if all(
                 hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
                 for el in combined.elements
             ):
                 combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
             else:
-                # Elements might be Regions without inherent sorting order yet
-                # Attempt sorting by page index if possible
                 try:
                     combined.sort(key=lambda el: el.page.index)
                 except AttributeError:
-                    logger.warning(
-                        "Cannot sort elements in document order: Missing required attributes (e.g., page)."
-                    )
+                    logger.warning("Cannot sort elements in document order: Missing required attributes.")
         return combined
@@ -606,24 +559,21 @@ class PDF:
         Args:
             selector: Optional selector to filter elements
-            preserve_whitespace: Whether to keep blank characters (default: True)
-            use_exclusions: Whether to apply exclusion regions (default: True)
-            debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
+            preserve_whitespace: Whether to keep blank characters
+            use_exclusions: Whether to apply exclusion regions
+            debug_exclusions: Whether to output detailed debugging for exclusions
             **kwargs: Additional extraction parameters
         Returns:
             Extracted text as string
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # If selector is provided, find elements first
         if selector:
             elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
             return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
-        # Otherwise extract from all pages
         if debug_exclusions:
             print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
             print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
@@ -644,25 +594,6 @@ class PDF:
         return "\n".join(texts)
-    def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
-        """
-        Shorthand for finding elements and extracting their text.
-        Args:
-            selector: CSS-like selector string
-            preserve_whitespace: Whether to keep blank characters (default: True)
-            **kwargs: Additional extraction parameters
-        Returns:
-            Extracted text from matching elements
-        """
-        # Ensure _pages is initialized
-        if not hasattr(self, "_pages"):
-            raise AttributeError("PDF pages not yet initialized.")
-        return self.extract_text(
-            selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
-        )  # apply_exclusions is handled by find_all in extract_text
     def extract_tables(
         self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
     ) -> List[Any]:
@@ -677,54 +608,43 @@ class PDF:
         Returns:
             List of extracted tables
         """
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        # TODO: Implement table extraction
         logger.warning("PDF.extract_tables is not fully implemented yet.")
         all_tables = []
         for page in self.pages:
-            # Assuming page.extract_tables(**kwargs) exists or is added
             if hasattr(page, "extract_tables"):
                 all_tables.extend(page.extract_tables(**kwargs))
             else:
                 logger.debug(f"Page {page.number} does not have extract_tables method.")
-        # Placeholder filtering
         if selector:
             logger.warning("Filtering extracted tables by selector is not implemented.")
-            # Would need to parse selector and filter the list `all_tables`
-        # Placeholder merging
         if merge_across_pages:
             logger.warning("Merging tables across pages is not implemented.")
-            # Would need logic to detect and merge related tables
         return all_tables
-    # --- New Method: save_searchable ---
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         Saves the PDF with an OCR text layer, making content searchable.
         Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
-        Note: OCR must have been applied to the pages beforehand
-              (e.g., using pdf.apply_ocr()).
         Args:
-            output_path: Path to save the searchable PDF.
-            dpi: Resolution for rendering and OCR overlay (default 300).
-            **kwargs: Additional keyword arguments passed to the exporter.
+            output_path: Path to save the searchable PDF
+            dpi: Resolution for rendering and OCR overlay
+            **kwargs: Additional keyword arguments passed to the exporter
         """
-        # Import moved here, assuming it's always available now
         from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
-        # Convert pathlib.Path to string if necessary
         output_path_str = str(output_path)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
         logger.info(f"Searchable PDF saved to: {output_path_str}")
-    # --- End New Method ---
     def ask(
         self,
         question: str,
@@ -746,27 +666,21 @@ class PDF:
             **kwargs: Additional parameters passed to the QA engine
         Returns:
-            A dictionary containing the answer, confidence, and other metadata.
-            Result will have an 'answer' key containing the answer text.
+            A dictionary containing the answer, confidence, and other metadata
         """
         from natural_pdf.qa import get_qa_engine
-        # Initialize or get QA engine
         qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
-        # Determine which pages to query
         if pages is None:
             target_pages = list(range(len(self.pages)))
         elif isinstance(pages, int):
-            # Single page
             target_pages = [pages]
         elif isinstance(pages, (list, range)):
-            # List or range of pages
             target_pages = pages
         else:
             raise ValueError(f"Invalid pages parameter: {pages}")
-        # Actually query each page and gather results
         results = []
         for page_idx in target_pages:
             if 0 <= page_idx < len(self.pages):
@@ -775,208 +689,148 @@ class PDF:
                     page=page, question=question, min_confidence=min_confidence, **kwargs
                 )
-                # Add to results if it found an answer
                 if page_result and page_result.get("found", False):
                     results.append(page_result)
-        # Sort results by confidence
         results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
-        # Return the best result, or a default result if none found
         if results:
             return results[0]
         else:
-            # Return a structure indicating no answer found
             return {
                 "answer": None,
                 "confidence": 0.0,
                 "found": False,
-                "page_num": None,  # Or maybe the pages searched?
+                "page_num": None,
                 "source_elements": [],
             }
     def search_within_index(
         self,
         query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol,  # Now required
+        search_service: SearchServiceProtocol,
         options: Optional[SearchOptions] = None,
     ) -> List[Dict[str, Any]]:
         """
-        Finds relevant documents specifically originating from THIS PDF document
-        within a search index managed by the provided SearchService.
-        This method uses a pre-configured SearchService instance and adds
-        a filter to the search query to scope results only to pages from
-        this specific PDF object (based on its resolved path).
+        Finds relevant documents from this PDF within a search index.
         Args:
-            query: The search query (text, image path, PIL Image, Region).
-            search_service: A pre-configured SearchService instance pointing to the
-                            index where this PDF's content (or related content)
-                            is expected to be found.
-            options: Optional SearchOptions to configure the query (top_k, filters, etc.).
-                     Any existing filters in `options` will be combined with the
-                     PDF-scoping filter using an 'AND' condition.
+            query: The search query (text, image path, PIL Image, Region)
+            search_service: A pre-configured SearchService instance
+            options: Optional SearchOptions to configure the query
         Returns:
-            A list of result dictionaries, sorted by relevance, containing only
-            results originating from this PDF's pages.
+            A list of result dictionaries, sorted by relevance
         Raises:
-            ImportError: If search dependencies are not installed.
-            ValueError: If search_service is None.
-            TypeError: If search_service does not conform to the protocol.
-            FileNotFoundError: If the collection managed by the service does not exist.
-            RuntimeError: For other search failures.
+            ImportError: If search dependencies are not installed
+            ValueError: If search_service is None
+            TypeError: If search_service does not conform to the protocol
+            FileNotFoundError: If the collection managed by the service does not exist
+            RuntimeError: For other search failures
         """
         if not search_service:
             raise ValueError("A configured SearchServiceProtocol instance must be provided.")
-        # Optional stricter check:
-        # if not isinstance(search_service, SearchServiceProtocol):
-        #     raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
-        # Get collection name from service for logging
         collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
-        logger.info(
-            f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
-        )
+        logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
+        service = search_service
-        # --- 1. Get Search Service Instance --- (REMOVED - provided directly)
-        # service: SearchServiceProtocol
-        # if search_service:
-        #     service = search_service
-        # else:
-        #     logger.debug(f"Getting SearchService instance via factory (persist={persist}, collection={collection_name})...")
-        #     factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
-        #     # TODO: Pass embedding model from options/pdf config if needed?
-        #     service = get_search_service(**factory_args)
-        service = search_service  # Use validated provided service
-        # --- 2. Prepare Query and Options ---
         query_input = query
-        # Resolve options (use default TextSearch if none provided)
         effective_options = copy.deepcopy(options) if options is not None else TextSearchOptions()
-        # Handle Region query - extract text for now
         if isinstance(query, Region):
             logger.debug("Query is a Region object. Extracting text.")
             if not isinstance(effective_options, TextSearchOptions):
-                logger.warning(
-                    "Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
-                )
+                logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
             query_input = query.extract_text()
             if not query_input or query_input.isspace():
                 logger.error("Region has no extractable text for query.")
                 return []
-        # --- 3. Add Filter to Scope Search to THIS PDF ---
-        # Assume metadata field 'pdf_path' stores the resolved path used during indexing
+        # Add filter to scope search to THIS PDF
         pdf_scope_filter = {
-            "field": "pdf_path",  # Or potentially "source_path" depending on indexing metadata
+            "field": "pdf_path",
             "operator": "eq",
-            "value": self.path,  # Use the resolved path of this PDF instance
+            "value": self.path,
         }
         logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
         # Combine with existing filters in options (if any)
         if effective_options.filters:
-            logger.debug(
-                f"Combining PDF scope filter with existing filters: {effective_options.filters}"
-            )
-            # Assume filters are compatible with the underlying search service
-            # If existing filters aren't already in an AND block, wrap them
-            if (
-                isinstance(effective_options.filters, dict)
-                and effective_options.filters.get("operator") == "AND"
-            ):
-                # Already an AND block, just append the condition
+            logger.debug(f"Combining PDF scope filter with existing filters")
+            if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
                 effective_options.filters["conditions"].append(pdf_scope_filter)
             elif isinstance(effective_options.filters, list):
-                # Assume list represents implicit AND conditions
                 effective_options.filters = {
                     "operator": "AND",
                     "conditions": effective_options.filters + [pdf_scope_filter],
                 }
-            elif isinstance(effective_options.filters, dict):  # Single filter dict
+            elif isinstance(effective_options.filters, dict):
                 effective_options.filters = {
                     "operator": "AND",
                     "conditions": [effective_options.filters, pdf_scope_filter],
                 }
             else:
-                logger.warning(
-                    f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
-                )
+                logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
                 effective_options.filters = pdf_scope_filter
         else:
             effective_options.filters = pdf_scope_filter
         logger.debug(f"Final filters for service search: {effective_options.filters}")
-        # --- 4. Call SearchService ---
         try:
-            # Call the service's search method (no collection_name needed)
             results = service.search(
                 query=query_input,
                 options=effective_options,
             )
-            logger.info(
-                f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
-            )
+            logger.info(f"SearchService returned {len(results)} results from PDF '{self.path}'")
             return results
         except FileNotFoundError as fnf:
-            logger.error(
-                f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
-            )
-            raise  # Re-raise specific error
+            logger.error(f"Search failed: Collection not found. Error: {fnf}")
+            raise
         except Exception as e:
-            logger.error(
-                f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
-                exc_info=True,
-            )
-            raise RuntimeError(
-                f"Search within index failed for PDF '{self.path}'. See logs for details."
-            ) from e
+            logger.error(f"SearchService search failed: {e}")
+            raise RuntimeError(f"Search within index failed. See logs for details.") from e
     def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
         """
-        Exports OCR results from this PDF into a correction task package (zip file).
+        Exports OCR results from this PDF into a correction task package.
         Args:
-            output_zip_path: The path to save the output zip file.
+            output_zip_path: The path to save the output zip file
             **kwargs: Additional arguments passed to create_correction_task_package
-                      (e.g., image_render_scale, overwrite).
         """
         try:
             from natural_pdf.utils.packaging import create_correction_task_package
             create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
         except ImportError:
             logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
-            # Or raise
         except Exception as e:
-            logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
-            raise # Re-raise the exception from the utility function
+            logger.error(f"Failed to export correction task: {e}")
+            raise
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
         pages: Optional[Union[Iterable[int], range, slice]] = None,
-    ) -> "PDF": # Return self for chaining
+        max_workers: Optional[int] = None,
+        progress_callback: Optional[Callable[[], None]] = None,
+    ) -> "PDF":
         """
-        Applies corrections to OCR-generated text elements using a callback function,
-        delegating the core work to the `Page.correct_ocr` method.
+        Applies corrections to OCR text elements using a callback function.
         Args:
-            correction_callback: A function that accepts a single argument (an element
-                                object) and returns `Optional[str]`. It returns the
-                                corrected text string if an update is needed, otherwise None.
+            correction_callback: Function that takes an element and returns corrected text or None
             pages: Optional page indices/slice to limit the scope of correction
-                (default: all pages).
+            max_workers: Maximum number of threads to use for parallel execution
+            progress_callback: Optional callback function for progress updates
         Returns:
-            Self for method chaining.
+            Self for method chaining
         """
-        # Determine target pages
-        target_page_indices: List[int] = []
+        target_page_indices = []
         if pages is None:
             target_page_indices = list(range(len(self._pages)))
         elif isinstance(pages, slice):
@@ -984,52 +838,49 @@ class PDF:
         elif hasattr(pages, "__iter__"):
             try:
                 target_page_indices = [int(i) for i in pages]
-                # Validate indices
                 for idx in target_page_indices:
                     if not (0 <= idx < len(self._pages)):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
-                raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
+                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
         else:
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_page_indices:
             logger.warning("No pages selected for OCR correction.")
             return self
-        logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
+        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
-        # Iterate through target pages and call their correct_ocr method
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
             try:
-                page.correct_ocr(correction_callback=correction_callback)
+                page.correct_ocr(
+                    correction_callback=correction_callback,
+                    max_workers=max_workers,
+                    progress_callback=progress_callback,
+                )
             except Exception as e:
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
-                # Optionally re-raise or just log and continue
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
-        logger.info(f"OCR correction process finished for requested pages.")
+        logger.info("OCR correction process finished.")
         return self
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
-        # Ensure _pages is initialized
         if not hasattr(self, "_pages"):
-            # Return 0 or raise error if not fully initialized? Let's return 0.
             return 0
         return len(self._pages)
-    def __getitem__(self, key) -> Union[Page, "PageCollection"]:  # Return PageCollection for slice
+    def __getitem__(self, key) -> Union[Page, "PageCollection"]:
         """Access pages by index or slice."""
-        # Check if self._pages has been initialized
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not initialized yet.")
         if isinstance(key, slice):
-            # Return a PageCollection slice
             from natural_pdf.elements.collections import PageCollection
             return PageCollection(self._pages[key])
-        # Check index bounds before accessing
         if isinstance(key, int):
             if 0 <= key < len(self._pages):
                 return self._pages[key]
@@ -1043,13 +894,12 @@ class PDF:
         if hasattr(self, "_pdf") and self._pdf is not None:
             try:
                 self._pdf.close()
-                logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
+                logger.debug(f"Closed pdfplumber PDF object for {self.source_path}")
             except Exception as e:
                 logger.warning(f"Error closing pdfplumber object: {e}")
             finally:
                 self._pdf = None
-        # Clean up temporary file if it exists
         if hasattr(self, "_temp_file") and self._temp_file is not None:
             temp_file_path = None
             try:
@@ -1059,7 +909,7 @@ class PDF:
                         os.unlink(temp_file_path)
                         logger.debug(f"Removed temporary PDF file: {temp_file_path}")
             except Exception as e:
-                logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
+                logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
             finally:
                 self._temp_file = None
@@ -1071,8 +921,176 @@ class PDF:
         """Context manager exit."""
         self.close()
-    # --- Indexable Protocol Methods --- Needed for search/sync
     def get_id(self) -> str:
+        """Get unique identifier for this PDF."""
         return self.path
+    # --- Classification Methods --- #
+    def classify_pages(
+        self,
+        categories: List[str],
+        model: Optional[str] = None,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+        analysis_key: str = "classification",
+        using: Optional[str] = None,
+        **kwargs,
+    ) -> "PDF":
+        """
+        Classifies specified pages of the PDF.
+        Args:
+            categories: List of category names
+            model: Model identifier ('text', 'vision', or specific HF ID)
+            pages: Page indices, slice, or None for all pages
+            analysis_key: Key to store results in page's analyses dict
+            using: Processing mode ('text' or 'vision')
+            **kwargs: Additional arguments for the ClassificationManager
+        Returns:
+            Self for method chaining
+        """
+        if not categories:
+            raise ValueError("Categories list cannot be empty.")
+        try:
+            manager = self.get_manager('classification')
+        except (ValueError, RuntimeError) as e:
+            raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
+        if not manager or not manager.is_available():
+            try:
+                from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
+                if not _CLASSIFICATION_AVAILABLE:
+                    raise ImportError("Classification dependencies missing.")
+            except ImportError:
+                raise ImportError(
+                    "Classification dependencies missing. "
+                    "Install with: pip install \"natural-pdf[classification]\""
+                )
+            raise ClassificationError("ClassificationManager not available.")
+        target_pages = []
+        if pages is None:
+            target_pages = self._pages
+        elif isinstance(pages, slice):
+            target_pages = self._pages[pages]
+        elif hasattr(pages, "__iter__"):
+            try:
+                target_pages = [self._pages[i] for i in pages]
+            except IndexError:
+                raise ValueError("Invalid page index provided.")
+            except TypeError:
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        if not target_pages:
+            logger.warning("No pages selected for classification.")
+            return self
+        inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+        logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
+        page_contents = []
+        pages_to_classify = []
+        logger.debug(f"Gathering content for {len(target_pages)} pages...")
+        for page in target_pages:
+            try:
+                content = page._get_classification_content(model_type=inferred_using, **kwargs)
+                page_contents.append(content)
+                pages_to_classify.append(page)
+            except ValueError as e:
+                logger.warning(f"Skipping page {page.number}: Cannot get content - {e}")
+            except Exception as e:
+                logger.warning(f"Skipping page {page.number}: Error getting content - {e}")
+        if not page_contents:
+            logger.warning("No content could be gathered for batch classification.")
+            return self
+        logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
+        try:
+            batch_results = manager.classify_batch(
+                item_contents=page_contents,
+                categories=categories,
+                model_id=model,
+                using=inferred_using,
+                **kwargs,
+            )
+        except Exception as e:
+            logger.error(f"Batch classification failed: {e}")
+            raise ClassificationError(f"Batch classification failed: {e}") from e
+        if len(batch_results) != len(pages_to_classify):
+            logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
+            return self
+        logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
+        for page, result_obj in zip(pages_to_classify, batch_results):
+            try:
+                if not hasattr(page, 'analyses') or page.analyses is None:
+                    page.analyses = {}
+                page.analyses[analysis_key] = result_obj
+            except Exception as e:
+                logger.warning(f"Failed to store classification results for page {page.number}: {e}")
+        logger.info(f"Finished classifying PDF pages.")
+        return self
+    # --- End Classification Methods --- #
+    # --- Extraction Support --- #
+    def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
+        """
+        Retrieves the content for the entire PDF.
+        Args:
+            using: 'text' or 'vision'
+            **kwargs: Additional arguments passed to extract_text or page.to_image
+        Returns:
+            str: Extracted text if using='text'
+            List[PIL.Image.Image]: List of page images if using='vision'
+            None: If content cannot be retrieved
+        """
+        if using == 'text':
+            try:
+                layout = kwargs.pop('layout', True)
+                return self.extract_text(layout=layout, **kwargs)
+            except Exception as e:
+                logger.error(f"Error extracting text from PDF: {e}")
+                return None
+        elif using == 'vision':
+            page_images = []
+            logger.info(f"Rendering {len(self.pages)} pages to images...")
+            resolution = kwargs.pop('resolution', 72)
+            include_highlights = kwargs.pop('include_highlights', False)
+            labels = kwargs.pop('labels', False)
+            try:
+                for page in tqdm(self.pages, desc="Rendering Pages"):
+                    img = page.to_image(
+                        resolution=resolution,
+                        include_highlights=include_highlights,
+                        labels=labels,
+                        **kwargs
+                    )
+                    if img:
+                        page_images.append(img)
+                    else:
+                        logger.warning(f"Failed to render page {page.number}, skipping.")
+                if not page_images:
+                    logger.error("Failed to render any pages.")
+                    return None
+                return page_images
+            except Exception as e:
+                logger.error(f"Error rendering pages: {e}")
+                return None
+        else:
+            logger.error(f"Unsupported value for 'using': {using}")
+            return None
+    # --- End Extraction Support --- #

natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl