PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +241 -158
natural_pdf/classification/mixin.py +52 -38
natural_pdf/classification/results.py +71 -45
natural_pdf/collections/mixins.py +85 -20
natural_pdf/collections/pdf_collection.py +245 -100
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +694 -195
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +610 -134
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
natural_pdf-0.1.10.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import copy
+import io
 import logging
 import os
 import re
 import tempfile
-import urllib.request
-import time
 import threading
+import time
+import urllib.request
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -18,38 +19,35 @@ from typing import (
     Tuple,
     Type,
     Union,
+    overload,
 )
-from natural_pdf.utils.tqdm_utils import get_tqdm
 import pdfplumber
 from PIL import Image
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
+from natural_pdf.classification.manager import ClassificationError, ClassificationManager
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.results import ClassificationResult
 from natural_pdf.core.highlighting_service import HighlightingService
-from natural_pdf.core.page import Page
-from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.base import Element
 from natural_pdf.elements.region import Region
+from natural_pdf.export.mixin import ExportMixin
+from natural_pdf.extraction.manager import StructuredDataManager
+from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
-from natural_pdf.classification.manager import ClassificationManager
-from natural_pdf.classification.manager import ClassificationError
-from natural_pdf.classification.results import ClassificationResult
-from natural_pdf.extraction.manager import StructuredDataManager
 from natural_pdf.utils.locks import pdf_render_lock
-from natural_pdf.elements.base import Element
-from natural_pdf.classification.mixin import ClassificationMixin
-from natural_pdf.extraction.mixin import ExtractionMixin
+from natural_pdf.utils.tqdm_utils import get_tqdm
 try:
     from typing import Any as TypingAny
-    from natural_pdf.search import TextSearchOptions
     from natural_pdf.search import (
         BaseSearchOptions,
         SearchOptions,
         SearchServiceProtocol,
+        TextSearchOptions,
         get_search_service,
     )
 except ImportError:
@@ -62,6 +60,7 @@ except ImportError:
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
 logger = logging.getLogger("natural_pdf.core.pdf")
 tqdm = get_tqdm()
@@ -70,7 +69,22 @@ DEFAULT_MANAGERS = {
     "structured_data": StructuredDataManager,
 }
-class PDF(ExtractionMixin):
+# Deskew Imports (Conditional)
+import numpy as np
+from PIL import Image
+try:
+    import img2pdf
+    from deskew import determine_skew
+    DESKEW_AVAILABLE = True
+except ImportError:
+    DESKEW_AVAILABLE = False
+    img2pdf = None
+# End Deskew Imports
+class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     """
     Enhanced PDF wrapper built on top of pdfplumber.
@@ -80,7 +94,7 @@ class PDF(ExtractionMixin):
     def __init__(
         self,
-        path_or_url: str,
+        path_or_url_or_stream,
         reading_order: bool = True,
         font_attrs: Optional[List[str]] = None,
         keep_spaces: bool = True,
@@ -89,54 +103,72 @@ class PDF(ExtractionMixin):
         Initialize the enhanced PDF object.
         Args:
-            path_or_url: Path to the PDF file or a URL to a PDF
+            path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
             reading_order: Whether to use natural reading order
             font_attrs: Font attributes for grouping characters into words
             keep_spaces: Whether to include spaces in word elements
         """
-        is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
-        self._original_path = path_or_url
+        self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
         self._resolved_path = None
-        if is_url:
-            logger.info(f"Downloading PDF from URL: {path_or_url}")
-            try:
-                self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
-                with urllib.request.urlopen(path_or_url) as response:
-                    self._temp_file.write(response.read())
-                    self._temp_file.flush()
-                    self._temp_file.close()
-                self._resolved_path = self._temp_file.name
-                logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
-            except Exception as e:
-                if self._temp_file and hasattr(self._temp_file, "name"):
-                    try:
-                        os.unlink(self._temp_file.name)
-                    except:
-                        pass
-                logger.error(f"Failed to download PDF from URL: {e}")
-                raise ValueError(f"Failed to download PDF from URL: {e}")
+        self._is_stream = False
+        stream_to_open = None
+        if hasattr(path_or_url_or_stream, "read"):  # Check if it's file-like
+            logger.info("Initializing PDF from in-memory stream.")
+            self._is_stream = True
+            self._resolved_path = None  # No resolved file path for streams
+            self.source_path = "<stream>"  # Identifier for source
+            self.path = self.source_path  # Use source identifier as path for streams
+            stream_to_open = path_or_url_or_stream
+        elif isinstance(path_or_url_or_stream, (str, Path)):
+            path_or_url = str(path_or_url_or_stream)
+            self.source_path = path_or_url  # Store original path/URL as source
+            is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
+            if is_url:
+                logger.info(f"Downloading PDF from URL: {path_or_url}")
+                try:
+                    # Use a context manager for the temporary file
+                    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
+                        self._temp_file = temp_f  # Store reference if needed for cleanup
+                        with urllib.request.urlopen(path_or_url) as response:
+                            temp_f.write(response.read())
+                            temp_f.flush()
+                        self._resolved_path = temp_f.name
+                        logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
+                        stream_to_open = self._resolved_path
+                except Exception as e:
+                    if self._temp_file and hasattr(self._temp_file, "name"):
+                        try:
+                            os.unlink(self._temp_file.name)
+                        except:  # noqa E722
+                            pass
+                    logger.error(f"Failed to download PDF from URL: {e}")
+                    raise ValueError(f"Failed to download PDF from URL: {e}")
+            else:
+                self._resolved_path = str(Path(path_or_url).resolve())  # Resolve local paths
+                stream_to_open = self._resolved_path
+            self.path = self._resolved_path  # Use resolved path for file-based PDFs
         else:
-            self._resolved_path = path_or_url
+            raise TypeError(
+                f"Invalid input type: {type(path_or_url_or_stream)}. "
+                f"Expected path (str/Path), URL (str), or file-like object."
+            )
-        logger.info(f"Initializing PDF from {self._resolved_path}")
+        logger.info(f"Opening PDF source: {self.source_path}")
         logger.debug(
             f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
         )
         try:
-            self._pdf = pdfplumber.open(self._resolved_path)
+            self._pdf = pdfplumber.open(stream_to_open)
         except Exception as e:
             logger.error(f"Failed to open PDF: {e}", exc_info=True)
-            self.close()
-            raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
-        self._path = self._resolved_path
-        self.path = self._resolved_path
-        self.source_path = self._original_path
+            self.close()  # Attempt cleanup if opening fails
+            raise IOError(f"Failed to open PDF source: {self.source_path}") from e
+        # Store configuration used for initialization
         self._reading_order = reading_order
         self._config = {"keep_spaces": keep_spaces}
         self._font_attrs = font_attrs
@@ -144,9 +176,11 @@ class PDF(ExtractionMixin):
         self._ocr_manager = OCRManager() if OCRManager else None
         self._layout_manager = LayoutManager() if LayoutManager else None
         self.highlighter = HighlightingService(self)
-        self._classification_manager_instance = ClassificationManager()
+        # self._classification_manager_instance = ClassificationManager() # Removed this line
         self._manager_registry = {}
+        from natural_pdf.core.page import Page
         self._pages = [
             Page(p, parent=self, index=i, font_attrs=font_attrs)
             for i, p in enumerate(self._pdf.pages)
@@ -160,6 +194,7 @@ class PDF(ExtractionMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        self.analyses: Dict[str, Any] = {}
     def _initialize_managers(self):
         """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -175,16 +210,20 @@ class PDF(ExtractionMixin):
     def get_manager(self, key: str) -> Any:
         """Retrieve a manager instance by its key."""
         if key not in self._managers:
-            raise KeyError(f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}")
+            raise KeyError(
+                f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
+            )
         manager_instance = self._managers.get(key)
         if manager_instance is None:
-             manager_class = DEFAULT_MANAGERS.get(key)
-             if manager_class:
-                  raise RuntimeError(f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously.")
-             else:
-                  raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
+            manager_class = DEFAULT_MANAGERS.get(key)
+            if manager_class:
+                raise RuntimeError(
+                    f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
+                )
+            else:
+                raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
         return manager_instance
@@ -227,6 +266,7 @@ class PDF(ExtractionMixin):
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
         Args:
+            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
             exclusion_func: A function that takes a Page and returns a Region to exclude, or None
             label: Optional label for this exclusion
@@ -259,11 +299,22 @@ class PDF(ExtractionMixin):
     ) -> "PDF":
         """
         Applies OCR to specified pages of the PDF using batch processing.
+        Applies OCR to specified pages of the PDF using batch processing.
         Args:
             engine: Name of the OCR engine
             languages: List of language codes
-            min_confidence: Minimum confidence threshold
+            min_confidence: Minimum confidence threshold
+            device: Device to run OCR on
+            resolution: DPI resolution for page images
+            apply_exclusions: Whether to mask excluded areas
+            detect_only: If True, only detect text boxes
+            replace: Whether to replace existing OCR elements
+            options: Engine-specific options
+            pages: Page indices to process or None for all pages
+            engine: Name of the OCR engine
+            languages: List of language codes
+            min_confidence: Minimum confidence threshold
             device: Device to run OCR on
             resolution: DPI resolution for page images
             apply_exclusions: Whether to mask excluded areas
@@ -274,6 +325,7 @@ class PDF(ExtractionMixin):
         Returns:
             Self for method chaining
+            Self for method chaining
         """
         if not self._ocr_manager:
             logger.error("OCRManager not available. Cannot apply OCR.")
@@ -281,7 +333,9 @@ class PDF(ExtractionMixin):
         thread_id = threading.current_thread().name
         logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
+        target_pages = []
         target_pages = []
         if pages is None:
             target_pages = self._pages
@@ -303,7 +357,7 @@ class PDF(ExtractionMixin):
         page_numbers = [p.number for p in target_pages]
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
         final_resolution = resolution or getattr(self, "_config", {}).get("resolution", 150)
         logger.debug(f"Using OCR image resolution: {final_resolution} DPI")
@@ -312,7 +366,7 @@ class PDF(ExtractionMixin):
         logger.info(f"[{thread_id}] Rendering {len(target_pages)} pages...")
         failed_page_num = "unknown"
         render_start_time = time.monotonic()
         try:
             for i, page in enumerate(tqdm(target_pages, desc="Rendering pages", leave=False)):
                 failed_page_num = page.number
@@ -326,14 +380,21 @@ class PDF(ExtractionMixin):
                 if img is None:
                     logger.error(f"  Failed to render page {page.number} to image.")
                     continue
+                    continue
                 images_pil.append(img)
                 page_image_map.append((page, img))
         except Exception as e:
+            logger.error(f"Failed to render pages for batch OCR: {e}")
             logger.error(f"Failed to render pages for batch OCR: {e}")
             raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
         render_end_time = time.monotonic()
-        logger.debug(f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)")
+        logger.debug(
+            f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
+        )
+        logger.debug(
+            f"[{thread_id}] Finished rendering {len(images_pil)} images (Duration: {render_end_time - render_start_time:.2f}s)"
+        )
         if not images_pil or not page_image_map:
             logger.error("No images were successfully rendered for batch OCR.")
@@ -344,16 +405,18 @@ class PDF(ExtractionMixin):
             "engine": engine,
             "languages": languages,
             "min_confidence": min_confidence,
+            "min_confidence": min_confidence,
             "device": device,
             "options": options,
             "detect_only": detect_only,
         }
         manager_args = {k: v for k, v in manager_args.items() if v is not None}
-        ocr_call_args = {k:v for k,v in manager_args.items() if k!='images'}
+        ocr_call_args = {k: v for k, v in manager_args.items() if k != "images"}
+        logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
         logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
         ocr_start_time = time.monotonic()
         try:
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
@@ -365,24 +428,28 @@ class PDF(ExtractionMixin):
         except Exception as e:
             logger.error(f"Batch OCR processing failed: {e}")
             return self
         ocr_end_time = time.monotonic()
-        logger.debug(f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)")
+        logger.debug(
+            f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
+        )
         logger.info("Adding OCR results to respective pages...")
         total_elements_added = 0
         for i, (page, img) in enumerate(page_image_map):
             results_for_page = batch_results[i]
             if not isinstance(results_for_page, list):
-                logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
+                logger.warning(
+                    f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
+                )
                 continue
             logger.debug(f"  Processing {len(results_for_page)} results for page {page.number}...")
             try:
                 if manager_args.get("replace", True) and hasattr(page, "_element_mgr"):
                     page._element_mgr.remove_ocr_elements()
                 img_scale_x = page.width / img.width if img.width > 0 else 1
                 img_scale_y = page.height / img.height if img.height > 0 else 1
                 elements = page._element_mgr.create_text_elements_from_ocr(
@@ -407,6 +474,7 @@ class PDF(ExtractionMixin):
         Add a region function to the PDF.
         Args:
+            region_func: A function that takes a Page and returns a Region, or None
             region_func: A function that takes a Page and returns a Region, or None
             name: Optional name for the region
@@ -425,126 +493,194 @@ class PDF(ExtractionMixin):
                 if region_instance and isinstance(region_instance, Region):
                     page.add_region(region_instance, name=name, source="named")
                 elif region_instance is not None:
-                    logger.warning(f"Region function did not return a valid Region for page {page.number}")
+                    logger.warning(
+                        f"Region function did not return a valid Region for page {page.number}"
+                    )
             except Exception as e:
                 logger.error(f"Error adding region for page {page.number}: {e}")
         return self
+    @overload
     def find(
-        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
+    def find(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
     ) -> Optional[Any]:
         """
-        Find the first element matching the selector.
+        Find the first element matching the selector OR text content across all pages.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions
-            regex: Whether to use regex for text search
-            case: Whether to do case-sensitive text search
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            Element object or None if not found
+            Element object or None if not found.
         """
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
-        selector_obj = parse_selector(selector)
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            effective_selector = f'text:contains("{escaped_text}")'
+            logger.debug(
+                f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            raise ValueError("Internal error: No selector or text provided.")
+        selector_obj = parse_selector(effective_selector)
         kwargs["regex"] = regex
         kwargs["case"] = case
-        results = self._apply_selector(
-            selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
-        )
-        return results.first if results else None
+        # Search page by page
+        for page in self.pages:
+            # Note: _apply_selector is on Page, so we call find directly here
+            # We pass the constructed/validated effective_selector
+            element = page.find(
+                selector=effective_selector,  # Use the processed selector
+                apply_exclusions=apply_exclusions,
+                regex=regex,  # Pass down flags
+                case=case,
+                **kwargs,
+            )
+            if element:
+                return element
+        return None  # Not found on any page
+    @overload
     def find_all(
-        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
-    ) -> ElementCollection:
-        """
-        Find all elements matching the selector.
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
-        Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions
-            regex: Whether to use regex for text search
-            case: Whether to do case-sensitive text search
-            **kwargs: Additional filter parameters
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
-        Returns:
-            ElementCollection with matching elements
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
         """
-        if not hasattr(self, "_pages"):
-            raise AttributeError("PDF pages not yet initialized.")
-        selector_obj = parse_selector(selector)
-        kwargs["regex"] = regex
-        kwargs["case"] = case
-        results = self._apply_selector(
-            selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
-        )
-        return results
+        Find all elements matching the selector OR text content across all pages.
-    def _apply_selector(
-        self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
-    ) -> ElementCollection:
-        """
-        Apply selector to PDF elements across all pages.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector_obj: Parsed selector dictionary
-            apply_exclusions: Whether to exclude elements in exclusion regions
-            first_only: If True, stop searching after the first match is found
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            ElementCollection of matching elements
+            ElementCollection with matching elements.
         """
-        from natural_pdf.elements.collections import ElementCollection
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
-        page_indices = kwargs.get("pages", range(len(self._pages)))
-        if isinstance(page_indices, int):
-            page_indices = [page_indices]
-        elif isinstance(page_indices, slice):
-            page_indices = range(*page_indices.indices(len(self._pages)))
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            effective_selector = f'text:contains("{escaped_text}")'
+            logger.debug(
+                f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            raise ValueError("Internal error: No selector or text provided.")
-        for pseudo in selector_obj.get("pseudo_classes", []):
-            if pseudo.get("name") in ("spans", "continues"):
-                logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
-                return ElementCollection([])
+        # Instead of parsing here, let each page parse and apply
+        # This avoids parsing the same selector multiple times if not needed
+        # selector_obj = parse_selector(effective_selector)
-        all_elements = []
-        for page_idx in page_indices:
-            if 0 <= page_idx < len(self._pages):
-                page = self._pages[page_idx]
-                page_elements_collection = page._apply_selector(
-                    selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
-                )
-                if page_elements_collection:
-                    page_elements = page_elements_collection.elements
-                    all_elements.extend(page_elements)
-                    if first_only and page_elements:
-                        break
-            else:
-                logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
+        # kwargs["regex"] = regex # Removed: Already passed explicitly
+        # kwargs["case"] = case   # Removed: Already passed explicitly
-        combined = ElementCollection(all_elements)
+        all_elements = []
+        for page in self.pages:
+            # Call page.find_all with the effective selector and flags
+            page_elements = page.find_all(
+                selector=effective_selector,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
+            if page_elements:
+                all_elements.extend(page_elements.elements)
-        if not first_only and kwargs.get("document_order", True):
-            if all(
-                hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
-                for el in combined.elements
-            ):
-                combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
-            else:
-                try:
-                    combined.sort(key=lambda el: el.page.index)
-                except AttributeError:
-                    logger.warning("Cannot sort elements in document order: Missing required attributes.")
+        from natural_pdf.elements.collections import ElementCollection
-        return combined
+        return ElementCollection(all_elements)
     def extract_text(
         self,
@@ -562,6 +698,9 @@ class PDF(ExtractionMixin):
             preserve_whitespace: Whether to keep blank characters
             use_exclusions: Whether to apply exclusion regions
             debug_exclusions: Whether to output detailed debugging for exclusions
+            preserve_whitespace: Whether to keep blank characters
+            use_exclusions: Whether to apply exclusion regions
+            debug_exclusions: Whether to output detailed debugging for exclusions
             **kwargs: Additional extraction parameters
         Returns:
@@ -610,22 +749,22 @@ class PDF(ExtractionMixin):
         """
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
         logger.warning("PDF.extract_tables is not fully implemented yet.")
         all_tables = []
         for page in self.pages:
             if hasattr(page, "extract_tables"):
                 all_tables.extend(page.extract_tables(**kwargs))
             else:
                 logger.debug(f"Page {page.number} does not have extract_tables method.")
         if selector:
             logger.warning("Filtering extracted tables by selector is not implemented.")
         if merge_across_pages:
             logger.warning("Merging tables across pages is not implemented.")
         return all_tables
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -638,6 +777,9 @@ class PDF(ExtractionMixin):
             output_path: Path to save the searchable PDF
             dpi: Resolution for rendering and OCR overlay
             **kwargs: Additional keyword arguments passed to the exporter
+            output_path: Path to save the searchable PDF
+            dpi: Resolution for rendering and OCR overlay
+            **kwargs: Additional keyword arguments passed to the exporter
         """
         from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
@@ -667,6 +809,7 @@ class PDF(ExtractionMixin):
         Returns:
             A dictionary containing the answer, confidence, and other metadata
+            A dictionary containing the answer, confidence, and other metadata
         """
         from natural_pdf.qa import get_qa_engine
@@ -713,14 +856,19 @@ class PDF(ExtractionMixin):
     ) -> List[Dict[str, Any]]:
         """
         Finds relevant documents from this PDF within a search index.
+        Finds relevant documents from this PDF within a search index.
         Args:
             query: The search query (text, image path, PIL Image, Region)
             search_service: A pre-configured SearchService instance
             options: Optional SearchOptions to configure the query
+            query: The search query (text, image path, PIL Image, Region)
+            search_service: A pre-configured SearchService instance
+            options: Optional SearchOptions to configure the query
         Returns:
             A list of result dictionaries, sorted by relevance
+            A list of result dictionaries, sorted by relevance
         Raises:
             ImportError: If search dependencies are not installed
@@ -728,12 +876,19 @@ class PDF(ExtractionMixin):
             TypeError: If search_service does not conform to the protocol
             FileNotFoundError: If the collection managed by the service does not exist
             RuntimeError: For other search failures
+            ImportError: If search dependencies are not installed
+            ValueError: If search_service is None
+            TypeError: If search_service does not conform to the protocol
+            FileNotFoundError: If the collection managed by the service does not exist
+            RuntimeError: For other search failures
         """
         if not search_service:
             raise ValueError("A configured SearchServiceProtocol instance must be provided.")
         collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
-        logger.info(f"Searching within index '{collection_name}' for content from PDF '{self.path}'")
+        logger.info(
+            f"Searching within index '{collection_name}' for content from PDF '{self.path}'"
+        )
         service = search_service
@@ -743,12 +898,15 @@ class PDF(ExtractionMixin):
         if isinstance(query, Region):
             logger.debug("Query is a Region object. Extracting text.")
             if not isinstance(effective_options, TextSearchOptions):
-                logger.warning("Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction.")
+                logger.warning(
+                    "Querying with Region image requires MultiModalSearchOptions. Falling back to text extraction."
+                )
             query_input = query.extract_text()
             if not query_input or query_input.isspace():
                 logger.error("Region has no extractable text for query.")
                 return []
+        # Add filter to scope search to THIS PDF
         # Add filter to scope search to THIS PDF
         pdf_scope_filter = {
             "field": "pdf_path",
@@ -760,7 +918,10 @@ class PDF(ExtractionMixin):
         # Combine with existing filters in options (if any)
         if effective_options.filters:
             logger.debug(f"Combining PDF scope filter with existing filters")
-            if isinstance(effective_options.filters, dict) and effective_options.filters.get("operator") == "AND":
+            if (
+                isinstance(effective_options.filters, dict)
+                and effective_options.filters.get("operator") == "AND"
+            ):
                 effective_options.filters["conditions"].append(pdf_scope_filter)
             elif isinstance(effective_options.filters, list):
                 effective_options.filters = {
@@ -773,7 +934,9 @@ class PDF(ExtractionMixin):
                     "conditions": [effective_options.filters, pdf_scope_filter],
                 }
             else:
-                logger.warning(f"Unsupported format for existing filters. Overwriting with PDF scope filter.")
+                logger.warning(
+                    f"Unsupported format for existing filters. Overwriting with PDF scope filter."
+                )
                 effective_options.filters = pdf_scope_filter
         else:
             effective_options.filters = pdf_scope_filter
@@ -790,26 +953,40 @@ class PDF(ExtractionMixin):
         except FileNotFoundError as fnf:
             logger.error(f"Search failed: Collection not found. Error: {fnf}")
             raise
+            logger.error(f"Search failed: Collection not found. Error: {fnf}")
+            raise
         except Exception as e:
             logger.error(f"SearchService search failed: {e}")
             raise RuntimeError(f"Search within index failed. See logs for details.") from e
+            logger.error(f"SearchService search failed: {e}")
+            raise RuntimeError(f"Search within index failed. See logs for details.") from e
     def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
         """
         Exports OCR results from this PDF into a correction task package.
+        Exports OCR results from this PDF into a correction task package.
         Args:
+            output_zip_path: The path to save the output zip file
             output_zip_path: The path to save the output zip file
             **kwargs: Additional arguments passed to create_correction_task_package
         """
         try:
             from natural_pdf.utils.packaging import create_correction_task_package
             create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
         except ImportError:
-            logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
         except Exception as e:
             logger.error(f"Failed to export correction task: {e}")
             raise
+            logger.error(f"Failed to export correction task: {e}")
+            raise
     def correct_ocr(
         self,
@@ -820,17 +997,23 @@ class PDF(ExtractionMixin):
     ) -> "PDF":
         """
         Applies corrections to OCR text elements using a callback function.
+        Applies corrections to OCR text elements using a callback function.
         Args:
+            correction_callback: Function that takes an element and returns corrected text or None
             correction_callback: Function that takes an element and returns corrected text or None
             pages: Optional page indices/slice to limit the scope of correction
             max_workers: Maximum number of threads to use for parallel execution
             progress_callback: Optional callback function for progress updates
+            max_workers: Maximum number of threads to use for parallel execution
+            progress_callback: Optional callback function for progress updates
         Returns:
             Self for method chaining
+            Self for method chaining
         """
         target_page_indices = []
+        target_page_indices = []
         if pages is None:
             target_page_indices = list(range(len(self._pages)))
         elif isinstance(pages, slice):
@@ -843,14 +1026,17 @@ class PDF(ExtractionMixin):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
                 raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
+                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_page_indices:
             logger.warning("No pages selected for OCR correction.")
             return self
         logger.info(f"Starting OCR correction for pages: {target_page_indices}")
+        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
@@ -862,7 +1048,9 @@ class PDF(ExtractionMixin):
                 )
             except Exception as e:
                 logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
+        logger.info("OCR correction process finished.")
         logger.info("OCR correction process finished.")
         return self
@@ -872,15 +1060,16 @@ class PDF(ExtractionMixin):
             return 0
         return len(self._pages)
-    def __getitem__(self, key) -> Union[Page, "PageCollection"]:
+    def __getitem__(self, key) -> Union["Page", "PageCollection"]:
         """Access pages by index or slice."""
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not initialized yet.")
         if isinstance(key, slice):
             from natural_pdf.elements.collections import PageCollection
             return PageCollection(self._pages[key])
         if isinstance(key, int):
             if 0 <= key < len(self._pages):
                 return self._pages[key]
@@ -905,13 +1094,12 @@ class PDF(ExtractionMixin):
             try:
                 if hasattr(self._temp_file, "name") and self._temp_file.name:
                     temp_file_path = self._temp_file.name
-                    if os.path.exists(temp_file_path):
+                    # Only unlink if it exists and _is_stream is False (meaning WE created it)
+                    if not self._is_stream and os.path.exists(temp_file_path):
                         os.unlink(temp_file_path)
                         logger.debug(f"Removed temporary PDF file: {temp_file_path}")
             except Exception as e:
                 logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
-            finally:
-                self._temp_file = None
     def __enter__(self):
         """Context manager entry."""
@@ -922,14 +1110,141 @@ class PDF(ExtractionMixin):
         self.close()
     def get_id(self) -> str:
+        """Get unique identifier for this PDF."""
         """Get unique identifier for this PDF."""
         return self.path
+    # --- Deskew Method --- #
+    def deskew(
+        self,
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+        resolution: int = 300,
+        detection_resolution: int = 72,
+        force_overwrite: bool = False,
+        **deskew_kwargs,
+    ) -> "PDF":
+        """
+        Creates a new, in-memory PDF object containing deskewed versions of the
+        specified pages from the original PDF.
+        This method renders each selected page, detects and corrects skew using the 'deskew'
+        library, and then combines the resulting images into a new PDF using 'img2pdf'.
+        The new PDF object is returned directly.
+        Important: The returned PDF is image-based. Any existing text, OCR results,
+        annotations, or other elements from the original pages will *not* be carried over.
+        Args:
+            pages: Page indices/slice to include (0-based). If None, processes all pages.
+            resolution: DPI resolution for rendering the output deskewed pages.
+            detection_resolution: DPI resolution used for skew detection if angles are not
+                                  already cached on the page objects.
+            force_overwrite: If False (default), raises a ValueError if any target page
+                             already contains processed elements (text, OCR, regions) to
+                             prevent accidental data loss. Set to True to proceed anyway.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             during automatic detection (e.g., `max_angle`, `num_peaks`).
+        Returns:
+            A new PDF object representing the deskewed document.
+        Raises:
+            ImportError: If 'deskew' or 'img2pdf' libraries are not installed.
+            ValueError: If `force_overwrite` is False and target pages contain elements.
+            FileNotFoundError: If the source PDF cannot be read (if file-based).
+            IOError: If creating the in-memory PDF fails.
+            RuntimeError: If rendering or deskewing individual pages fails.
+        """
+        if not DESKEW_AVAILABLE:
+            raise ImportError(
+                "Deskew/img2pdf libraries missing. Install with: pip install natural-pdf[deskew]"
+            )
+        target_pages = self._get_target_pages(pages)  # Use helper to resolve pages
+        # --- Safety Check --- #
+        if not force_overwrite:
+            for page in target_pages:
+                # Check if the element manager has been initialized and contains any elements
+                if (
+                    hasattr(page, "_element_mgr")
+                    and page._element_mgr
+                    and page._element_mgr.has_elements()
+                ):
+                    raise ValueError(
+                        f"Page {page.number} contains existing elements (text, OCR, etc.). "
+                        f"Deskewing creates an image-only PDF, discarding these elements. "
+                        f"Set force_overwrite=True to proceed."
+                    )
+        # --- Process Pages --- #
+        deskewed_images_bytes = []
+        logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
+        # Use tqdm via get_tqdm
+        for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
+            try:
+                # Use page.deskew to get the corrected PIL image
+                # Pass down resolutions and kwargs
+                deskewed_img = page.deskew(
+                    resolution=resolution,
+                    angle=None,  # Let page.deskew handle detection/caching
+                    detection_resolution=detection_resolution,
+                    **deskew_kwargs,
+                )
+                if not deskewed_img:
+                    logger.warning(
+                        f"Page {page.number}: Failed to generate deskewed image, skipping."
+                    )
+                    continue
+                # Convert image to bytes for img2pdf (use PNG for lossless quality)
+                with io.BytesIO() as buf:
+                    deskewed_img.save(buf, format="PNG")
+                    deskewed_images_bytes.append(buf.getvalue())
+            except Exception as e:
+                logger.error(
+                    f"Page {page.number}: Failed during deskewing process: {e}", exc_info=True
+                )
+                # Option: Raise a runtime error, or continue and skip the page?
+                # Raising makes the whole operation fail if one page fails.
+                raise RuntimeError(f"Failed to process page {page.number} during deskewing.") from e
+        # --- Create PDF --- #
+        if not deskewed_images_bytes:
+            raise RuntimeError("No pages were successfully processed to create the deskewed PDF.")
+        logger.info(f"Combining {len(deskewed_images_bytes)} deskewed images into in-memory PDF...")
+        try:
+            # Use img2pdf to combine image bytes into PDF bytes
+            pdf_bytes = img2pdf.convert(deskewed_images_bytes)
+            # Wrap bytes in a stream
+            pdf_stream = io.BytesIO(pdf_bytes)
+            # Create a new PDF object from the stream using original config
+            logger.info("Creating new PDF object from deskewed stream...")
+            new_pdf = PDF(
+                pdf_stream,
+                reading_order=self._reading_order,
+                font_attrs=self._font_attrs,
+                keep_spaces=self._config.get("keep_spaces", True),
+            )
+            return new_pdf
+        except Exception as e:
+            logger.error(f"Failed to create in-memory PDF using img2pdf/PDF init: {e}")
+            raise IOError("Failed to create deskewed PDF object from image stream.") from e
+    # --- End Deskew Method --- #
     # --- Classification Methods --- #
     def classify_pages(
         self,
-        categories: List[str],
+        labels: List[str],
         model: Optional[str] = None,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         analysis_key: str = "classification",
@@ -940,7 +1255,7 @@ class PDF(ExtractionMixin):
         Classifies specified pages of the PDF.
         Args:
-            categories: List of category names
+            labels: List of category names
             model: Model identifier ('text', 'vision', or specific HF ID)
             pages: Page indices, slice, or None for all pages
             analysis_key: Key to store results in page's analyses dict
@@ -950,23 +1265,24 @@ class PDF(ExtractionMixin):
         Returns:
             Self for method chaining
         """
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         try:
-            manager = self.get_manager('classification')
+            manager = self.get_manager("classification")
         except (ValueError, RuntimeError) as e:
             raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
         if not manager or not manager.is_available():
             try:
                 from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
                 if not _CLASSIFICATION_AVAILABLE:
                     raise ImportError("Classification dependencies missing.")
             except ImportError:
                 raise ImportError(
                     "Classification dependencies missing. "
-                    "Install with: pip install \"natural-pdf[classification]\""
+                    'Install with: pip install "natural-pdf[classification]"'
                 )
             raise ClassificationError("ClassificationManager not available.")
@@ -990,12 +1306,14 @@ class PDF(ExtractionMixin):
             return self
         inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
-        logger.info(f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})")
+        logger.info(
+            f"Classifying {len(target_pages)} pages using model '{model or '(default)'}' (mode: {inferred_using})"
+        )
         page_contents = []
         pages_to_classify = []
         logger.debug(f"Gathering content for {len(target_pages)} pages...")
         for page in target_pages:
             try:
                 content = page._get_classification_content(model_type=inferred_using, **kwargs)
@@ -1009,13 +1327,13 @@ class PDF(ExtractionMixin):
         if not page_contents:
             logger.warning("No content could be gathered for batch classification.")
             return self
         logger.debug(f"Gathered content for {len(pages_to_classify)} pages.")
         try:
             batch_results = manager.classify_batch(
                 item_contents=page_contents,
-                categories=categories,
+                labels=labels,
                 model_id=model,
                 using=inferred_using,
                 **kwargs,
@@ -1025,17 +1343,23 @@ class PDF(ExtractionMixin):
             raise ClassificationError(f"Batch classification failed: {e}") from e
         if len(batch_results) != len(pages_to_classify):
-            logger.error(f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})")
+            logger.error(
+                f"Mismatch between number of results ({len(batch_results)}) and pages ({len(pages_to_classify)})"
+            )
             return self
-        logger.debug(f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'...")
+        logger.debug(
+            f"Distributing {len(batch_results)} results to pages under key '{analysis_key}'..."
+        )
         for page, result_obj in zip(pages_to_classify, batch_results):
             try:
-                if not hasattr(page, 'analyses') or page.analyses is None:
+                if not hasattr(page, "analyses") or page.analyses is None:
                     page.analyses = {}
                 page.analyses[analysis_key] = result_obj
             except Exception as e:
-                logger.warning(f"Failed to store classification results for page {page.number}: {e}")
+                logger.warning(
+                    f"Failed to store classification results for page {page.number}: {e}"
+                )
         logger.info(f"Finished classifying PDF pages.")
         return self
@@ -1043,7 +1367,7 @@ class PDF(ExtractionMixin):
     # --- End Classification Methods --- #
     # --- Extraction Support --- #
-    def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
+    def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
         """
         Retrieves the content for the entire PDF.
@@ -1056,28 +1380,28 @@ class PDF(ExtractionMixin):
             List[PIL.Image.Image]: List of page images if using='vision'
             None: If content cannot be retrieved
         """
-        if using == 'text':
+        if using == "text":
             try:
-                layout = kwargs.pop('layout', True)
+                layout = kwargs.pop("layout", True)
                 return self.extract_text(layout=layout, **kwargs)
             except Exception as e:
                 logger.error(f"Error extracting text from PDF: {e}")
                 return None
-        elif using == 'vision':
+        elif using == "vision":
             page_images = []
             logger.info(f"Rendering {len(self.pages)} pages to images...")
-            resolution = kwargs.pop('resolution', 72)
-            include_highlights = kwargs.pop('include_highlights', False)
-            labels = kwargs.pop('labels', False)
+            resolution = kwargs.pop("resolution", 72)
+            include_highlights = kwargs.pop("include_highlights", False)
+            labels = kwargs.pop("labels", False)
             try:
                 for page in tqdm(self.pages, desc="Rendering Pages"):
                     img = page.to_image(
                         resolution=resolution,
                         include_highlights=include_highlights,
                         labels=labels,
-                        **kwargs
+                        **kwargs,
                     )
                     if img:
                         page_images.append(img)
@@ -1093,4 +1417,179 @@ class PDF(ExtractionMixin):
         else:
             logger.error(f"Unsupported value for 'using': {using}")
             return None
     # --- End Extraction Support --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all pages in the PDF.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not hasattr(self, "_pages") or not self._pages:
+            logger.warning(f"No pages found in PDF {self.path}")
+            return []
+        all_data = []
+        for page in tqdm(self._pages, desc="Gathering page data", leave=False):
+            # Basic page information
+            page_data = {
+                "pdf_path": self.path,
+                "page_number": page.number,
+                "page_index": page.index,
+            }
+            # Include extracted text if requested
+            if include_content:
+                try:
+                    page_data["content"] = page.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page.number}: {e}")
+                    page_data["content"] = ""
+            # Save image if requested
+            if include_images:
+                try:
+                    # Create image filename
+                    image_filename = f"pdf_{Path(self.path).stem}_page_{page.number}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    page.save_image(
+                        str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for page {page.number}: {e}")
+                    page_data["image_path"] = None
+            # Add analyses data
+            for key in analysis_keys:
+                if not hasattr(page, "analyses") or not page.analyses:
+                    raise ValueError(f"Page {page.number} does not have analyses data")
+                if key not in page.analyses:
+                    raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
+                # Get the analysis result
+                analysis_result = page.analyses[key]
+                # If the result has a to_dict method, use it
+                if hasattr(analysis_result, "to_dict"):
+                    analysis_data = analysis_result.to_dict()
+                else:
+                    # Otherwise, use the result directly if it's dict-like
+                    try:
+                        analysis_data = dict(analysis_result)
+                    except (TypeError, ValueError):
+                        # Last resort: convert to string
+                        analysis_data = {"raw_result": str(analysis_result)}
+                # Add analysis data to page data with the key as prefix
+                for k, v in analysis_data.items():
+                    page_data[f"{key}.{k}"] = v
+            all_data.append(page_data)
+        return all_data
+    def _get_target_pages(
+        self, pages: Optional[Union[Iterable[int], range, slice]] = None
+    ) -> List["Page"]:
+        """
+        Helper method to get a list of Page objects based on the input pages.
+        Args:
+            pages: Page indices, slice, or None for all pages
+        Returns:
+            List of Page objects
+        """
+        if pages is None:
+            return self._pages
+        elif isinstance(pages, slice):
+            return self._pages[pages]
+        elif hasattr(pages, "__iter__"):
+            try:
+                return [self._pages[i] for i in pages]
+            except IndexError:
+                raise ValueError("Invalid page index provided in 'pages' iterable.")
+            except TypeError:
+                raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        """Returns the ClassificationManager instance for this PDF."""
+        try:
+            return self.get_manager("classification")
+        except (KeyError, RuntimeError) as e:
+            raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
+        """
+        Provides the content for classifying the entire PDF.
+        Args:
+            model_type: 'text' or 'vision'.
+            **kwargs: Additional arguments (e.g., for text extraction or image rendering).
+        Returns:
+            Extracted text (str) or the first page's image (PIL.Image).
+        Raises:
+            ValueError: If model_type is 'vision' and PDF has != 1 page,
+                      or if model_type is unsupported, or if content cannot be generated.
+        """
+        if model_type == "text":
+            try:
+                # Extract text from the whole document
+                text = self.extract_text(**kwargs)  # Pass relevant kwargs
+                if not text or text.isspace():
+                    raise ValueError("PDF contains no extractable text for classification.")
+                return text
+            except Exception as e:
+                logger.error(f"Error extracting text for PDF classification: {e}")
+                raise ValueError("Failed to extract text for classification.") from e
+        elif model_type == "vision":
+            if len(self.pages) == 1:
+                # Use the single page's content method
+                try:
+                    return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
+                except Exception as e:
+                    logger.error(f"Error getting image from single page for classification: {e}")
+                    raise ValueError("Failed to get image from single page.") from e
+            elif len(self.pages) == 0:
+                raise ValueError("Cannot classify empty PDF using vision model.")
+            else:
+                raise ValueError(
+                    f"Vision classification for a PDF object is only supported for single-page PDFs. "
+                    f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
+                )
+        else:
+            raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
+    # --- End Classification Mixin Implementation ---

natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl