PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +230 -151
natural_pdf/classification/mixin.py +49 -35
natural_pdf/classification/results.py +64 -46
natural_pdf/collections/mixins.py +68 -20
natural_pdf/collections/pdf_collection.py +177 -64
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +633 -190
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +503 -131
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import base64
+import concurrent.futures  # Added import
 import hashlib
 import io
 import json
@@ -6,19 +7,30 @@ import logging
 import os
 import re
 import tempfile
-import time # Import time
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-import concurrent.futures # Added import
-from tqdm.auto import tqdm # Added tqdm import
 import threading
+import time  # Import time
+from pathlib import Path
+from typing import (  # Added overload
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
 import pdfplumber
 from PIL import Image, ImageDraw
+from tqdm.auto import tqdm  # Added tqdm import
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
+from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock  # Import from utils instead
+from natural_pdf.utils.visualization import render_plain_page
 if TYPE_CHECKING:
     import pdfplumber
@@ -31,6 +43,8 @@ if TYPE_CHECKING:
 # New Imports
 import itertools
+# Deskew Imports (Conditional)
+import numpy as np
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -39,27 +53,35 @@ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.analyzers.layout.layout_options import LayoutOptions
 from natural_pdf.analyzers.text_options import TextStyleOptions
 from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
+from natural_pdf.classification.manager import ClassificationManager  # For type hint
+# --- Classification Imports --- #
+from natural_pdf.classification.mixin import ClassificationMixin  # Import classification mixin
 from natural_pdf.core.element_manager import ElementManager
+from natural_pdf.elements.base import Element  # Import base element
 from natural_pdf.elements.text import TextElement
+from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr import OCRManager, OCROptions
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
+from natural_pdf.qa import DocumentQA, get_qa_engine
+from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 from natural_pdf.widgets import InteractiveViewerWidget
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
-from natural_pdf.qa import DocumentQA, get_qa_engine
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
-# --- Classification Imports --- #
-from natural_pdf.classification.mixin import ClassificationMixin
-from natural_pdf.classification.manager import ClassificationManager # For type hint
 # --- End Classification Imports --- #
-from natural_pdf.utils.locks import pdf_render_lock # Import the lock
-from natural_pdf.elements.base import Element # Import base element
-from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
-from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
+try:
+    from deskew import determine_skew
+    DESKEW_AVAILABLE = True
+except ImportError:
+    DESKEW_AVAILABLE = False
+    determine_skew = None
+# End Deskew Imports
 logger = logging.getLogger(__name__)
@@ -87,6 +109,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._index = index
         self._text_styles = None  # Lazy-loaded text style analyzer results
         self._exclusions = []  # List to store exclusion functions/regions
+        self._skew_angle: Optional[float] = None  # Stores detected skew angle
         # --- ADDED --- Metadata store for mixins
         self.metadata: Dict[str, Any] = {}
@@ -436,25 +459,79 @@ class Page(ClassificationMixin, ExtractionMixin):
         return filtered_elements
-    def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
+    @overload
+    def find(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]: ...
+    def find(
+        self,
+        selector: Optional[str] = None,  # Now optional
+        *,  # Force subsequent args to be keyword-only
+        text: Optional[str] = None,  # New text parameter
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[Any]:
         """
-        Find first element on this page matching selector.
+        Find first element on this page matching selector OR text content.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            Element object or None if not found
-        """
-        from natural_pdf.selectors.parser import parse_selector
+            Element object or None if not found.
+        """
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            # Escape quotes within the text for the selector string
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            # Default to 'text:contains(...)'
+            effective_selector = f'text:contains("{escaped_text}")'
+            # Note: regex/case handled by kwargs passed down
+            logger.debug(
+                f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            # Should be unreachable due to checks above
+            raise ValueError("Internal error: No selector or text provided.")
-        selector_obj = parse_selector(selector)
+        selector_obj = parse_selector(effective_selector)
-        # Pass regex and case flags to selector function
+        # Pass regex and case flags to selector function via kwargs
         kwargs["regex"] = regex
         kwargs["case"] = case
@@ -474,27 +551,80 @@ class Page(ClassificationMixin, ExtractionMixin):
         else:
             return None
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
     def find_all(
-        self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,  # Now optional
+        *,  # Force subsequent args to be keyword-only
+        text: Optional[str] = None,  # New text parameter
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
     ) -> "ElementCollection":
         """
-        Find all elements on this page matching selector.
+        Find all elements on this page matching selector OR text content.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            ElementCollection with matching elements
-        """
-        from natural_pdf.selectors.parser import parse_selector
+            ElementCollection with matching elements.
+        """
+        from natural_pdf.elements.collections import ElementCollection  # Import here for type hint
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text', not both.")
+        if selector is None and text is None:
+            raise ValueError("Provide either 'selector' or 'text'.")
+        # Construct selector if 'text' is provided
+        effective_selector = ""
+        if text is not None:
+            # Escape quotes within the text for the selector string
+            escaped_text = text.replace('"', '\\"').replace("'", "\\'")
+            # Default to 'text:contains(...)'
+            effective_selector = f'text:contains("{escaped_text}")'
+            logger.debug(
+                f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
+            )
+        elif selector is not None:
+            effective_selector = selector
+        else:
+            # Should be unreachable due to checks above
+            raise ValueError("Internal error: No selector or text provided.")
-        selector_obj = parse_selector(selector)
+        selector_obj = parse_selector(effective_selector)
-        # Pass regex and case flags to selector function
+        # Pass regex and case flags to selector function via kwargs
         kwargs["regex"] = regex
         kwargs["case"] = case
@@ -1282,18 +1412,22 @@ class Page(ClassificationMixin, ExtractionMixin):
         image = None
         render_resolution = resolution if resolution is not None else scale * 72
         thread_id = threading.current_thread().name
-        logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
+        logger.debug(
+            f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
+        )
         lock_wait_start = time.monotonic()
         try:
             # Acquire the global PDF rendering lock
             with pdf_render_lock:
                 lock_acquired_time = time.monotonic()
-                logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
+                logger.debug(
+                    f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
+                )
                 if include_highlights:
                     # Delegate rendering to the central service
                     image = self._highlighter.render_page(
                         page_index=self.index,
-                        scale=scale,  # Note: scale is used by highlighter internally for drawing
+                        scale=scale,
                         labels=labels,
                         legend_position=legend_position,
                         render_ocr=render_ocr,
@@ -1301,28 +1435,15 @@ class Page(ClassificationMixin, ExtractionMixin):
                         **kwargs,
                     )
                 else:
-                    # Get the base page image directly from pdfplumber if no highlights needed
-                    # Use the underlying pdfplumber page object
-                    img_object = self._page.to_image(resolution=render_resolution, **kwargs)
-                    # Access the PIL image directly (assuming pdfplumber structure)
-                    image = (
-                        img_object.annotated
-                        if hasattr(img_object, "annotated")
-                        else img_object._repr_png_()
-                    )
-                    if isinstance(image, bytes):  # Handle cases where it returns bytes
-                        from io import BytesIO
-                        image = Image.open(BytesIO(image)).convert(
-                            "RGB"
-                        )  # Convert to RGB for consistency
+                    image = render_plain_page(self, render_resolution)
         except Exception as e:
             logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
             return None  # Return None on error
         finally:
             render_end_time = time.monotonic()
-            logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
+            logger.debug(
+                f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
+            )
         if image is None:
             return None
@@ -1445,7 +1566,9 @@ class Page(ClassificationMixin, ExtractionMixin):
         # Remove existing OCR elements if replace is True
         if replace and hasattr(self, "_element_mgr"):
-            logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
+            logger.info(
+                f"Page {self.number}: Removing existing OCR elements before applying new OCR."
+            )
             self._element_mgr.remove_ocr_elements()
         logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
@@ -1513,7 +1636,9 @@ class Page(ClassificationMixin, ExtractionMixin):
             with pdf_render_lock:
                 image = self.to_image(resolution=final_resolution, include_highlights=False)
                 if not image:
-                    logger.error(f"  Failed to render page {self.number} to image for OCR extraction.")
+                    logger.error(
+                        f"  Failed to render page {self.number} to image for OCR extraction."
+                    )
                     return []
                 logger.debug(f"  Rendered image size: {image.width}x{image.height}")
         except Exception as e:
@@ -1585,6 +1710,11 @@ class Page(ClassificationMixin, ExtractionMixin):
         logger.info(f"  Created {len(temp_elements)} TextElements from OCR (extract only).")
         return temp_elements
+    @property
+    def size(self) -> Tuple[float, float]:
+        """Get the size of the page in points."""
+        return (self._page.width, self._page.height)
     @property
     def layout_analyzer(self) -> LayoutAnalyzer:
         """Get or create the layout analyzer for this page."""
@@ -1604,6 +1734,8 @@ class Page(ClassificationMixin, ExtractionMixin):
         exclude_classes: Optional[List[str]] = None,
         device: Optional[str] = None,
         existing: str = "replace",
+        model_name: Optional[str] = None,
+        client: Optional[Any] = None,  # Add client parameter
     ) -> ElementCollection[Region]:
         """
         Analyze the page layout using the configured LayoutManager.
@@ -1629,6 +1761,8 @@ class Page(ClassificationMixin, ExtractionMixin):
             exclude_classes=exclude_classes,
             device=device,
             existing=existing,
+            model_name=model_name,
+            client=client,  # Pass client down
         )
         # Retrieve the detected regions from the element manager
@@ -1699,14 +1833,24 @@ class Page(ClassificationMixin, ExtractionMixin):
             )
             return None
+    def split(self, divider, **kwargs) -> "ElementCollection[Region]":
+        """
+        Divides the page into sections based on the provided divider elements.
+        """
+        sections = self.get_sections(start_elements=divider, **kwargs)
+        top = self.region(0, 0, self.width, sections[0].top)
+        sections.append(top)
+        return sections
     def get_sections(
         self,
         start_elements=None,
         end_elements=None,
-        boundary_inclusion="both",
+        boundary_inclusion="start",
         y_threshold=5.0,
         bounding_box=None,
-    ) -> "ElementCollection[Region]":  # Updated type hint
+    ) -> "ElementCollection[Region]":
         """
         Get sections of a page defined by start/end elements.
         Uses the page-level implementation.
@@ -2068,7 +2212,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         self,
         correction_callback: Callable[[Any], Optional[str]],
         max_workers: Optional[int] = None,
-        progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
+        progress_callback: Optional[Callable[[], None]] = None,  # Added progress callback
     ) -> "Page":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements on this page
@@ -2096,7 +2240,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         target_elements_collection = self.find_all(
             selector="text[source=ocr]", apply_exclusions=False
         )
-        target_elements = target_elements_collection.elements # Get the list
+        target_elements = target_elements_collection.elements  # Get the list
         if not target_elements:
             logger.info(f"Page {self.number}: No OCR elements found to correct.")
@@ -2109,22 +2253,24 @@ class Page(ClassificationMixin, ExtractionMixin):
         # Define the task to be run by the worker thread or sequentially
         def _process_element_task(element):
             try:
-                current_text = getattr(element, 'text', None)
+                current_text = getattr(element, "text", None)
                 # Call the user-provided callback
                 corrected_text = correction_callback(element)
                 # Validate result type
                 if corrected_text is not None and not isinstance(corrected_text, str):
-                    logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
-                    return element, None, None # Treat as no correction
+                    logger.warning(
+                        f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
+                    )
+                    return element, None, None  # Treat as no correction
                 return element, corrected_text, None  # Return element, result, no error
             except Exception as e:
                 logger.error(
                     f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
-                    exc_info=False # Keep log concise
+                    exc_info=False,  # Keep log concise
                 )
-                return element, None, e # Return element, no result, error
+                return element, None, e  # Return element, no result, error
             finally:
                 # --- Call progress callback here --- #
                 if progress_callback:
@@ -2132,16 +2278,24 @@ class Page(ClassificationMixin, ExtractionMixin):
                         progress_callback()
                     except Exception as cb_e:
                         # Log error in callback itself, but don't stop processing
-                        logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
+                        logger.error(
+                            f"Page {self.number}: Error executing progress_callback: {cb_e}",
+                            exc_info=False,
+                        )
         # Choose execution strategy based on max_workers
         if max_workers is not None and max_workers > 1:
             # --- Parallel execution --- #
-            logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
+            logger.info(
+                f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
+            )
             futures = []
             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 # Submit all tasks
-                future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
+                future_to_element = {
+                    executor.submit(_process_element_task, element): element
+                    for element in target_elements
+                }
                 # Process results as they complete (progress_callback called by worker)
                 for future in concurrent.futures.as_completed(future_to_element):
@@ -2153,14 +2307,17 @@ class Page(ClassificationMixin, ExtractionMixin):
                             # Error already logged in worker
                         elif corrected_text is not None:
                             # Apply correction if text changed
-                            current_text = getattr(element, 'text', None)
+                            current_text = getattr(element, "text", None)
                             if corrected_text != current_text:
                                 element.text = corrected_text
                                 updated_count += 1
                     except Exception as exc:
                         # Catch errors from future.result() itself
-                        element = future_to_element[future] # Find original element
-                        logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
+                        element = future_to_element[future]  # Find original element
+                        logger.error(
+                            f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
+                            exc_info=True,
+                        )
                         error_count += 1
                         # Note: progress_callback was already called in the worker's finally block
@@ -2168,65 +2325,230 @@ class Page(ClassificationMixin, ExtractionMixin):
             # --- Sequential execution --- #
             logger.info(f"Page {self.number}: Running OCR correction sequentially.")
             for element in target_elements:
-                 # Call the task function directly (it handles progress_callback)
-                 processed_count += 1
-                 _element, corrected_text, error = _process_element_task(element)
-                 if error:
-                     error_count += 1
-                 elif corrected_text is not None:
-                     # Apply correction if text changed
-                     current_text = getattr(_element, 'text', None)
-                     if corrected_text != current_text:
-                         _element.text = corrected_text
-                         updated_count += 1
+                # Call the task function directly (it handles progress_callback)
+                processed_count += 1
+                _element, corrected_text, error = _process_element_task(element)
+                if error:
+                    error_count += 1
+                elif corrected_text is not None:
+                    # Apply correction if text changed
+                    current_text = getattr(_element, "text", None)
+                    if corrected_text != current_text:
+                        _element.text = corrected_text
+                        updated_count += 1
         logger.info(
-             f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
+            f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
         )
-        return self # Return self for chaining
+        return self  # Return self for chaining
     # --- Classification Mixin Implementation --- #
     def _get_classification_manager(self) -> "ClassificationManager":
-        if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
-             raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
+        if not hasattr(self, "pdf") or not hasattr(self.pdf, "get_manager"):
+            raise AttributeError(
+                "ClassificationManager cannot be accessed: Parent PDF or get_manager method missing."
+            )
         try:
-             # Use the PDF's manager registry accessor
-             return self.pdf.get_manager('classification')
+            # Use the PDF's manager registry accessor
+            return self.pdf.get_manager("classification")
         except (ValueError, RuntimeError, AttributeError) as e:
             # Wrap potential errors from get_manager for clarity
             raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
-    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
-        if model_type == 'text':
-            text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
+    def _get_classification_content(
+        self, model_type: str, **kwargs
+    ) -> Union[str, "Image"]:  # Use "Image" for lazy import
+        if model_type == "text":
+            text_content = self.extract_text(
+                layout=False, use_exclusions=False
+            )  # Simple join, ignore exclusions for classification
             if not text_content or text_content.isspace():
                 raise ValueError("Cannot classify page with 'text' model: No text content found.")
             return text_content
-        elif model_type == 'vision':
+        elif model_type == "vision":
             # Get resolution from manager/kwargs if possible, else default
             manager = self._get_classification_manager()
             default_resolution = 150
             # Access kwargs passed to classify method if needed
-            resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
+            resolution = (
+                kwargs.get("resolution", default_resolution)
+                if "kwargs" in locals()
+                else default_resolution
+            )
             # Use to_image, ensuring no highlights interfere
             img = self.to_image(
                 resolution=resolution,
                 include_highlights=False,
                 labels=False,
-                exclusions=None # Don't mask exclusions for classification input image
+                exclusions=None,  # Don't mask exclusions for classification input image
             )
             if img is None:
-                raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
+                raise ValueError(
+                    "Cannot classify page with 'vision' model: Failed to render image."
+                )
             return img
         else:
             raise ValueError(f"Unsupported model_type for classification: {model_type}")
     def _get_metadata_storage(self) -> Dict[str, Any]:
         # Ensure metadata exists
-        if not hasattr(self, 'metadata') or self.metadata is None:
+        if not hasattr(self, "metadata") or self.metadata is None:
             self.metadata = {}
         return self.metadata
     # --- Content Extraction ---
+    # --- Skew Detection and Correction --- #
+    @property
+    def skew_angle(self) -> Optional[float]:
+        """Get the detected skew angle for this page (if calculated)."""
+        return self._skew_angle
+    def detect_skew_angle(
+        self,
+        resolution: int = 72,
+        grayscale: bool = True,
+        force_recalculate: bool = False,
+        **deskew_kwargs,
+    ) -> Optional[float]:
+        """
+        Detects the skew angle of the page image and stores it.
+        Args:
+            resolution: DPI resolution for rendering the page image for detection.
+            grayscale: Whether to convert the image to grayscale before detection.
+            force_recalculate: If True, recalculate even if an angle exists.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             (e.g., `max_angle`, `num_peaks`).
+        Returns:
+            The detected skew angle in degrees, or None if detection failed.
+        Raises:
+            ImportError: If the 'deskew' library is not installed.
+        """
+        if not DESKEW_AVAILABLE:
+            raise ImportError(
+                "Deskew library not found. Install with: pip install natural-pdf[deskew]"
+            )
+        if self._skew_angle is not None and not force_recalculate:
+            logger.debug(f"Page {self.number}: Returning cached skew angle: {self._skew_angle:.2f}")
+            return self._skew_angle
+        logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
+        try:
+            # Render the page at the specified detection resolution
+            img = self.to_image(resolution=resolution, include_highlights=False)
+            if not img:
+                logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
+                self._skew_angle = None
+                return None
+            # Convert to numpy array
+            img_np = np.array(img)
+            # Convert to grayscale if needed
+            if grayscale:
+                if len(img_np.shape) == 3 and img_np.shape[2] >= 3:
+                    gray_np = np.mean(img_np[:, :, :3], axis=2).astype(np.uint8)
+                elif len(img_np.shape) == 2:
+                    gray_np = img_np  # Already grayscale
+                else:
+                    logger.warning(
+                        f"Page {self.number}: Unexpected image shape {img_np.shape} for grayscale conversion."
+                    )
+                    gray_np = img_np  # Try using it anyway
+            else:
+                gray_np = img_np  # Use original if grayscale=False
+            # Determine skew angle using the deskew library
+            angle = determine_skew(gray_np, **deskew_kwargs)
+            self._skew_angle = angle
+            logger.debug(f"Page {self.number}: Detected skew angle = {angle}")
+            return angle
+        except Exception as e:
+            logger.warning(f"Page {self.number}: Failed during skew detection: {e}", exc_info=True)
+            self._skew_angle = None
+            return None
+    def deskew(
+        self,
+        resolution: int = 300,
+        angle: Optional[float] = None,
+        detection_resolution: int = 72,
+        **deskew_kwargs,
+    ) -> Optional[Image.Image]:
+        """
+        Creates and returns a deskewed PIL image of the page.
+        If `angle` is not provided, it will first try to detect the skew angle
+        using `detect_skew_angle` (or use the cached angle if available).
+        Args:
+            resolution: DPI resolution for the output deskewed image.
+            angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
+            detection_resolution: DPI resolution used for detection if `angle` is None.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             if automatic detection is performed.
+        Returns:
+            A deskewed PIL.Image.Image object, or None if rendering/rotation fails.
+        Raises:
+            ImportError: If the 'deskew' library is not installed.
+        """
+        if not DESKEW_AVAILABLE:
+            raise ImportError(
+                "Deskew library not found. Install with: pip install natural-pdf[deskew]"
+            )
+        # Determine the angle to use
+        rotation_angle = angle
+        if rotation_angle is None:
+            # Detect angle (or use cached) if not explicitly provided
+            rotation_angle = self.detect_skew_angle(
+                resolution=detection_resolution, **deskew_kwargs
+            )
+        logger.debug(
+            f"Page {self.number}: Preparing to deskew (output resolution={resolution} DPI). Using angle: {rotation_angle}"
+        )
+        try:
+            # Render the original page at the desired output resolution
+            img = self.to_image(resolution=resolution, include_highlights=False)
+            if not img:
+                logger.error(f"Page {self.number}: Failed to render image for deskewing.")
+                return None
+            # Rotate if a significant angle was found/provided
+            if rotation_angle is not None and abs(rotation_angle) > 0.05:
+                logger.debug(f"Page {self.number}: Rotating by {rotation_angle:.2f} degrees.")
+                # Determine fill color based on image mode
+                fill = (255, 255, 255) if img.mode == "RGB" else 255  # White background
+                # Rotate the image using PIL
+                rotated_img = img.rotate(
+                    rotation_angle,  # deskew provides angle, PIL rotates counter-clockwise
+                    resample=Image.Resampling.BILINEAR,
+                    expand=True,  # Expand image to fit rotated content
+                    fillcolor=fill,
+                )
+                return rotated_img
+            else:
+                logger.debug(
+                    f"Page {self.number}: No significant rotation needed (angle={rotation_angle}). Returning original render."
+                )
+                return img  # Return the original rendered image if no rotation needed
+        except Exception as e:
+            logger.error(
+                f"Page {self.number}: Error during deskewing image generation: {e}", exc_info=True
+            )
+            return None
+    # --- End Skew Detection and Correction --- #

natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl