PyPI - natural-pdf - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

natural-pdf 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

natural_pdf/__init__.py +24 -40
natural_pdf/classification/manager.py +26 -22
natural_pdf/classification/mixin.py +7 -7
natural_pdf/classification/results.py +17 -9
natural_pdf/collections/mixins.py +17 -0
natural_pdf/collections/pdf_collection.py +78 -46
natural_pdf/core/page.py +17 -17
natural_pdf/core/pdf.py +192 -18
natural_pdf/elements/collections.py +307 -3
natural_pdf/elements/region.py +2 -3
natural_pdf/exporters/hocr.py +540 -0
natural_pdf/exporters/hocr_font.py +142 -0
natural_pdf/exporters/original_pdf.py +130 -0
natural_pdf/exporters/searchable_pdf.py +3 -3
natural_pdf/ocr/engine_surya.py +1 -1
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +20 -17
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -40,10 +40,10 @@ if TYPE_CHECKING:
     from natural_pdf.elements.base import Element
     from natural_pdf.elements.collections import ElementCollection
-# New Imports
+# # New Imports
 import itertools
-# Deskew Imports (Conditional)
+# # Deskew Imports (Conditional)
 import numpy as np
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
 from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
 from natural_pdf.classification.manager import ClassificationManager  # For type hint
-# --- Classification Imports --- #
+# # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin  # Import classification mixin
 from natural_pdf.core.element_manager import ElementManager
 from natural_pdf.elements.base import Element  # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
-# Import new utils
+# # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 from natural_pdf.widgets import InteractiveViewerWidget
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def add_exclusion(
         self,
-        exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
+        exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
         label: Optional[str] = None,
     ) -> "Page":
         """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
+    def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
         """
         Add a region to the page.
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
+    def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
         """
         Add multiple regions to the page.
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
+    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
         """
         Get all exclusion regions for this page.
         Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._highlighter.clear_page(self.index)
         return self
-    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
+    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
         """
         Analyze text elements by style, adding attributes directly to elements.
@@ -1520,7 +1520,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def _create_text_elements_from_ocr(
         self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
         logger.warning(
             "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1532,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def apply_ocr(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
@@ -1597,12 +1597,12 @@ class Page(ClassificationMixin, ExtractionMixin):
     def extract_ocr_elements(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
         resolution: Optional[int] = None,
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """
         Extract text elements using OCR *without* adding them to the page's elements.
         Uses the shared OCRManager instance.
@@ -1716,7 +1716,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return (self._page.width, self._page.height)
     @property
-    def layout_analyzer(self) -> LayoutAnalyzer:
+    def layout_analyzer(self) -> "LayoutAnalyzer":
         """Get or create the layout analyzer for this page."""
         if self._layout_analyzer is None:
             if not self._layout_manager:
@@ -1728,7 +1728,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def analyze_layout(
         self,
         engine: Optional[str] = None,
-        options: Optional[LayoutOptions] = None,
+        options: Optional["LayoutOptions"] = None,
         confidence: Optional[float] = None,
         classes: Optional[List[str]] = None,
         exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1736,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         existing: str = "replace",
         model_name: Optional[str] = None,
         client: Optional[Any] = None,  # Add client parameter
-    ) -> ElementCollection[Region]:
+    ) -> "ElementCollection[Region]":
         """
         Analyze the page layout using the configured LayoutManager.
         Adds detected Region objects to the page's element manager.
@@ -1813,7 +1813,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def get_section_between(
         self, start_element=None, end_element=None, boundary_inclusion="both"
-    ) -> Optional[Region]:  # Return Optional
+    ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
         """

natural_pdf/core/pdf.py CHANGED Viewed

@@ -60,6 +60,14 @@ except ImportError:
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
+try:
+    from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+except ImportError:
+    create_searchable_pdf = None
+try:
+    from natural_pdf.exporters.original_pdf import create_original_pdf
+except ImportError:
+    create_original_pdf = None
 logger = logging.getLogger("natural_pdf.core.pdf")
 tqdm = get_tqdm()
@@ -84,7 +92,7 @@ except ImportError:
 # End Deskew Imports
-class PDF(ExtractionMixin, ExportMixin):
+class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     """
     Enhanced PDF wrapper built on top of pdfplumber.
@@ -194,6 +202,7 @@ class PDF(ExtractionMixin, ExportMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        self.analyses: Dict[str, Any] = {}
     def _initialize_managers(self):
         """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -259,7 +268,7 @@ class PDF(ExtractionMixin, ExportMixin):
         return self
     def add_exclusion(
-        self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
+        self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
     ) -> "PDF":
         """
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -467,7 +476,7 @@ class PDF(ExtractionMixin, ExportMixin):
         return self
     def add_region(
-        self, region_func: Callable[["Page"], Optional[Region]], name: str = None
+        self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
     ) -> "PDF":
         """
         Add a region function to the PDF.
@@ -768,23 +777,133 @@ class PDF(ExtractionMixin, ExportMixin):
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
+        DEPRECATED: Use save_pdf(..., ocr=True) instead.
         Saves the PDF with an OCR text layer, making content searchable.
-        Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
+        Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
         Args:
             output_path: Path to save the searchable PDF
             dpi: Resolution for rendering and OCR overlay
             **kwargs: Additional keyword arguments passed to the exporter
-            output_path: Path to save the searchable PDF
-            dpi: Resolution for rendering and OCR overlay
-            **kwargs: Additional keyword arguments passed to the exporter
         """
-        from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+        logger.warning(
+            "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
+        )
+        if create_searchable_pdf is None:
+             raise ImportError(
+                 "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
+                 "Install with: pip install \"natural-pdf[ocr-export]\""
+             )
         output_path_str = str(output_path)
+        # Call the exporter directly, passing self (the PDF instance)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
-        logger.info(f"Searchable PDF saved to: {output_path_str}")
+        # Logger info is handled within the exporter now
+        # logger.info(f"Searchable PDF saved to: {output_path_str}")
+    def save_pdf(
+        self,
+        output_path: Union[str, Path],
+        ocr: bool = False,
+        original: bool = False,
+        dpi: int = 300,
+    ):
+        """
+        Saves the PDF object (all its pages) to a new file.
+        Choose one saving mode:
+        - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
+          Text generated during the natural-pdf session becomes searchable,
+          but original vector content is lost. Requires 'ocr-export' extras.
+        - `original=True`: Saves a copy of the original PDF file this object represents.
+          Any OCR results or analyses from the natural-pdf session are NOT included.
+          If the PDF was opened from an in-memory buffer, this mode may not be suitable.
+          Requires 'ocr-export' extras.
+        Args:
+            output_path: Path to save the new PDF file.
+            ocr: If True, save as a searchable, image-based PDF using OCR data.
+            original: If True, save the original source PDF content.
+            dpi: Resolution (dots per inch) used only when ocr=True.
+        Raises:
+            ValueError: If the PDF has no pages, if neither or both 'ocr'
+                        and 'original' are True.
+            ImportError: If required libraries are not installed for the chosen mode.
+            RuntimeError: If an unexpected error occurs during saving.
+        """
+        if not self.pages:
+            raise ValueError("Cannot save an empty PDF object.")
+        if not (ocr ^ original):  # XOR: exactly one must be true
+            raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
+        output_path_obj = Path(output_path)
+        output_path_str = str(output_path_obj)
+        if ocr:
+            if create_searchable_pdf is None:
+                raise ImportError(
+                    "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
+                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                )
+            # Optional: Add warning about vector data loss similar to PageCollection
+            has_vector_elements = False
+            for page in self.pages:
+                if (hasattr(page, 'rects') and page.rects or
+                    hasattr(page, 'lines') and page.lines or
+                    hasattr(page, 'curves') and page.curves or
+                    (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
+                    (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
+                    has_vector_elements = True
+                    break
+            if has_vector_elements:
+                 logger.warning(
+                     "Warning: Saving with ocr=True creates an image-based PDF. "
+                     "Original vector elements (rects, lines, non-OCR text/chars) "
+                     "will not be preserved in the output file."
+                 )
+            logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
+            try:
+                # Delegate to the searchable PDF exporter, passing self (PDF instance)
+                create_searchable_pdf(self, output_path_str, dpi=dpi)
+            except Exception as e:
+                 raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
+        elif original:
+            if create_original_pdf is None:
+                raise ImportError(
+                    "Saving with original=True requires 'pikepdf'. "
+                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                )
+             # Optional: Add warning about losing OCR data similar to PageCollection
+            has_ocr_elements = False
+            for page in self.pages:
+                 if hasattr(page, 'find_all'):
+                     ocr_text_elements = page.find_all("text[source=ocr]")
+                     if ocr_text_elements:
+                         has_ocr_elements = True
+                         break
+                 elif hasattr(page, 'words'): # Fallback
+                     if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
+                          has_ocr_elements = True
+                          break
+            if has_ocr_elements:
+                logger.warning(
+                    "Warning: Saving with original=True preserves original page content. "
+                    "OCR text generated in this session will not be included in the saved file."
+                )
+            logger.info(f"Saving original PDF content to: {output_path_str}")
+            try:
+                 # Delegate to the original PDF exporter, passing self (PDF instance)
+                 create_original_pdf(self, output_path_str)
+            except Exception as e:
+                 # Re-raise exception from exporter
+                 raise e
     def ask(
         self,
@@ -849,9 +968,9 @@ class PDF(ExtractionMixin, ExportMixin):
     def search_within_index(
         self,
-        query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol,
-        options: Optional[SearchOptions] = None,
+        query: Union[str, Path, Image.Image, "Region"],
+        search_service: "SearchServiceProtocol",
+        options: Optional["SearchOptions"] = None,
     ) -> List[Dict[str, Any]]:
         """
         Finds relevant documents from this PDF within a search index.
@@ -1243,7 +1362,7 @@ class PDF(ExtractionMixin, ExportMixin):
     def classify_pages(
         self,
-        categories: List[str],
+        labels: List[str],
         model: Optional[str] = None,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         analysis_key: str = "classification",
@@ -1254,7 +1373,7 @@ class PDF(ExtractionMixin, ExportMixin):
         Classifies specified pages of the PDF.
         Args:
-            categories: List of category names
+            labels: List of category names
             model: Model identifier ('text', 'vision', or specific HF ID)
             pages: Page indices, slice, or None for all pages
             analysis_key: Key to store results in page's analyses dict
@@ -1264,8 +1383,8 @@ class PDF(ExtractionMixin, ExportMixin):
         Returns:
             Self for method chaining
         """
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         try:
             manager = self.get_manager("classification")
@@ -1332,7 +1451,7 @@ class PDF(ExtractionMixin, ExportMixin):
         try:
             batch_results = manager.classify_batch(
                 item_contents=page_contents,
-                categories=categories,
+                labels=labels,
                 model_id=model,
                 using=inferred_using,
                 **kwargs,
@@ -1537,3 +1656,58 @@ class PDF(ExtractionMixin, ExportMixin):
                 raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        """Returns the ClassificationManager instance for this PDF."""
+        try:
+            return self.get_manager("classification")
+        except (KeyError, RuntimeError) as e:
+            raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
+        """
+        Provides the content for classifying the entire PDF.
+        Args:
+            model_type: 'text' or 'vision'.
+            **kwargs: Additional arguments (e.g., for text extraction or image rendering).
+        Returns:
+            Extracted text (str) or the first page's image (PIL.Image).
+        Raises:
+            ValueError: If model_type is 'vision' and PDF has != 1 page,
+                      or if model_type is unsupported, or if content cannot be generated.
+        """
+        if model_type == "text":
+            try:
+                # Extract text from the whole document
+                text = self.extract_text(**kwargs)  # Pass relevant kwargs
+                if not text or text.isspace():
+                    raise ValueError("PDF contains no extractable text for classification.")
+                return text
+            except Exception as e:
+                logger.error(f"Error extracting text for PDF classification: {e}")
+                raise ValueError("Failed to extract text for classification.") from e
+        elif model_type == "vision":
+            if len(self.pages) == 1:
+                # Use the single page's content method
+                try:
+                    return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
+                except Exception as e:
+                    logger.error(f"Error getting image from single page for classification: {e}")
+                    raise ValueError("Failed to get image from single page.") from e
+            elif len(self.pages) == 0:
+                raise ValueError("Cannot classify empty PDF using vision model.")
+            else:
+                raise ValueError(
+                    f"Vision classification for a PDF object is only supported for single-page PDFs. "
+                    f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
+                )
+        else:
+            raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
+    # --- End Classification Mixin Implementation ---

natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

natural-pdf 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl