PyPI - natural-pdf - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

natural-pdf 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

natural_pdf/__init__.py +24 -40
natural_pdf/core/page.py +17 -17
natural_pdf/core/pdf.py +130 -12
natural_pdf/elements/collections.py +229 -29
natural_pdf/elements/region.py +2 -3
natural_pdf/exporters/hocr.py +540 -0
natural_pdf/exporters/hocr_font.py +142 -0
natural_pdf/exporters/original_pdf.py +130 -0
natural_pdf/exporters/searchable_pdf.py +3 -3
natural_pdf/ocr/engine_surya.py +1 -1
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +15 -12
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -37,72 +37,56 @@ def configure_logging(level=logging.INFO, handler=None):
     logger.propagate = False
+# Version
+__version__ = "0.1.1"
+# Core imports
+from natural_pdf.collections.pdf_collection import PDFCollection
 from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
-# Import QA module if available
-try:
-    from natural_pdf.qa import DocumentQA, get_qa_engine
-    HAS_QA = True
-except ImportError:
-    HAS_QA = False
-__version__ = "0.1.1"
-__all__ = [
-    "PDF",
-    "PDFCollection",
-    "Page",
-    "Region",
-    "ElementCollection",
-    "TextSearchOptions",
-    "MultiModalSearchOptions",
-    "BaseSearchOptions",
-    "configure_logging",
-]
-if HAS_QA:
-    __all__.extend(["DocumentQA", "get_qa_engine"])
-from .collections.pdf_collection import PDFCollection
-# Core classes
-from .core.pdf import PDF
-from .elements.region import Region
+ElementCollection = None
 # Search options (if extras installed)
 try:
-    from .search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
+    from natural_pdf.search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
 except ImportError:
     # Define dummy classes if extras not installed, so imports don't break
     # but using them will raise the ImportError from check_haystack_availability
-    class TextSearchOptions:
+    class BaseSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-    class MultiModalSearchOptions:
+    class TextSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-    class BaseSearchOptions:
+    class MultiModalSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-# Expose logging setup? (Optional)
-# from . import logging_config
-# logging_config.setup_logging()
+# Import QA module if available
+try:
+    from natural_pdf.qa import DocumentQA, get_qa_engine
+    HAS_QA = True
+except ImportError:
+    HAS_QA = False
 # Explicitly define what gets imported with 'from natural_pdf import *'
 __all__ = [
     "PDF",
     "PDFCollection",
+    "Page",
     "Region",
-    "TextSearchOptions",  # Include search options
+    "ElementCollection",
+    "TextSearchOptions",
     "MultiModalSearchOptions",
     "BaseSearchOptions",
+    "configure_logging",
 ]
+# Add QA components to __all__ if available
+if HAS_QA:
+    __all__.extend(["DocumentQA", "get_qa_engine"])

natural_pdf/core/page.py CHANGED Viewed

@@ -40,10 +40,10 @@ if TYPE_CHECKING:
     from natural_pdf.elements.base import Element
     from natural_pdf.elements.collections import ElementCollection
-# New Imports
+# # New Imports
 import itertools
-# Deskew Imports (Conditional)
+# # Deskew Imports (Conditional)
 import numpy as np
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
 from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
 from natural_pdf.classification.manager import ClassificationManager  # For type hint
-# --- Classification Imports --- #
+# # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin  # Import classification mixin
 from natural_pdf.core.element_manager import ElementManager
 from natural_pdf.elements.base import Element  # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
-# Import new utils
+# # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 from natural_pdf.widgets import InteractiveViewerWidget
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def add_exclusion(
         self,
-        exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
+        exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
         label: Optional[str] = None,
     ) -> "Page":
         """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
+    def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
         """
         Add a region to the page.
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
+    def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
         """
         Add multiple regions to the page.
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
+    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
         """
         Get all exclusion regions for this page.
         Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._highlighter.clear_page(self.index)
         return self
-    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
+    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
         """
         Analyze text elements by style, adding attributes directly to elements.
@@ -1520,7 +1520,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def _create_text_elements_from_ocr(
         self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
         logger.warning(
             "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1532,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def apply_ocr(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
@@ -1597,12 +1597,12 @@ class Page(ClassificationMixin, ExtractionMixin):
     def extract_ocr_elements(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
         resolution: Optional[int] = None,
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """
         Extract text elements using OCR *without* adding them to the page's elements.
         Uses the shared OCRManager instance.
@@ -1716,7 +1716,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return (self._page.width, self._page.height)
     @property
-    def layout_analyzer(self) -> LayoutAnalyzer:
+    def layout_analyzer(self) -> "LayoutAnalyzer":
         """Get or create the layout analyzer for this page."""
         if self._layout_analyzer is None:
             if not self._layout_manager:
@@ -1728,7 +1728,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def analyze_layout(
         self,
         engine: Optional[str] = None,
-        options: Optional[LayoutOptions] = None,
+        options: Optional["LayoutOptions"] = None,
         confidence: Optional[float] = None,
         classes: Optional[List[str]] = None,
         exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1736,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         existing: str = "replace",
         model_name: Optional[str] = None,
         client: Optional[Any] = None,  # Add client parameter
-    ) -> ElementCollection[Region]:
+    ) -> "ElementCollection[Region]":
         """
         Analyze the page layout using the configured LayoutManager.
         Adds detected Region objects to the page's element manager.
@@ -1813,7 +1813,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def get_section_between(
         self, start_element=None, end_element=None, boundary_inclusion="both"
-    ) -> Optional[Region]:  # Return Optional
+    ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
         """

natural_pdf/core/pdf.py CHANGED Viewed

@@ -60,6 +60,14 @@ except ImportError:
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
+try:
+    from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+except ImportError:
+    create_searchable_pdf = None
+try:
+    from natural_pdf.exporters.original_pdf import create_original_pdf
+except ImportError:
+    create_original_pdf = None
 logger = logging.getLogger("natural_pdf.core.pdf")
 tqdm = get_tqdm()
@@ -260,7 +268,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return self
     def add_exclusion(
-        self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
+        self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
     ) -> "PDF":
         """
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -468,7 +476,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return self
     def add_region(
-        self, region_func: Callable[["Page"], Optional[Region]], name: str = None
+        self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
     ) -> "PDF":
         """
         Add a region function to the PDF.
@@ -769,23 +777,133 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
+        DEPRECATED: Use save_pdf(..., ocr=True) instead.
         Saves the PDF with an OCR text layer, making content searchable.
-        Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
+        Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
         Args:
             output_path: Path to save the searchable PDF
             dpi: Resolution for rendering and OCR overlay
             **kwargs: Additional keyword arguments passed to the exporter
-            output_path: Path to save the searchable PDF
-            dpi: Resolution for rendering and OCR overlay
-            **kwargs: Additional keyword arguments passed to the exporter
         """
-        from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+        logger.warning(
+            "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
+        )
+        if create_searchable_pdf is None:
+             raise ImportError(
+                 "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
+                 "Install with: pip install \"natural-pdf[ocr-export]\""
+             )
         output_path_str = str(output_path)
+        # Call the exporter directly, passing self (the PDF instance)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
-        logger.info(f"Searchable PDF saved to: {output_path_str}")
+        # Logger info is handled within the exporter now
+        # logger.info(f"Searchable PDF saved to: {output_path_str}")
+    def save_pdf(
+        self,
+        output_path: Union[str, Path],
+        ocr: bool = False,
+        original: bool = False,
+        dpi: int = 300,
+    ):
+        """
+        Saves the PDF object (all its pages) to a new file.
+        Choose one saving mode:
+        - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
+          Text generated during the natural-pdf session becomes searchable,
+          but original vector content is lost. Requires 'ocr-export' extras.
+        - `original=True`: Saves a copy of the original PDF file this object represents.
+          Any OCR results or analyses from the natural-pdf session are NOT included.
+          If the PDF was opened from an in-memory buffer, this mode may not be suitable.
+          Requires 'ocr-export' extras.
+        Args:
+            output_path: Path to save the new PDF file.
+            ocr: If True, save as a searchable, image-based PDF using OCR data.
+            original: If True, save the original source PDF content.
+            dpi: Resolution (dots per inch) used only when ocr=True.
+        Raises:
+            ValueError: If the PDF has no pages, if neither or both 'ocr'
+                        and 'original' are True.
+            ImportError: If required libraries are not installed for the chosen mode.
+            RuntimeError: If an unexpected error occurs during saving.
+        """
+        if not self.pages:
+            raise ValueError("Cannot save an empty PDF object.")
+        if not (ocr ^ original):  # XOR: exactly one must be true
+            raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
+        output_path_obj = Path(output_path)
+        output_path_str = str(output_path_obj)
+        if ocr:
+            if create_searchable_pdf is None:
+                raise ImportError(
+                    "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
+                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                )
+            # Optional: Add warning about vector data loss similar to PageCollection
+            has_vector_elements = False
+            for page in self.pages:
+                if (hasattr(page, 'rects') and page.rects or
+                    hasattr(page, 'lines') and page.lines or
+                    hasattr(page, 'curves') and page.curves or
+                    (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
+                    (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
+                    has_vector_elements = True
+                    break
+            if has_vector_elements:
+                 logger.warning(
+                     "Warning: Saving with ocr=True creates an image-based PDF. "
+                     "Original vector elements (rects, lines, non-OCR text/chars) "
+                     "will not be preserved in the output file."
+                 )
+            logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
+            try:
+                # Delegate to the searchable PDF exporter, passing self (PDF instance)
+                create_searchable_pdf(self, output_path_str, dpi=dpi)
+            except Exception as e:
+                 raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
+        elif original:
+            if create_original_pdf is None:
+                raise ImportError(
+                    "Saving with original=True requires 'pikepdf'. "
+                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                )
+             # Optional: Add warning about losing OCR data similar to PageCollection
+            has_ocr_elements = False
+            for page in self.pages:
+                 if hasattr(page, 'find_all'):
+                     ocr_text_elements = page.find_all("text[source=ocr]")
+                     if ocr_text_elements:
+                         has_ocr_elements = True
+                         break
+                 elif hasattr(page, 'words'): # Fallback
+                     if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
+                          has_ocr_elements = True
+                          break
+            if has_ocr_elements:
+                logger.warning(
+                    "Warning: Saving with original=True preserves original page content. "
+                    "OCR text generated in this session will not be included in the saved file."
+                )
+            logger.info(f"Saving original PDF content to: {output_path_str}")
+            try:
+                 # Delegate to the original PDF exporter, passing self (PDF instance)
+                 create_original_pdf(self, output_path_str)
+            except Exception as e:
+                 # Re-raise exception from exporter
+                 raise e
     def ask(
         self,
@@ -850,9 +968,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     def search_within_index(
         self,
-        query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol,
-        options: Optional[SearchOptions] = None,
+        query: Union[str, Path, Image.Image, "Region"],
+        search_service: "SearchServiceProtocol",
+        options: Optional["SearchOptions"] = None,
     ) -> List[Dict[str, Any]]:
         """
         Finds relevant documents from this PDF within a search index.

natural-pdf 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

natural-pdf 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl