PyPI - natural-pdf - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

natural-pdf 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

natural_pdf/classification/manager.py +38 -13
natural_pdf/core/page.py +2 -1
natural_pdf/core/pdf.py +141 -32
natural_pdf/describe/__init__.py +21 -0
natural_pdf/describe/base.py +457 -0
natural_pdf/describe/elements.py +411 -0
natural_pdf/describe/mixin.py +84 -0
natural_pdf/describe/summary.py +186 -0
natural_pdf/elements/base.py +2 -1
natural_pdf/elements/collections.py +11 -1
natural_pdf/elements/region.py +4 -1
natural_pdf/exporters/__init__.py +12 -1
natural_pdf/exporters/hocr.py +9 -8
natural_pdf/exporters/original_pdf.py +31 -2
natural_pdf/ocr/engine_surya.py +1 -2
natural_pdf/ocr/ocr_manager.py +21 -4
natural_pdf/search/__init__.py +20 -3
natural_pdf/search/lancedb_search_service.py +13 -5
natural_pdf/search/numpy_search_service.py +13 -3
{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/METADATA +16 -16
{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/RECORD +24 -19
{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -30,6 +30,7 @@ from tqdm.auto import tqdm
 from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
 from natural_pdf.classification.manager import ClassificationManager
 from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
 from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.base import Element
@@ -71,7 +72,14 @@ P = TypeVar("P", bound="Page")
 class ElementCollection(
-    Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
+    Generic[T],
+    ApplyMixin,
+    ExportMixin,
+    ClassificationMixin,
+    DirectionalCollectionMixin,
+    DescribeMixin,
+    InspectMixin,
+    MutableSequence,
 ):
     """
     Collection of PDF elements with batch operations.
@@ -1795,6 +1803,8 @@ class ElementCollection(
         )
 class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
     """
     Represents a collection of Page objects, often from a single PDF document.

natural_pdf/elements/region.py CHANGED Viewed

@@ -15,6 +15,7 @@ from natural_pdf.classification.manager import ClassificationManager  # Keep for
 # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.describe.mixin import DescribeMixin
 from natural_pdf.elements.base import DirectionalMixin
 from natural_pdf.elements.text import TextElement  # ADDED IMPORT
 from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
@@ -49,7 +50,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
+class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
     """
     Represents a rectangular region on a page.
     """
@@ -2962,3 +2963,5 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 )
         return text_element

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -1,4 +1,15 @@
 from .base import FinetuneExporter
-from .paddleocr import PaddleOCRRecognitionExporter
+# Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
+def _get_paddleocr_exporter():
+    """Lazy import for PaddleOCRRecognitionExporter."""
+    from .paddleocr import PaddleOCRRecognitionExporter
+    return PaddleOCRRecognitionExporter
+# Make PaddleOCRRecognitionExporter available through attribute access
+def __getattr__(name):
+    if name == "PaddleOCRRecognitionExporter":
+        return _get_paddleocr_exporter()
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/hocr.py CHANGED Viewed

@@ -16,6 +16,7 @@ from dataclasses import dataclass
 from itertools import pairwise
 from math import atan, pi
 from pathlib import Path
+from typing import Optional, Union
 from xml.etree import ElementTree
 from pikepdf import Matrix, Name, Rectangle
@@ -94,12 +95,12 @@ class HocrTransform:
     def __init__(
         self,
         *,
-        hocr_filename: str | Path,
+        hocr_filename: Union[str, Path],
         dpi: float,
         debug: bool = False,
         fontname: Name = Name("/f-0-0"),
         font: Font = GlyphlessFont(),
-        debug_render_options: DebugRenderOptions | None = None,
+        debug_render_options: Optional[DebugRenderOptions] = None,
     ):
         """Initialize the HocrTransform object."""
         if debug:
@@ -144,7 +145,7 @@ class HocrTransform:
         return text
     @classmethod
-    def element_coordinates(cls, element: Element) -> Rectangle | None:
+    def element_coordinates(cls, element: Element) -> Optional[Rectangle]:
         """Get coordinates of the bounding box around an element."""
         matches = cls.box_pattern.search(element.attrib.get("title", ""))
         if not matches:
@@ -172,7 +173,7 @@ class HocrTransform:
             return 0.0
         return float(matches.group(1))
-    def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
+    def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
         xpath = f".//{self.xmlns}{html_tag}"
         if html_class:
             xpath += f"[@class='{html_class}']"
@@ -187,7 +188,7 @@ class HocrTransform:
         self,
         *,
         out_filename: Path,
-        image_filename: Path | None = None,
+        image_filename: Optional[Path] = None,
         invisible_text: bool = True,
     ) -> None:
         """Creates a PDF file with an image superimposed on top of the text.
@@ -291,7 +292,7 @@ class HocrTransform:
     def _do_line(
         self,
         canvas: Canvas,
-        line: Element | None,
+        line: Optional[Element],
         elemclass: str,
         invisible_text: bool,
         text_direction: TextDirection,
@@ -387,8 +388,8 @@ class HocrTransform:
         line_matrix: Matrix,
         text: Text,
         fontsize: float,
-        elem: Element | None,
-        next_elem: Element | None,
+        elem: Optional[Element],
+        next_elem: Optional[Element],
         text_direction: TextDirection,
         inject_word_breaks: bool,
     ):

natural_pdf/exporters/original_pdf.py CHANGED Viewed

@@ -4,6 +4,8 @@ Module for exporting original PDF pages without modification.
 import logging
 import os
+import io
+import urllib.request
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Set, Union
@@ -69,8 +71,11 @@ def create_original_pdf(
     # Verify all pages come from the same PDF and get path
     first_page_pdf_path = None
+    first_page_pdf_obj = None
     if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
-        first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
+        src_pdf = pages_to_extract[0].pdf
+        first_page_pdf_path = getattr(src_pdf, "path", None)
+        first_page_pdf_obj = src_pdf
     if not first_page_pdf_path:
         raise ValueError(
@@ -93,7 +98,28 @@ def create_original_pdf(
     )
     try:
-        with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
+        # Prefer opening via filesystem path when it exists locally
+        if first_page_pdf_path and os.path.exists(first_page_pdf_path):
+            source_handle = pikepdf.Pdf.open(first_page_pdf_path)
+        else:
+            # Fallback: attempt to open from in-memory bytes stored on PDF object
+            if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
+                source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
+            else:
+                # Attempt to download bytes directly if path looks like URL
+                if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
+                    try:
+                        with urllib.request.urlopen(first_page_pdf_path) as resp:
+                            data = resp.read()
+                        source_handle = pikepdf.Pdf.open(io.BytesIO(data))
+                    except Exception as dl_err:
+                        raise FileNotFoundError(
+                            f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
+                        )
+                else:
+                    raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
+        with source_handle as source_pikepdf_doc:
             target_pikepdf_doc = pikepdf.Pdf.new()
             for page_index in sorted_indices:
@@ -113,6 +139,9 @@ def create_original_pdf(
                 f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
             )
+    except FileNotFoundError as e:
+        logger.error(str(e))
+        raise RuntimeError(f"Failed to save original pages PDF: {e}")
     except pikepdf.PasswordError:
         logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
         raise RuntimeError(

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -27,7 +27,6 @@ class SuryaOCREngine(OCREngine):
         if not self.is_available():
             raise ImportError("Surya OCR library is not installed or available.")
-        # Store languages for use in _process_single_image
         self._langs = languages
         from surya.detection import DetectionPredictor
@@ -63,7 +62,6 @@ class SuryaOCREngine(OCREngine):
         if not self._recognition_predictor or not self._detection_predictor:
             raise RuntimeError("Surya predictors are not initialized.")
-        # Store languages instance variable during initialization to use here
         langs = (
             [self._langs]  # Send all languages together in one list per image
             if hasattr(self, "_langs")
@@ -75,6 +73,7 @@ class SuryaOCREngine(OCREngine):
             results = self._detection_predictor(images=[image])
         else:
             results = self._recognition_predictor(
+                langs=langs,
                 images=[image],
                 det_predictor=self._detection_predictor,
             )

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -11,7 +11,8 @@ from PIL import Image
 from .engine import OCREngine
 from .engine_doctr import DoctrOCREngine
 from .engine_easyocr import EasyOCREngine
-from .engine_paddle import PaddleOCREngine
+# Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
+# from .engine_paddle import PaddleOCREngine
 from .engine_surya import SuryaOCREngine
 from .ocr_options import (
     BaseOCROptions,
@@ -28,10 +29,16 @@ logger = logging.getLogger(__name__)
 class OCRManager:
     """Manages OCR engine selection, configuration, and execution."""
+    @staticmethod
+    def _get_paddle_engine_class():
+        """Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
+        from .engine_paddle import PaddleOCREngine
+        return PaddleOCREngine
     # Registry mapping engine names to classes and default options
     ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
         "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
-        "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
+        "paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
         "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
         "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
         # Add other engines here
@@ -76,7 +83,12 @@ class OCRManager:
             logger.info(
                 f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
             )
-            engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
+            engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
+            # Handle lazy loading - if it's a lambda function, call it to get the actual class
+            if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+                engine_class = engine_class_or_factory()
+            else:
+                engine_class = engine_class_or_factory
             start_time = time.monotonic()  # Optional: time initialization
             try:
                 engine_instance = engine_class()  # Instantiate first
@@ -277,7 +289,12 @@ class OCRManager:
         for name, registry_entry in self.ENGINE_REGISTRY.items():
             try:
                 # Temporarily instantiate to check availability without caching
-                engine_class = registry_entry["class"]
+                engine_class_or_factory = registry_entry["class"]
+                # Handle lazy loading - if it's a lambda function, call it to get the actual class
+                if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+                    engine_class = engine_class_or_factory()
+                else:
+                    engine_class = engine_class_or_factory
                 if engine_class().is_available():
                     available.append(name)
             except Exception as e:

natural_pdf/search/__init__.py CHANGED Viewed

@@ -18,7 +18,8 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
 try:
     import numpy as np
-    import sentence_transformers
+    # Lazy import for sentence_transformers to avoid heavy loading at module level
+    # import sentence_transformers
     # Basic search dependencies are available
     SEARCH_DEPENDENCIES_AVAILABLE = True
@@ -46,12 +47,28 @@ except ImportError:
 logger = logging.getLogger(__name__)
+def _check_sentence_transformers():
+    """Lazy check for sentence_transformers availability."""
+    try:
+        import sentence_transformers
+        return True
+    except ImportError:
+        return False
 def check_search_availability():
     """Check if required search dependencies are available."""
     if not SEARCH_DEPENDENCIES_AVAILABLE:
         raise ImportError(
-            "Search functionality requires 'sentence-transformers' and NumPy. "
-            "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
+            "Search functionality requires 'lancedb' and pyarrow. "
+            "Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
+        )
+    # Lazy check for sentence_transformers when actually needed
+    if not _check_sentence_transformers():
+        raise ImportError(
+            "Search functionality requires 'sentence-transformers'. "
+            "Install with: pip install sentence-transformers"
         )

natural_pdf/search/lancedb_search_service.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import logging
+import os
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
 import lancedb
 import pyarrow as pa
-from sentence_transformers import SentenceTransformer
+# Lazy import for SentenceTransformer to avoid heavy loading at module level
+# from sentence_transformers import SentenceTransformer
 from .search_options import BaseSearchOptions
 from .search_service_protocol import (
@@ -17,8 +19,14 @@ from .search_service_protocol import (
 logger = logging.getLogger(__name__)
-DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
+DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
+def _get_sentence_transformer(model_name: str):
+    """Lazy import and instantiation of SentenceTransformer."""
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer(model_name)
 class LanceDBSearchService(SearchServiceProtocol):
@@ -41,7 +49,7 @@ class LanceDBSearchService(SearchServiceProtocol):
         self._db = None
         self._table = None
-        self.embedding_model = SentenceTransformer(self._embedding_model_name)
+        self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
         test_embedding = self.embedding_model.encode("test")
         self._embedding_dims = len(test_embedding)

natural_pdf/search/numpy_search_service.py CHANGED Viewed

@@ -1,21 +1,31 @@
 import json
 import logging
+import os
+import tempfile
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
 import numpy as np
-from sentence_transformers import SentenceTransformer
+# Lazy import for SentenceTransformer to avoid heavy loading at module level
+# from sentence_transformers import SentenceTransformer
 from .search_options import BaseSearchOptions
 from .search_service_protocol import (
     Indexable,
     IndexConfigurationError,
+    SearchResult,
     SearchServiceProtocol,
 )
 logger = logging.getLogger(__name__)
-DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+def _get_sentence_transformer(model_name: str):
+    """Lazy import and instantiation of SentenceTransformer."""
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer(model_name)
 class NumpySearchService(SearchServiceProtocol):
@@ -38,7 +48,7 @@ class NumpySearchService(SearchServiceProtocol):
         self.collection_name = collection_name
         self._embedding_model_name = embedding_model_name
-        self.embedding_model = SentenceTransformer(self._embedding_model_name)
+        self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
         self._embedding_dims = len(self.embedding_model.encode("test"))
         # Simple in-memory storage

{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.16
+Version: 0.1.18
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: pandas
 Requires-Dist: pdfplumber
 Requires-Dist: colormath2
 Requires-Dist: pillow
@@ -20,14 +21,15 @@ Requires-Dist: urllib3
 Requires-Dist: tqdm
 Requires-Dist: pydantic
 Requires-Dist: jenkspy
-Requires-Dist: pikepdf>=9.7.0
+Requires-Dist: pikepdf
 Requires-Dist: scipy
 Requires-Dist: torch
 Requires-Dist: torchvision
-Requires-Dist: transformers[sentencepiece]<=4.34.1
+Requires-Dist: transformers[sentencepiece]
 Requires-Dist: huggingface_hub>=0.29.3
 Requires-Dist: sentence-transformers
 Requires-Dist: timm
+Requires-Dist: ipywidgets>=7.0.0
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
 Requires-Dist: pytest-xdist; extra == "test"
@@ -39,7 +41,6 @@ Provides-Extra: favorites
 Requires-Dist: natural-pdf[deskew]; extra == "favorites"
 Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
 Requires-Dist: natural-pdf[search]; extra == "favorites"
-Requires-Dist: ipywidgets; extra == "favorites"
 Requires-Dist: surya-ocr; extra == "favorites"
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
@@ -61,23 +62,22 @@ Requires-Dist: setuptools; extra == "dev"
 Provides-Extra: deskew
 Requires-Dist: deskew>=1.5; extra == "deskew"
 Requires-Dist: img2pdf; extra == "deskew"
-Provides-Extra: addons
-Requires-Dist: surya-ocr; extra == "addons"
-Requires-Dist: doclayout_yolo; extra == "addons"
-Requires-Dist: paddlepaddle>=3.0.0; extra == "addons"
-Requires-Dist: paddleocr>=3.0.0; extra == "addons"
-Requires-Dist: ipywidgets>=7.0.0; extra == "addons"
-Requires-Dist: easyocr; extra == "addons"
-Requires-Dist: surya-ocr; extra == "addons"
-Requires-Dist: doclayout_yolo; extra == "addons"
-Requires-Dist: python-doctr[torch]; extra == "addons"
-Requires-Dist: docling; extra == "addons"
 Provides-Extra: all
 Requires-Dist: natural-pdf[ocr-export]; extra == "all"
 Requires-Dist: natural-pdf[deskew]; extra == "all"
 Requires-Dist: natural-pdf[test]; extra == "all"
 Requires-Dist: natural-pdf[search]; extra == "all"
-Requires-Dist: natural-pdf[addons]; extra == "all"
+Requires-Dist: natural-pdf[extras]; extra == "all"
+Requires-Dist: natural-pdf[favorites]; extra == "all"
+Provides-Extra: paddle
+Requires-Dist: paddlepaddle>=3.0.0; extra == "paddle"
+Requires-Dist: paddleocr>=3.0.1; extra == "paddle"
+Requires-Dist: paddlex>=3.0.1; extra == "paddle"
+Provides-Extra: extras
+Requires-Dist: surya-ocr; extra == "extras"
+Requires-Dist: doclayout_yolo; extra == "extras"
+Requires-Dist: easyocr; extra == "extras"
+Requires-Dist: natural-pdf[paddle]; extra == "extras"
 Provides-Extra: ocr-export
 Requires-Dist: pikepdf; extra == "ocr-export"
 Provides-Extra: export-extras

{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/RECORD RENAMED Viewed

@@ -17,7 +17,7 @@ natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuK
 natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
 natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
 natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
-natural_pdf/classification/manager.py,sha256=-rdZzGP_JK4RDDxIEgdY8_gHRNS0cNHhpOSodjxbd84,17853
+natural_pdf/classification/manager.py,sha256=pzuTP-34W9N3im1ZFhCfQpOu37VSHEx4JHoHNxyy6o0,18894
 natural_pdf/classification/mixin.py,sha256=_XtoqCMqj1nxZYskIV2RbVYiVVcEWzFwae4s5vpzC74,6566
 natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
 natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
@@ -25,21 +25,26 @@ natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
 natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
-natural_pdf/core/page.py,sha256=ciwBf-SoI431SJjp2VRfLxdtqgO2L6p044kXXjlNtjo,118231
-natural_pdf/core/pdf.py,sha256=bAoGPiKIrFaebLwULMT-9VkHQ_wkE_zNl4hlbMLk-2w,69325
+natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
+natural_pdf/core/pdf.py,sha256=yBvb1iGw9gwVPJ3Rm1EBaZ8_g60TuW_Elhg2EOcJMzc,73871
+natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
+natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
+natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
+natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
+natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
-natural_pdf/elements/base.py,sha256=tEyCInUc6wxbUtnXVaBa21Qpr591Sgu4yi7tKxWb-3U,39607
-natural_pdf/elements/collections.py,sha256=_lWL-W-RKlYikkGJU66dskGCZ8-7WfMyUx2G0IgjhlQ,121965
+natural_pdf/elements/base.py,sha256=IlAeyzV66xMrxVx9U3ocGPekzGUBJgKkAiJ5kpvCSAg,39675
+natural_pdf/elements/collections.py,sha256=vgVZsVC3xxRF2S5KW7L0JKa-NSUFnqURk50NtvlwbcM,122113
 natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=nCXyI0vq9-MIQ4Zk90q5Nn-U6gDGv22NY6ime6qG1MY,123330
+natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
 natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
-natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
+natural_pdf/exporters/__init__.py,sha256=XG0ckcKHgG7IVma75syORUme6wEItUvDA46aCZzGqrU,639
 natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
-natural_pdf/exporters/hocr.py,sha256=MOb5sTxe-GlMSOtmqp3p4SY_ZigwOtmd4sj_zMRCIQY,19907
+natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
 natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
-natural_pdf/exporters/original_pdf.py,sha256=zsZPg_lUoEerKIzzoEw-qGdM5XBg_LZhFJeVKnCUp4o,5054
+natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
 natural_pdf/exporters/paddleocr.py,sha256=IAG2p9YeImYcsIvb6a_L5mMrKarvaMaDvRrvdlY6bX4,19489
 natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
 natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -58,16 +63,16 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
 natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
 natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
 natural_pdf/ocr/engine_paddle.py,sha256=ZUtyjso_UjjAPnJt5ac-AtOpR6PfOhO76iOyjngGzr0,16198
-natural_pdf/ocr/engine_surya.py,sha256=Qc3geQQzJ1-9WS1aho38jfvd7yxbYOUVeIpzpapHLRg,5159
+natural_pdf/ocr/engine_surya.py,sha256=PNjvpsHnBghAoa-df52HEyvXzfNI-gTFgKvs2LxHgKo,5051
 natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
-natural_pdf/ocr/ocr_manager.py,sha256=O-wSx50k9pcf0M8N_5nKVefS55r6tMJWRF8KjktA8ts,13664
+natural_pdf/ocr/ocr_manager.py,sha256=M1GRAThzWl5iMkQJ41j84G6cJ7XruQD_HoPPzWf7nUk,14742
 natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
 natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
 natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
 natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
-natural_pdf/search/__init__.py,sha256=RHP1E-5m3hhLXz__g7EvZihBJjPTDtUYh_bZr_NwDo0,3724
-natural_pdf/search/lancedb_search_service.py,sha256=kgm-nYXjPQBkEkWE0gkdpL4V53xm_CEX4rZ5KBpxgfM,14190
-natural_pdf/search/numpy_search_service.py,sha256=5zkkZds-Dcp8PsrvTJdyW15fS1ffHDLVjeiXTGWoRsY,10006
+natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
+natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
+natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
 natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
 natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
 natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
@@ -85,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.16.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.16.dist-info/METADATA,sha256=ncvnNI_PubS4q4v29OKp5UXyanEZNVWqsCanu-xGCOA,6753
-natural_pdf-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.16.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.16.dist-info/RECORD,,
+natural_pdf-0.1.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.18.dist-info/METADATA,sha256=aU8IC02yZuy1aUrHhtDCHEp5igjwaUGP1NDnFDsOTL8,6684
+natural_pdf-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.18.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.18.dist-info/RECORD,,

{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

natural-pdf 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl