PyPI - natural-pdf - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

natural-pdf 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

natural_pdf/classification/manager.py +38 -13
natural_pdf/core/pdf.py +141 -32
natural_pdf/exporters/__init__.py +12 -1
natural_pdf/exporters/hocr.py +9 -8
natural_pdf/exporters/original_pdf.py +31 -2
natural_pdf/ocr/engine_surya.py +1 -2
natural_pdf/ocr/ocr_manager.py +21 -4
natural_pdf/search/__init__.py +20 -3
natural_pdf/search/lancedb_search_service.py +13 -5
natural_pdf/search/numpy_search_service.py +13 -3
{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/METADATA +16 -16
{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/RECORD +15 -15
{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/top_level.txt +0 -0

natural_pdf/classification/manager.py CHANGED Viewed

@@ -5,25 +5,41 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 from PIL import Image
+# Lazy imports for heavy dependencies to avoid loading at module level
 # Use try-except for robustness if dependencies are missing
-try:
+_CLASSIFICATION_AVAILABLE = None
+def _check_classification_dependencies():
+    """Lazy check for classification dependencies."""
+    global _CLASSIFICATION_AVAILABLE
+    if _CLASSIFICATION_AVAILABLE is None:
+        try:
+            import torch
+            import transformers
+            _CLASSIFICATION_AVAILABLE = True
+        except ImportError:
+            _CLASSIFICATION_AVAILABLE = False
+    return _CLASSIFICATION_AVAILABLE
+def _get_torch():
+    """Lazy import for torch."""
     import torch
+    return torch
+def _get_transformers_components():
+    """Lazy import for transformers components."""
     from transformers import (
         AutoModelForSequenceClassification,
         AutoModelForZeroShotImageClassification,
         AutoTokenizer,
         pipeline,
     )
-    _CLASSIFICATION_AVAILABLE = True
-except ImportError:
-    _CLASSIFICATION_AVAILABLE = False
-    # Define dummy types for type hinting if imports fail
-    pipeline = object
-    AutoTokenizer = object
-    AutoModelForZeroShotImageClassification = object
-    AutoModelForSequenceClassification = object
-    torch = None
+    return {
+        'AutoModelForSequenceClassification': AutoModelForSequenceClassification,
+        'AutoModelForZeroShotImageClassification': AutoModelForZeroShotImageClassification,
+        'AutoTokenizer': AutoTokenizer,
+        'pipeline': pipeline,
+    }
 from tqdm.auto import tqdm
@@ -41,6 +57,11 @@ _PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
 _TOKENIZER_CACHE: Dict[str, Any] = {}
 _MODEL_CACHE: Dict[str, Any] = {}
+# Export the availability check function for external use
+def is_classification_available() -> bool:
+    """Check if classification dependencies are available."""
+    return _check_classification_dependencies()
 class ClassificationError(Exception):
     """Custom exception for classification errors."""
@@ -66,7 +87,7 @@ class ClassificationManager:
             model_mapping: Optional dictionary mapping aliases ('text', 'vision') to model IDs.
             default_device: Default device ('cpu', 'cuda') if not specified in classify calls.
         """
-        if not _CLASSIFICATION_AVAILABLE:
+        if not _check_classification_dependencies():
             raise ImportError(
                 "Classification dependencies missing. "
                 'Install with: pip install "natural-pdf[core-ml]"'
@@ -81,7 +102,7 @@ class ClassificationManager:
     def is_available(self) -> bool:
         """Check if required dependencies are installed."""
-        return _CLASSIFICATION_AVAILABLE
+        return _check_classification_dependencies()
     def _get_pipeline(self, model_id: str, using: str) -> "Pipeline":
         """Get or create a classification pipeline."""
@@ -92,6 +113,10 @@ class ClassificationManager:
             )
             start_time = time.time()
             try:
+                # Lazy import transformers components
+                transformers_components = _get_transformers_components()
+                pipeline = transformers_components['pipeline']
                 task = (
                     "zero-shot-classification"
                     if using == "text"

natural_pdf/core/pdf.py CHANGED Viewed

@@ -25,9 +25,10 @@ from typing import (
 import pdfplumber
 from PIL import Image
 from tqdm.auto import tqdm
+import weakref
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
-from natural_pdf.classification.manager import ClassificationError, ClassificationManager
+from natural_pdf.classification.manager import ClassificationError
 from natural_pdf.classification.mixin import ClassificationMixin
 from natural_pdf.classification.results import ClassificationResult
 from natural_pdf.core.highlighting_service import HighlightingService
@@ -72,8 +73,13 @@ except ImportError:
 logger = logging.getLogger("natural_pdf.core.pdf")
+def _get_classification_manager_class():
+    """Lazy import for ClassificationManager."""
+    from natural_pdf.classification.manager import ClassificationManager
+    return ClassificationManager
 DEFAULT_MANAGERS = {
-    "classification": ClassificationManager,
+    "classification": _get_classification_manager_class,
     "structured_data": StructuredDataManager,
 }
@@ -91,6 +97,62 @@ except ImportError:
     img2pdf = None
 # End Deskew Imports
+# --- Lazy Page List Helper --- #
+from collections.abc import Sequence
+class _LazyPageList(Sequence):
+    """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
+    The sequence holds `None` placeholders until an index is accessed, at which point
+    a real `Page` object is created, cached, and returned.  Slices and iteration are
+    also supported and will materialise pages on demand.
+    """
+    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
+        self._parent_pdf = parent_pdf
+        self._plumber_pdf = plumber_pdf
+        self._font_attrs = font_attrs
+        # One slot per pdfplumber page – initially all None
+        self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
+    # Internal helper -----------------------------------------------------
+    def _create_page(self, index: int) -> "Page":
+        cached = self._cache[index]
+        if cached is None:
+            # Import here to avoid circular import problems
+            from natural_pdf.core.page import Page
+            plumber_page = self._plumber_pdf.pages[index]
+            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
+            self._cache[index] = cached
+        return cached
+    # Sequence protocol ---------------------------------------------------
+    def __len__(self) -> int:
+        return len(self._cache)
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            # Materialise pages for slice lazily as well
+            indices = range(*key.indices(len(self)))
+            return [self._create_page(i) for i in indices]
+        elif isinstance(key, int):
+            if key < 0:
+                key += len(self)
+            if key < 0 or key >= len(self):
+                raise IndexError("Page index out of range")
+            return self._create_page(key)
+        else:
+            raise TypeError("Page indices must be integers or slices")
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self._create_page(i)
+    def __repr__(self) -> str:  # pragma: no cover
+        return f"<_LazyPageList(len={len(self)})>"
+# --- End Lazy Page List Helper --- #
 class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     """
@@ -129,6 +191,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             self.source_path = "<stream>"  # Identifier for source
             self.path = self.source_path  # Use source identifier as path for streams
             stream_to_open = path_or_url_or_stream
+            try:
+                if hasattr(path_or_url_or_stream, "read"):
+                    # If caller provided an in-memory binary stream, capture bytes for potential re-export
+                    current_pos = path_or_url_or_stream.tell()
+                    path_or_url_or_stream.seek(0)
+                    self._original_bytes = path_or_url_or_stream.read()
+                    path_or_url_or_stream.seek(current_pos)
+            except Exception:
+                pass
         elif isinstance(path_or_url_or_stream, (str, Path)):
             path_or_url = str(path_or_url_or_stream)
             self.source_path = path_or_url  # Store original path/URL as source
@@ -137,21 +208,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             if is_url:
                 logger.info(f"Downloading PDF from URL: {path_or_url}")
                 try:
-                    # Use a context manager for the temporary file
-                    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
-                        self._temp_file = temp_f  # Store reference if needed for cleanup
-                        with urllib.request.urlopen(path_or_url) as response:
-                            temp_f.write(response.read())
-                            temp_f.flush()
-                        self._resolved_path = temp_f.name
-                        logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
-                        stream_to_open = self._resolved_path
+                    with urllib.request.urlopen(path_or_url) as response:
+                        data = response.read()
+                    # Load directly into an in-memory buffer — no temp file needed
+                    buffer = io.BytesIO(data)
+                    buffer.seek(0)
+                    self._temp_file = None  # No on-disk temp file
+                    self._resolved_path = path_or_url  # For repr / get_id purposes
+                    stream_to_open = buffer  # pdfplumber accepts file-like objects
                 except Exception as e:
-                    if self._temp_file and hasattr(self._temp_file, "name"):
-                        try:
-                            os.unlink(self._temp_file.name)
-                        except:  # noqa E722
-                            pass
                     logger.error(f"Failed to download PDF from URL: {e}")
                     raise ValueError(f"Failed to download PDF from URL: {e}")
             else:
@@ -187,12 +252,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         # self._classification_manager_instance = ClassificationManager() # Removed this line
         self._manager_registry = {}
-        from natural_pdf.core.page import Page
-        self._pages = [
-            Page(p, parent=self, index=i, font_attrs=font_attrs)
-            for i, p in enumerate(self._pdf.pages)
-        ]
+        # Lazily instantiate pages only when accessed
+        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
         self._element_cache = {}
         self._exclusions = []
@@ -204,15 +265,45 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._initialize_highlighter()
         self.analyses: Dict[str, Any] = {}
+        # --- Automatic cleanup when object is garbage-collected ---
+        self._finalizer = weakref.finalize(
+            self,
+            PDF._finalize_cleanup,
+            self._pdf,
+            getattr(self, "_temp_file", None),
+            getattr(self, "_is_stream", False),
+        )
     def _initialize_managers(self):
         """Initialize manager instances based on DEFAULT_MANAGERS."""
         self._managers = {}
-        for key, manager_class in DEFAULT_MANAGERS.items():
+        for key, manager_class_or_factory in DEFAULT_MANAGERS.items():
             try:
-                self._managers[key] = manager_class()
-                logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
+                # Resolve the entry in DEFAULT_MANAGERS which can be:
+                #   1. A class  -> instantiate directly
+                #   2. A factory (callable) returning a class -> call then instantiate
+                #   3. A factory returning a **ready instance** -> use as-is
+                resolved = manager_class_or_factory
+                # If we have a callable that is *not* a class, call it to obtain the real target
+                # (This is the lazy-import factory case.)
+                if not isinstance(resolved, type) and callable(resolved):
+                    resolved = resolved()
+                # At this point `resolved` is either a class or an already-created instance
+                if isinstance(resolved, type):
+                    instance = resolved()  # Instantiate class
+                    self._managers[key] = instance
+                    logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
+                else:
+                    # Assume factory already returned an instance
+                    self._managers[key] = resolved
+                    logger.debug(
+                        f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
+                    )
             except Exception as e:
-                logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
+                logger.error(f"Failed to initialize manager for key '{key}': {e}")
                 self._managers[key] = None
     def get_manager(self, key: str) -> Any:
@@ -1220,6 +1311,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             except Exception as e:
                 logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
+        # Cancels the weakref finalizer so we don't double-clean
+        if hasattr(self, "_finalizer") and self._finalizer.alive:
+            self._finalizer()
     def __enter__(self):
         """Context manager entry."""
         return self
@@ -1404,12 +1499,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
         if not manager or not manager.is_available():
-            try:
-                from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
-                if not _CLASSIFICATION_AVAILABLE:
-                    raise ImportError("Classification dependencies missing.")
-            except ImportError:
+            from natural_pdf.classification.manager import is_classification_available
+            if not is_classification_available():
                 raise ImportError(
                     "Classification dependencies missing. "
                     'Install with: pip install "natural-pdf[core-ml]"'
@@ -1723,3 +1815,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
     # --- End Classification Mixin Implementation ---
+    # Static helper for weakref.finalize to avoid capturing 'self'
+    @staticmethod
+    def _finalize_cleanup(plumber_pdf, temp_file_obj, is_stream):
+        try:
+            if plumber_pdf is not None:
+                plumber_pdf.close()
+        except Exception:
+            pass
+        if temp_file_obj and not is_stream:
+            try:
+                path = temp_file_obj.name if hasattr(temp_file_obj, "name") else None
+                if path and os.path.exists(path):
+                    os.unlink(path)
+            except Exception:
+                pass

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -1,4 +1,15 @@
 from .base import FinetuneExporter
-from .paddleocr import PaddleOCRRecognitionExporter
+# Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
+def _get_paddleocr_exporter():
+    """Lazy import for PaddleOCRRecognitionExporter."""
+    from .paddleocr import PaddleOCRRecognitionExporter
+    return PaddleOCRRecognitionExporter
+# Make PaddleOCRRecognitionExporter available through attribute access
+def __getattr__(name):
+    if name == "PaddleOCRRecognitionExporter":
+        return _get_paddleocr_exporter()
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/hocr.py CHANGED Viewed

@@ -16,6 +16,7 @@ from dataclasses import dataclass
 from itertools import pairwise
 from math import atan, pi
 from pathlib import Path
+from typing import Optional, Union
 from xml.etree import ElementTree
 from pikepdf import Matrix, Name, Rectangle
@@ -94,12 +95,12 @@ class HocrTransform:
     def __init__(
         self,
         *,
-        hocr_filename: str | Path,
+        hocr_filename: Union[str, Path],
         dpi: float,
         debug: bool = False,
         fontname: Name = Name("/f-0-0"),
         font: Font = GlyphlessFont(),
-        debug_render_options: DebugRenderOptions | None = None,
+        debug_render_options: Optional[DebugRenderOptions] = None,
     ):
         """Initialize the HocrTransform object."""
         if debug:
@@ -144,7 +145,7 @@ class HocrTransform:
         return text
     @classmethod
-    def element_coordinates(cls, element: Element) -> Rectangle | None:
+    def element_coordinates(cls, element: Element) -> Optional[Rectangle]:
         """Get coordinates of the bounding box around an element."""
         matches = cls.box_pattern.search(element.attrib.get("title", ""))
         if not matches:
@@ -172,7 +173,7 @@ class HocrTransform:
             return 0.0
         return float(matches.group(1))
-    def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
+    def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
         xpath = f".//{self.xmlns}{html_tag}"
         if html_class:
             xpath += f"[@class='{html_class}']"
@@ -187,7 +188,7 @@ class HocrTransform:
         self,
         *,
         out_filename: Path,
-        image_filename: Path | None = None,
+        image_filename: Optional[Path] = None,
         invisible_text: bool = True,
     ) -> None:
         """Creates a PDF file with an image superimposed on top of the text.
@@ -291,7 +292,7 @@ class HocrTransform:
     def _do_line(
         self,
         canvas: Canvas,
-        line: Element | None,
+        line: Optional[Element],
         elemclass: str,
         invisible_text: bool,
         text_direction: TextDirection,
@@ -387,8 +388,8 @@ class HocrTransform:
         line_matrix: Matrix,
         text: Text,
         fontsize: float,
-        elem: Element | None,
-        next_elem: Element | None,
+        elem: Optional[Element],
+        next_elem: Optional[Element],
         text_direction: TextDirection,
         inject_word_breaks: bool,
     ):

natural_pdf/exporters/original_pdf.py CHANGED Viewed

@@ -4,6 +4,8 @@ Module for exporting original PDF pages without modification.
 import logging
 import os
+import io
+import urllib.request
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Set, Union
@@ -69,8 +71,11 @@ def create_original_pdf(
     # Verify all pages come from the same PDF and get path
     first_page_pdf_path = None
+    first_page_pdf_obj = None
     if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
-        first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
+        src_pdf = pages_to_extract[0].pdf
+        first_page_pdf_path = getattr(src_pdf, "path", None)
+        first_page_pdf_obj = src_pdf
     if not first_page_pdf_path:
         raise ValueError(
@@ -93,7 +98,28 @@ def create_original_pdf(
     )
     try:
-        with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
+        # Prefer opening via filesystem path when it exists locally
+        if first_page_pdf_path and os.path.exists(first_page_pdf_path):
+            source_handle = pikepdf.Pdf.open(first_page_pdf_path)
+        else:
+            # Fallback: attempt to open from in-memory bytes stored on PDF object
+            if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
+                source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
+            else:
+                # Attempt to download bytes directly if path looks like URL
+                if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
+                    try:
+                        with urllib.request.urlopen(first_page_pdf_path) as resp:
+                            data = resp.read()
+                        source_handle = pikepdf.Pdf.open(io.BytesIO(data))
+                    except Exception as dl_err:
+                        raise FileNotFoundError(
+                            f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
+                        )
+                else:
+                    raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
+        with source_handle as source_pikepdf_doc:
             target_pikepdf_doc = pikepdf.Pdf.new()
             for page_index in sorted_indices:
@@ -113,6 +139,9 @@ def create_original_pdf(
                 f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
             )
+    except FileNotFoundError as e:
+        logger.error(str(e))
+        raise RuntimeError(f"Failed to save original pages PDF: {e}")
     except pikepdf.PasswordError:
         logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
         raise RuntimeError(

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -27,7 +27,6 @@ class SuryaOCREngine(OCREngine):
         if not self.is_available():
             raise ImportError("Surya OCR library is not installed or available.")
-        # Store languages for use in _process_single_image
         self._langs = languages
         from surya.detection import DetectionPredictor
@@ -63,7 +62,6 @@ class SuryaOCREngine(OCREngine):
         if not self._recognition_predictor or not self._detection_predictor:
             raise RuntimeError("Surya predictors are not initialized.")
-        # Store languages instance variable during initialization to use here
         langs = (
             [self._langs]  # Send all languages together in one list per image
             if hasattr(self, "_langs")
@@ -75,6 +73,7 @@ class SuryaOCREngine(OCREngine):
             results = self._detection_predictor(images=[image])
         else:
             results = self._recognition_predictor(
+                langs=langs,
                 images=[image],
                 det_predictor=self._detection_predictor,
             )

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -11,7 +11,8 @@ from PIL import Image
 from .engine import OCREngine
 from .engine_doctr import DoctrOCREngine
 from .engine_easyocr import EasyOCREngine
-from .engine_paddle import PaddleOCREngine
+# Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
+# from .engine_paddle import PaddleOCREngine
 from .engine_surya import SuryaOCREngine
 from .ocr_options import (
     BaseOCROptions,
@@ -28,10 +29,16 @@ logger = logging.getLogger(__name__)
 class OCRManager:
     """Manages OCR engine selection, configuration, and execution."""
+    @staticmethod
+    def _get_paddle_engine_class():
+        """Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
+        from .engine_paddle import PaddleOCREngine
+        return PaddleOCREngine
     # Registry mapping engine names to classes and default options
     ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
         "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
-        "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
+        "paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
         "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
         "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
         # Add other engines here
@@ -76,7 +83,12 @@ class OCRManager:
             logger.info(
                 f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
             )
-            engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
+            engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
+            # Handle lazy loading - if it's a lambda function, call it to get the actual class
+            if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+                engine_class = engine_class_or_factory()
+            else:
+                engine_class = engine_class_or_factory
             start_time = time.monotonic()  # Optional: time initialization
             try:
                 engine_instance = engine_class()  # Instantiate first
@@ -277,7 +289,12 @@ class OCRManager:
         for name, registry_entry in self.ENGINE_REGISTRY.items():
             try:
                 # Temporarily instantiate to check availability without caching
-                engine_class = registry_entry["class"]
+                engine_class_or_factory = registry_entry["class"]
+                # Handle lazy loading - if it's a lambda function, call it to get the actual class
+                if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+                    engine_class = engine_class_or_factory()
+                else:
+                    engine_class = engine_class_or_factory
                 if engine_class().is_available():
                     available.append(name)
             except Exception as e:

natural_pdf/search/__init__.py CHANGED Viewed

@@ -18,7 +18,8 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
 try:
     import numpy as np
-    import sentence_transformers
+    # Lazy import for sentence_transformers to avoid heavy loading at module level
+    # import sentence_transformers
     # Basic search dependencies are available
     SEARCH_DEPENDENCIES_AVAILABLE = True
@@ -46,12 +47,28 @@ except ImportError:
 logger = logging.getLogger(__name__)
+def _check_sentence_transformers():
+    """Lazy check for sentence_transformers availability."""
+    try:
+        import sentence_transformers
+        return True
+    except ImportError:
+        return False
 def check_search_availability():
     """Check if required search dependencies are available."""
     if not SEARCH_DEPENDENCIES_AVAILABLE:
         raise ImportError(
-            "Search functionality requires 'sentence-transformers' and NumPy. "
-            "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
+            "Search functionality requires 'lancedb' and pyarrow. "
+            "Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
+        )
+    # Lazy check for sentence_transformers when actually needed
+    if not _check_sentence_transformers():
+        raise ImportError(
+            "Search functionality requires 'sentence-transformers'. "
+            "Install with: pip install sentence-transformers"
         )

natural_pdf/search/lancedb_search_service.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import logging
+import os
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
 import lancedb
 import pyarrow as pa
-from sentence_transformers import SentenceTransformer
+# Lazy import for SentenceTransformer to avoid heavy loading at module level
+# from sentence_transformers import SentenceTransformer
 from .search_options import BaseSearchOptions
 from .search_service_protocol import (
@@ -17,8 +19,14 @@ from .search_service_protocol import (
 logger = logging.getLogger(__name__)
-DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
+DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
+def _get_sentence_transformer(model_name: str):
+    """Lazy import and instantiation of SentenceTransformer."""
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer(model_name)
 class LanceDBSearchService(SearchServiceProtocol):
@@ -41,7 +49,7 @@ class LanceDBSearchService(SearchServiceProtocol):
         self._db = None
         self._table = None
-        self.embedding_model = SentenceTransformer(self._embedding_model_name)
+        self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
         test_embedding = self.embedding_model.encode("test")
         self._embedding_dims = len(test_embedding)

natural_pdf/search/numpy_search_service.py CHANGED Viewed

@@ -1,21 +1,31 @@
 import json
 import logging
+import os
+import tempfile
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
 import numpy as np
-from sentence_transformers import SentenceTransformer
+# Lazy import for SentenceTransformer to avoid heavy loading at module level
+# from sentence_transformers import SentenceTransformer
 from .search_options import BaseSearchOptions
 from .search_service_protocol import (
     Indexable,
     IndexConfigurationError,
+    SearchResult,
     SearchServiceProtocol,
 )
 logger = logging.getLogger(__name__)
-DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+def _get_sentence_transformer(model_name: str):
+    """Lazy import and instantiation of SentenceTransformer."""
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer(model_name)
 class NumpySearchService(SearchServiceProtocol):
@@ -38,7 +48,7 @@ class NumpySearchService(SearchServiceProtocol):
         self.collection_name = collection_name
         self._embedding_model_name = embedding_model_name
-        self.embedding_model = SentenceTransformer(self._embedding_model_name)
+        self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
         self._embedding_dims = len(self.embedding_model.encode("test"))
         # Simple in-memory storage

{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.17
+Version: 0.1.18
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: pandas
 Requires-Dist: pdfplumber
 Requires-Dist: colormath2
 Requires-Dist: pillow
@@ -20,14 +21,15 @@ Requires-Dist: urllib3
 Requires-Dist: tqdm
 Requires-Dist: pydantic
 Requires-Dist: jenkspy
-Requires-Dist: pikepdf>=9.7.0
+Requires-Dist: pikepdf
 Requires-Dist: scipy
 Requires-Dist: torch
 Requires-Dist: torchvision
-Requires-Dist: transformers[sentencepiece]<=4.34.1
+Requires-Dist: transformers[sentencepiece]
 Requires-Dist: huggingface_hub>=0.29.3
 Requires-Dist: sentence-transformers
 Requires-Dist: timm
+Requires-Dist: ipywidgets>=7.0.0
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
 Requires-Dist: pytest-xdist; extra == "test"
@@ -39,7 +41,6 @@ Provides-Extra: favorites
 Requires-Dist: natural-pdf[deskew]; extra == "favorites"
 Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
 Requires-Dist: natural-pdf[search]; extra == "favorites"
-Requires-Dist: ipywidgets; extra == "favorites"
 Requires-Dist: surya-ocr; extra == "favorites"
 Provides-Extra: dev
 Requires-Dist: black; extra == "dev"
@@ -61,23 +62,22 @@ Requires-Dist: setuptools; extra == "dev"
 Provides-Extra: deskew
 Requires-Dist: deskew>=1.5; extra == "deskew"
 Requires-Dist: img2pdf; extra == "deskew"
-Provides-Extra: addons
-Requires-Dist: surya-ocr; extra == "addons"
-Requires-Dist: doclayout_yolo; extra == "addons"
-Requires-Dist: paddlepaddle>=3.0.0; extra == "addons"
-Requires-Dist: paddleocr>=3.0.0; extra == "addons"
-Requires-Dist: ipywidgets>=7.0.0; extra == "addons"
-Requires-Dist: easyocr; extra == "addons"
-Requires-Dist: surya-ocr; extra == "addons"
-Requires-Dist: doclayout_yolo; extra == "addons"
-Requires-Dist: python-doctr[torch]; extra == "addons"
-Requires-Dist: docling; extra == "addons"
 Provides-Extra: all
 Requires-Dist: natural-pdf[ocr-export]; extra == "all"
 Requires-Dist: natural-pdf[deskew]; extra == "all"
 Requires-Dist: natural-pdf[test]; extra == "all"
 Requires-Dist: natural-pdf[search]; extra == "all"
-Requires-Dist: natural-pdf[addons]; extra == "all"
+Requires-Dist: natural-pdf[extras]; extra == "all"
+Requires-Dist: natural-pdf[favorites]; extra == "all"
+Provides-Extra: paddle
+Requires-Dist: paddlepaddle>=3.0.0; extra == "paddle"
+Requires-Dist: paddleocr>=3.0.1; extra == "paddle"
+Requires-Dist: paddlex>=3.0.1; extra == "paddle"
+Provides-Extra: extras
+Requires-Dist: surya-ocr; extra == "extras"
+Requires-Dist: doclayout_yolo; extra == "extras"
+Requires-Dist: easyocr; extra == "extras"
+Requires-Dist: natural-pdf[paddle]; extra == "extras"
 Provides-Extra: ocr-export
 Requires-Dist: pikepdf; extra == "ocr-export"
 Provides-Extra: export-extras

{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/RECORD RENAMED Viewed

@@ -17,7 +17,7 @@ natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuK
 natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
 natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
 natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
-natural_pdf/classification/manager.py,sha256=-rdZzGP_JK4RDDxIEgdY8_gHRNS0cNHhpOSodjxbd84,17853
+natural_pdf/classification/manager.py,sha256=pzuTP-34W9N3im1ZFhCfQpOu37VSHEx4JHoHNxyy6o0,18894
 natural_pdf/classification/mixin.py,sha256=_XtoqCMqj1nxZYskIV2RbVYiVVcEWzFwae4s5vpzC74,6566
 natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
 natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
@@ -26,7 +26,7 @@ natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,
 natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
 natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
 natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
-natural_pdf/core/pdf.py,sha256=bAoGPiKIrFaebLwULMT-9VkHQ_wkE_zNl4hlbMLk-2w,69325
+natural_pdf/core/pdf.py,sha256=yBvb1iGw9gwVPJ3Rm1EBaZ8_g60TuW_Elhg2EOcJMzc,73871
 natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
 natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
 natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
@@ -40,11 +40,11 @@ natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,
 natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
 natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
-natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
+natural_pdf/exporters/__init__.py,sha256=XG0ckcKHgG7IVma75syORUme6wEItUvDA46aCZzGqrU,639
 natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
-natural_pdf/exporters/hocr.py,sha256=MOb5sTxe-GlMSOtmqp3p4SY_ZigwOtmd4sj_zMRCIQY,19907
+natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
 natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
-natural_pdf/exporters/original_pdf.py,sha256=zsZPg_lUoEerKIzzoEw-qGdM5XBg_LZhFJeVKnCUp4o,5054
+natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
 natural_pdf/exporters/paddleocr.py,sha256=IAG2p9YeImYcsIvb6a_L5mMrKarvaMaDvRrvdlY6bX4,19489
 natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
 natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -63,16 +63,16 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
 natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
 natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
 natural_pdf/ocr/engine_paddle.py,sha256=ZUtyjso_UjjAPnJt5ac-AtOpR6PfOhO76iOyjngGzr0,16198
-natural_pdf/ocr/engine_surya.py,sha256=Qc3geQQzJ1-9WS1aho38jfvd7yxbYOUVeIpzpapHLRg,5159
+natural_pdf/ocr/engine_surya.py,sha256=PNjvpsHnBghAoa-df52HEyvXzfNI-gTFgKvs2LxHgKo,5051
 natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
-natural_pdf/ocr/ocr_manager.py,sha256=O-wSx50k9pcf0M8N_5nKVefS55r6tMJWRF8KjktA8ts,13664
+natural_pdf/ocr/ocr_manager.py,sha256=M1GRAThzWl5iMkQJ41j84G6cJ7XruQD_HoPPzWf7nUk,14742
 natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
 natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
 natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
 natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
-natural_pdf/search/__init__.py,sha256=RHP1E-5m3hhLXz__g7EvZihBJjPTDtUYh_bZr_NwDo0,3724
-natural_pdf/search/lancedb_search_service.py,sha256=kgm-nYXjPQBkEkWE0gkdpL4V53xm_CEX4rZ5KBpxgfM,14190
-natural_pdf/search/numpy_search_service.py,sha256=5zkkZds-Dcp8PsrvTJdyW15fS1ffHDLVjeiXTGWoRsY,10006
+natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
+natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
+natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
 natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
 natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
 natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
@@ -90,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.17.dist-info/METADATA,sha256=yGeusUaYx_R_aRl0lUnAHVfBav9Zw43MXDYcB3b6BcA,6753
-natural_pdf-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.17.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.17.dist-info/RECORD,,
+natural_pdf-0.1.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.18.dist-info/METADATA,sha256=aU8IC02yZuy1aUrHhtDCHEp5igjwaUGP1NDnFDsOTL8,6684
+natural_pdf-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.18.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.18.dist-info/RECORD,,

{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.17.dist-info → natural_pdf-0.1.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl

natural-pdf 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl