PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/elements/text.py CHANGED Viewed

@@ -32,11 +32,11 @@ class TextElement(Element):
             obj["object_type"] = "text"
         super().__init__(obj, page)
         # Memory optimization: Store character indices instead of full dictionaries
         # This reduces memory usage by ~50% by avoiding character data duplication
         self._char_indices = obj.pop("_char_indices", [])
         # Backward compatibility: Keep _char_dicts for existing code
         # But prefer _char_indices when available to save memory
         self._char_dicts = obj.pop("_char_dicts", [])
@@ -44,20 +44,20 @@ class TextElement(Element):
     @property
     def chars(self):
         """Get constituent character elements efficiently.
         Uses character indices when available to avoid memory duplication,
         falls back to _char_dicts for backward compatibility.
         """
         if self._char_indices:
             # Memory-efficient approach: access characters by index
-            if hasattr(self.page, '_element_mgr'):
-                char_elements = self.page._element_mgr.get_elements('chars')
+            if hasattr(self.page, "_element_mgr"):
+                char_elements = self.page._element_mgr.get_elements("chars")
                 return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
         # Backward compatibility: convert _char_dicts to TextElement objects
         if self._char_dicts:
             return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
         return []
     @property
@@ -75,12 +75,12 @@ class TextElement(Element):
         try:
             # If using memory-efficient character indices, update the referenced chars
             if hasattr(self, "_char_indices") and self._char_indices:
-                if hasattr(self.page, '_element_mgr'):
-                    char_elements = self.page._element_mgr.get_elements('chars')
+                if hasattr(self.page, "_element_mgr"):
+                    char_elements = self.page._element_mgr.get_elements("chars")
                     for idx, char_idx in enumerate(self._char_indices):
                         if char_idx < len(char_elements) and idx < len(value):
                             char_elements[char_idx].text = value[idx]
             # Legacy _char_dicts synchronization for backward compatibility
             elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
                 if not self._char_dicts:
@@ -121,6 +121,7 @@ class TextElement(Element):
         except Exception as sync_err:  # pragma: no cover
             # Keep failures silent but logged; better to have outdated chars than crash.
             import logging
             logger = logging.getLogger(__name__)
             logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
@@ -379,7 +380,9 @@ class TextElement(Element):
     @property
     def underline(self) -> bool:
         """True if element is underlined."""
-        return bool(self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline"))
+        return bool(
+            self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline")
+        )
     # -----------------------------
     #  Highlight decoration
@@ -397,7 +400,9 @@ class TextElement(Element):
     @property
     def highlight_color(self):
         """Return RGB(A) tuple of highlight colour if stored."""
-        return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get("highlight_color")
+        return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
+            "highlight_color"
+        )
     def __repr__(self) -> str:
         """String representation of the text element."""
@@ -489,6 +494,7 @@ class TextElement(Element):
         try:
             from bidi.algorithm import get_display  # type: ignore
             from natural_pdf.utils.bidi_mirror import mirror_brackets
             # Convert from logical order to visual order

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -1,16 +1,19 @@
 from .base import FinetuneExporter
 # Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
 def _get_paddleocr_exporter():
     """Lazy import for PaddleOCRRecognitionExporter."""
     from .paddleocr import PaddleOCRRecognitionExporter
     return PaddleOCRRecognitionExporter
 # Make PaddleOCRRecognitionExporter available through attribute access
 def __getattr__(name):
     if name == "PaddleOCRRecognitionExporter":
         return _get_paddleocr_exporter()
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/original_pdf.py CHANGED Viewed

@@ -2,9 +2,9 @@
 Module for exporting original PDF pages without modification.
 """
+import io
 import logging
 import os
-import io
 import urllib.request
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Set, Union
@@ -103,11 +103,17 @@ def create_original_pdf(
             source_handle = pikepdf.Pdf.open(first_page_pdf_path)
         else:
             # Fallback: attempt to open from in-memory bytes stored on PDF object
-            if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
+            if (
+                first_page_pdf_obj is not None
+                and hasattr(first_page_pdf_obj, "_original_bytes")
+                and first_page_pdf_obj._original_bytes
+            ):
                 source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
             else:
                 # Attempt to download bytes directly if path looks like URL
-                if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
+                if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(
+                    ("http://", "https://")
+                ):
                     try:
                         with urllib.request.urlopen(first_page_pdf_path) as resp:
                             data = resp.read()
@@ -117,7 +123,9 @@ def create_original_pdf(
                             f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
                         )
                 else:
-                    raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
+                    raise FileNotFoundError(
+                        f"Source PDF bytes not available for {first_page_pdf_path}"
+                    )
         with source_handle as source_pikepdf_doc:
             target_pikepdf_doc = pikepdf.Pdf.new()

natural_pdf/extraction/mixin.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional, Type, Sequence
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
 from pydantic import BaseModel, Field, create_model
@@ -16,9 +16,54 @@ DEFAULT_STRUCTURED_KEY = "structured"  # Define default key
 class ExtractionMixin(ABC):
-    """
-    Mixin class providing structured data extraction capabilities to elements.
-    Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
+    """Mixin class providing structured data extraction capabilities to elements.
+    This mixin adds AI-powered structured data extraction functionality to pages,
+    regions, and elements, enabling extraction of specific data fields using
+    Pydantic schemas and large language models. It supports both text-based and
+    vision-based extraction modes.
+    The mixin integrates with the StructuredDataManager to handle LLM interactions
+    and provides schema validation using Pydantic models. Extracted data is
+    automatically validated against the provided schema and stored with
+    confidence metrics and metadata.
+    Extraction modes:
+    - Text-based: Uses extracted text content for LLM processing
+    - Vision-based: Uses rendered images for multimodal LLM analysis
+    - Automatic: Selects best mode based on content and model capabilities
+    Host class requirements:
+    - Must implement extract_text(**kwargs) -> str
+    - Must implement to_image(**kwargs) -> PIL.Image
+    - Must have access to StructuredDataManager (usually via parent PDF)
+    Example:
+        ```python
+        from pydantic import BaseModel
+        class InvoiceData(BaseModel):
+            invoice_number: str
+            total_amount: float
+            due_date: str
+            vendor_name: str
+        pdf = npdf.PDF("invoice.pdf")
+        page = pdf.pages[0]
+        # Extract structured data
+        invoice = page.extract_structured_data(InvoiceData)
+        print(f"Invoice {invoice.data.invoice_number}: ${invoice.data.total_amount}")
+        # Region-specific extraction
+        header_region = page.find('text:contains("Invoice")').above()
+        header_data = header_region.extract_structured_data(InvoiceData)
+        ```
+    Note:
+        Structured extraction requires a compatible LLM to be configured in the
+        StructuredDataManager. Results include confidence scores and validation
+        metadata for quality assessment.
     """
     def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
@@ -386,10 +431,13 @@ class ExtractionMixin(ABC):
         question_map = question_map or {}
         try:
-            from natural_pdf.qa.document_qa import get_qa_engine
-            from natural_pdf.extraction.result import StructuredDataResult
-            from pydantic import Field as _Field, create_model
             import re
+            from pydantic import Field as _Field
+            from pydantic import create_model
+            from natural_pdf.extraction.result import StructuredDataResult
+            from natural_pdf.qa.document_qa import get_qa_engine
         except ImportError as exc:
             raise RuntimeError(
                 "Document-QA dependencies missing. Install with `pip install natural-pdf[ai]`."
@@ -424,7 +472,9 @@ class ExtractionMixin(ABC):
                 question = question_map[display_name]
             else:
                 description = None
-                if hasattr(field_obj, "field_info") and hasattr(field_obj.field_info, "description"):
+                if hasattr(field_obj, "field_info") and hasattr(
+                    field_obj.field_info, "description"
+                ):
                     description = field_obj.field_info.description
                 elif hasattr(field_obj, "description"):
                     description = field_obj.description
@@ -529,7 +579,11 @@ class ExtractionMixin(ABC):
             pdf_instance = self
         elif hasattr(self, "pdf") and hasattr(self.pdf, "get_manager"):
             pdf_instance = self.pdf
-        elif hasattr(self, "page") and hasattr(self.page, "pdf") and hasattr(self.page.pdf, "get_manager"):
+        elif (
+            hasattr(self, "page")
+            and hasattr(self.page, "pdf")
+            and hasattr(self.page.pdf, "get_manager")
+        ):
             pdf_instance = self.page.pdf
         else:
             raise RuntimeError("Cannot access PDF manager to perform LLM extraction.")
@@ -542,7 +596,9 @@ class ExtractionMixin(ABC):
         layout_for_text = kwargs.pop("layout", True)
         content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
-        if content is None or (using == "text" and isinstance(content, str) and not content.strip()):
+        if content is None or (
+            using == "text" and isinstance(content, str) and not content.strip()
+        ):
             result = StructuredDataResult(
                 data=None,
                 success=False,

natural_pdf/extraction/result.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from typing import Any, Generic, Optional, TypeVar
 from collections.abc import Mapping
+from typing import Any, Generic, Optional, TypeVar
 from pydantic import BaseModel, Field

natural_pdf/flows/flow.py CHANGED Viewed

@@ -14,10 +14,69 @@ logger = logging.getLogger(__name__)
 class Flow:
-    """
-    Defines a logical flow or sequence of physical Page or Region objects,
-    specifying their arrangement and alignment to enable operations that
-    span across these segments as if they were a continuous area.
+    """Defines a logical flow or sequence of physical Page or Region objects.
+    A Flow represents a continuous logical document structure that spans across
+    multiple pages or regions, enabling operations on content that flows across
+    boundaries. This is essential for handling multi-page tables, articles that
+    span columns, or any content that requires reading order across segments.
+    Flows specify arrangement (vertical/horizontal) and alignment rules to create
+    a unified coordinate system for element extraction and text processing. They
+    enable natural-pdf to treat fragmented content as a single continuous area
+    for analysis and extraction operations.
+    The Flow system is particularly useful for:
+    - Multi-page tables that break across page boundaries
+    - Multi-column articles with complex reading order
+    - Forms that span multiple pages
+    - Any content requiring logical continuation across segments
+    Attributes:
+        segments: List of Page or Region objects in flow order.
+        arrangement: Primary flow direction ('vertical' or 'horizontal').
+        alignment: Cross-axis alignment for segments of different sizes.
+        segment_gap: Virtual gap between segments in PDF points.
+    Example:
+        Multi-page table flow:
+        ```python
+        pdf = npdf.PDF("multi_page_table.pdf")
+        # Create flow for table spanning pages 2-4
+        table_flow = Flow(
+            segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
+            arrangement='vertical',
+            alignment='left',
+            segment_gap=10.0
+        )
+        # Extract table as if it were continuous
+        table_data = table_flow.extract_table()
+        text_content = table_flow.get_text()
+        ```
+        Multi-column article flow:
+        ```python
+        page = pdf.pages[0]
+        left_column = page.region(0, 0, 300, page.height)
+        right_column = page.region(320, 0, page.width, page.height)
+        # Create horizontal flow for columns
+        article_flow = Flow(
+            segments=[left_column, right_column],
+            arrangement='horizontal',
+            alignment='top'
+        )
+        # Read in proper order
+        article_text = article_flow.get_text()
+        ```
+    Note:
+        Flows create virtual coordinate systems that map element positions across
+        segments, enabling spatial navigation and element selection to work
+        seamlessly across boundaries.
     """
     def __init__(

natural_pdf/flows/region.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Callable
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 from pdfplumber.utils.geometry import objects_to_bbox  # For calculating combined bbox
@@ -133,7 +133,7 @@ class FlowRegion:
         # This is a simplification; true layout-aware joining would be more complex.
         joiner = (
             "\n" if self.flow.arrangement == "vertical" else " "
-        )  # TODO: Make this smarter, consider segment_gap
+        )  # TODO: Consider flow.segment_gap for proportional spacing between segments
         extracted = joiner.join(t for t in texts if t)
         if apply_exclusions:  # Only cache if standard exclusion behavior
@@ -258,7 +258,7 @@ class FlowRegion:
         """
         Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
         If multiple pages are involved, they are stacked into a single image.
         Args:
             resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
             labels: Whether to include a legend for highlights.
@@ -270,7 +270,7 @@ class FlowRegion:
             stack_gap: Gap in pixels between stacked pages.
             stack_background_color: RGB background color for the stacked image.
             **kwargs: Additional arguments passed to the underlying rendering methods.
         Returns:
             PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
         """

natural_pdf/ocr/engine.py CHANGED Viewed

@@ -12,7 +12,38 @@ logger = logging.getLogger(__name__)
 class TextRegion:
-    """Standard representation of an OCR text region."""
+    """Standard representation of an OCR text region.
+    TextRegion provides a standardized format for representing text detected by
+    OCR engines, regardless of the underlying engine implementation. This ensures
+    consistent interfaces across different OCR backends (EasyOCR, Surya, PaddleOCR, etc.).
+    The class handles coordinate normalization and provides utilities for converting
+    between different coordinate formats (bounding boxes vs. polygons).
+    Attributes:
+        bbox: Bounding box coordinates as (x0, y0, x1, y1) tuple.
+        text: The recognized text content.
+        confidence: Confidence score from 0.0 (low) to 1.0 (high).
+        source: Source identifier, typically "ocr" or engine name.
+    Example:
+        ```python
+        # Create from bounding box
+        region = TextRegion(
+            bbox=(100, 200, 300, 250),
+            text="Hello World",
+            confidence=0.95
+        )
+        # Create from polygon coordinates
+        polygon = [[100, 200], [300, 200], [300, 250], [100, 250]]
+        region = TextRegion.from_polygon(polygon, "Hello World", 0.95)
+        # Convert to dictionary for processing
+        data = region.to_dict()
+        ```
+    """
     def __init__(
         self,
@@ -54,7 +85,57 @@ class TextRegion:
 class OCREngine(ABC):
-    """Abstract Base Class for OCR engines."""
+    """Abstract base class for OCR engines.
+    This class defines the standard interface that all OCR engines must implement
+    in natural-pdf. It provides a consistent API for text recognition regardless
+    of the underlying OCR technology (EasyOCR, Surya, PaddleOCR, DocTR, etc.).
+    The base class handles common functionality like model caching, parameter
+    validation, and result standardization, while concrete implementations
+    provide engine-specific processing logic.
+    Subclasses must implement:
+    - process_single_image(): Core OCR processing for a single image
+    - is_available(): Check if the engine dependencies are installed
+    - get_supported_languages(): Return list of supported language codes
+    Class Attributes:
+        DEFAULT_MIN_CONFIDENCE: Default confidence threshold (0.2).
+        DEFAULT_LANGUAGES: Default language list (["en"]).
+        DEFAULT_DEVICE: Default processing device ("cpu").
+    Attributes:
+        logger: Logger instance for the specific engine.
+        _model: Cached model instance for the engine.
+        _initialized: Whether the engine has been initialized.
+        _reader_cache: Cache for initialized models/readers.
+    Example:
+        Implementing a custom OCR engine:
+        ```python
+        class MyOCREngine(OCREngine):
+            @classmethod
+            def is_available(cls) -> bool:
+                try:
+                    import my_ocr_library
+                    return True
+                except ImportError:
+                    return False
+            def process_single_image(self, image, languages, min_confidence,
+                                   device, detect_only, options):
+                # Implement OCR processing
+                return text_regions
+        ```
+        Using an OCR engine:
+        ```python
+        if EasyOCREngine.is_available():
+            engine = EasyOCREngine()
+            results = engine.process_image(image, languages=['en', 'es'])
+        ```
+    """
     # Default values as class constants
     DEFAULT_MIN_CONFIDENCE = 0.2

natural_pdf/ocr/engine_paddle.py CHANGED Viewed

@@ -11,6 +11,7 @@ from .ocr_options import BaseOCROptions, PaddleOCROptions
 logger = logging.getLogger(__name__)
 class PaddleOCREngine(OCREngine):
     """PaddleOCR engine implementation."""
@@ -147,8 +148,8 @@ class PaddleOCREngine(OCREngine):
         # --- RESTORE: Language/version support check logic ---
         user_specified_model = (
-            getattr(paddle_options, "text_recognition_model_name", None) is not None or
-            getattr(paddle_options, "text_detection_model_name", None) is not None
+            getattr(paddle_options, "text_recognition_model_name", None) is not None
+            or getattr(paddle_options, "text_detection_model_name", None) is not None
         )
         if user_specified_model and user_ocr_version:
             if primary_lang not in self.SUPPORT_MATRIX.get(user_ocr_version, set()):
@@ -169,7 +170,7 @@ class PaddleOCREngine(OCREngine):
                     user_ocr_version,
                 )
                 final_ocr_version = None  # Reset to find a compatible version
         # If no version was specified or the specified one was incompatible, find the best fit.
         if not final_ocr_version:
             found_compatible = False
@@ -269,7 +270,6 @@ class PaddleOCREngine(OCREngine):
                 if value is not None:
                     ocr_config[arg] = value
         try:
             # The new API uses PaddleOCR as a pipeline object.
             self._model = paddleocr.PaddleOCR(**ocr_config)
@@ -350,7 +350,7 @@ class PaddleOCREngine(OCREngine):
                     # This code converts any numpy array to a list before passing to _standardize_bbox,
                     # which handles both rectangle and polygon formats robustly.
                     box = rec_boxes[i]
-                    if hasattr(box, 'tolist'):
+                    if hasattr(box, "tolist"):
                         box = box.tolist()
                     bbox = self._standardize_bbox(box)
                     if detect_only:

natural_pdf/ocr/ocr_factory.py CHANGED Viewed

@@ -32,7 +32,8 @@ class OCRFactory:
                 return SuryaOCREngine(**kwargs)
             except ImportError:
                 raise ImportError(
-                    "Surya engine requires additional dependencies. " "Install with: npdf install surya"
+                    "Surya engine requires additional dependencies. "
+                    "Install with: npdf install surya"
                 )
         elif engine_type == "easyocr":
             try:

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -11,6 +11,7 @@ from PIL import Image
 from .engine import OCREngine
 from .engine_doctr import DoctrOCREngine
 from .engine_easyocr import EasyOCREngine
 # Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
 # from .engine_paddle import PaddleOCREngine
 from .engine_surya import SuryaOCREngine
@@ -33,12 +34,16 @@ class OCRManager:
     def _get_paddle_engine_class():
         """Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
         from .engine_paddle import PaddleOCREngine
         return PaddleOCREngine
     # Registry mapping engine names to classes and default options
     ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
         "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
-        "paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
+        "paddle": {
+            "class": lambda: OCRManager._get_paddle_engine_class(),
+            "options_class": PaddleOCROptions,
+        },
         "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
         "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
         # Add other engines here
@@ -85,7 +90,10 @@ class OCRManager:
             )
             engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
             # Handle lazy loading - if it's a lambda function, call it to get the actual class
-            if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+            if (
+                callable(engine_class_or_factory)
+                and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
+            ):
                 engine_class = engine_class_or_factory()
             else:
                 engine_class = engine_class_or_factory
@@ -283,7 +291,10 @@ class OCRManager:
                 # Temporarily instantiate to check availability without caching
                 engine_class_or_factory = registry_entry["class"]
                 # Handle lazy loading - if it's a lambda function, call it to get the actual class
-                if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
+                if (
+                    callable(engine_class_or_factory)
+                    and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
+                ):
                     engine_class = engine_class_or_factory()
                 else:
                     engine_class = engine_class_or_factory
@@ -299,49 +310,49 @@ class OCRManager:
     def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
         """
         Cleanup OCR engine instances to free memory.
         Args:
             engine_name: Specific engine to cleanup, or None to cleanup all engines
         Returns:
             Number of engines cleaned up
         """
         cleaned_count = 0
         if engine_name:
             # Cleanup specific engine
             engine_name = engine_name.lower()
             if engine_name in self._engine_instances:
                 engine = self._engine_instances.pop(engine_name)
-                if hasattr(engine, 'cleanup'):
+                if hasattr(engine, "cleanup"):
                     try:
                         engine.cleanup()
                     except Exception as e:
                         logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
                 # Clear associated locks
                 self._engine_locks.pop(engine_name, None)
                 self._engine_inference_locks.pop(engine_name, None)
                 logger.info(f"Cleaned up OCR engine: {engine_name}")
                 cleaned_count = 1
         else:
             # Cleanup all engines
             for name, engine in list(self._engine_instances.items()):
-                if hasattr(engine, 'cleanup'):
+                if hasattr(engine, "cleanup"):
                     try:
                         engine.cleanup()
                     except Exception as e:
                         logger.debug(f"Engine {name} cleanup method failed: {e}")
             # Clear all caches
             engine_count = len(self._engine_instances)
             self._engine_instances.clear()
             self._engine_locks.clear()
             self._engine_inference_locks.clear()
             if engine_count > 0:
                 logger.info(f"Cleaned up {engine_count} OCR engines")
             cleaned_count = engine_count
         return cleaned_count

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl