PyPI - natural-pdf - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl - Mend

natural-pdf 0.1.22py3-none-any.whl → 0.1.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

natural_pdf/analyzers/shape_detection_mixin.py +3 -3
natural_pdf/classification/manager.py +1 -1
natural_pdf/classification/mixin.py +35 -14
natural_pdf/classification/results.py +16 -1
natural_pdf/cli.py +1 -0
natural_pdf/core/highlighting_service.py +23 -0
natural_pdf/core/page.py +16 -0
natural_pdf/core/pdf.py +24 -4
natural_pdf/elements/base.py +79 -1
natural_pdf/elements/collections.py +23 -1
natural_pdf/elements/region.py +54 -148
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +2 -2
natural_pdf/extraction/mixin.py +295 -11
natural_pdf/extraction/result.py +28 -1
natural_pdf/flows/region.py +1 -1
natural_pdf/ocr/engine_surya.py +25 -5
natural_pdf/qa/__init__.py +2 -1
natural_pdf/qa/document_qa.py +33 -37
natural_pdf/qa/qa_result.py +55 -0
natural_pdf/selectors/parser.py +22 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/METADATA +21 -13
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/RECORD +27 -26
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.23.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/shape_detection_mixin.py CHANGED Viewed

@@ -63,7 +63,7 @@ class ShapeDetectionMixin:
                 logger.debug(f"Shape detection on Region: {self}")
                 page_obj = self._page
                 pil_image = self.to_image(
-                    resolution=resolution, crop_only=True, include_highlights=False
+                    resolution=resolution, crop=True, include_highlights=False
                 )
                 if pil_image:  # Ensure pil_image is not None before accessing attributes
                     origin_offset_pdf = (self.x0, self.top)
@@ -681,7 +681,7 @@ class ShapeDetectionMixin:
         if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
             if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
                 pil_image_for_dims = self.to_image(
-                    resolution=resolution, crop_only=True, include_highlights=False
+                    resolution=resolution, crop=True, include_highlights=False
                 )
             else:
                 pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
@@ -1204,7 +1204,7 @@ class ShapeDetectionMixin:
         if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
             if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
                 pil_image_for_dims = self.to_image(
-                    resolution=resolution, crop_only=True, include_highlights=False
+                    resolution=resolution, crop=True, include_highlights=False
                 )
             else:
                 pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)

natural_pdf/classification/manager.py CHANGED Viewed

@@ -90,7 +90,7 @@ class ClassificationManager:
         if not _check_classification_dependencies():
             raise ImportError(
                 "Classification dependencies missing. "
-                'Install with: pip install "natural-pdf[core-ml]"'
+                'Install with: pip install "natural-pdf[ai]"'
             )
         self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (

natural_pdf/classification/mixin.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 from PIL import Image
+import warnings
 from .results import ClassificationResult
@@ -74,32 +75,52 @@ class ClassificationMixin:
         try:
             manager = self._get_classification_manager()
-            # Determine the effective model ID and engine type
+            # ------------------------------------------------------------
+            # Resolve engine ('text' vs 'vision')
+            # ------------------------------------------------------------
+            engine: Optional[str] = using  # rename for clarity
+            content = None  # will hold final content
+            if engine is None:
+                # Try text first
+                try:
+                    tentative_text = self._get_classification_content("text", **kwargs)
+                    if tentative_text and not (isinstance(tentative_text, str) and tentative_text.isspace()):
+                        engine = "text"
+                        content = tentative_text
+                    else:
+                        raise ValueError("Empty text")
+                except Exception:
+                    warnings.warn(
+                        "No text found for classification; falling back to vision model. "
+                        "Pass using='vision' explicitly to silence this message.",
+                        UserWarning,
+                    )
+                    engine = "vision"
+            # If engine determined but content not yet retrieved, get it now
+            if content is None:
+                content = self._get_classification_content(model_type=engine, **kwargs)
+            # ------------------------------------------------------------
+            # Determine model ID default based on engine
+            # ------------------------------------------------------------
             effective_model_id = model
-            inferred_using = manager.infer_using(
-                model if model else manager.DEFAULT_TEXT_MODEL, using
-            )
-            # If model was not provided, use the manager's default for the inferred engine type
             if effective_model_id is None:
                 effective_model_id = (
-                    manager.DEFAULT_TEXT_MODEL
-                    if inferred_using == "text"
-                    else manager.DEFAULT_VISION_MODEL
+                    manager.DEFAULT_TEXT_MODEL if engine == "text" else manager.DEFAULT_VISION_MODEL
                 )
                 logger.debug(
-                    f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'"
+                    f"No model provided, using default for mode '{engine}': '{effective_model_id}'"
                 )
-            # Get content based on the *final* determined engine type
-            content = self._get_classification_content(model_type=inferred_using, **kwargs)
             # Manager now returns a ClassificationResult object
             result_obj: ClassificationResult = manager.classify_item(
                 item_content=content,
                 labels=labels,
                 model_id=effective_model_id,
-                using=inferred_using,
+                using=engine,
                 min_confidence=min_confidence,
                 multi_label=multi_label,
                 **kwargs,

natural_pdf/classification/results.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Dict, List, Optional
+from collections.abc import Mapping
 logger = logging.getLogger(__name__)
@@ -20,7 +21,7 @@ class CategoryScore:
 @dataclass
-class ClassificationResult:
+class ClassificationResult(Mapping):
     """Results from a classification operation."""
     category: Optional[str]  # Can be None if scores are empty
@@ -86,3 +87,17 @@ class ClassificationResult:
     def __repr__(self) -> str:
         return f"<ClassificationResult category='{self.category}' score={self.score:.3f} model='{self.model_id}'>"
+    def __iter__(self):
+        """Iterate over mapping keys (linked to ``to_dict`` so it stays in sync)."""
+        return iter(self.to_dict())
+    def __getitem__(self, key):
+        """Dictionary-style access to attributes."""
+        try:
+            return self.to_dict()[key]
+        except KeyError as exc:
+            raise KeyError(key) from exc
+    def __len__(self):
+        return len(self.to_dict())

natural_pdf/cli.py CHANGED Viewed

@@ -21,6 +21,7 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
     "deskew": [f"{__package__.split('.')[0]}[deskew]"],
     "search": [f"{__package__.split('.')[0]}[search]"],
     "easyocr": ["easyocr"],
+    "ai": [f"{__package__.split('.')[0]}[ai]"],
 }

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -727,6 +727,7 @@ class HighlightingService:
         legend_position: str = "right",
         render_ocr: bool = False,
         resolution: Optional[float] = None,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         **kwargs,
     ) -> Optional[Image.Image]:
         """
@@ -741,6 +742,9 @@ class HighlightingService:
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
             resolution: Resolution for base page image rendering if width/height not used.
+            crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
+                space to crop the output image to, before legends or other overlays are
+                applied. If None, no cropping is performed.
             **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
         Returns:
@@ -855,6 +859,25 @@ class HighlightingService:
             )
             rendered_image = renderer.render()
+            # --- Optional Cropping BEFORE legend addition ---
+            if crop_bbox is not None:
+                cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
+                # Convert to pixel coordinates using actual scales
+                left_px = int(cb_x0 * actual_scale_x) - 2
+                top_px = int(cb_top * actual_scale_y) - 2
+                right_px = int(cb_x1 * actual_scale_x) + 2
+                bottom_px = int(cb_bottom * actual_scale_y) + 2
+                # Safeguard coordinates within bounds
+                left_px = max(0, min(left_px, rendered_image.width - 1))
+                top_px = max(0, min(top_px, rendered_image.height - 1))
+                right_px = max(left_px + 1, min(right_px, rendered_image.width))
+                bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
+                rendered_image = rendered_image.crop(
+                    (left_px, top_px, right_px, bottom_px)
+                )
             legend = None
             if labels:
                 preview_labels = {h.label: h.color for h in preview_highlights if h.label}

natural_pdf/core/page.py CHANGED Viewed

@@ -2808,3 +2808,19 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             return None
     # --- End Skew Detection and Correction --- #
+    # ------------------------------------------------------------------
+    # Unified analysis storage (maps to metadata["analysis"])
+    # ------------------------------------------------------------------
+    @property
+    def analyses(self) -> Dict[str, Any]:
+        if not hasattr(self, "metadata") or self.metadata is None:
+            self.metadata = {}
+        return self.metadata.setdefault("analysis", {})
+    @analyses.setter
+    def analyses(self, value: Dict[str, Any]):
+        if not hasattr(self, "metadata") or self.metadata is None:
+            self.metadata = {}
+        self.metadata["analysis"] = value

natural_pdf/core/pdf.py CHANGED Viewed

@@ -263,7 +263,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._initialize_managers()
         self._initialize_highlighter()
-        self.analyses: Dict[str, Any] = {}
+        # Analysis results accessed via self.analyses property (see below)
         # --- Automatic cleanup when object is garbage-collected ---
         self._finalizer = weakref.finalize(
@@ -1490,7 +1490,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             if not is_classification_available():
                 raise ImportError(
                     "Classification dependencies missing. "
-                    'Install with: pip install "natural-pdf[core-ml]"'
+                    'Install with: pip install "natural-pdf[ai]"'
                 )
             raise ClassificationError("ClassificationManager not available.")
@@ -1802,6 +1802,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     # --- End Classification Mixin Implementation ---
+    # ------------------------------------------------------------------
+    # Unified analysis storage (maps to metadata["analysis"])
+    # ------------------------------------------------------------------
+    @property
+    def analyses(self) -> Dict[str, Any]:
+        if not hasattr(self, "metadata") or self.metadata is None:
+            # For PDF, metadata property returns self._pdf.metadata which may be None
+            self._pdf.metadata = self._pdf.metadata or {}
+        if self.metadata is None:
+            # Fallback safeguard
+            self._pdf.metadata = {}
+        return self.metadata.setdefault("analysis", {})  # type: ignore[attr-defined]
+    @analyses.setter
+    def analyses(self, value: Dict[str, Any]):
+        if not hasattr(self, "metadata") or self.metadata is None:
+            self._pdf.metadata = self._pdf.metadata or {}
+        self.metadata["analysis"] = value  # type: ignore[attr-defined]
     # Static helper for weakref.finalize to avoid capturing 'self'
     @staticmethod
     def _finalize_cleanup(plumber_pdf, temp_file_obj, is_stream):
@@ -1816,5 +1836,5 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                 path = temp_file_obj.name if hasattr(temp_file_obj, "name") else None
                 if path and os.path.exists(path):
                     os.unlink(path)
-            except Exception:
-                pass
+            except Exception as e:
+                logger.warning(f"Failed to clean up temporary file '{path}': {e}")

natural_pdf/elements/base.py CHANGED Viewed

@@ -9,11 +9,13 @@ from PIL import Image
 # Import selector parsing functions
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 from natural_pdf.describe.mixin import DescribeMixin
+from natural_pdf.classification.mixin import ClassificationMixin
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.collections import ElementCollection
     from natural_pdf.elements.region import Region
+    from natural_pdf.classification.manager import ClassificationManager  # noqa: F401
 def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
@@ -413,7 +415,7 @@ class DirectionalMixin:
         return new_region
-class Element(DirectionalMixin, DescribeMixin):
+class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
     """
     Base class for all PDF elements.
@@ -432,6 +434,10 @@ class Element(DirectionalMixin, DescribeMixin):
         self._obj = obj
         self._page = page
+        # Containers for per-element metadata and analysis results (e.g., classification)
+        self.metadata: Dict[str, Any] = {}
+        # Access analysis results via self.analyses property (see below)
     @property
     def type(self) -> str:
         """Element type."""
@@ -850,6 +856,7 @@ class Element(DirectionalMixin, DescribeMixin):
         color: Optional[Union[Tuple, str]] = "red",  # Default color for single element
         label: Optional[str] = None,
         width: Optional[int] = None,  # Add width parameter
+        crop: bool = False,  # NEW: Crop to element bounds before legend
     ) -> Optional["Image.Image"]:
         """
         Show the page with only this element highlighted temporarily.
@@ -861,6 +868,8 @@ class Element(DirectionalMixin, DescribeMixin):
             color: Color to highlight this element (default: red)
             label: Optional label for this element in the legend
             width: Optional width for the output image in pixels
+            crop: If True, crop the rendered image to this element's
+                        bounding box before legends/overlays are added.
         Returns:
             PIL Image of the page with only this element highlighted, or None if error.
@@ -887,6 +896,9 @@ class Element(DirectionalMixin, DescribeMixin):
             "use_color_cycling": False,  # Explicitly false for single preview
         }
+        # Determine crop bbox
+        crop_bbox = self.bbox if crop else None
         # Check if we actually got geometry data
         if temp_highlight_data["bbox"] is None and temp_highlight_data["polygon"] is None:
             logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
@@ -901,6 +913,7 @@ class Element(DirectionalMixin, DescribeMixin):
                 width=width,  # Pass the width parameter
                 labels=labels,
                 legend_position=legend_position,
+                crop_bbox=crop_bbox,
             )
         except Exception as e:
             logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
@@ -1070,3 +1083,68 @@ class Element(DirectionalMixin, DescribeMixin):
             case=case,
             **kwargs,
         )
+    # ------------------------------------------------------------------
+    # ClassificationMixin requirements
+    # ------------------------------------------------------------------
+    def _get_classification_manager(self) -> "ClassificationManager":
+        """Access the shared ClassificationManager via the parent PDF."""
+        if (
+            not hasattr(self, "page")
+            or not hasattr(self.page, "pdf")
+            or not hasattr(self.page.pdf, "get_manager")
+        ):
+            raise AttributeError(
+                "ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
+            )
+        return self.page.pdf.get_manager("classification")
+    def _get_classification_content(self, model_type: str, **kwargs):  # type: ignore[override]
+        """Return either text or an image, depending on model_type (text|vision)."""
+        if model_type == "text":
+            text_content = self.extract_text(layout=False)  # type: ignore[arg-type]
+            if not text_content or text_content.isspace():
+                raise ValueError(
+                    "Cannot classify element with 'text' model: No text content found."
+                )
+            return text_content
+        elif model_type == "vision":
+            # Delegate to Region implementation via a temporary expand()
+            resolution = kwargs.get("resolution", 150)
+            from natural_pdf.elements.region import Region  # Local import to avoid cycles
+            return self.expand().to_image(
+                resolution=resolution,
+                include_highlights=False,
+                crop=True,
+            )
+        else:
+            raise ValueError(f"Unsupported model_type for classification: {model_type}")
+    # ------------------------------------------------------------------
+    # Lightweight to_image proxy (vision models, previews, etc.)
+    # ------------------------------------------------------------------
+    def to_image(self, *args, **kwargs):  # type: ignore[override]
+        """Generate an image of this element by delegating to a temporary Region."""
+        return self.expand().to_image(*args, **kwargs)
+    # ------------------------------------------------------------------
+    # Unified analysis storage (maps to metadata["analysis"])
+    # ------------------------------------------------------------------
+    @property
+    def analyses(self) -> Dict[str, Any]:
+        """Dictionary holding model-generated analysis objects (classification, extraction, …)."""
+        if not hasattr(self, "metadata") or self.metadata is None:
+            self.metadata = {}
+        return self.metadata.setdefault("analysis", {})
+    @analyses.setter
+    def analyses(self, value: Dict[str, Any]):
+        if not hasattr(self, "metadata") or self.metadata is None:
+            self.metadata = {}
+        self.metadata["analysis"] = value

natural_pdf/elements/collections.py CHANGED Viewed

@@ -852,6 +852,7 @@ class ElementCollection(
         render_ocr: bool = False,
         width: Optional[int] = None,  # Add width parameter
         page: Optional[Any] = None,  # NEW: Optional page parameter for empty collections
+        crop: bool = False,  # NEW: If True, crop output to element bounds
     ) -> Optional["Image.Image"]:
         """
         Generates a temporary preview image highlighting elements in this collection
@@ -875,6 +876,9 @@ class ElementCollection(
             legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
             render_ocr: Whether to render OCR text.
             width: Optional width for the output image in pixels.
+            crop: If True, crop the resulting image to the tight bounding box
+                        containing all elements in the collection. The elements are
+                        still highlighted first, then the image is cropped.
         Returns:
             PIL Image object of the temporary preview, or None if rendering fails or
@@ -931,7 +935,23 @@ class ElementCollection(
         # 2. Call render_preview on the HighlightingService
         try:
-            return service.render_preview(
+            # Calculate crop bounding box in PDF coordinates if crop is requested
+            crop_bbox = None
+            if crop:
+                try:
+                    crop_bbox = (
+                        min(el.x0 for el in self._elements),
+                        min(el.top for el in self._elements),
+                        max(el.x1 for el in self._elements),
+                        max(el.bottom for el in self._elements),
+                    )
+                except Exception as bbox_err:
+                    logger.error(
+                        f"Error determining crop bbox for collection show: {bbox_err}",
+                        exc_info=True,
+                    )
+            img = service.render_preview(
                 page_index=page.index,
                 temporary_highlights=highlight_data_list,
                 scale=scale,
@@ -939,7 +959,9 @@ class ElementCollection(
                 labels=labels,  # Use 'labels'
                 legend_position=legend_position,
                 render_ocr=render_ocr,
+                crop_bbox=crop_bbox,
             )
+            return img
         except Exception as e:
             logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
             return None

natural-pdf 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl

natural-pdf 0.1.22py3-none-any.whl → 0.1.23py3-none-any.whl