PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +119 -76
natural_pdf/core/pdf.py +19 -22
natural_pdf/describe/__init__.py +21 -0
natural_pdf/describe/base.py +457 -0
natural_pdf/describe/elements.py +411 -0
natural_pdf/describe/mixin.py +84 -0
natural_pdf/describe/summary.py +186 -0
natural_pdf/elements/base.py +11 -10
natural_pdf/elements/collections.py +116 -51
natural_pdf/elements/region.py +204 -127
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0

natural_pdf/elements/base.py CHANGED Viewed

@@ -8,6 +8,7 @@ from PIL import Image
 # Import selector parsing functions
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.describe.mixin import DescribeMixin
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
@@ -18,34 +19,34 @@ if TYPE_CHECKING:
 def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
     """
     Extract bounding box coordinates from any object that has bbox properties.
     Args:
         obj: Object that might have bbox coordinates (Element, Region, etc.)
     Returns:
         Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
     """
     # Try bbox property first (most common)
-    if hasattr(obj, 'bbox') and obj.bbox is not None:
+    if hasattr(obj, "bbox") and obj.bbox is not None:
         bbox = obj.bbox
         if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
             return tuple(float(coord) for coord in bbox)
     # Try individual coordinate properties
-    if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
+    if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
         try:
             return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
         except (ValueError, TypeError):
             pass
     # If object is a dict with bbox keys
     if isinstance(obj, dict):
-        if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
+        if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
             try:
-                return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
+                return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
             except (ValueError, TypeError):
                 pass
     return None
@@ -412,7 +413,7 @@ class DirectionalMixin:
         return new_region
-class Element(DirectionalMixin):
+class Element(DirectionalMixin, DescribeMixin):
     """
     Base class for all PDF elements.

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 from collections.abc import MutableSequence
 from pathlib import Path
@@ -18,7 +19,6 @@ from typing import (
     Union,
     overload,
 )
-import hashlib
 from pdfplumber.utils.geometry import objects_to_bbox
@@ -27,8 +27,10 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
 from PIL import Image, ImageDraw, ImageFont
 from tqdm.auto import tqdm
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
 from natural_pdf.classification.manager import ClassificationManager
 from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
 from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.base import Element
@@ -38,8 +40,6 @@ from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
-from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
-from tqdm.auto import tqdm
 # Potentially lazy imports for optional dependencies needed in save_pdf
 try:
@@ -65,14 +65,21 @@ if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.core.pdf import PDF  # ---> ADDED PDF type hint
     from natural_pdf.elements.region import Region
-    from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
+    from natural_pdf.elements.text import TextElement  # Ensure TextElement is imported
 T = TypeVar("T")
 P = TypeVar("P", bound="Page")
 class ElementCollection(
-    Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
+    Generic[T],
+    ApplyMixin,
+    ExportMixin,
+    ClassificationMixin,
+    DirectionalCollectionMixin,
+    DescribeMixin,
+    InspectMixin,
+    MutableSequence,
 ):
     """
     Collection of PDF elements with batch operations.
@@ -844,6 +851,7 @@ class ElementCollection(
         legend_position: str = "right",
         render_ocr: bool = False,
         width: Optional[int] = None,  # Add width parameter
+        page: Optional[Any] = None,  # NEW: Optional page parameter for empty collections
     ) -> Optional["Image.Image"]:
         """
         Generates a temporary preview image highlighting elements in this collection
@@ -1590,13 +1598,13 @@ class ElementCollection(
     def to_text_elements(
         self,
-        text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
         source_label: str = "derived_from_region",
         object_type: str = "word",
         default_font_size: float = 10.0,
         default_font_name: str = "RegionContent",
         confidence: Optional[float] = None,
-        add_to_page: bool = False # Default is False
+        add_to_page: bool = False,  # Default is False
     ) -> "ElementCollection[TextElement]":
         """
         Converts each Region in this collection to a TextElement.
@@ -1610,95 +1618,150 @@ class ElementCollection(
             default_font_size: Placeholder font size.
             default_font_name: Placeholder font name.
             confidence: Confidence score.
-            add_to_page: If True (default is False), also adds the created
+            add_to_page: If True (default is False), also adds the created
                          TextElements to their respective page's element manager.
         Returns:
             A new ElementCollection containing the created TextElement objects.
         """
-        from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
-        from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
+        from natural_pdf.elements.region import (  # Local import for type checking if needed or to resolve circularity
+            Region,
+        )
+        from natural_pdf.elements.text import (  # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
+            TextElement,
+        )
         new_text_elements: List["TextElement"] = []
-        if not self.elements: # Accesses self._elements via property
+        if not self.elements:  # Accesses self._elements via property
             return ElementCollection([])
         page_context_for_adding: Optional["Page"] = None
         if add_to_page:
             # Try to determine a consistent page context if adding elements
             first_valid_region_with_page = next(
-                (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
-                None
+                (
+                    el
+                    for el in self.elements
+                    if isinstance(el, Region) and hasattr(el, "page") and el.page is not None
+                ),
+                None,
             )
             if first_valid_region_with_page:
                 page_context_for_adding = first_valid_region_with_page.page
             else:
-                logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
-                add_to_page = False # Disable adding if no valid page context can be determined
+                logger.warning(
+                    "Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None."
+                )
+                add_to_page = False  # Disable adding if no valid page context can be determined
-        for element in self.elements: # Accesses self._elements via property/iterator
+        for element in self.elements:  # Accesses self._elements via property/iterator
             if isinstance(element, Region):
                 text_el = element.to_text_element(
-                    text_content=text_content_func,
+                    text_content=text_content_func,
                     source_label=source_label,
                     object_type=object_type,
                     default_font_size=default_font_size,
                     default_font_name=default_font_name,
-                    confidence=confidence
+                    confidence=confidence,
                 )
                 new_text_elements.append(text_el)
                 if add_to_page:
-                    if not hasattr(text_el, 'page') or text_el.page is None:
-                        logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
+                    if not hasattr(text_el, "page") or text_el.page is None:
+                        logger.warning(
+                            f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page."
+                        )
                         continue
                     if page_context_for_adding and text_el.page == page_context_for_adding:
-                        if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
-                            add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
-                            page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
+                        if (
+                            hasattr(page_context_for_adding, "_element_mgr")
+                            and page_context_for_adding._element_mgr is not None
+                        ):
+                            add_as_type = (
+                                "words"
+                                if object_type == "word"
+                                else "chars" if object_type == "char" else object_type
+                            )
+                            page_context_for_adding._element_mgr.add_element(
+                                text_el, element_type=add_as_type
+                            )
                         else:
-                            page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
-                            logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
+                            page_num_str = (
+                                str(page_context_for_adding.page_number)
+                                if hasattr(page_context_for_adding, "page_number")
+                                else "N/A"
+                            )
+                            logger.error(
+                                f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement."
+                            )
                     elif page_context_for_adding and text_el.page != page_context_for_adding:
-                        current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
-                        context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
-                        logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
-                                       f"not added as it's different from collection's inferred page context {context_page_num_str}.")
-                    elif not page_context_for_adding:
-                        logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
+                        current_page_num_str = (
+                            str(text_el.page.page_number)
+                            if hasattr(text_el.page, "page_number")
+                            else "Unknown"
+                        )
+                        context_page_num_str = (
+                            str(page_context_for_adding.page_number)
+                            if hasattr(page_context_for_adding, "page_number")
+                            else "N/A"
+                        )
+                        logger.warning(
+                            f"TextElement for region {element.bbox} from page {current_page_num_str} "
+                            f"not added as it's different from collection's inferred page context {context_page_num_str}."
+                        )
+                    elif not page_context_for_adding:
+                        logger.warning(
+                            f"TextElement for region {element.bbox} created, but no page context was determined for adding."
+                        )
             else:
                 logger.warning(f"Skipping element {type(element)}, not a Region.")
         if add_to_page and page_context_for_adding:
-            page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
-            logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
-        elif add_to_page and not page_context_for_adding:
-             logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
-        else: # add_to_page is False
+            page_num_str = (
+                str(page_context_for_adding.page_number)
+                if hasattr(page_context_for_adding, "page_number")
+                else "N/A"
+            )
+            logger.info(
+                f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}."
+            )
+        elif add_to_page and not page_context_for_adding:
+            logger.info(
+                f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent."
+            )
+        else:  # add_to_page is False
             logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
         return ElementCollection(new_text_elements)
-    def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
+    def trim(
+        self,
+        padding: int = 1,
+        threshold: float = 0.95,
+        resolution: float = 150,
+        show_progress: bool = True,
+    ) -> "ElementCollection":
         """
         Trim visual whitespace from each region in the collection.
         Applies the trim() method to each element in the collection,
         returning a new collection with the trimmed regions.
         Args:
             padding: Number of pixels to keep as padding after trimming (default: 1)
             threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
             resolution: Resolution for image rendering in DPI (default: 150)
             show_progress: Whether to show a progress bar for the trimming operation
         Returns:
             New ElementCollection with trimmed regions
         """
         return self.apply(
-            lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
-            show_progress=show_progress
+            lambda element: element.trim(
+                padding=padding, threshold=threshold, resolution=resolution
+            ),
+            show_progress=show_progress,
         )
     def clip(
@@ -1711,27 +1774,27 @@ class ElementCollection(
     ) -> "ElementCollection":
         """
         Clip each element in the collection to the specified bounds.
         This method applies the clip operation to each individual element,
         returning a new collection with the clipped elements.
         Args:
             obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
             left: Optional left boundary (x0) to clip to
-            top: Optional top boundary to clip to
+            top: Optional top boundary to clip to
             right: Optional right boundary (x1) to clip to
             bottom: Optional bottom boundary to clip to
         Returns:
             New ElementCollection containing the clipped elements
         Examples:
             # Clip each element to another region's bounds
             clipped_elements = collection.clip(container_region)
             # Clip each element to specific coordinates
             clipped_elements = collection.clip(left=100, right=400)
             # Mix object bounds with specific overrides
             clipped_elements = collection.clip(obj=container, bottom=page.height/2)
         """
@@ -1740,6 +1803,8 @@ class ElementCollection(
         )
 class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
     """
     Represents a collection of Page objects, often from a single PDF document.

natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl