PyPI - natural-pdf - Versions diffs - 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

natural-pdf 0.2.1.dev0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

natural_pdf/analyzers/guides.py +159 -3
natural_pdf/collections/mixins.py +16 -3
natural_pdf/core/highlighting_service.py +33 -9
natural_pdf/core/page.py +138 -7
natural_pdf/core/page_collection.py +51 -14
natural_pdf/core/page_groupby.py +229 -0
natural_pdf/core/render_spec.py +62 -4
natural_pdf/elements/base.py +102 -20
natural_pdf/elements/element_collection.py +11 -10
natural_pdf/elements/region.py +21 -21
natural_pdf/elements/text.py +5 -0
natural_pdf/extraction/manager.py +8 -14
natural_pdf/extraction/mixin.py +35 -21
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +37 -0
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/METADATA +2 -2
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/RECORD +23 -22
optimization/performance_analysis.py +1 -1
tools/bad_pdf_eval/analyser.py +1 -1
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/top_level.txt +0 -0

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -259,7 +259,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         self,
         *,
         text: str,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -271,7 +271,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         self,
         selector: str,
         *,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -283,7 +283,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -297,9 +297,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
-            contains: How to determine if elements are inside: 'all' (fully inside),
-                     'any' (any overlap), or 'center' (center point inside).
-                     (default: "all")
+            overlap: How to determine if elements overlap: 'full' (fully inside),
+                     'partial' (any overlap), or 'center' (center point inside).
+                     (default: "full")
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -313,7 +313,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
             element = page.find(
                 selector=selector,
                 text=text,
-                contains=contains,
+                overlap=overlap,
                 apply_exclusions=apply_exclusions,
                 regex=regex,
                 case=case,
@@ -328,7 +328,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         self,
         *,
         text: str,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -340,7 +340,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         self,
         selector: str,
         *,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -352,7 +352,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
-        contains: str = "all",
+        overlap: str = "full",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -366,9 +366,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
-            contains: How to determine if elements are inside: 'all' (fully inside),
-                     'any' (any overlap), or 'center' (center point inside).
-                     (default: "all")
+            overlap: How to determine if elements overlap: 'full' (fully inside),
+                     'partial' (any overlap), or 'center' (center point inside).
+                     (default: "full")
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -383,7 +383,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
             elements = page.find_all(
                 selector=selector,
                 text=text,
-                contains=contains,
+                overlap=overlap,
                 apply_exclusions=apply_exclusions,
                 regex=regex,
                 case=case,
@@ -1247,3 +1247,40 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         from natural_pdf.core.highlighting_service import HighlightContext
         return HighlightContext(self, show_on_exit=show)
+    def groupby(self, by: Union[str, Callable], *, show_progress: bool = True) -> "PageGroupBy":
+        """
+        Group pages by selector text or callable result.
+        Args:
+            by: CSS selector string or callable function
+            show_progress: Whether to show progress bar during computation (default: True)
+        Returns:
+            PageGroupBy object supporting iteration and dict-like access
+        Examples:
+            # Group by header text
+            for title, pages in pdf.pages.groupby('text[size=16]'):
+                print(f"Section: {title}")
+            # Group by callable
+            for city, pages in pdf.pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text()):
+                process_city_pages(pages)
+            # Quick exploration with indexing
+            grouped = pdf.pages.groupby('text[size=16]')
+            grouped.info()                    # Show all groups
+            first_section = grouped[0]        # First group
+            last_section = grouped[-1]       # Last group
+            # Dict-like access by name
+            madison_pages = grouped.get('CITY OF MADISON')
+            madison_pages = grouped['CITY OF MADISON']  # Alternative
+            # Disable progress bar for small collections
+            grouped = pdf.pages.groupby('text[size=16]', show_progress=False)
+        """
+        from natural_pdf.core.page_groupby import PageGroupBy
+        return PageGroupBy(self, by, show_progress=show_progress)

natural_pdf/core/page_groupby.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""
+PageGroupBy class for grouping pages by selector text or callable results.
+"""
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from tqdm.auto import tqdm
+if TYPE_CHECKING:
+    from natural_pdf.core.page import Page
+    from natural_pdf.core.page_collection import PageCollection
+class PageGroupBy:
+    """
+    A groupby object for PageCollections that supports both iteration and dict-like access.
+    This class provides pandas-like groupby functionality for natural-pdf PageCollections.
+    Pages are grouped by the result of applying a selector string or callable function.
+    Supports:
+    - Direct iteration: for key, pages in grouped:
+    - Dict-like access: grouped.get(key), grouped.get_group(key)
+    - Batch operations: grouped.apply(func)
+    """
+    def __init__(
+        self,
+        page_collection: "PageCollection",
+        by: Union[str, Callable],
+        *,
+        show_progress: bool = True,
+    ):
+        """
+        Initialize the PageGroupBy object.
+        Args:
+            page_collection: The PageCollection to group
+            by: CSS selector string or callable function for grouping
+            show_progress: Whether to show progress bar during computation (default: True)
+        """
+        self.page_collection = page_collection
+        self.by = by
+        self.show_progress = show_progress
+        self._groups: Optional[Dict[Any, "PageCollection"]] = None
+    def _compute_groups(self) -> Dict[Any, "PageCollection"]:
+        """
+        Compute the groups by applying the selector/callable to each page.
+        Returns:
+            Dictionary mapping group keys to PageCollection objects
+        """
+        if self._groups is not None:
+            return self._groups
+        groups = defaultdict(list)
+        # Setup progress bar if enabled and collection is large enough
+        pages_iterator = self.page_collection.pages
+        total_pages = len(self.page_collection)
+        if self.show_progress and total_pages > 1:  # Show progress for more than 1 page
+            desc = f"Grouping by {'selector' if isinstance(self.by, str) else 'function'}"
+            pages_iterator = tqdm(pages_iterator, desc=desc, unit="pages", total=total_pages)
+        for page in pages_iterator:
+            if callable(self.by):
+                # Apply callable function
+                key = self.by(page)
+            else:
+                # Apply selector string
+                element = page.find(self.by)
+                if element:
+                    key = element.extract_text()
+                else:
+                    key = None
+            groups[key].append(page)
+        # Convert lists to PageCollections
+        from natural_pdf.core.page_collection import PageCollection
+        self._groups = {key: PageCollection(pages) for key, pages in groups.items()}
+        return self._groups
+    def __iter__(self) -> Iterator[Tuple[Any, "PageCollection"]]:
+        """
+        Support direct iteration: for key, pages in grouped:
+        Yields:
+            Tuples of (group_key, PageCollection)
+        """
+        groups = self._compute_groups()
+        return iter(groups.items())
+    def get(
+        self, key: Any, default: Optional["PageCollection"] = None
+    ) -> Optional["PageCollection"]:
+        """
+        Dict-like access to get a specific group.
+        Args:
+            key: The group key to look up
+            default: Value to return if key is not found
+        Returns:
+            PageCollection for the group, or default if not found
+        """
+        groups = self._compute_groups()
+        return groups.get(key, default)
+    def get_group(self, key: Any) -> "PageCollection":
+        """
+        Pandas-style access to get a specific group.
+        Args:
+            key: The group key to look up
+        Returns:
+            PageCollection for the group
+        Raises:
+            KeyError: If the group key is not found
+        """
+        groups = self._compute_groups()
+        if key not in groups:
+            raise KeyError(f"Group key '{key}' not found")
+        return groups[key]
+    def keys(self) -> List[Any]:
+        """
+        Get all group keys.
+        Returns:
+            List of all group keys
+        """
+        groups = self._compute_groups()
+        return list(groups.keys())
+    def __getitem__(self, index: Union[int, Any]) -> "PageCollection":
+        """
+        Access groups by index or key.
+        Args:
+            index: Integer index (0-based) or group key
+        Returns:
+            PageCollection for the specified group
+        Examples:
+            grouped = pages.groupby('text[size=16]')
+            # Access by index (useful for quick exploration)
+            first_group = grouped[0]        # First group by order
+            second_group = grouped[1]       # Second group
+            last_group = grouped[-1]        # Last group
+            # Access by key (same as .get_group())
+            madison = grouped['CITY OF MADISON']
+        """
+        groups = self._compute_groups()
+        if isinstance(index, int):
+            # Access by integer index
+            keys_list = list(groups.keys())
+            original_index = index  # Keep original for error message
+            if index < 0:
+                index = len(keys_list) + index  # Support negative indexing
+            if not (0 <= index < len(keys_list)):
+                raise IndexError(f"Group index {original_index} out of range")
+            key = keys_list[index]
+            return groups[key]
+        else:
+            # Access by key (same as get_group)
+            if index not in groups:
+                raise KeyError(f"Group key '{index}' not found")
+            return groups[index]
+    def apply(self, func: Callable[["PageCollection"], Any]) -> Dict[Any, Any]:
+        """
+        Apply a function to each group.
+        Args:
+            func: Function to apply to each PageCollection group
+        Returns:
+            Dictionary mapping group keys to function results
+        """
+        groups = self._compute_groups()
+        return {key: func(pages) for key, pages in groups.items()}
+    def show(self, **kwargs):
+        """
+        Show each group separately with headers.
+        Args:
+            **kwargs: Arguments passed to each group's show() method
+        """
+        groups = self._compute_groups()
+        for key, pages in groups.items():
+            print(f"\n--- Group: {key} ({len(pages)} pages) ---")
+            pages.show(**kwargs)
+    def __len__(self) -> int:
+        """Return the number of groups."""
+        groups = self._compute_groups()
+        return len(groups)
+    def info(self) -> None:
+        """
+        Print information about all groups.
+        Useful for quick exploration of group structure.
+        """
+        groups = self._compute_groups()
+        print(f"PageGroupBy with {len(groups)} groups:")
+        print("-" * 40)
+        for i, (key, pages) in enumerate(groups.items()):
+            key_display = f"'{key}'" if key is not None else "None"
+            print(f"[{i}] {key_display}: {len(pages)} pages")
+    def __repr__(self) -> str:
+        """String representation showing group count."""
+        groups = self._compute_groups()
+        return f"<PageGroupBy(groups={len(groups)})>"

natural_pdf/core/render_spec.py CHANGED Viewed

@@ -92,6 +92,50 @@ class Visualizable:
     _get_render_specs() to gain full image generation capabilities.
     """
+    def highlight(self, *elements, **kwargs):
+        """
+        Convenience method for highlighting elements in Jupyter/Colab.
+        This method creates a highlight context, adds the elements, and returns
+        the resulting image. It's designed for simple one-liner usage in notebooks.
+        Args:
+            *elements: Elements or element collections to highlight
+            **kwargs: Additional parameters passed to show()
+        Returns:
+            PIL Image with highlights
+        Example:
+            # Simple one-liner highlighting
+            page.highlight(left, mid, right)
+            # With custom colors
+            page.highlight(
+                (tables, 'blue'),
+                (headers, 'red'),
+                (footers, 'green')
+            )
+        """
+        from natural_pdf.core.highlighting_service import HighlightContext
+        # Create context and add elements
+        ctx = HighlightContext(self, show_on_exit=False)
+        for element in elements:
+            if isinstance(element, tuple) and len(element) == 2:
+                # Element with color: (element, color)
+                ctx.add(element[0], color=element[1])
+            elif isinstance(element, tuple) and len(element) == 3:
+                # Element with color and label: (element, color, label)
+                ctx.add(element[0], color=element[1], label=element[2])
+            else:
+                # Just element
+                ctx.add(element)
+        # Return the image directly
+        return ctx.show(**kwargs)
     def _get_render_specs(
         self, mode: Literal["show", "render"] = "show", **kwargs
     ) -> List[RenderSpec]:
@@ -146,10 +190,11 @@ class Visualizable:
         legend_position: str = "right",
         annotate: Optional[Union[str, List[str]]] = None,
         # Layout options for multi-page/region
-        layout: Literal["stack", "grid", "single"] = "stack",
+        layout: Optional[Literal["stack", "grid", "single"]] = None,
         stack_direction: Literal["vertical", "horizontal"] = "vertical",
         gap: int = 5,
-        columns: Optional[int] = None,  # For grid layout
+        columns: Optional[int] = 6,  # For grid layout, defaults to 6 columns
+        limit: Optional[int] = 30,  # Max pages to show (default 30)
         # Cropping options
         crop: Union[bool, Literal["content"]] = False,
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
@@ -169,10 +214,11 @@ class Visualizable:
             highlights: Additional highlight groups to show
             legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
             annotate: Attribute name(s) to display on highlights (string or list)
-            layout: How to arrange multiple pages/regions
+            layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
             stack_direction: Direction for stack layout
             gap: Pixels between stacked images
-            columns: Number of columns for grid layout
+            columns: Number of columns for grid layout (defaults to 6)
+            limit: Maximum number of pages to display (default 30, None for all)
             crop: Whether to crop (True, False, or 'content' for bbox of elements)
             crop_bbox: Explicit crop bounds
             **kwargs: Additional parameters passed to rendering
@@ -184,6 +230,10 @@ class Visualizable:
         if isinstance(annotate, str):
             annotate = [annotate]
+        # Pass limit as max_pages to _get_render_specs
+        if limit is not None:
+            kwargs["max_pages"] = limit
         specs = self._get_render_specs(
             mode="show",
             color=color,
@@ -198,6 +248,14 @@ class Visualizable:
             logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
             return None
+        # Determine default layout based on content and parameters
+        if layout is None:
+            # For PDFs and multi-page collections, default to grid with 6 columns
+            if len(specs) > 1:
+                layout = "grid"
+            else:
+                layout = "single"
         highlighter = self._get_highlighter()
         return highlighter.unified_render(
             specs=specs,

natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

natural-pdf 0.2.1.dev0py3-none-any.whl → 0.2.3py3-none-any.whl