PyPI - natural-pdf - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

natural-pdf 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

natural_pdf/analyzers/guides.py +159 -3
natural_pdf/core/highlighting_service.py +8 -8
natural_pdf/core/page.py +135 -4
natural_pdf/core/page_collection.py +37 -0
natural_pdf/core/page_groupby.py +229 -0
natural_pdf/core/render_spec.py +18 -4
natural_pdf/elements/base.py +54 -6
natural_pdf/elements/element_collection.py +1 -0
natural_pdf/elements/region.py +2 -2
natural_pdf/elements/text.py +5 -0
natural_pdf/extraction/manager.py +8 -14
natural_pdf/extraction/mixin.py +35 -21
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +37 -0
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/METADATA +2 -2
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/RECORD +22 -21
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/top_level.txt +0 -2
optimization/performance_analysis.py +1 -1
tools/bad_pdf_eval/analyser.py +1 -1
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.0.dist-info → natural_pdf-0.2.2.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/guides.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import json
 import logging
 from collections import UserList
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
     from natural_pdf.elements.element_collection import ElementCollection
     from natural_pdf.elements.region import Region
     from natural_pdf.flows.region import FlowRegion
+    from natural_pdf.tables.result import TableResult
 logger = logging.getLogger(__name__)
@@ -131,6 +132,15 @@ class GuidesList(UserList):
         self._parent = parent_guides
         self._axis = axis
+    def __getitem__(self, i):
+        """Override to handle slicing properly."""
+        if isinstance(i, slice):
+            # Return a new GuidesList with the sliced data
+            return self.__class__(self._parent, self._axis, self.data[i])
+        else:
+            # For single index, return the value directly
+            return self.data[i]
     def from_content(
         self,
         markers: Union[str, List[str], "ElementCollection", None],
@@ -140,6 +150,7 @@ class GuidesList(UserList):
         tolerance: float = 5,
         *,
         append: bool = False,
+        apply_exclusions: bool = True,
     ) -> "Guides":
         """
         Create guides from content markers and add to this axis.
@@ -154,6 +165,7 @@ class GuidesList(UserList):
             align: How to align guides relative to found elements
             outer: Whether to add outer boundary guides
             tolerance: Tolerance for snapping to element edges
+            apply_exclusions: Whether to apply exclusion zones when searching for text
         Returns:
             Parent Guides object for chaining
@@ -178,6 +190,7 @@ class GuidesList(UserList):
                     align=align,
                     outer=outer,
                     tolerance=tolerance,
+                    apply_exclusions=apply_exclusions,
                 )
                 # Collect guides from this region
@@ -260,6 +273,7 @@ class GuidesList(UserList):
             align=align,
             outer=outer,
             tolerance=tolerance,
+            apply_exclusions=apply_exclusions,
         )
         # Replace or append based on parameter
@@ -1398,6 +1412,7 @@ class Guides:
         align: Literal["left", "right", "center", "between"] = "left",
         outer: bool = True,
         tolerance: float = 5,
+        apply_exclusions: bool = True,
     ) -> "Guides":
         """
         Create guides based on text content positions.
@@ -1413,6 +1428,7 @@ class Guides:
             align: Where to place guides relative to found text
             outer: Whether to add guides at the boundaries
             tolerance: Maximum distance to search for text
+            apply_exclusions: Whether to apply exclusion zones when searching for text
         Returns:
             New Guides object aligned to text content
@@ -1431,6 +1447,7 @@ class Guides:
                     align=align,
                     outer=outer,
                     tolerance=tolerance,
+                    apply_exclusions=apply_exclusions,
                 )
                 # Store in flow guides
@@ -1469,7 +1486,7 @@ class Guides:
         # Find each marker and determine guide position
         for marker in marker_texts:
             if hasattr(obj, "find"):
-                element = obj.find(f'text:contains("{marker}")')
+                element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
                 if element:
                     if axis == "vertical":
                         if align == "left":
@@ -1498,7 +1515,9 @@ class Guides:
             marker_bounds = []
             for marker in marker_texts:
                 if hasattr(obj, "find"):
-                    element = obj.find(f'text:contains("{marker}")')
+                    element = obj.find(
+                        f'text:contains("{marker}")', apply_exclusions=apply_exclusions
+                    )
                     if element:
                         if axis == "vertical":
                             marker_bounds.append((element.x0, element.x1))
@@ -3285,6 +3304,7 @@ class Guides:
         align: Literal["left", "right", "center", "between"] = "left",
         outer: bool = True,
         tolerance: float = 5,
+        apply_exclusions: bool = True,
     ) -> "Guides":
         """
         Instance method: Add guides from content, allowing chaining.
@@ -3301,6 +3321,7 @@ class Guides:
             align: How to align guides relative to found elements
             outer: Whether to add outer boundary guides
             tolerance: Tolerance for snapping to element edges
+            apply_exclusions: Whether to apply exclusion zones when searching for text
         Returns:
             Self for method chaining
@@ -3318,6 +3339,7 @@ class Guides:
             align=align,
             outer=outer,
             tolerance=tolerance,
+            apply_exclusions=apply_exclusions,
         )
         # Add the appropriate coordinates to this object
@@ -3421,6 +3443,140 @@ class Guides:
         return self
+    def extract_table(
+        self,
+        target: Optional[Union["Page", "Region"]] = None,
+        source: str = "guides_temp",
+        cell_padding: float = 0.5,
+        include_outer_boundaries: bool = False,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        use_ocr: bool = False,
+        ocr_config: Optional[dict] = None,
+        text_options: Optional[Dict] = None,
+        cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = False,
+        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
+        *,
+        multi_page: Literal["auto", True, False] = "auto",
+    ) -> "TableResult":
+        """
+        Extract table data directly from guides without leaving temporary regions.
+        This method:
+        1. Creates table structure using build_grid()
+        2. Extracts table data from the created table region
+        3. Cleans up all temporary regions
+        4. Returns the TableResult
+        Args:
+            target: Page or Region to create regions on (uses self.context if None)
+            source: Source label for temporary regions (will be cleaned up)
+            cell_padding: Internal padding for cell regions in points
+            include_outer_boundaries: Whether to add boundaries at edges if missing
+            method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
+            table_settings: Settings for pdfplumber table extraction
+            use_ocr: Whether to use OCR for text extraction
+            ocr_config: OCR configuration parameters
+            text_options: Dictionary of options for the 'text' method
+            cell_extraction_func: Optional callable for custom cell text extraction
+            show_progress: Controls progress bar for text method
+            content_filter: Content filtering function or patterns
+            multi_page: Controls multi-region table creation for FlowRegions
+        Returns:
+            TableResult: Extracted table data
+        Raises:
+            ValueError: If no table region is created from the guides
+        Example:
+            ```python
+            from natural_pdf.analyzers import Guides
+            # Create guides from detected lines
+            guides = Guides.from_lines(page, source_label="detected")
+            # Extract table directly - no temporary regions left behind
+            table_data = guides.extract_table()
+            # Convert to pandas DataFrame
+            df = table_data.to_df()
+            ```
+        """
+        target_obj = target or self.context
+        if not target_obj:
+            raise ValueError("No target object available. Provide target parameter or context.")
+        # Get the page for cleanup later
+        if hasattr(target_obj, "x0") and hasattr(target_obj, "top"):  # Region
+            page = target_obj._page
+            element_manager = page._element_mgr
+        elif hasattr(target_obj, "_element_mgr"):  # Page
+            page = target_obj
+            element_manager = page._element_mgr
+        else:
+            raise ValueError(f"Target object {target_obj} is not a Page or Region")
+        try:
+            # Step 1: Build grid structure (creates temporary regions)
+            grid_result = self.build_grid(
+                target=target_obj,
+                source=source,
+                cell_padding=cell_padding,
+                include_outer_boundaries=include_outer_boundaries,
+                multi_page=multi_page,
+            )
+            # Step 2: Get the table region and extract table data
+            table_region = grid_result["regions"]["table"]
+            if table_region is None:
+                raise ValueError(
+                    "No table region was created from the guides. Check that you have both vertical and horizontal guides."
+                )
+            # Handle multi-page case where table_region might be a list
+            if isinstance(table_region, list):
+                if not table_region:
+                    raise ValueError("No table regions were created from the guides.")
+                # Use the first table region for extraction
+                table_region = table_region[0]
+            # Step 3: Extract table data using the region's extract_table method
+            table_result = table_region.extract_table(
+                method=method,
+                table_settings=table_settings,
+                use_ocr=use_ocr,
+                ocr_config=ocr_config,
+                text_options=text_options,
+                cell_extraction_func=cell_extraction_func,
+                show_progress=show_progress,
+                content_filter=content_filter,
+            )
+            return table_result
+        finally:
+            # Step 4: Clean up all temporary regions created by build_grid
+            # This ensures no regions are left behind regardless of success/failure
+            try:
+                regions_to_remove = [
+                    r
+                    for r in element_manager.regions
+                    if getattr(r, "source", None) == source
+                    and getattr(r, "region_type", None)
+                    in {"table", "table_row", "table_column", "table_cell"}
+                ]
+                for region in regions_to_remove:
+                    element_manager.remove_element(region, element_type="regions")
+                if regions_to_remove:
+                    logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
+            except Exception as cleanup_err:
+                logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
     def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
         """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
         if not self.is_flow_region or len(self.context.constituent_regions) < 2:

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -689,7 +689,7 @@ class HighlightingService:
         logger.debug(f"Added highlight to page {page_index}: {highlight}")
         # --- Invalidate page-level image cache --------------------------------
-        # The Page.to_image method maintains an internal cache keyed by rendering
+        # The Page.render method maintains an internal cache keyed by rendering
         # parameters.  Because the cache key currently does **not** incorporate
         # any information about the highlights themselves, it can return stale
         # images after highlights are added or removed.  To ensure the next
@@ -700,11 +700,11 @@ class HighlightingService:
             if hasattr(page_obj, "_to_image_cache"):
                 page_obj._to_image_cache.clear()
                 logger.debug(
-                    f"Cleared cached to_image renders for page {page_index} after adding a highlight."
+                    f"Cleared cached render images for page {page_index} after adding a highlight."
                 )
         except Exception as cache_err:  # pragma: no cover – never fail highlight creation
             logger.warning(
-                f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
+                f"Failed to invalidate render cache for page {page_index}: {cache_err}",
                 exc_info=True,
             )
@@ -737,11 +737,11 @@ class HighlightingService:
             if hasattr(page_obj, "_to_image_cache"):
                 page_obj._to_image_cache.clear()
                 logger.debug(
-                    f"Cleared cached to_image renders for page {page_index} after removing highlights."
+                    f"Cleared cached render images for page {page_index} after removing highlights."
                 )
         except Exception as cache_err:  # pragma: no cover
             logger.warning(
-                f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
+                f"Failed to invalidate render cache for page {page_index}: {cache_err}",
                 exc_info=True,
             )
@@ -760,7 +760,7 @@ class HighlightingService:
         labels: bool = True,
         legend_position: str = "right",
         render_ocr: bool = False,
-        **kwargs,  # Pass other args to pdfplumber.page.to_image if needed
+        **kwargs,  # Pass other args to pdfplumber.page.to_image if needed (internal API)
     ) -> Optional[Image.Image]:
         """
         Renders a specific page with its highlights.
@@ -773,7 +773,7 @@ class HighlightingService:
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on the image.
-            kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
+            kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
         Returns:
             A PIL Image object of the rendered page, or None if rendering fails.
@@ -957,7 +957,7 @@ class HighlightingService:
             crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
                 space to crop the output image to, before legends or other overlays are
                 applied. If None, no cropping is performed.
-            **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
+            **kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
         Returns:
             PIL Image of the preview, or None if rendering fails.

natural_pdf/core/page.py CHANGED Viewed

@@ -341,6 +341,26 @@ class Page(
                     for elem in elements:
                         spec.add_highlight(element=elem, color=group_color, label=group_label)
+            # Handle exclusions visualization
+            exclusions_param = kwargs.get("exclusions")
+            if exclusions_param:
+                # Get exclusion regions
+                exclusion_regions = self._get_exclusion_regions(include_callable=True)
+                if exclusion_regions:
+                    # Determine color for exclusions
+                    exclusion_color = (
+                        exclusions_param if isinstance(exclusions_param, str) else "red"
+                    )
+                    # Add exclusion regions as highlights
+                    for region in exclusion_regions:
+                        spec.add_highlight(
+                            element=region,
+                            color=exclusion_color,
+                            label=f"Exclusion: {region.label or 'unnamed'}",
+                        )
         return [spec]
     @property
@@ -391,7 +411,9 @@ class Page(
     def add_exclusion(
         self,
-        exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
+        exclusion_func_or_region: Union[
+            Callable[["Page"], "Region"], "Region", List[Any], Tuple[Any, ...], Any
+        ],
         label: Optional[str] = None,
         method: str = "region",
     ) -> "Page":
@@ -401,7 +423,8 @@ class Page(
         Args:
             exclusion_func_or_region: Either a callable function returning a Region,
-                                      a Region object, or another object with a valid .bbox attribute.
+                                      a Region object, a list/tuple of regions or elements,
+                                      or another object with a valid .bbox attribute.
             label: Optional label for this exclusion (e.g., 'header', 'footer').
             method: Exclusion method - 'region' (exclude all elements in bounding box) or
                     'element' (exclude only the specific elements). Default: 'region'.
@@ -551,10 +574,53 @@ class Page(
                     raise TypeError(
                         f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
                     ) from e
+        elif isinstance(exclusion_func_or_region, (list, tuple)):
+            # Handle lists/tuples of regions or elements
+            if not exclusion_func_or_region:
+                logger.warning(f"Page {self.index}: Empty list provided for exclusion, ignoring.")
+                return self
+            if method == "element":
+                # Store each element directly
+                for item in exclusion_func_or_region:
+                    if hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
+                        self._exclusions.append((item, label, method))
+                        logger.debug(
+                            f"Page {self.index}: Added element exclusion from list -> {item}"
+                        )
+                    else:
+                        logger.warning(
+                            f"Page {self.index}: Skipping item without valid bbox in list: {item}"
+                        )
+            else:  # method == "region"
+                # Convert each item to a Region and add
+                for item in exclusion_func_or_region:
+                    try:
+                        if isinstance(item, Region):
+                            item.label = label
+                            self._exclusions.append((item, label, method))
+                            logger.debug(f"Page {self.index}: Added Region from list: {item}")
+                        elif hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
+                            bbox_coords = tuple(float(v) for v in item.bbox)
+                            region = Region(self, bbox_coords, label=label)
+                            self._exclusions.append((region, label, method))
+                            logger.debug(
+                                f"Page {self.index}: Added exclusion region from list item {bbox_coords}"
+                            )
+                        else:
+                            logger.warning(
+                                f"Page {self.index}: Skipping item without valid bbox in list: {item}"
+                            )
+                    except Exception as e:
+                        logger.error(
+                            f"Page {self.index}: Failed to convert list item to Region: {e}"
+                        )
+                        continue
+            return self
         else:
             # Reject invalid types
             raise TypeError(
-                f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
+                f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, list/tuple of regions/elements, or have a valid .bbox attribute."
             )
         # Append the stored data (tuple of object/callable, label, and method)
@@ -668,6 +734,46 @@ class Page(
                         regions.append(region_result)
                         if debug:
                             print(f"    ✓ Added region from callable '{label}': {region_result}")
+                    elif hasattr(region_result, "__iter__") and hasattr(region_result, "__len__"):
+                        # Handle ElementCollection or other iterables
+                        from natural_pdf.elements.element_collection import ElementCollection
+                        if isinstance(region_result, ElementCollection) or (
+                            hasattr(region_result, "__iter__") and region_result
+                        ):
+                            if debug:
+                                print(
+                                    f"    Converting {type(region_result)} with {len(region_result)} elements to regions..."
+                                )
+                            # Convert each element to a region
+                            for elem in region_result:
+                                try:
+                                    if hasattr(elem, "bbox") and len(elem.bbox) == 4:
+                                        bbox_coords = tuple(float(v) for v in elem.bbox)
+                                        region = Region(self, bbox_coords, label=label)
+                                        regions.append(region)
+                                        if debug:
+                                            print(
+                                                f"      ✓ Added region from element: {bbox_coords}"
+                                            )
+                                    else:
+                                        if debug:
+                                            print(
+                                                f"      ✗ Skipping element without valid bbox: {elem}"
+                                            )
+                                except Exception as e:
+                                    if debug:
+                                        print(f"      ✗ Failed to convert element to region: {e}")
+                                    continue
+                            if debug and len(region_result) > 0:
+                                print(
+                                    f"    ✓ Converted {len(region_result)} elements from callable '{label}'"
+                                )
+                        else:
+                            if debug:
+                                print(f"    ✗ Empty iterable returned from callable '{label}'")
                     elif region_result:
                         logger.warning(
                             f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
@@ -1013,6 +1119,22 @@ class Page(
                         "Cannot sort elements in reading order: Missing required attributes (top, x0)."
                     )
+            # Handle collection-level pseudo-classes (:first, :last) for OR selectors
+            # Note: We only apply :first/:last if they appear in any of the sub-selectors
+            has_first = False
+            has_last = False
+            for sub_selector in selector_obj.get("selectors", []):
+                for pseudo in sub_selector.get("pseudo_classes", []):
+                    if pseudo.get("name") == "first":
+                        has_first = True
+                    elif pseudo.get("name") == "last":
+                        has_last = True
+            if has_first:
+                matching_elements = matching_elements[:1] if matching_elements else []
+            elif has_last:
+                matching_elements = matching_elements[-1:] if matching_elements else []
             # Return result collection
             return ElementCollection(matching_elements)
@@ -1134,6 +1256,15 @@ class Page(
                     "Cannot sort elements in reading order: Missing required attributes (top, x0)."
                 )
+        # Handle collection-level pseudo-classes (:first, :last)
+        for pseudo in selector_obj.get("pseudo_classes", []):
+            name = pseudo.get("name")
+            if name == "first":
+                matching_elements = matching_elements[:1] if matching_elements else []
+            elif name == "last":
+                matching_elements = matching_elements[-1:] if matching_elements else []
         # Create result collection - exclusions are handled by the calling methods (find, find_all)
         result = ElementCollection(matching_elements)
@@ -1944,7 +2075,7 @@ class Page(
             render_ocr: Whether to render OCR text.
             include_highlights: Whether to render highlights.
             resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
-            **kwargs: Additional args for pdfplumber's to_image.
+            **kwargs: Additional args for pdfplumber's internal to_image.
         Returns:
             Self for method chaining.

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -1247,3 +1247,40 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         from natural_pdf.core.highlighting_service import HighlightContext
         return HighlightContext(self, show_on_exit=show)
+    def groupby(self, by: Union[str, Callable], *, show_progress: bool = True) -> "PageGroupBy":
+        """
+        Group pages by selector text or callable result.
+        Args:
+            by: CSS selector string or callable function
+            show_progress: Whether to show progress bar during computation (default: True)
+        Returns:
+            PageGroupBy object supporting iteration and dict-like access
+        Examples:
+            # Group by header text
+            for title, pages in pdf.pages.groupby('text[size=16]'):
+                print(f"Section: {title}")
+            # Group by callable
+            for city, pages in pdf.pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text()):
+                process_city_pages(pages)
+            # Quick exploration with indexing
+            grouped = pdf.pages.groupby('text[size=16]')
+            grouped.info()                    # Show all groups
+            first_section = grouped[0]        # First group
+            last_section = grouped[-1]       # Last group
+            # Dict-like access by name
+            madison_pages = grouped.get('CITY OF MADISON')
+            madison_pages = grouped['CITY OF MADISON']  # Alternative
+            # Disable progress bar for small collections
+            grouped = pdf.pages.groupby('text[size=16]', show_progress=False)
+        """
+        from natural_pdf.core.page_groupby import PageGroupBy
+        return PageGroupBy(self, by, show_progress=show_progress)

natural-pdf 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

natural-pdf 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl