PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/__init__.py +6 -0
natural_pdf/core/page.py +21 -21
natural_pdf/core/pdf.py +77 -24
natural_pdf/elements/collections.py +164 -40
natural_pdf/elements/region.py +90 -40
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +702 -20
natural_pdf/flows/region.py +52 -4
natural_pdf/selectors/parser.py +34 -1
natural_pdf/text_mixin.py +97 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import (
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.text_mixin import TextMixin
 # Potentially lazy imports for optional dependencies needed in save_pdf
 try:
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
     from natural_pdf.core.pdf import PDF  # ---> ADDED PDF type hint
     from natural_pdf.elements.region import Region
     from natural_pdf.elements.text import TextElement  # Ensure TextElement is imported
+    from natural_pdf.flows.flow import Flow
 T = TypeVar("T")
 P = TypeVar("P", bound="Page")
@@ -1416,7 +1419,7 @@ class ElementCollection(
     def correct_ocr(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
         max_workers: Optional[int] = None,
     ) -> "ElementCollection":
         """
@@ -1425,10 +1428,10 @@ class ElementCollection(
         in parallel if `max_workers` is specified.
         Iterates through elements currently in the collection. If an element's
-        'source' attribute starts with 'ocr', it calls the `correction_callback`
+        'source' attribute starts with 'ocr', it calls the `transform`
         for that element, passing the element itself.
-        The `correction_callback` should contain the logic to:
+        The `transform` should contain the logic to:
         1. Determine if the element needs correction.
         2. Perform the correction (e.g., call an LLM).
         3. Return the new text (`str`) or `None`.
@@ -1438,8 +1441,8 @@ class ElementCollection(
         Elements without a source starting with 'ocr' are skipped.
         Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+            transform: A function accepting an element and returning
+                       `Optional[str]` (new text or None).
             max_workers: The maximum number of worker threads to use for parallel
                          correction on each page. If None, defaults are used.
@@ -1449,7 +1452,7 @@ class ElementCollection(
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
             elements=self._elements,
-            correction_callback=correction_callback,
+            correction_callback=transform,
             caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
             max_workers=max_workers,
         )
@@ -2045,7 +2048,7 @@ class ElementCollection(
     # ------------------------------------------------------------------
-class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
+class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
     """
     Represents a collection of Page objects, often from a single PDF document.
     Provides methods for batch operations on these pages.
@@ -2363,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         return ElementCollection(all_elements)
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
+        selector: str = "text",
         max_workers: Optional[int] = None,
     ) -> "PageCollection[P]":
         """
-        Applies corrections to OCR-generated text elements across all pages
+        Applies corrections to text elements across all pages
         in this collection using a user-provided callback function, executed
         in parallel if `max_workers` is specified.
-        This method delegates to the parent PDF's `correct_ocr` method,
+        This method delegates to the parent PDF's `update_text` method,
         targeting all pages within this collection.
         Args:
-            correction_callback: A function that accepts a single argument (an element
-                                 object) and returns `Optional[str]` (new text or None).
+            transform: A function that accepts a single argument (an element
+                       object) and returns `Optional[str]` (new text or None).
+            selector: The attribute name to update. Default is 'text'.
             max_workers: The maximum number of worker threads to use for parallel
                          correction on each page. If None, defaults are used.
@@ -2387,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Raises:
             RuntimeError: If the collection is empty, pages lack a parent PDF reference,
-                          or the parent PDF lacks the `correct_ocr` method.
+                          or the parent PDF lacks the `update_text` method.
         """
         if not self.pages:
-            logger.warning("Cannot correct OCR for an empty PageCollection.")
+            logger.warning("Cannot update text for an empty PageCollection.")
             # Return self even if empty to maintain chaining consistency
             return self
@@ -2398,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         parent_pdf = self.pages[0]._parent
         if (
             not parent_pdf
-            or not hasattr(parent_pdf, "correct_ocr")
-            or not callable(parent_pdf.correct_ocr)
+            or not hasattr(parent_pdf, "update_text")
+            or not callable(parent_pdf.update_text)
         ):
             raise RuntimeError(
-                "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
+                "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
             )
         page_indices = self._get_page_indices()
         logger.info(
-            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
+            f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
         )
         # Delegate the call to the parent PDF object for the relevant pages
         # Pass the max_workers parameter down
-        parent_pdf.correct_ocr(
-            correction_callback=correction_callback,
+        parent_pdf.update_text(
+            transform=transform,
             pages=page_indices,
-            max_workers=max_workers,  # Pass it here
+            selector=selector,
+            max_workers=max_workers,
         )
         return self
@@ -2431,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Extract sections from a page collection based on start/end elements.
         Args:
-            start_elements: Elements or selector string that mark the start of sections
-            end_elements: Elements or selector string that mark the end of sections
+            start_elements: Elements or selector string that mark the start of sections (optional)
+            end_elements: Elements or selector string that mark the end of sections (optional)
             new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
             boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
         Returns:
             List of Region objects representing the extracted sections
+        Note:
+            You can provide only start_elements, only end_elements, or both.
+            - With only start_elements: sections go from each start to the next start (or end of page)
+            - With only end_elements: sections go from beginning of document/page to each end
+            - With both: sections go from each start to the corresponding end
         """
         # Find start and end elements across all pages
         if isinstance(start_elements, str):
@@ -2446,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         if isinstance(end_elements, str):
             end_elements = self.find_all(end_elements).elements
-        # If no start elements, return empty list
-        if not start_elements:
+        # If no start elements and no end elements, return empty list
+        if not start_elements and not end_elements:
             return []
         # If there are page break boundaries, we'll need to add them
@@ -2482,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         # Sort by page index, then vertical position, then horizontal position
         all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
+        # If we only have end_elements (no start_elements), create implicit start elements
+        if not start_elements and end_elements:
+            from natural_pdf.elements.region import Region
+            start_elements = []
+            # Add implicit start at the beginning of the first page
+            first_page = self.pages[0]
+            first_start = Region(first_page, (0, 0, first_page.width, 1))
+            first_start.is_implicit_start = True
+            start_elements.append(first_start)
+            # For each end element (except the last), add an implicit start after it
+            sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
+            for i, end_elem in enumerate(sorted_end_elements[:-1]):  # Exclude last end element
+                # Create implicit start element right after this end element
+                implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
+                implicit_start.is_implicit_start = True
+                start_elements.append(implicit_start)
         # Mark section boundaries
         section_boundaries = []
@@ -2507,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                         "page_idx": element.page.index,
                     }
                 )
+            elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
+                # This is an implicit start element
+                section_boundaries.append(
+                    {
+                        "index": -2,  # Special index for implicit starts
+                        "element": element,
+                        "type": "start",
+                        "page_idx": element.page.index,
+                    }
+                )
         # Add end element boundaries if provided
         if end_elements:
@@ -2533,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     )
         # Sort boundaries by page index, then by actual document position
-        section_boundaries.sort(
-            key=lambda x: (
-                x["page_idx"],
-                x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
-            )
-        )
+        def _sort_key(boundary):
+            """Sort boundaries by (page_idx, vertical_top, priority)."""
+            page_idx = boundary["page_idx"]
+            element = boundary["element"]
+            # Vertical position on the page
+            y_pos = getattr(element, "top", 0.0)
+            # Ensure starts come before ends at the same coordinate
+            priority = 0 if boundary["type"] == "start" else 1
+            return (page_idx, y_pos, priority)
+        section_boundaries.sort(key=_sort_key)
         # Generate sections
         sections = []
@@ -2558,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             end_pg = end_el.page if end_el is not None else self.pages[-1]
             parts: list[Region] = []
-            # Slice of first page
-            parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
+            # Use the actual top of the start element (for implicit starts this is
+            # the bottom of the previous end element) instead of forcing to 0.
+            start_top = start_el.top
+            # Slice of first page beginning at *start_top*
+            parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
             # Full middle pages
             for pg_idx in range(start_pg.index + 1, end_pg.index):
@@ -2597,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # If both elements are on the same page, use the page's get_section_between
                 if start_element.page == end_element.page:
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
+                    # For implicit start elements, create a region from the top of the page
+                    if hasattr(start_element, "is_implicit_start"):
+                        from natural_pdf.elements.region import Region
+                        section = Region(
+                            start_element.page,
+                            (0, start_element.top, start_element.page.width, end_element.bottom)
+                        )
+                        section.start_element = start_element
+                        section.boundary_element_found = end_element
+                    else:
+                        section = start_element.page.get_section_between(
+                            start_element, end_element, boundary_inclusion
+                        )
                     sections.append(section)
                 else:
                     # Create FlowRegion spanning pages
@@ -2638,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     from natural_pdf.elements.region import Region
                     start_page = start_element.page
+                    # Handle implicit start elements
+                    start_top = start_element.top
                     region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
+                        start_page, (0, start_top, start_page.width, start_page.height)
                     )
                     region.start_element = start_element
                     sections.append(region)
@@ -2667,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # With start_elements only, create a section to the end of the current page
                 from natural_pdf.elements.region import Region
+                # Handle implicit start elements
+                start_top = start_element.top
                 region = Region(
-                    start_page, (0, start_element.top, start_page.width, start_page.height)
+                    start_page, (0, start_top, start_page.width, start_page.height)
                 )
                 region.start_element = start_element
                 sections.append(region)
@@ -3181,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
             # <--- END MODIFIED
+    def to_flow(
+        self,
+        arrangement: Literal["vertical", "horizontal"] = "vertical",
+        alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
+        segment_gap: float = 0.0,
+    ) -> "Flow":
+        """
+        Convert this PageCollection to a Flow for cross-page operations.
+        This enables treating multiple pages as a continuous logical document
+        structure, useful for multi-page tables, articles spanning columns,
+        or any content requiring reading order across page boundaries.
+        Args:
+            arrangement: Primary flow direction ('vertical' or 'horizontal').
+                        'vertical' stacks pages top-to-bottom (most common).
+                        'horizontal' arranges pages left-to-right.
+            alignment: Cross-axis alignment for pages of different sizes:
+                      For vertical: 'left'/'start', 'center', 'right'/'end'
+                      For horizontal: 'top'/'start', 'center', 'bottom'/'end'
+            segment_gap: Virtual gap between pages in PDF points (default: 0.0).
+        Returns:
+            Flow object that can perform operations across all pages in sequence.
+        Example:
+            Multi-page table extraction:
+            ```python
+            pdf = npdf.PDF("multi_page_report.pdf")
+            # Create flow for pages 2-4 containing a table
+            table_flow = pdf.pages[1:4].to_flow()
+            # Extract table as if it were continuous
+            table_data = table_flow.extract_table()
+            df = table_data.df
+            ```
+            Cross-page element search:
+            ```python
+            # Find all headers across multiple pages
+            headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
+            # Analyze layout across pages
+            regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
+            ```
+        """
+        from natural_pdf.flows.flow import Flow
+        return Flow(
+            segments=self,  # Flow constructor now handles PageCollection
+            arrangement=arrangement,
+            alignment=alignment,
+            segment_gap=segment_gap,
+        )
     # Alias .to_image() to .show() for convenience
     def show(
         self,

natural_pdf/elements/region.py CHANGED Viewed

@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement  # ADDED IMPORT
 from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.text_mixin import TextMixin
 # ------------------------------------------------------------------
 # Table utilities
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
 class Region(
-    DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
+    TextMixin,
+    DirectionalMixin,
+    ClassificationMixin,
+    ExtractionMixin,
+    ShapeDetectionMixin,
+    DescribeMixin,
 ):
     """Represents a rectangular region on a page.
@@ -1610,8 +1616,47 @@ class Region(
             table_settings.setdefault("join_x_tolerance", join)
             table_settings.setdefault("join_y_tolerance", join)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering, if any exclusions are
+        # defined on the parent Page.  We create a lightweight
+        # pdfplumber.Page copy whose .chars list omits characters that
+        # fall inside any exclusion Region.  Other object types are
+        # left untouched for now ("chars-only" strategy).
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            # Resolve exclusion Regions (callables already evaluated)
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                """Return True if pdfplumber obj should be kept."""
+                if obj.get("object_type") != "char":
+                    # Keep non-char objects unchanged – lattice grids etc.
+                    return True
+                # Compute character centre point
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                # Reject if the centre lies inside ANY exclusion Region
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                # Fallback – if filtering fails, log and proceed unfiltered
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        cropped = filtered_page.crop(self.bbox)
         # Extract all tables from the cropped area
         tables = cropped.extract_tables(table_settings)
@@ -1672,8 +1717,38 @@ class Region(
             if y_tol is not None:
                 table_settings.setdefault("text_y_tolerance", y_tol)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering (chars only) just like in
+        # _extract_tables_plumber so header/footer text does not appear
+        # in extracted tables.
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                if obj.get("object_type") != "char":
+                    return True
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        # Now crop the (possibly filtered) page to the region bbox
+        cropped = filtered_page.crop(self.bbox)
         # Extract the single largest table from the cropped area
         table = cropped.extract_table(table_settings)
@@ -3007,45 +3082,20 @@ class Region(
         source_info = f" source='{self.source}'" if self.source else ""
         return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Region":  # Return self for chaining
-        """
-        Applies corrections to OCR-generated text elements within this region
-        using a user-provided callback function.
-        Finds text elements within this region whose 'source' attribute starts
-        with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself.
-        The `correction_callback` should contain the logic to:
-        1. Determine if the element needs correction.
-        2. Perform the correction (e.g., call an LLM).
-        3. Return the new text (`str`) or `None`.
-        If the callback returns a string, the element's `.text` is updated.
-        Metadata updates (source, confidence, etc.) should happen within the callback.
-        Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+        transform: Callable[[Any], Optional[str]],
+        *,
+        selector: str = "text",
+        apply_exclusions: bool = False,
+    ) -> "Region":
+        """Apply *transform* to every text element matched by *selector* inside this region.
-        Returns:
-            Self for method chaining.
+        The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
+        override simply ensures the search is scoped to the region.
         """
-        # Find OCR elements specifically within this region
-        # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
-        # Delegate to the utility function
-        _apply_ocr_correction_to_elements(
-            elements=target_elements,  # Pass the ElementCollection directly
-            correction_callback=correction_callback,
-            caller_info=f"Region({self.bbox})",  # Pass caller info
-        )
-        return self  # Return self for chaining
+        return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
     # --- Classification Mixin Implementation --- #
     def _get_classification_manager(self) -> "ClassificationManager":

natural_pdf/flows/element.py CHANGED Viewed

@@ -73,6 +73,31 @@ class FlowElement:
         """Returns the physical page of the underlying element."""
         return getattr(self.physical_object, "page", None)
+    def __getattr__(self, name: str) -> Any:
+        """
+        Delegate unknown attribute access to the physical_object.
+        This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
+        from the physical element are accessible on the FlowElement wrapper.
+        Args:
+            name: The attribute name being accessed
+        Returns:
+            The attribute value from physical_object
+        Raises:
+            AttributeError: If the attribute doesn't exist on physical_object either
+        """
+        try:
+            return getattr(self.physical_object, name)
+        except AttributeError:
+            # Provide a helpful error message that mentions both FlowElement and physical_object
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"(also not found on underlying {type(self.physical_object).__name__})"
+            )
     def _flow_direction(
         self,
         direction: str,  # "above", "below", "left", "right"

natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl