PyPI - natural-pdf - Versions diffs - 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/__init__.py +6 -0
natural_pdf/core/page.py +90 -22
natural_pdf/core/pdf.py +183 -59
natural_pdf/elements/collections.py +202 -47
natural_pdf/elements/region.py +176 -56
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +702 -20
natural_pdf/flows/region.py +52 -4
natural_pdf/selectors/parser.py +34 -1
natural_pdf/text_mixin.py +97 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import hashlib
 import logging
-from collections.abc import MutableSequence
+from collections.abc import MutableSequence, Sequence
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -11,6 +11,7 @@ from typing import (
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.text_mixin import TextMixin
 # Potentially lazy imports for optional dependencies needed in save_pdf
 try:
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
     from natural_pdf.core.pdf import PDF  # ---> ADDED PDF type hint
     from natural_pdf.elements.region import Region
     from natural_pdf.elements.text import TextElement  # Ensure TextElement is imported
+    from natural_pdf.flows.flow import Flow
 T = TypeVar("T")
 P = TypeVar("P", bound="Page")
@@ -1416,7 +1419,7 @@ class ElementCollection(
     def correct_ocr(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
         max_workers: Optional[int] = None,
     ) -> "ElementCollection":
         """
@@ -1425,10 +1428,10 @@ class ElementCollection(
         in parallel if `max_workers` is specified.
         Iterates through elements currently in the collection. If an element's
-        'source' attribute starts with 'ocr', it calls the `correction_callback`
+        'source' attribute starts with 'ocr', it calls the `transform`
         for that element, passing the element itself.
-        The `correction_callback` should contain the logic to:
+        The `transform` should contain the logic to:
         1. Determine if the element needs correction.
         2. Perform the correction (e.g., call an LLM).
         3. Return the new text (`str`) or `None`.
@@ -1438,8 +1441,8 @@ class ElementCollection(
         Elements without a source starting with 'ocr' are skipped.
         Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+            transform: A function accepting an element and returning
+                       `Optional[str]` (new text or None).
             max_workers: The maximum number of worker threads to use for parallel
                          correction on each page. If None, defaults are used.
@@ -1449,7 +1452,7 @@ class ElementCollection(
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
             elements=self._elements,
-            correction_callback=correction_callback,
+            correction_callback=transform,
             caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
             max_workers=max_workers,
         )
@@ -2045,20 +2048,26 @@ class ElementCollection(
     # ------------------------------------------------------------------
-class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
+class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
     """
     Represents a collection of Page objects, often from a single PDF document.
     Provides methods for batch operations on these pages.
     """
-    def __init__(self, pages: List[P]):
+    def __init__(self, pages: Union[List[P], Sequence[P]]):
         """
         Initialize a page collection.
         Args:
-            pages: List of Page objects
+            pages: List or sequence of Page objects (can be lazy)
         """
-        self.pages = pages
+        # Store the sequence as-is to preserve lazy behavior
+        # Only convert to list if we need list-specific operations
+        if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
+            self.pages = pages
+        else:
+            # Fallback for non-sequence types
+            self.pages = list(pages)
     def __len__(self) -> int:
         """Return the number of pages in the collection."""
@@ -2078,6 +2087,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         """Return a string representation showing the page count."""
         return f"<PageCollection(count={len(self)})>"
+    def _get_items_for_apply(self) -> Iterator[P]:
+        """
+        Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
+        Returns an iterator that yields pages on-demand rather than materializing
+        all pages at once, maintaining the lazy loading behavior.
+        """
+        return iter(self.pages)
+    def _get_page_indices(self) -> List[int]:
+        """
+        Get page indices without forcing materialization of pages.
+        Returns:
+            List of page indices for the pages in this collection.
+        """
+        # Handle different types of page sequences efficiently
+        if hasattr(self.pages, '_indices'):
+            # If it's a _LazyPageList (or slice), get indices directly
+            return list(self.pages._indices)
+        else:
+            # Fallback: if pages are already materialized, get indices normally
+            # This will force materialization but only if pages aren't lazy
+            return [p.index for p in self.pages]
     def extract_text(
         self,
         keep_blank_chars: bool = True,
@@ -2172,7 +2206,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
         # Get the 0-based indices of the pages in this collection
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
@@ -2332,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         return ElementCollection(all_elements)
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
+        selector: str = "text",
         max_workers: Optional[int] = None,
     ) -> "PageCollection[P]":
         """
-        Applies corrections to OCR-generated text elements across all pages
+        Applies corrections to text elements across all pages
         in this collection using a user-provided callback function, executed
         in parallel if `max_workers` is specified.
-        This method delegates to the parent PDF's `correct_ocr` method,
+        This method delegates to the parent PDF's `update_text` method,
         targeting all pages within this collection.
         Args:
-            correction_callback: A function that accepts a single argument (an element
-                                 object) and returns `Optional[str]` (new text or None).
+            transform: A function that accepts a single argument (an element
+                       object) and returns `Optional[str]` (new text or None).
+            selector: The attribute name to update. Default is 'text'.
             max_workers: The maximum number of worker threads to use for parallel
                          correction on each page. If None, defaults are used.
@@ -2356,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Raises:
             RuntimeError: If the collection is empty, pages lack a parent PDF reference,
-                          or the parent PDF lacks the `correct_ocr` method.
+                          or the parent PDF lacks the `update_text` method.
         """
         if not self.pages:
-            logger.warning("Cannot correct OCR for an empty PageCollection.")
+            logger.warning("Cannot update text for an empty PageCollection.")
             # Return self even if empty to maintain chaining consistency
             return self
@@ -2367,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         parent_pdf = self.pages[0]._parent
         if (
             not parent_pdf
-            or not hasattr(parent_pdf, "correct_ocr")
-            or not callable(parent_pdf.correct_ocr)
+            or not hasattr(parent_pdf, "update_text")
+            or not callable(parent_pdf.update_text)
         ):
             raise RuntimeError(
-                "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
+                "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
             )
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(
-            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
+            f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
         )
         # Delegate the call to the parent PDF object for the relevant pages
         # Pass the max_workers parameter down
-        parent_pdf.correct_ocr(
-            correction_callback=correction_callback,
+        parent_pdf.update_text(
+            transform=transform,
             pages=page_indices,
-            max_workers=max_workers,  # Pass it here
+            selector=selector,
+            max_workers=max_workers,
         )
         return self
@@ -2400,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Extract sections from a page collection based on start/end elements.
         Args:
-            start_elements: Elements or selector string that mark the start of sections
-            end_elements: Elements or selector string that mark the end of sections
+            start_elements: Elements or selector string that mark the start of sections (optional)
+            end_elements: Elements or selector string that mark the end of sections (optional)
             new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
             boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
         Returns:
             List of Region objects representing the extracted sections
+        Note:
+            You can provide only start_elements, only end_elements, or both.
+            - With only start_elements: sections go from each start to the next start (or end of page)
+            - With only end_elements: sections go from beginning of document/page to each end
+            - With both: sections go from each start to the corresponding end
         """
         # Find start and end elements across all pages
         if isinstance(start_elements, str):
@@ -2415,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         if isinstance(end_elements, str):
             end_elements = self.find_all(end_elements).elements
-        # If no start elements, return empty list
-        if not start_elements:
+        # If no start elements and no end elements, return empty list
+        if not start_elements and not end_elements:
             return []
         # If there are page break boundaries, we'll need to add them
@@ -2451,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         # Sort by page index, then vertical position, then horizontal position
         all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
+        # If we only have end_elements (no start_elements), create implicit start elements
+        if not start_elements and end_elements:
+            from natural_pdf.elements.region import Region
+            start_elements = []
+            # Add implicit start at the beginning of the first page
+            first_page = self.pages[0]
+            first_start = Region(first_page, (0, 0, first_page.width, 1))
+            first_start.is_implicit_start = True
+            start_elements.append(first_start)
+            # For each end element (except the last), add an implicit start after it
+            sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
+            for i, end_elem in enumerate(sorted_end_elements[:-1]):  # Exclude last end element
+                # Create implicit start element right after this end element
+                implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
+                implicit_start.is_implicit_start = True
+                start_elements.append(implicit_start)
         # Mark section boundaries
         section_boundaries = []
@@ -2476,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                         "page_idx": element.page.index,
                     }
                 )
+            elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
+                # This is an implicit start element
+                section_boundaries.append(
+                    {
+                        "index": -2,  # Special index for implicit starts
+                        "element": element,
+                        "type": "start",
+                        "page_idx": element.page.index,
+                    }
+                )
         # Add end element boundaries if provided
         if end_elements:
@@ -2502,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     )
         # Sort boundaries by page index, then by actual document position
-        section_boundaries.sort(
-            key=lambda x: (
-                x["page_idx"],
-                x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
-            )
-        )
+        def _sort_key(boundary):
+            """Sort boundaries by (page_idx, vertical_top, priority)."""
+            page_idx = boundary["page_idx"]
+            element = boundary["element"]
+            # Vertical position on the page
+            y_pos = getattr(element, "top", 0.0)
+            # Ensure starts come before ends at the same coordinate
+            priority = 0 if boundary["type"] == "start" else 1
+            return (page_idx, y_pos, priority)
+        section_boundaries.sort(key=_sort_key)
         # Generate sections
         sections = []
@@ -2527,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             end_pg = end_el.page if end_el is not None else self.pages[-1]
             parts: list[Region] = []
-            # Slice of first page
-            parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
+            # Use the actual top of the start element (for implicit starts this is
+            # the bottom of the previous end element) instead of forcing to 0.
+            start_top = start_el.top
+            # Slice of first page beginning at *start_top*
+            parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
             # Full middle pages
             for pg_idx in range(start_pg.index + 1, end_pg.index):
@@ -2566,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # If both elements are on the same page, use the page's get_section_between
                 if start_element.page == end_element.page:
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
+                    # For implicit start elements, create a region from the top of the page
+                    if hasattr(start_element, "is_implicit_start"):
+                        from natural_pdf.elements.region import Region
+                        section = Region(
+                            start_element.page,
+                            (0, start_element.top, start_element.page.width, end_element.bottom)
+                        )
+                        section.start_element = start_element
+                        section.boundary_element_found = end_element
+                    else:
+                        section = start_element.page.get_section_between(
+                            start_element, end_element, boundary_inclusion
+                        )
                     sections.append(section)
                 else:
                     # Create FlowRegion spanning pages
@@ -2607,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     from natural_pdf.elements.region import Region
                     start_page = start_element.page
+                    # Handle implicit start elements
+                    start_top = start_element.top
                     region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
+                        start_page, (0, start_top, start_page.width, start_page.height)
                     )
                     region.start_element = start_element
                     sections.append(region)
@@ -2636,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # With start_elements only, create a section to the end of the current page
                 from natural_pdf.elements.region import Region
+                # Handle implicit start elements
+                start_top = start_element.top
                 region = Region(
-                    start_page, (0, start_element.top, start_page.width, start_page.height)
+                    start_page, (0, start_top, start_page.width, start_page.height)
                 )
                 region.start_element = start_element
                 sections.append(region)
@@ -2800,7 +2900,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             )
         # Get the 0-based indices of the pages in this collection
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(
             f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
         )
@@ -3150,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
             # <--- END MODIFIED
+    def to_flow(
+        self,
+        arrangement: Literal["vertical", "horizontal"] = "vertical",
+        alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
+        segment_gap: float = 0.0,
+    ) -> "Flow":
+        """
+        Convert this PageCollection to a Flow for cross-page operations.
+        This enables treating multiple pages as a continuous logical document
+        structure, useful for multi-page tables, articles spanning columns,
+        or any content requiring reading order across page boundaries.
+        Args:
+            arrangement: Primary flow direction ('vertical' or 'horizontal').
+                        'vertical' stacks pages top-to-bottom (most common).
+                        'horizontal' arranges pages left-to-right.
+            alignment: Cross-axis alignment for pages of different sizes:
+                      For vertical: 'left'/'start', 'center', 'right'/'end'
+                      For horizontal: 'top'/'start', 'center', 'bottom'/'end'
+            segment_gap: Virtual gap between pages in PDF points (default: 0.0).
+        Returns:
+            Flow object that can perform operations across all pages in sequence.
+        Example:
+            Multi-page table extraction:
+            ```python
+            pdf = npdf.PDF("multi_page_report.pdf")
+            # Create flow for pages 2-4 containing a table
+            table_flow = pdf.pages[1:4].to_flow()
+            # Extract table as if it were continuous
+            table_data = table_flow.extract_table()
+            df = table_data.df
+            ```
+            Cross-page element search:
+            ```python
+            # Find all headers across multiple pages
+            headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
+            # Analyze layout across pages
+            regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
+            ```
+        """
+        from natural_pdf.flows.flow import Flow
+        return Flow(
+            segments=self,  # Flow constructor now handles PageCollection
+            arrangement=arrangement,
+            alignment=alignment,
+            segment_gap=segment_gap,
+        )
     # Alias .to_image() to .show() for convenience
     def show(
         self,

natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl