PyPI - natural-pdf - Versions diffs - 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl - Mend

natural-pdf 0.1.36py3-none-any.whl → 0.1.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/analyzers/guides.py +1053 -26
natural_pdf/core/page.py +274 -46
natural_pdf/core/pdf.py +116 -30
natural_pdf/elements/collections.py +48 -7
natural_pdf/elements/region.py +179 -17
natural_pdf/elements/text.py +36 -2
natural_pdf/flows/region.py +128 -26
natural_pdf/selectors/parser.py +24 -0
natural_pdf/utils/layout.py +26 -0
natural_pdf/utils/text_extraction.py +76 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +16 -15
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -103,6 +103,7 @@ except ImportError:
 from collections.abc import Sequence
 class _LazyPageList(Sequence):
     """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
@@ -121,6 +122,7 @@ class _LazyPageList(Sequence):
         _font_attrs: Font attributes to use when creating pages.
         _cache: List of cached Page objects (None until accessed).
         _load_text: Whether to load text layer when creating pages.
+        _indices: Optional range of indices this list represents (for slices).
     Example:
         ```python
@@ -130,7 +132,7 @@ class _LazyPageList(Sequence):
         last_page = pdf.pages[-1]  # Creates another Page object
         # Slicing works too
-        first_three = pdf.pages[0:3]  # Creates 3 Page objects
+        first_three = pdf.pages[0:3]  # Returns another lazy list
         # Iteration creates all pages
         for page in pdf.pages:  # Each page created as needed
@@ -139,30 +141,71 @@ class _LazyPageList(Sequence):
     """
     def __init__(
-        self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
+        self,
+        parent_pdf: "PDF",
+        plumber_pdf: "pdfplumber.PDF",
+        font_attrs=None,
+        load_text=True,
+        indices: Optional[List[int]] = None
     ):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
-        # One slot per pdfplumber page – initially all None
-        self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
         self._load_text = load_text
+        # If indices is provided, this is a sliced view
+        if indices is not None:
+            self._indices = indices
+            self._cache = [None] * len(indices)
+        else:
+            # Full PDF - one slot per pdfplumber page
+            self._indices = list(range(len(plumber_pdf.pages)))
+            self._cache = [None] * len(plumber_pdf.pages)
     # Internal helper -----------------------------------------------------
     def _create_page(self, index: int) -> "Page":
+        """Create and cache a page at the given index within this list."""
         cached = self._cache[index]
         if cached is None:
             # Import here to avoid circular import problems
             from natural_pdf.core.page import Page
-            plumber_page = self._plumber_pdf.pages[index]
+            # Get the actual page index in the full PDF
+            actual_page_index = self._indices[index]
+            plumber_page = self._plumber_pdf.pages[actual_page_index]
             cached = Page(
                 plumber_page,
                 parent=self._parent_pdf,
-                index=index,
+                index=actual_page_index,
                 font_attrs=self._font_attrs,
                 load_text=self._load_text,
             )
+            # Apply any stored exclusions to the newly created page
+            if hasattr(self._parent_pdf, '_exclusions'):
+                for exclusion_data in self._parent_pdf._exclusions:
+                    exclusion_func, label = exclusion_data
+                    try:
+                        cached.add_exclusion(exclusion_func, label=label)
+                    except Exception as e:
+                        logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
+            # Apply any stored regions to the newly created page
+            if hasattr(self._parent_pdf, '_regions'):
+                for region_data in self._parent_pdf._regions:
+                    region_func, name = region_data
+                    try:
+                        region_instance = region_func(cached)
+                        if region_instance and hasattr(region_instance, '__class__'):
+                            # Check if it's a Region-like object (avoid importing Region here)
+                            cached.add_region(region_instance, name=name, source="named")
+                        elif region_instance is not None:
+                            logger.warning(
+                                f"Region function did not return a valid Region for page {cached.number}"
+                            )
+                    except Exception as e:
+                        logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
         return cached
@@ -172,9 +215,18 @@ class _LazyPageList(Sequence):
     def __getitem__(self, key):
         if isinstance(key, slice):
-            # Materialise pages for slice lazily as well
-            indices = range(*key.indices(len(self)))
-            return [self._create_page(i) for i in indices]
+            # Get the slice of our current indices
+            slice_indices = range(*key.indices(len(self)))
+            # Extract the actual page indices for this slice
+            actual_indices = [self._indices[i] for i in slice_indices]
+            # Return a new lazy list for the slice
+            return _LazyPageList(
+                self._parent_pdf,
+                self._plumber_pdf,
+                font_attrs=self._font_attrs,
+                load_text=self._load_text,
+                indices=actual_indices
+            )
         elif isinstance(key, int):
             if key < 0:
                 key += len(self)
@@ -556,12 +608,18 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
-        for page in self._pages:
-            page.clear_exclusions()
+        # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only clear from existing pages
+                try:
+                    self._pages._cache[i].clear_exclusions()
+                except Exception as e:
+                    logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
         return self
     def add_exclusion(
-        self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
+        self, exclusion_func, label: str = None
     ) -> "PDF":
         """Add an exclusion function to the PDF.
@@ -607,11 +665,36 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
+        # ------------------------------------------------------------------
+        # Support selector strings and ElementCollection objects directly.
+        # Store exclusion and apply only to already-created pages.
+        # ------------------------------------------------------------------
+        from natural_pdf.elements.collections import ElementCollection  # local import
+        if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
+            # Store for bookkeeping and lazy application
+            self._exclusions.append((exclusion_func, label))
+            # Apply only to already-created (cached) pages to avoid forcing page creation
+            for i in range(len(self._pages)):
+                if self._pages._cache[i] is not None:  # Only apply to existing pages
+                    try:
+                        self._pages._cache[i].add_exclusion(exclusion_func, label=label)
+                    except Exception as e:
+                        logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+            return self
+        # Fallback to original callable / Region behaviour ------------------
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        for page in self._pages:
-            page.add_exclusion(exclusion_func, label=label)
+        # Apply only to already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only apply to existing pages
+                try:
+                    self._pages._cache[i].add_exclusion(exclusion_func, label=label)
+                except Exception as e:
+                    logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
         return self
@@ -853,7 +936,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         Add a region function to the PDF.
         Args:
-            region_func: A function that takes a Page and returns a Region, or None
             region_func: A function that takes a Page and returns a Region, or None
             name: Optional name for the region
@@ -866,17 +948,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         region_data = (region_func, name)
         self._regions.append(region_data)
-        for page in self._pages:
-            try:
-                region_instance = region_func(page)
-                if region_instance and isinstance(region_instance, Region):
-                    page.add_region(region_instance, name=name, source="named")
-                elif region_instance is not None:
-                    logger.warning(
-                        f"Region function did not return a valid Region for page {page.number}"
-                    )
-            except Exception as e:
-                logger.error(f"Error adding region for page {page.number}: {e}")
+        # Apply only to already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only apply to existing pages
+                page = self._pages._cache[i]
+                try:
+                    region_instance = region_func(page)
+                    if region_instance and isinstance(region_instance, Region):
+                        page.add_region(region_instance, name=name, source="named")
+                    elif region_instance is not None:
+                        logger.warning(
+                            f"Region function did not return a valid Region for page {page.number}"
+                        )
+                except Exception as e:
+                    logger.error(f"Error adding region for page {page.number}: {e}")
         return self
@@ -1697,10 +1782,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         if isinstance(key, slice):
             from natural_pdf.elements.collections import PageCollection
-            return PageCollection(self._pages[key])
-        if isinstance(key, int):
+            # Use the lazy page list's slicing which returns another _LazyPageList
+            lazy_slice = self._pages[key]
+            # Wrap in PageCollection for compatibility
+            return PageCollection(lazy_slice)
+        elif isinstance(key, int):
             if 0 <= key < len(self._pages):
                 return self._pages[key]
             else:

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import hashlib
 import logging
-from collections.abc import MutableSequence
+from collections.abc import MutableSequence, Sequence
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -369,6 +369,7 @@ class ElementCollection(
         preserve_whitespace: bool = True,
         use_exclusions: bool = True,
         strip: Optional[bool] = None,
+        content_filter=None,
         **kwargs,
     ) -> str:
         """
@@ -379,6 +380,10 @@ class ElementCollection(
             preserve_whitespace: Deprecated. Use layout=False for simple joining.
             use_exclusions: Deprecated. Exclusions should be applied *before* creating
                           the collection or by filtering the collection itself.
+            content_filter: Optional content filter to exclude specific text patterns. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
             **kwargs: Additional layout parameters passed directly to pdfplumber's
                       `chars_to_textmap` function ONLY if `layout=True` is passed.
                       See Page.extract_text docstring for common parameters.
@@ -412,6 +417,11 @@ class ElementCollection(
                 getattr(el, "text", "") for el in text_elements
             )  # Fallback to simple join of word text
+        # Apply content filtering if provided
+        if content_filter is not None:
+            from natural_pdf.utils.text_extraction import _apply_content_filter
+            all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
         # Check if layout is requested
         use_layout = kwargs.get("layout", False)
@@ -2041,14 +2051,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
     Provides methods for batch operations on these pages.
     """
-    def __init__(self, pages: List[P]):
+    def __init__(self, pages: Union[List[P], Sequence[P]]):
         """
         Initialize a page collection.
         Args:
-            pages: List of Page objects
+            pages: List or sequence of Page objects (can be lazy)
         """
-        self.pages = pages
+        # Store the sequence as-is to preserve lazy behavior
+        # Only convert to list if we need list-specific operations
+        if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
+            self.pages = pages
+        else:
+            # Fallback for non-sequence types
+            self.pages = list(pages)
     def __len__(self) -> int:
         """Return the number of pages in the collection."""
@@ -2068,6 +2084,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         """Return a string representation showing the page count."""
         return f"<PageCollection(count={len(self)})>"
+    def _get_items_for_apply(self) -> Iterator[P]:
+        """
+        Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
+        Returns an iterator that yields pages on-demand rather than materializing
+        all pages at once, maintaining the lazy loading behavior.
+        """
+        return iter(self.pages)
+    def _get_page_indices(self) -> List[int]:
+        """
+        Get page indices without forcing materialization of pages.
+        Returns:
+            List of page indices for the pages in this collection.
+        """
+        # Handle different types of page sequences efficiently
+        if hasattr(self.pages, '_indices'):
+            # If it's a _LazyPageList (or slice), get indices directly
+            return list(self.pages._indices)
+        else:
+            # Fallback: if pages are already materialized, get indices normally
+            # This will force materialization but only if pages aren't lazy
+            return [p.index for p in self.pages]
     def extract_text(
         self,
         keep_blank_chars: bool = True,
@@ -2162,7 +2203,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
         # Get the 0-based indices of the pages in this collection
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
@@ -2364,7 +2405,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
             )
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(
             f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
         )
@@ -2790,7 +2831,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
             )
         # Get the 0-based indices of the pages in this collection
-        page_indices = [p.index for p in self.pages]
+        page_indices = self._get_page_indices()
         logger.info(
             f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
         )

natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl

natural-pdf 0.1.36py3-none-any.whl → 0.1.38py3-none-any.whl