PyPI - natural-pdf - Versions diffs - 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/__init__.py +6 -0
natural_pdf/core/page.py +90 -22
natural_pdf/core/pdf.py +183 -59
natural_pdf/elements/collections.py +202 -47
natural_pdf/elements/region.py +176 -56
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +702 -20
natural_pdf/flows/region.py +52 -4
natural_pdf/selectors/parser.py +34 -1
natural_pdf/text_mixin.py +97 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
+from natural_pdf.flows.flow import Flow
+from natural_pdf.flows.region import FlowRegion
+from natural_pdf.analyzers.guides import Guides
 ElementCollection = None
@@ -116,6 +119,9 @@ __all__ = [
     "Page",
     "Region",
     "ElementCollection",
+    "Flow",
+    "FlowRegion",
+    "Guides",
     "TextSearchOptions",
     "MultiModalSearchOptions",
     "BaseSearchOptions",

natural_pdf/core/page.py CHANGED Viewed

@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
 from natural_pdf.describe.mixin import DescribeMixin  # Import describe mixin
 from natural_pdf.elements.base import Element  # Import base element
 from natural_pdf.elements.text import TextElement
-from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
 # --- End Classification Imports --- #
-# --- End Shape Detection Mixin --- #
+# --- Text update mixin import --- #
+from natural_pdf.text_mixin import TextMixin
+from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 try:
@@ -92,7 +92,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
+class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
     """Enhanced Page wrapper built on top of pdfplumber.Page.
     This class provides a fluent interface for working with PDF pages,
@@ -1655,7 +1655,27 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                     table_settings.setdefault("join_x_tolerance", join)
                     table_settings.setdefault("join_y_tolerance", join)
-            return self._page.extract_tables(table_settings)
+            raw_tables = self._page.extract_tables(table_settings)
+            # Apply RTL text processing to all extracted tables
+            if raw_tables:
+                processed_tables = []
+                for table in raw_tables:
+                    processed_table = []
+                    for row in table:
+                        processed_row = []
+                        for cell in row:
+                            if cell is not None:
+                                # Apply RTL text processing to each cell
+                                rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
+                                processed_row.append(rtl_processed_cell)
+                            else:
+                                processed_row.append(cell)
+                        processed_table.append(processed_row)
+                    processed_tables.append(processed_table)
+                return processed_tables
+            return raw_tables
         else:
             raise ValueError(
                 f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
@@ -2866,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         logger.info(f"Searchable PDF saved to: {output_path_str}")
     # --- Added correct_ocr method ---
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
-        selector: Optional[str] = "text[source=ocr]",
+        transform: Callable[[Any], Optional[str]],
+        selector: str = "text",
         max_workers: Optional[int] = None,
         progress_callback: Optional[Callable[[], None]] = None,  # Added progress callback
     ) -> "Page":  # Return self for chaining
         """
-        Applies corrections to OCR-generated text elements on this page
+        Applies corrections to text elements on this page
         using a user-provided callback function, potentially in parallel.
-        Finds text elements on this page whose 'source' attribute starts
-        with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself. Updates the element's text if the callback returns
-        a new string.
+        Finds text elements on this page matching the *selector* argument and
+        calls the ``transform`` for each, passing the element itself.
+        Updates the element's text if the callback returns a new string.
         Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+            transform: A function accepting an element and returning
+                       `Optional[str]` (new text or None).
+            selector: CSS-like selector string to match text elements.
             max_workers: The maximum number of threads to use for parallel execution.
                          If None or 0 or 1, runs sequentially.
             progress_callback: Optional callback function to call after processing each element.
@@ -2893,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             Self for method chaining.
         """
         logger.info(
-            f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
+            f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
         )
         target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
         target_elements = target_elements_collection.elements  # Get the list
         if not target_elements:
-            logger.info(f"Page {self.number}: No OCR elements found to correct.")
+            logger.info(f"Page {self.number}: No text elements found to update.")
             return self
         element_pbar = None
         try:
             element_pbar = tqdm(
                 total=len(target_elements),
-                desc=f"Correcting OCR Page {self.number}",
+                desc=f"Updating text Page {self.number}",
                 unit="element",
                 leave=False,
             )
@@ -2921,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                 try:
                     current_text = getattr(element, "text", None)
                     # Call the user-provided callback
-                    corrected_text = correction_callback(element)
+                    corrected_text = transform(element)
                     # Validate result type
                     if corrected_text is not None and not isinstance(corrected_text, str):
@@ -2956,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             if max_workers is not None and max_workers > 1:
                 # --- Parallel execution --- #
                 logger.info(
-                    f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
+                    f"Page {self.number}: Running text update in parallel with {max_workers} workers."
                 )
                 futures = []
                 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -2992,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             else:
                 # --- Sequential execution --- #
-                logger.info(f"Page {self.number}: Running OCR correction sequentially.")
+                logger.info(f"Page {self.number}: Running text update sequentially.")
                 for element in target_elements:
                     # Call the task function directly (it handles progress_callback)
                     processed_count += 1
@@ -3007,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                             updated_count += 1
             logger.info(
-                f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
+                f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
             )
             return self  # Return self for chaining
@@ -3280,6 +3300,54 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         )
         return self
+    def _apply_rtl_processing_to_text(self, text: str) -> str:
+        """
+        Apply RTL (Right-to-Left) text processing to a string.
+        This converts visual order text (as stored in PDFs) to logical order
+        for proper display of Arabic, Hebrew, and other RTL scripts.
+        Args:
+            text: Input text string in visual order
+        Returns:
+            Text string in logical order
+        """
+        if not text or not text.strip():
+            return text
+        # Quick check for RTL characters - if none found, return as-is
+        import unicodedata
+        def _contains_rtl(s):
+            return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
+        if not _contains_rtl(text):
+            return text
+        try:
+            from bidi.algorithm import get_display  # type: ignore
+            from natural_pdf.utils.bidi_mirror import mirror_brackets
+            # Apply BiDi algorithm to convert from visual to logical order
+            # Process line by line to handle mixed content properly
+            processed_lines = []
+            for line in text.split("\n"):
+                if line.strip():
+                    # Determine base direction for this line
+                    base_dir = "R" if _contains_rtl(line) else "L"
+                    logical_line = get_display(line, base_dir=base_dir)
+                    # Apply bracket mirroring for correct logical order
+                    processed_lines.append(mirror_brackets(logical_line))
+                else:
+                    processed_lines.append(line)
+            return "\n".join(processed_lines)
+        except (ImportError, Exception):
+            # If bidi library is not available or fails, return original text
+            return text
     @property
     def lines(self) -> List[Any]:
         """Get all line elements on this page."""

natural_pdf/core/pdf.py CHANGED Viewed

@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.text_mixin import TextMixin
+if TYPE_CHECKING:
+    from natural_pdf.elements.collections import ElementCollection
 try:
     from typing import Any as TypingAny
@@ -103,6 +107,7 @@ except ImportError:
 from collections.abc import Sequence
 class _LazyPageList(Sequence):
     """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
@@ -121,6 +126,7 @@ class _LazyPageList(Sequence):
         _font_attrs: Font attributes to use when creating pages.
         _cache: List of cached Page objects (None until accessed).
         _load_text: Whether to load text layer when creating pages.
+        _indices: Optional range of indices this list represents (for slices).
     Example:
         ```python
@@ -130,7 +136,7 @@ class _LazyPageList(Sequence):
         last_page = pdf.pages[-1]  # Creates another Page object
         # Slicing works too
-        first_three = pdf.pages[0:3]  # Creates 3 Page objects
+        first_three = pdf.pages[0:3]  # Returns another lazy list
         # Iteration creates all pages
         for page in pdf.pages:  # Each page created as needed
@@ -139,30 +145,71 @@ class _LazyPageList(Sequence):
     """
     def __init__(
-        self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
+        self,
+        parent_pdf: "PDF",
+        plumber_pdf: "pdfplumber.PDF",
+        font_attrs=None,
+        load_text=True,
+        indices: Optional[List[int]] = None
     ):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
-        # One slot per pdfplumber page – initially all None
-        self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
         self._load_text = load_text
+        # If indices is provided, this is a sliced view
+        if indices is not None:
+            self._indices = indices
+            self._cache = [None] * len(indices)
+        else:
+            # Full PDF - one slot per pdfplumber page
+            self._indices = list(range(len(plumber_pdf.pages)))
+            self._cache = [None] * len(plumber_pdf.pages)
     # Internal helper -----------------------------------------------------
     def _create_page(self, index: int) -> "Page":
+        """Create and cache a page at the given index within this list."""
         cached = self._cache[index]
         if cached is None:
             # Import here to avoid circular import problems
             from natural_pdf.core.page import Page
-            plumber_page = self._plumber_pdf.pages[index]
+            # Get the actual page index in the full PDF
+            actual_page_index = self._indices[index]
+            plumber_page = self._plumber_pdf.pages[actual_page_index]
             cached = Page(
                 plumber_page,
                 parent=self._parent_pdf,
-                index=index,
+                index=actual_page_index,
                 font_attrs=self._font_attrs,
                 load_text=self._load_text,
             )
+            # Apply any stored exclusions to the newly created page
+            if hasattr(self._parent_pdf, '_exclusions'):
+                for exclusion_data in self._parent_pdf._exclusions:
+                    exclusion_func, label = exclusion_data
+                    try:
+                        cached.add_exclusion(exclusion_func, label=label)
+                    except Exception as e:
+                        logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
+            # Apply any stored regions to the newly created page
+            if hasattr(self._parent_pdf, '_regions'):
+                for region_data in self._parent_pdf._regions:
+                    region_func, name = region_data
+                    try:
+                        region_instance = region_func(cached)
+                        if region_instance and hasattr(region_instance, '__class__'):
+                            # Check if it's a Region-like object (avoid importing Region here)
+                            cached.add_region(region_instance, name=name, source="named")
+                        elif region_instance is not None:
+                            logger.warning(
+                                f"Region function did not return a valid Region for page {cached.number}"
+                            )
+                    except Exception as e:
+                        logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
         return cached
@@ -172,9 +219,18 @@ class _LazyPageList(Sequence):
     def __getitem__(self, key):
         if isinstance(key, slice):
-            # Materialise pages for slice lazily as well
-            indices = range(*key.indices(len(self)))
-            return [self._create_page(i) for i in indices]
+            # Get the slice of our current indices
+            slice_indices = range(*key.indices(len(self)))
+            # Extract the actual page indices for this slice
+            actual_indices = [self._indices[i] for i in slice_indices]
+            # Return a new lazy list for the slice
+            return _LazyPageList(
+                self._parent_pdf,
+                self._plumber_pdf,
+                font_attrs=self._font_attrs,
+                load_text=self._load_text,
+                indices=actual_indices
+            )
         elif isinstance(key, int):
             if key < 0:
                 key += len(self)
@@ -195,7 +251,7 @@ class _LazyPageList(Sequence):
 # --- End Lazy Page List Helper --- #
-class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
+class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
     """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,
@@ -556,8 +612,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
-        for page in self._pages:
-            page.clear_exclusions()
+        # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only clear from existing pages
+                try:
+                    self._pages._cache[i].clear_exclusions()
+                except Exception as e:
+                    logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
         return self
     def add_exclusion(
@@ -608,25 +670,35 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise AttributeError("PDF pages not yet initialized.")
         # ------------------------------------------------------------------
-        # NEW: Support selector strings and ElementCollection objects directly.
-        # We simply forward the same object to each page's add_exclusion which
-        # now knows how to interpret these inputs.
+        # Support selector strings and ElementCollection objects directly.
+        # Store exclusion and apply only to already-created pages.
         # ------------------------------------------------------------------
         from natural_pdf.elements.collections import ElementCollection  # local import
         if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
-            # Store for bookkeeping
+            # Store for bookkeeping and lazy application
             self._exclusions.append((exclusion_func, label))
-            for page in self._pages:
-                page.add_exclusion(exclusion_func, label=label)
+            # Apply only to already-created (cached) pages to avoid forcing page creation
+            for i in range(len(self._pages)):
+                if self._pages._cache[i] is not None:  # Only apply to existing pages
+                    try:
+                        self._pages._cache[i].add_exclusion(exclusion_func, label=label)
+                    except Exception as e:
+                        logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
             return self
         # Fallback to original callable / Region behaviour ------------------
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        for page in self._pages:
-            page.add_exclusion(exclusion_func, label=label)
+        # Apply only to already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only apply to existing pages
+                try:
+                    self._pages._cache[i].add_exclusion(exclusion_func, label=label)
+                except Exception as e:
+                    logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
         return self
@@ -868,7 +940,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         Add a region function to the PDF.
         Args:
-            region_func: A function that takes a Page and returns a Region, or None
             region_func: A function that takes a Page and returns a Region, or None
             name: Optional name for the region
@@ -881,17 +952,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         region_data = (region_func, name)
         self._regions.append(region_data)
-        for page in self._pages:
-            try:
-                region_instance = region_func(page)
-                if region_instance and isinstance(region_instance, Region):
-                    page.add_region(region_instance, name=name, source="named")
-                elif region_instance is not None:
-                    logger.warning(
-                        f"Region function did not return a valid Region for page {page.number}"
-                    )
-            except Exception as e:
-                logger.error(f"Error adding region for page {page.number}: {e}")
+        # Apply only to already-created (cached) pages to avoid forcing page creation
+        for i in range(len(self._pages)):
+            if self._pages._cache[i] is not None:  # Only apply to existing pages
+                page = self._pages._cache[i]
+                try:
+                    region_instance = region_func(page)
+                    if region_instance and isinstance(region_instance, Region):
+                        page.add_region(region_instance, name=name, source="named")
+                    elif region_instance is not None:
+                        logger.warning(
+                            f"Region function did not return a valid Region for page {page.number}"
+                        )
+                except Exception as e:
+                    logger.error(f"Error adding region for page {page.number}: {e}")
         return self
@@ -1159,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return all_tables
+    def get_sections(
+        self,
+        start_elements=None,
+        end_elements=None,
+        new_section_on_page_break=False,
+        boundary_inclusion="both",
+    ) -> "ElementCollection":
+        """
+        Extract sections from the entire PDF based on start/end elements.
+        This method delegates to the PageCollection.get_sections() method,
+        providing a convenient way to extract document sections across all pages.
+        Args:
+            start_elements: Elements or selector string that mark the start of sections (optional)
+            end_elements: Elements or selector string that mark the end of sections (optional)
+            new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
+            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
+        Returns:
+            ElementCollection of Region objects representing the extracted sections
+        Example:
+            Extract sections between headers:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            # Get sections between headers
+            sections = pdf.get_sections(
+                start_elements='text[size>14]:bold',
+                end_elements='text[size>14]:bold'
+            )
+            # Get sections that break at page boundaries
+            sections = pdf.get_sections(
+                start_elements='text:contains("Chapter")',
+                new_section_on_page_break=True
+            )
+            ```
+        Note:
+            You can provide only start_elements, only end_elements, or both.
+            - With only start_elements: sections go from each start to the next start (or end of document)
+            - With only end_elements: sections go from beginning of document to each end
+            - With both: sections go from each start to the corresponding end
+        """
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
+        return self.pages.get_sections(
+            start_elements=start_elements,
+            end_elements=end_elements,
+            new_section_on_page_break=new_section_on_page_break,
+            boundary_inclusion=boundary_inclusion,
+        )
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -1633,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             logger.error(f"Failed to export correction task: {e}")
             raise
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
         pages: Optional[Union[Iterable[int], range, slice]] = None,
+        selector: str = "text",
         max_workers: Optional[int] = None,
         progress_callback: Optional[Callable[[], None]] = None,
     ) -> "PDF":
         """
-        Applies corrections to OCR text elements using a callback function.
-        Applies corrections to OCR text elements using a callback function.
+        Applies corrections to text elements using a callback function.
         Args:
-            correction_callback: Function that takes an element and returns corrected text or None
             correction_callback: Function that takes an element and returns corrected text or None
             pages: Optional page indices/slice to limit the scope of correction
-            max_workers: Maximum number of threads to use for parallel execution
-            progress_callback: Optional callback function for progress updates
+            selector: Selector to apply corrections to (default: "text")
             max_workers: Maximum number of threads to use for parallel execution
             progress_callback: Optional callback function for progress updates
         Returns:
             Self for method chaining
-            Self for method chaining
         """
         target_page_indices = []
-        target_page_indices = []
         if pages is None:
             target_page_indices = list(range(len(self._pages)))
         elif isinstance(pages, slice):
@@ -1671,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
                 raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
-                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_page_indices:
-            logger.warning("No pages selected for OCR correction.")
+            logger.warning("No pages selected for text update.")
             return self
-        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
-        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
+        logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
             try:
-                page.correct_ocr(
-                    correction_callback=correction_callback,
-                    max_workers=max_workers,
-                    progress_callback=progress_callback,
-                )
+                            page.update_text(
+                transform=transform,
+                selector=selector,
+                max_workers=max_workers,
+                progress_callback=progress_callback,
+            )
             except Exception as e:
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
+                logger.error(f"Error during text update on page {page_idx}: {e}")
+                logger.error(f"Error during text update on page {page_idx}: {e}")
-        logger.info("OCR correction process finished.")
-        logger.info("OCR correction process finished.")
+        logger.info("Text update process finished.")
         return self
     def __len__(self) -> int:
@@ -1712,10 +1835,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         if isinstance(key, slice):
             from natural_pdf.elements.collections import PageCollection
-            return PageCollection(self._pages[key])
-        if isinstance(key, int):
+            # Use the lazy page list's slicing which returns another _LazyPageList
+            lazy_slice = self._pages[key]
+            # Wrap in PageCollection for compatibility
+            return PageCollection(lazy_slice)
+        elif isinstance(key, int):
             if 0 <= key < len(self._pages):
                 return self._pages[key]
             else:

natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl