PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +11 -6
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +252 -399
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +231 -89
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +405 -280
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +1658 -19
natural_pdf/flows/region.py +757 -263
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +35 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +101 -0
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -1,5 +1,16 @@
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
@@ -15,6 +26,9 @@ from natural_pdf.classification.manager import ClassificationManager  # Keep for
 # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin
+# Add Visualizable import
+from natural_pdf.core.render_spec import RenderSpec, Visualizable
 from natural_pdf.describe.mixin import DescribeMixin
 from natural_pdf.elements.base import DirectionalMixin
 from natural_pdf.elements.text import TextElement  # ADDED IMPORT
@@ -26,11 +40,15 @@ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 # Table utilities
 # ------------------------------------------------------------------
 from natural_pdf.tables import TableResult
+from natural_pdf.text_mixin import TextMixin
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
+# Import viewer widget support
+from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
 # --- End Classification Imports --- #
@@ -42,7 +60,7 @@ if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.base import Element  # Added for type hint
-    from natural_pdf.elements.collections import ElementCollection
+    from natural_pdf.elements.element_collection import ElementCollection
     from natural_pdf.elements.text import TextElement
 # Import OCRManager conditionally to avoid circular imports
@@ -56,7 +74,13 @@ logger = logging.getLogger(__name__)
 class Region(
-    DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
+    TextMixin,
+    DirectionalMixin,
+    ClassificationMixin,
+    ExtractionMixin,
+    ShapeDetectionMixin,
+    DescribeMixin,
+    Visualizable,
 ):
     """Represents a rectangular region on a page.
@@ -193,6 +217,62 @@ class Region(
         self.text_content = None  # Direct text content (e.g., from Docling)
         self.associated_text_elements = []  # Native text elements that overlap with this region
+    def _get_render_specs(
+        self,
+        mode: Literal["show", "render"] = "show",
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        crop: Union[bool, Literal["content"]] = True,  # Default to True for regions
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
+        **kwargs,
+    ) -> List[RenderSpec]:
+        """Get render specifications for this region.
+        Args:
+            mode: Rendering mode - 'show' includes highlights, 'render' is clean
+            color: Color for highlighting this region in show mode
+            highlights: Additional highlight groups to show
+            crop: Whether to crop to this region
+            crop_bbox: Explicit crop bounds (overrides region bounds)
+            **kwargs: Additional parameters
+        Returns:
+            List containing a single RenderSpec for this region's page
+        """
+        from typing import Literal
+        spec = RenderSpec(page=self.page)
+        # Handle cropping
+        if crop_bbox:
+            spec.crop_bbox = crop_bbox
+        elif crop:
+            # Crop to this region's bounds
+            spec.crop_bbox = self.bbox
+        # Add highlights in show mode
+        if mode == "show":
+            # Highlight this region
+            if color or mode == "show":  # Always highlight in show mode
+                spec.add_highlight(
+                    bbox=self.bbox,
+                    polygon=self.polygon if self.has_polygon else None,
+                    color=color or "blue",
+                    label=self.label or self.name or "Region",
+                )
+            # Add additional highlight groups if provided
+            if highlights:
+                for group in highlights:
+                    elements = group.get("elements", [])
+                    group_color = group.get("color", color)
+                    group_label = group.get("label")
+                    for elem in elements:
+                        spec.add_highlight(element=elem, color=group_color, label=group_label)
+        return [spec]
     def _direction(
         self,
         direction: str,
@@ -633,7 +713,7 @@ class Region(
         label: Optional[str] = None,
         color: Optional[Union[Tuple, str]] = None,
         use_color_cycling: bool = False,
-        include_attrs: Optional[List[str]] = None,
+        annotate: Optional[List[str]] = None,
         existing: str = "append",
     ) -> "Region":
         """
@@ -643,7 +723,7 @@ class Region(
             label: Optional label for the highlight
             color: Color tuple/string for the highlight, or None to use automatic color
             use_color_cycling: Force color cycling even with no label (default: False)
-            include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
+            annotate: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
             existing: How to handle existing highlights ('append' or 'replace').
         Returns:
@@ -659,7 +739,7 @@ class Region(
             "label": label,
             "use_color_cycling": use_color_cycling,
             "element": self,  # Pass the region itself so attributes can be accessed
-            "include_attrs": include_attrs,
+            "annotate": annotate,
             "existing": existing,
         }
@@ -673,178 +753,6 @@ class Region(
         return self
-    def to_image(
-        self,
-        resolution: Optional[float] = None,
-        crop: bool = False,
-        include_highlights: bool = True,
-        **kwargs,
-    ) -> "Image.Image":
-        """
-        Generate an image of just this region.
-        Args:
-            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
-            crop: If True, only crop the region without highlighting its boundaries
-            include_highlights: Whether to include existing highlights (default: True)
-            **kwargs: Additional parameters for page.to_image()
-        Returns:
-            PIL Image of just this region
-        """
-        # Apply global options as defaults
-        import natural_pdf
-        if resolution is None:
-            if natural_pdf.options.image.resolution is not None:
-                resolution = natural_pdf.options.image.resolution
-            else:
-                resolution = 144  # Default resolution when none specified
-        # Handle the case where user wants the cropped region to have a specific width
-        page_kwargs = kwargs.copy()
-        effective_resolution = resolution  # Start with the provided resolution
-        if crop and "width" in kwargs:
-            target_width = kwargs["width"]
-            # Calculate what resolution is needed to make the region crop have target_width
-            region_width_points = self.width  # Region width in PDF points
-            if region_width_points > 0:
-                # Calculate scale needed: target_width / region_width_points
-                required_scale = target_width / region_width_points
-                # Convert scale to resolution: scale * 72 DPI
-                effective_resolution = required_scale * 72.0
-                page_kwargs.pop("width")  # Remove width parameter to avoid conflicts
-                logger.debug(
-                    f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
-                )
-            else:
-                logger.warning(
-                    f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
-                )
-        # First get the full page image with highlights if requested
-        page_image = self._page.to_image(
-            resolution=effective_resolution,
-            include_highlights=include_highlights,
-            **page_kwargs,
-        )
-        # Calculate the actual scale factor used by the page image
-        if page_image.width > 0 and self._page.width > 0:
-            scale_factor = page_image.width / self._page.width
-        else:
-            # Fallback to resolution-based calculation if dimensions are invalid
-            scale_factor = resolution / 72.0
-        # Apply scaling to the coordinates
-        x0 = int(self.x0 * scale_factor)
-        top = int(self.top * scale_factor)
-        x1 = int(self.x1 * scale_factor)
-        bottom = int(self.bottom * scale_factor)
-        # Ensure coords are valid for cropping (left < right, top < bottom)
-        if x0 >= x1:
-            logger.warning(
-                f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
-            )
-            return None
-        if top >= bottom:
-            logger.warning(
-                f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
-            )
-            return None
-        # Crop the image to just this region
-        region_image = page_image.crop((x0, top, x1, bottom))
-        # If not crop, add a border to highlight the region boundaries
-        if not crop:
-            from PIL import ImageDraw
-            # Create a 1px border around the region
-            draw = ImageDraw.Draw(region_image)
-            draw.rectangle(
-                (0, 0, region_image.width - 1, region_image.height - 1),
-                outline=(255, 0, 0),
-                width=1,
-            )
-        return region_image
-    def show(
-        self,
-        resolution: Optional[float] = None,
-        labels: bool = True,
-        legend_position: str = "right",
-        # Add a default color for standalone show
-        color: Optional[Union[Tuple, str]] = "blue",
-        label: Optional[str] = None,
-        width: Optional[int] = None,  # Add width parameter
-        crop: bool = False,  # NEW: Crop output to region bounds before legend
-    ) -> "Image.Image":
-        """
-        Show the page with just this region highlighted temporarily.
-        Args:
-            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
-            labels: Whether to include a legend for labels
-            legend_position: Position of the legend
-            color: Color to highlight this region (default: blue)
-            label: Optional label for this region in the legend
-            width: Optional width for the output image in pixels
-            crop: If True, crop the rendered image to this region's
-                        bounding box (with a small margin handled inside
-                        HighlightingService) before legends/overlays are added.
-        Returns:
-            PIL Image of the page with only this region highlighted
-        """
-        # Apply global options as defaults
-        import natural_pdf
-        if resolution is None:
-            if natural_pdf.options.image.resolution is not None:
-                resolution = natural_pdf.options.image.resolution
-            else:
-                resolution = 144  # Default resolution when none specified
-        if not self._page:
-            raise ValueError("Region must be associated with a page to show.")
-        # Use the highlighting service via the page's property
-        service = self._page._highlighter
-        # Determine the label if not provided
-        display_label = (
-            label if label is not None else f"Region ({self.type})" if self.type else "Region"
-        )
-        # Prepare temporary highlight data for just this region
-        temp_highlight_data = {
-            "page_index": self._page.index,
-            "bbox": self.bbox,
-            "polygon": self.polygon if self.has_polygon else None,
-            "color": color,  # Use provided or default color
-            "label": display_label,
-            "use_color_cycling": False,  # Explicitly false for single preview
-        }
-        # Determine crop bbox if requested
-        crop_bbox = self.bbox if crop else None
-        # Use render_preview to show only this highlight
-        return service.render_preview(
-            page_index=self._page.index,
-            temporary_highlights=[temp_highlight_data],
-            resolution=resolution,
-            width=width,  # Pass the width parameter
-            labels=labels,
-            legend_position=legend_position,
-            crop_bbox=crop_bbox,
-        )
     def save(
         self,
         filename: str,
@@ -898,7 +806,7 @@ class Region(
             resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             crop: If True, only crop the region without highlighting its boundaries
             include_highlights: Whether to include existing highlights (default: True)
-            **kwargs: Additional parameters for page.to_image()
+            **kwargs: Additional parameters for rendering
         Returns:
             Self for method chaining
@@ -912,16 +820,23 @@ class Region(
             else:
                 resolution = 144  # Default resolution when none specified
-        # Get the region image
-        image = self.to_image(
-            resolution=resolution,
-            crop=crop,
-            include_highlights=include_highlights,
-            **kwargs,
-        )
+        # Use export() to save the image
+        if include_highlights:
+            # With highlights, use export() which includes them
+            self.export(
+                path=filename,
+                resolution=resolution,
+                crop=crop,
+                **kwargs,
+            )
+        else:
+            # Without highlights, use render() and save manually
+            image = self.render(resolution=resolution, crop=crop, **kwargs)
+            if image:
+                image.save(filename)
+            else:
+                logger.error(f"Failed to render region image for saving to {filename}")
-        # Save the image
-        image.save(filename)
         return self
     def trim(
@@ -982,7 +897,8 @@ class Region(
         )
         # Get the region image
-        image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
+        # Use render() for clean image without highlights, with cropping
+        image = work_region.render(resolution=resolution, crop=True)
         if image is None:
             logger.warning(
@@ -1221,7 +1137,9 @@ class Region(
             # Filter to elements in this region
             return [e for e in page_elements if self._is_element_in_region(e)]
-    def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
+    def extract_text(
+        self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
+    ) -> str:
         """
         Extract text from this region, respecting page exclusions and using pdfplumber's
         layout engine (chars_to_textmap).
@@ -1293,7 +1211,7 @@ class Region(
         final_kwargs = kwargs.copy()
         if content_filter is not None:
             final_kwargs["content_filter"] = content_filter
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=self.bbox,  # Use region's bbox for context
@@ -1313,7 +1231,9 @@ class Region(
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         # --- NEW: Add tqdm control option --- #
         show_progress: bool = False,  # Controls progress bar for text method
-        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,  # NEW: Content filtering
+        content_filter: Optional[
+            Union[str, Callable[[str], bool], List[str]]
+        ] = None,  # NEW: Content filtering
     ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1373,7 +1293,11 @@ class Region(
                     logger.debug(
                         f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
                     )
-                    return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
+                    return TableResult(
+                        self._extract_table_from_cells(
+                            cell_regions_in_table, content_filter=content_filter
+                        )
+                    )
                 # --------------------------------------------------------------- #
@@ -1454,7 +1378,9 @@ class Region(
         # Use the selected method
         if effective_method == "tatr":
-            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
+            table_rows = self._extract_table_tatr(
+                use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
+            )
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
@@ -1610,8 +1536,47 @@ class Region(
             table_settings.setdefault("join_x_tolerance", join)
             table_settings.setdefault("join_y_tolerance", join)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering, if any exclusions are
+        # defined on the parent Page.  We create a lightweight
+        # pdfplumber.Page copy whose .chars list omits characters that
+        # fall inside any exclusion Region.  Other object types are
+        # left untouched for now ("chars-only" strategy).
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            # Resolve exclusion Regions (callables already evaluated)
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                """Return True if pdfplumber obj should be kept."""
+                if obj.get("object_type") != "char":
+                    # Keep non-char objects unchanged – lattice grids etc.
+                    return True
+                # Compute character centre point
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                # Reject if the centre lies inside ANY exclusion Region
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                # Fallback – if filtering fails, log and proceed unfiltered
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        cropped = filtered_page.crop(self.bbox)
         # Extract all tables from the cropped area
         tables = cropped.extract_tables(table_settings)
@@ -1672,8 +1637,38 @@ class Region(
             if y_tol is not None:
                 table_settings.setdefault("text_y_tolerance", y_tol)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering (chars only) just like in
+        # _extract_tables_plumber so header/footer text does not appear
+        # in extracted tables.
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                if obj.get("object_type") != "char":
+                    return True
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        # Now crop the (possibly filtered) page to the region bbox
+        cropped = filtered_page.crop(self.bbox)
         # Extract the single largest table from the cropped area
         table = cropped.extract_table(table_settings)
@@ -1688,10 +1683,12 @@ class Region(
                     if cell is not None:
                         # Apply RTL text processing first
                         rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
                         # Then apply content filter if provided
                         if content_filter is not None:
-                            filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
+                            filtered_cell = self._apply_content_filter_to_text(
+                                rtl_processed_cell, content_filter
+                            )
                             processed_row.append(filtered_cell)
                         else:
                             processed_row.append(rtl_processed_cell)
@@ -1701,7 +1698,9 @@ class Region(
             return processed_table
         return []
-    def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
+    def _extract_table_tatr(
+        self, use_ocr=False, ocr_config=None, content_filter=None
+    ) -> List[List[str]]:
         """
         Extract table using TATR structure detection.
@@ -2098,7 +2097,7 @@ class Region(
         Returns:
             ElementCollection with matching elements.
         """
-        from natural_pdf.elements.collections import ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         if selector is not None and text is not None:
             raise ValueError("Provide either 'selector' or 'text', not both.")
@@ -2183,7 +2182,7 @@ class Region(
         ---------
         ```python
         def llm_ocr(region):
-            image = region.to_image(resolution=300, crop=True)
+            image = region.render(resolution=300, crop=True)
             return my_llm_client.ocr(image)
         region.apply_ocr(function=llm_ocr)
         ```
@@ -2293,9 +2292,8 @@ class Region(
         # Render the page region to an image using the determined resolution
         try:
-            region_image = self.to_image(
-                resolution=final_resolution, include_highlights=False, crop=True
-            )
+            # Use render() for clean image without highlights, with cropping
+            region_image = self.render(resolution=final_resolution, crop=True)
             if not region_image:
                 logger.error("Failed to render region to image for OCR.")
                 return self
@@ -2417,7 +2415,7 @@ class Region(
         Example:
             # Using with an LLM
             def ocr_with_llm(region):
-                image = region.to_image(resolution=300, crop=True)
+                image = region.render(resolution=300, crop=True)
                 # Call your LLM API here
                 return llm_client.ocr(image)
@@ -2425,7 +2423,7 @@ class Region(
             # Using with a custom OCR service
             def ocr_with_service(region):
-                img_bytes = region.to_image(crop=True).tobytes()
+                img_bytes = region.render(crop=True).tobytes()
                 response = ocr_service.process(img_bytes)
                 return response.text
@@ -2530,14 +2528,14 @@ class Region(
         return self
-    def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
+    def get_section_between(self, start_element=None, end_element=None, include_boundaries="both"):
         """
         Get a section between two elements within this region.
         Args:
             start_element: Element marking the start of the section
             end_element: Element marking the end of the section
-            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
         Returns:
             Region representing the section
@@ -2586,15 +2584,15 @@ class Region(
         start_element_for_bbox = start_element
         end_element_for_bbox = end_element
-        if boundary_inclusion == "none":
+        if include_boundaries == "none":
             start_idx += 1
             end_idx -= 1
             start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
             end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
-        elif boundary_inclusion == "start":
+        elif include_boundaries == "start":
             end_idx -= 1
             end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
-        elif boundary_inclusion == "end":
+        elif include_boundaries == "end":
             start_idx += 1
             start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
@@ -2627,7 +2625,7 @@ class Region(
         return section
     def get_sections(
-        self, start_elements=None, end_elements=None, boundary_inclusion="both"
+        self, start_elements=None, end_elements=None, include_boundaries="both"
     ) -> "ElementCollection[Region]":
         """
         Get sections within this region based on start/end elements.
@@ -2635,12 +2633,12 @@ class Region(
         Args:
             start_elements: Elements or selector string that mark the start of sections
             end_elements: Elements or selector string that mark the end of sections
-            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
         Returns:
             List of Region objects representing the extracted sections
         """
-        from natural_pdf.elements.collections import ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         # Process string selectors to find elements WITHIN THIS REGION
         if isinstance(start_elements, str):
@@ -2714,7 +2712,7 @@ class Region(
                 start_element = current_start_boundary["element"]
                 end_element = boundary["element"]
                 # Use the helper, ensuring elements are from within the region
-                section = self.get_section_between(start_element, end_element, boundary_inclusion)
+                section = self.get_section_between(start_element, end_element, include_boundaries)
                 sections.append(section)
                 current_start_boundary = None  # Reset
@@ -2731,7 +2729,7 @@ class Region(
                 if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
                     end_element = all_elements_in_region[end_idx]
                     section = self.get_section_between(
-                        start_element, end_element, boundary_inclusion
+                        start_element, end_element, include_boundaries
                     )
                     sections.append(section)
                 # Else: Section started and ended by consecutive start elements? Create empty?
@@ -2745,7 +2743,7 @@ class Region(
             start_element = current_start_boundary["element"]
             # End at the last element within the region
             end_element = all_elements_in_region[-1]
-            section = self.get_section_between(start_element, end_element, boundary_inclusion)
+            section = self.get_section_between(start_element, end_element, include_boundaries)
             sections.append(section)
         return ElementCollection(sections)
@@ -3007,46 +3005,23 @@ class Region(
         source_info = f" source='{self.source}'" if self.source else ""
         return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Region":  # Return self for chaining
-        """
-        Applies corrections to OCR-generated text elements within this region
-        using a user-provided callback function.
-        Finds text elements within this region whose 'source' attribute starts
-        with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself.
-        The `correction_callback` should contain the logic to:
-        1. Determine if the element needs correction.
-        2. Perform the correction (e.g., call an LLM).
-        3. Return the new text (`str`) or `None`.
-        If the callback returns a string, the element's `.text` is updated.
-        Metadata updates (source, confidence, etc.) should happen within the callback.
-        Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+        transform: Callable[[Any], Optional[str]],
+        *,
+        selector: str = "text",
+        apply_exclusions: bool = False,
+    ) -> "Region":
+        """Apply *transform* to every text element matched by *selector* inside this region.
-        Returns:
-            Self for method chaining.
+        The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
+        override simply ensures the search is scoped to the region.
         """
-        # Find OCR elements specifically within this region
-        # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
-        # Delegate to the utility function
-        _apply_ocr_correction_to_elements(
-            elements=target_elements,  # Pass the ElementCollection directly
-            correction_callback=correction_callback,
-            caller_info=f"Region({self.bbox})",  # Pass caller info
+        return TextMixin.update_text(
+            self, transform, selector=selector, apply_exclusions=apply_exclusions
         )
-        return self  # Return self for chaining
     # --- Classification Mixin Implementation --- #
     def _get_classification_manager(self) -> "ClassificationManager":
         if (
@@ -3086,9 +3061,8 @@ class Region(
                 else default_resolution
             )
-            img = self.to_image(
+            img = self.render(
                 resolution=resolution,
-                include_highlights=False,  # No highlights for classification input
                 crop=True,  # Just the region content
             )
             if img is None:
@@ -3218,7 +3192,7 @@ class Region(
             An ElementCollection containing temporary Region objects for each detected cell,
             or an empty ElementCollection if no cells are found or an error occurs.
         """
-        from natural_pdf.elements.collections import ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         # 1. Perform the analysis (or use cached results)
         if "text_table_structure" in self.analyses:
@@ -3420,13 +3394,15 @@ class Region(
     # New helper: build table from pre-computed table_cell regions
     # ------------------------------------------------------------------
-    def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
+    def _extract_table_from_cells(
+        self, cell_regions: List["Region"], content_filter=None
+    ) -> List[List[Optional[str]]]:
         """Construct a table (list-of-lists) from table_cell regions.
         This assumes each cell Region has metadata.row_index / col_index as written by
         detect_table_structure_from_lines().  If these keys are missing we will
         fall back to sorting by geometry.
         Args:
             cell_regions: List of table cell Region objects to extract text from
             content_filter: Optional content filter to apply to cell text extraction
@@ -3460,7 +3436,9 @@ class Region(
                 try:
                     r_idx = int(cell.metadata.get("row_index"))
                     c_idx = int(cell.metadata.get("col_index"))
-                    text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
+                    text_val = cell.extract_text(
+                        layout=False, apply_exclusions=False, content_filter=content_filter
+                    ).strip()
                     table_grid[r_idx][c_idx] = text_val if text_val else None
                 except Exception as _err:
                     # Skip problematic cell
@@ -3507,7 +3485,9 @@ class Region(
             row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
             col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
-            text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
+            text_val = cell.extract_text(
+                layout=False, apply_exclusions=False, content_filter=content_filter
+            ).strip()
             table_grid[row_idx][col_idx] = text_val if text_val else None
         return table_grid
@@ -3515,32 +3495,33 @@ class Region(
     def _apply_rtl_processing_to_text(self, text: str) -> str:
         """
         Apply RTL (Right-to-Left) text processing to a string.
         This converts visual order text (as stored in PDFs) to logical order
         for proper display of Arabic, Hebrew, and other RTL scripts.
         Args:
             text: Input text string in visual order
         Returns:
             Text string in logical order
         """
         if not text or not text.strip():
             return text
         # Quick check for RTL characters - if none found, return as-is
         import unicodedata
         def _contains_rtl(s):
             return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
         if not _contains_rtl(text):
             return text
         try:
             from bidi.algorithm import get_display  # type: ignore
             from natural_pdf.utils.bidi_mirror import mirror_brackets
             # Apply BiDi algorithm to convert from visual to logical order
             # Process line by line to handle mixed content properly
             processed_lines = []
@@ -3553,9 +3534,9 @@ class Region(
                     processed_lines.append(mirror_brackets(logical_line))
                 else:
                     processed_lines.append(line)
             return "\n".join(processed_lines)
         except (ImportError, Exception):
             # If bidi library is not available or fails, return original text
             return text
@@ -3563,36 +3544,36 @@ class Region(
     def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
         """
         Apply content filter to a text string.
         Args:
             text: Input text string
             content_filter: Content filter (regex, callable, or list of regexes)
         Returns:
             Filtered text string
         """
         if not text or content_filter is None:
             return text
         import re
         if isinstance(content_filter, str):
             # Single regex pattern - remove matching parts
             try:
-                return re.sub(content_filter, '', text)
+                return re.sub(content_filter, "", text)
             except re.error:
                 return text  # Invalid regex, return original
         elif isinstance(content_filter, list):
             # List of regex patterns - remove parts matching ANY pattern
             try:
                 result = text
                 for pattern in content_filter:
-                    result = re.sub(pattern, '', result)
+                    result = re.sub(pattern, "", result)
                 return result
             except re.error:
                 return text  # Invalid regex, return original
         elif callable(content_filter):
             # Callable filter - apply to individual characters
             try:
@@ -3600,8 +3581,152 @@ class Region(
                 for char in text:
                     if content_filter(char):
                         filtered_chars.append(char)
-                return ''.join(filtered_chars)
+                return "".join(filtered_chars)
             except Exception:
                 return text  # Function error, return original
         return text
+    # ------------------------------------------------------------------
+    # Interactive Viewer Support
+    # ------------------------------------------------------------------
+    def viewer(
+        self,
+        *,
+        resolution: int = 150,
+        include_chars: bool = False,
+        include_attributes: Optional[List[str]] = None,
+    ) -> Optional["InteractiveViewerWidget"]:
+        """Create an interactive ipywidget viewer for **this specific region**.
+        The method renders the region to an image (cropped to the region bounds) and
+        overlays all elements that intersect the region (optionally excluding noisy
+        character-level elements).  The resulting widget offers the same zoom / pan
+        experience as :py:meth:`Page.viewer` but scoped to the region.
+        Parameters
+        ----------
+        resolution : int, default 150
+            Rendering resolution (DPI).  This should match the value used by the
+            page-level viewer so element scaling is accurate.
+        include_chars : bool, default False
+            Whether to include individual *char* elements in the overlay.  These
+            are often too dense for a meaningful visualisation so are skipped by
+            default.
+        include_attributes : list[str], optional
+            Additional element attributes to expose in the info panel (on top of
+            the default set used by the page viewer).
+        Returns
+        -------
+        InteractiveViewerWidget | None
+            The widget instance, or ``None`` if *ipywidgets* is not installed or
+            an error occurred during creation.
+        """
+        # ------------------------------------------------------------------
+        # Dependency / environment checks
+        # ------------------------------------------------------------------
+        if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
+            logger.error(
+                "Interactive viewer requires 'ipywidgets'. "
+                'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
+            )
+            return None
+        try:
+            # ------------------------------------------------------------------
+            # Render region image (cropped) and encode as data URI
+            # ------------------------------------------------------------------
+            import base64
+            from io import BytesIO
+            # Use unified render() with crop=True to obtain just the region
+            img = self.render(resolution=resolution, crop=True)
+            if img is None:
+                logger.error(f"Failed to render image for region {self.bbox} viewer.")
+                return None
+            buf = BytesIO()
+            img.save(buf, format="PNG")
+            img_str = base64.b64encode(buf.getvalue()).decode()
+            image_uri = f"data:image/png;base64,{img_str}"
+            # ------------------------------------------------------------------
+            # Prepare element overlay data (coordinates relative to region)
+            # ------------------------------------------------------------------
+            scale = resolution / 72.0  # Same convention as page viewer
+            # Gather elements intersecting the region
+            region_elements = self.get_elements(apply_exclusions=False)
+            # Optionally filter out chars
+            if not include_chars:
+                region_elements = [
+                    el for el in region_elements if str(getattr(el, "type", "")).lower() != "char"
+                ]
+            default_attrs = [
+                "text",
+                "fontname",
+                "size",
+                "bold",
+                "italic",
+                "color",
+                "linewidth",
+                "is_horizontal",
+                "is_vertical",
+                "source",
+                "confidence",
+                "label",
+                "model",
+                "upright",
+                "direction",
+            ]
+            if include_attributes:
+                default_attrs.extend([a for a in include_attributes if a not in default_attrs])
+            elements_json: List[dict] = []
+            for idx, el in enumerate(region_elements):
+                try:
+                    # Calculate coordinates relative to region bbox and apply scale
+                    x0 = (el.x0 - self.x0) * scale
+                    y0 = (el.top - self.top) * scale
+                    x1 = (el.x1 - self.x0) * scale
+                    y1 = (el.bottom - self.top) * scale
+                    elem_dict = {
+                        "id": idx,
+                        "type": getattr(el, "type", "unknown"),
+                        "x0": round(x0, 2),
+                        "y0": round(y0, 2),
+                        "x1": round(x1, 2),
+                        "y1": round(y1, 2),
+                        "width": round(x1 - x0, 2),
+                        "height": round(y1 - y0, 2),
+                    }
+                    # Add requested / default attributes
+                    for attr_name in default_attrs:
+                        if hasattr(el, attr_name):
+                            val = getattr(el, attr_name)
+                            # Ensure JSON serialisable
+                            if not isinstance(val, (str, int, float, bool, list, dict, type(None))):
+                                val = str(val)
+                            elem_dict[attr_name] = val
+                    elements_json.append(elem_dict)
+                except Exception as e:
+                    logger.warning(f"Error preparing element {idx} for region viewer: {e}")
+            viewer_data = {"page_image": image_uri, "elements": elements_json}
+            # ------------------------------------------------------------------
+            # Instantiate the widget directly using the prepared data
+            # ------------------------------------------------------------------
+            return InteractiveViewerWidget(pdf_data=viewer_data)
+        except Exception as e:
+            logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
+            return None

natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl