PyPI - natural-pdf - Versions diffs - 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl - Mend

natural-pdf 0.1.40py3-none-any.whl → 0.2.1.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +6 -7
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +236 -383
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +172 -83
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +318 -243
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +4 -4
natural_pdf/flows/flow.py +1200 -243
natural_pdf/flows/region.py +707 -261
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +7 -3
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import (  # Added overload
     Callable,
     Dict,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -26,7 +27,7 @@ import pdfplumber
 from PIL import Image, ImageDraw
 from tqdm.auto import tqdm  # Added tqdm import
-from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.element_collection import ElementCollection
 from natural_pdf.elements.region import Region
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock  # Import from utils instead
@@ -38,7 +39,6 @@ if TYPE_CHECKING:
     from natural_pdf.core.highlighting_service import HighlightingService
     from natural_pdf.core.pdf import PDF
     from natural_pdf.elements.base import Element
-    from natural_pdf.elements.collections import ElementCollection
 # # New Imports
 import itertools
@@ -61,12 +61,19 @@ from natural_pdf.classification.manager import ClassificationManager  # For type
 # # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin  # Import classification mixin
 from natural_pdf.core.element_manager import ElementManager
+# Add new import
+from natural_pdf.core.render_spec import RenderSpec, Visualizable
 from natural_pdf.describe.mixin import DescribeMixin  # Import describe mixin
 from natural_pdf.elements.base import Element  # Import base element
 from natural_pdf.elements.text import TextElement
+from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
+# --- Text update mixin import --- #
+from natural_pdf.text_mixin import TextMixin
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # # Import new utils
@@ -75,10 +82,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
 # --- End Classification Imports --- #
-# --- Text update mixin import --- #
-from natural_pdf.text_mixin import TextMixin
-from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 try:
     from deskew import determine_skew
@@ -92,7 +95,14 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
+class Page(
+    TextMixin,
+    ClassificationMixin,
+    ExtractionMixin,
+    ShapeDetectionMixin,
+    DescribeMixin,
+    Visualizable,
+):
     """Enhanced Page wrapper built on top of pdfplumber.Page.
     This class provides a fluent interface for working with PDF pages,
@@ -262,6 +272,77 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         self._load_elements()
         self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
+    def _get_render_specs(
+        self,
+        mode: Literal["show", "render"] = "show",
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        crop: Union[bool, Literal["content"]] = False,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
+        **kwargs,
+    ) -> List[RenderSpec]:
+        """Get render specifications for this page.
+        Args:
+            mode: Rendering mode - 'show' includes page highlights, 'render' is clean
+            color: Default color for highlights in show mode
+            highlights: Additional highlight groups to show
+            crop: Whether to crop the page
+            crop_bbox: Explicit crop bounds
+            **kwargs: Additional parameters
+        Returns:
+            List containing a single RenderSpec for this page
+        """
+        spec = RenderSpec(page=self)
+        # Handle cropping
+        if crop_bbox:
+            spec.crop_bbox = crop_bbox
+        elif crop == "content":
+            # Calculate content bounds from all elements
+            elements = self.get_elements(apply_exclusions=False)
+            if elements:
+                # Get bounding box of all elements
+                x_coords = []
+                y_coords = []
+                for elem in elements:
+                    if hasattr(elem, "bbox") and elem.bbox:
+                        x0, y0, x1, y1 = elem.bbox
+                        x_coords.extend([x0, x1])
+                        y_coords.extend([y0, y1])
+                if x_coords and y_coords:
+                    spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+        elif crop is True:
+            # Crop to full page (no-op, but included for consistency)
+            spec.crop_bbox = (0, 0, self.width, self.height)
+        # Add highlights in show mode
+        if mode == "show":
+            # Add page's persistent highlights if any
+            page_highlights = self._highlighter.get_highlights_for_page(self.index)
+            for highlight in page_highlights:
+                spec.add_highlight(
+                    bbox=highlight.bbox,
+                    polygon=highlight.polygon,
+                    color=highlight.color,
+                    label=highlight.label,
+                    element=None,  # Persistent highlights don't have element refs
+                )
+            # Add additional highlight groups if provided
+            if highlights:
+                for group in highlights:
+                    elements = group.get("elements", [])
+                    group_color = group.get("color", color)
+                    group_label = group.get("label")
+                    for elem in elements:
+                        spec.add_highlight(element=elem, color=group_color, label=group_label)
+        return [spec]
     @property
     def pdf(self) -> "PDF":
         """Provides public access to the parent PDF object."""
@@ -322,7 +403,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             exclusion_func_or_region: Either a callable function returning a Region,
                                       a Region object, or another object with a valid .bbox attribute.
             label: Optional label for this exclusion (e.g., 'header', 'footer').
-            method: Exclusion method - 'region' (exclude all elements in bounding box) or
+            method: Exclusion method - 'region' (exclude all elements in bounding box) or
                     'element' (exclude only the specific elements). Default: 'region'.
         Returns:
@@ -346,7 +427,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         # Likewise, if an ElementCollection is passed we iterate over its
         # elements and create Regions for each one.
         # ------------------------------------------------------------------
-        from natural_pdf.elements.collections import ElementCollection  # local import to avoid cycle
+        # Import ElementCollection from the new module path (old path removed)
+        from natural_pdf.elements.element_collection import ElementCollection
         # Selector string ---------------------------------------------------
         if isinstance(exclusion_func_or_region, str):
@@ -368,7 +450,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                 else:  # method == "region"
                     for el in matching_elements:
                         try:
-                            bbox_coords = (float(el.x0), float(el.top), float(el.x1), float(el.bottom))
+                            bbox_coords = (
+                                float(el.x0),
+                                float(el.top),
+                                float(el.x1),
+                                float(el.bottom),
+                            )
                             region = Region(self, bbox_coords, label=label)
                             # Store directly as a Region tuple so we don't recurse endlessly
                             self._exclusions.append((region, label, method))
@@ -376,9 +463,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                                 f"Page {self.index}: Added exclusion region from selector '{selector_str}' -> {bbox_coords}"
                             )
                         except Exception as e:
-                            logger.warning(
-                                f"Page {self.index}: Failed to create exclusion region from element {el}: {e}"
+                            # Re-raise so calling code/test sees the failure immediately
+                            logger.error(
+                                f"Page {self.index}: Failed to create exclusion region from element {el}: {e}",
+                                exc_info=False,
                             )
+                            raise
             return self  # Completed processing for selector input
         # ElementCollection -----------------------------------------------
@@ -406,9 +496,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                             f"Page {self.index}: Added exclusion region from ElementCollection element {bbox_coords}"
                         )
                     except Exception as e:
-                        logger.warning(
-                            f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}"
+                        logger.error(
+                            f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}",
+                            exc_info=False,
                         )
+                        raise
             return self  # Completed processing for ElementCollection input
         # ------------------------------------------------------------------
@@ -425,7 +517,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         elif isinstance(exclusion_func_or_region, Region):
             # Store Region objects directly, assigning the label
             exclusion_func_or_region.label = label  # Assign label
-            exclusion_data = (exclusion_func_or_region, label, method)  # Store as tuple for consistency
+            exclusion_data = (
+                exclusion_func_or_region,
+                label,
+                method,
+            )  # Store as tuple for consistency
             logger.debug(
                 f"Page {self.index}: Added Region exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
             )
@@ -547,7 +643,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             else:
                 # New format: (exclusion_item, label, method)
                 exclusion_item, label, method = exclusion_data
             exclusion_label = label if label else f"exclusion {i}"
             # Process callable exclusion functions
@@ -609,7 +705,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
     ) -> List["Element"]:
         """
         Filters a list of elements, removing those based on exclusion rules.
-        Handles both region-based exclusions (exclude all in area) and
+        Handles both region-based exclusions (exclude all in area) and
         element-based exclusions (exclude only specific elements).
         Args:
@@ -633,7 +729,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         # Collect element-based exclusions
         excluded_elements = set()  # Use set for O(1) lookup
         for exclusion_data in self._exclusions:
             # Handle both old format (2-tuple) and new format (3-tuple)
             if len(exclusion_data) == 2:
@@ -641,15 +737,15 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                 method = "region"
             else:
                 exclusion_item, label, method = exclusion_data
             # Skip callables (already handled in _get_exclusion_regions)
             if callable(exclusion_item):
                 continue
             # Skip regions (already in exclusion_regions)
             if isinstance(exclusion_item, Region):
                 continue
             # Handle element-based exclusions
             if method == "element" and hasattr(exclusion_item, "bbox"):
                 excluded_elements.add(id(exclusion_item))
@@ -665,10 +761,10 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         filtered_elements = []
         region_excluded_count = 0
         element_excluded_count = 0
         for element in elements:
             exclude = False
             # Check element-based exclusions first (faster)
             if id(element) in excluded_elements:
                 exclude = True
@@ -685,7 +781,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                         if debug_exclusions:
                             print(f"    Element {element} excluded by region {region}")
                         break  # No need to check other regions for this element
             if not exclude:
                 filtered_elements.append(element)
@@ -837,7 +933,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         Returns:
             ElementCollection with matching elements.
         """
-        from natural_pdf.elements.collections import ElementCollection  # Import here for type hint
+        from natural_pdf.elements.element_collection import (  # Import here for type hint
+            ElementCollection,
+        )
         if selector is not None and text is not None:
             raise ValueError("Provide either 'selector' or 'text', not both.")
@@ -1324,7 +1422,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         return self._page.crop(bbox, **kwargs)
     def extract_text(
-        self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, content_filter=None, **kwargs
+        self,
+        preserve_whitespace=True,
+        use_exclusions=True,
+        debug_exclusions=False,
+        content_filter=None,
+        **kwargs,
     ) -> str:
         """
         Extract text from this page, respecting exclusions and using pdfplumber's
@@ -1363,11 +1466,15 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         # 2. Apply element-based exclusions if enabled
         if use_exclusions and self._exclusions:
-            # Filter word elements through _filter_elements_by_exclusions
+            # Filter word elements through _filter_elements_by_exclusions
             # This handles both element-based and region-based exclusions
-            word_elements = self._filter_elements_by_exclusions(word_elements, debug_exclusions=debug)
+            word_elements = self._filter_elements_by_exclusions(
+                word_elements, debug_exclusions=debug
+            )
             if debug:
-                logger.debug(f"Page {self.number}: {len(word_elements)} words remaining after exclusion filtering.")
+                logger.debug(
+                    f"Page {self.number}: {len(word_elements)} words remaining after exclusion filtering."
+                )
         # 3. Get region-based exclusions for spatial filtering
         apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
@@ -1375,7 +1482,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         if apply_exclusions_flag and self._exclusions:
             exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
             if debug:
-                logger.debug(f"Page {self.number}: Found {len(exclusion_regions)} region exclusions for spatial filtering.")
+                logger.debug(
+                    f"Page {self.number}: Found {len(exclusion_regions)} region exclusions for spatial filtering."
+                )
         elif debug:
             logger.debug(f"Page {self.number}: Not applying exclusions.")
@@ -1656,7 +1765,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                     table_settings.setdefault("join_y_tolerance", join)
             raw_tables = self._page.extract_tables(table_settings)
             # Apply RTL text processing to all extracted tables
             if raw_tables:
                 processed_tables = []
@@ -1674,7 +1783,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                         processed_table.append(processed_row)
                     processed_tables.append(processed_table)
                 return processed_tables
             return raw_tables
         else:
             raise ValueError(
@@ -1743,7 +1852,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         label: Optional[str] = None,
         use_color_cycling: bool = False,
         element: Optional[Any] = None,
-        include_attrs: Optional[List[str]] = None,
+        annotate: Optional[List[str]] = None,
         existing: str = "append",
     ) -> "Page":
         """
@@ -1756,7 +1865,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             label: Optional label for the highlight.
             use_color_cycling: If True and no label/color, use next cycle color.
             element: Optional original element being highlighted (for attribute extraction).
-            include_attrs: List of attribute names from 'element' to display.
+            annotate: List of attribute names from 'element' to display.
             existing: How to handle existing highlights ('append' or 'replace').
         Returns:
@@ -1770,7 +1879,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             label=label,
             use_color_cycling=use_color_cycling,
             element=element,
-            include_attrs=include_attrs,
+            annotate=annotate,
             existing=existing,
         )
         return self
@@ -1782,7 +1891,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         label: Optional[str] = None,
         use_color_cycling: bool = False,
         element: Optional[Any] = None,
-        include_attrs: Optional[List[str]] = None,
+        annotate: Optional[List[str]] = None,
         existing: str = "append",
     ) -> "Page":
         """
@@ -1795,7 +1904,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             label: Optional label for the highlight.
             use_color_cycling: If True and no label/color, use next cycle color.
             element: Optional original element being highlighted (for attribute extraction).
-            include_attrs: List of attribute names from 'element' to display.
+            annotate: List of attribute names from 'element' to display.
             existing: How to handle existing highlights ('append' or 'replace').
         Returns:
@@ -1808,41 +1917,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             label=label,
             use_color_cycling=use_color_cycling,
             element=element,
-            include_attrs=include_attrs,
+            annotate=annotate,
             existing=existing,
         )
         return self
-    def show(
-        self,
-        resolution: float = 144,
-        width: Optional[int] = None,
-        labels: bool = True,
-        legend_position: str = "right",
-        render_ocr: bool = False,
-    ) -> Optional[Image.Image]:
-        """
-        Generates and returns an image of the page with persistent highlights rendered.
-        Args:
-            resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
-            width: Optional width for the output image.
-            labels: Whether to include a legend for labels.
-            legend_position: Position of the legend.
-            render_ocr: Whether to render OCR text.
-        Returns:
-            PIL Image object of the page with highlights, or None if rendering fails.
-        """
-        return self.to_image(
-            resolution=resolution,
-            width=width,
-            labels=labels,
-            legend_position=legend_position,
-            render_ocr=render_ocr,
-            include_highlights=True,  # Ensure highlights are requested
-        )
     def save_image(
         self,
         filename: str,
@@ -1870,17 +1949,38 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         Returns:
             Self for method chaining.
         """
-        # Use to_image to generate and save the image
-        self.to_image(
-            path=filename,
-            width=width,
-            labels=labels,
-            legend_position=legend_position,
-            render_ocr=render_ocr,
-            include_highlights=include_highlights,
-            resolution=resolution,
-            **kwargs,
-        )
+        # Use export() to save the image
+        if include_highlights:
+            self.export(
+                path=filename,
+                resolution=resolution,
+                width=width,
+                labels=labels,
+                legend_position=legend_position,
+                render_ocr=render_ocr,
+                **kwargs,
+            )
+        else:
+            # For saving without highlights, use render() and save manually
+            img = self.render(resolution=resolution, **kwargs)
+            if img:
+                # Resize if width is specified
+                if width is not None and width > 0 and img.width > 0:
+                    aspect_ratio = img.height / img.width
+                    height = int(width * aspect_ratio)
+                    try:
+                        img = img.resize((width, height), Image.Resampling.LANCZOS)
+                    except Exception as e:
+                        logger.warning(f"Could not resize image: {e}")
+                # Save the image
+                try:
+                    if os.path.dirname(filename):
+                        os.makedirs(os.path.dirname(filename), exist_ok=True)
+                    img.save(filename)
+                except Exception as e:
+                    logger.error(f"Failed to save image to {filename}: {e}")
         return self
     def clear_highlights(self) -> "Page":
@@ -1923,280 +2023,6 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         # Return the collection of elements which now have style attributes
         return processed_elements_collection
-    def to_image(
-        self,
-        path: Optional[str] = None,
-        width: Optional[int] = None,
-        labels: bool = True,
-        legend_position: str = "right",
-        render_ocr: bool = False,
-        resolution: Optional[float] = None,
-        include_highlights: bool = True,
-        exclusions: Optional[str] = None,  # New parameter
-        **kwargs,
-    ) -> Optional[Image.Image]:
-        """
-        Generate a PIL image of the page, using HighlightingService if needed.
-        Args:
-            path: Optional path to save the image to.
-            width: Optional width for the output image.
-            labels: Whether to include a legend for highlights.
-            legend_position: Position of the legend.
-            render_ocr: Whether to render OCR text on highlights.
-            resolution: Resolution in DPI for base page image. If None, uses global setting or defaults to 144 DPI.
-            include_highlights: Whether to render highlights.
-            exclusions: Accepts one of the following:
-                        • None  – no masking (default)
-                        • "mask" – mask using solid white (back-compat)
-                        • CSS/HTML colour string (e.g. "red", "#ff0000", "#ff000080")
-                        • Tuple of RGB or RGBA values (ints 0-255 or floats 0-1)
-                        All excluded regions are filled with this colour.
-            **kwargs: Additional parameters for pdfplumber.to_image.
-        Returns:
-            PIL Image of the page, or None if rendering fails.
-        """
-        # Apply global options as defaults, but allow explicit parameters to override
-        import natural_pdf
-        # Determine if this is likely a computational use (OCR, analysis, etc.)
-        # If resolution is explicitly provided but width is not, assume computational use
-        # and don't apply global display width settings
-        is_computational_use = (resolution is not None and width is None and
-                               kwargs.get('include_highlights', True) is False)
-        # Use global options if parameters are not explicitly set
-        if width is None and not is_computational_use:
-            width = natural_pdf.options.image.width
-        if resolution is None:
-            if natural_pdf.options.image.resolution is not None:
-                resolution = natural_pdf.options.image.resolution
-            else:
-                resolution = 144  # Default resolution when none specified
-        # 1. Create cache key (excluding path)
-        cache_key_parts = [
-            width,
-            labels,
-            legend_position,
-            render_ocr,
-            resolution,
-            include_highlights,
-            exclusions,
-        ]
-        # Convert kwargs to a stable, hashable representation
-        sorted_kwargs_list = []
-        for k, v in sorted(kwargs.items()):
-            if isinstance(v, list):
-                try:
-                    v = tuple(v)  # Convert lists to tuples
-                except TypeError:  # pragma: no cover
-                    # If list contains unhashable items, fall back to repr or skip
-                    # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
-                    logger.warning(
-                        f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
-                    )
-            sorted_kwargs_list.append((k, v))
-        cache_key_parts.append(tuple(sorted_kwargs_list))
-        try:
-            cache_key = tuple(cache_key_parts)
-        except TypeError as e:  # pragma: no cover
-            logger.warning(
-                f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
-            )
-            cache_key = None  # Fallback to not using cache for this call
-        image_to_return: Optional[Image.Image] = None
-        # 2. Check cache
-        if cache_key is not None and cache_key in self._to_image_cache:
-            image_to_return = self._to_image_cache[cache_key]
-            logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
-        else:
-            # --- This is the original logic to generate the image ---
-            rendered_image_component: Optional[Image.Image] = (
-                None  # Renamed from 'image' in original
-            )
-            render_resolution = resolution
-            thread_id = threading.current_thread().name
-            logger.debug(
-                f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
-            )
-            lock_wait_start = time.monotonic()
-            try:
-                # Acquire the global PDF rendering lock
-                with pdf_render_lock:
-                    lock_acquired_time = time.monotonic()
-                    logger.debug(
-                        f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
-                    )
-                    if include_highlights:
-                        # Delegate rendering to the central service
-                        rendered_image_component = self._highlighter.render_page(
-                            page_index=self.index,
-                            resolution=render_resolution,
-                            labels=labels,
-                            legend_position=legend_position,
-                            render_ocr=render_ocr,
-                            **kwargs,
-                        )
-                    else:
-                        rendered_image_component = render_plain_page(self, render_resolution)
-            except Exception as e:
-                logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
-                # rendered_image_component remains None
-            finally:
-                render_end_time = time.monotonic()
-                logger.debug(
-                    f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
-                )
-            if rendered_image_component is None:
-                if cache_key is not None:
-                    self._to_image_cache[cache_key] = None  # Cache the failure
-                # Save the image if path is provided (will try to save None, handled by PIL/OS)
-                if path:
-                    try:
-                        if os.path.dirname(path):
-                            os.makedirs(os.path.dirname(path), exist_ok=True)
-                        if rendered_image_component is not None:  # Should be None here
-                            rendered_image_component.save(path)  # This line won't be hit if None
-                        # else: logger.debug("Not saving None image") # Not strictly needed
-                    except Exception as save_error:  # pragma: no cover
-                        logger.error(f"Failed to save image to {path}: {save_error}")
-                return None
-            # --- Apply exclusion masking if requested ---
-            # This modifies 'rendered_image_component'
-            image_after_masking = rendered_image_component  # Start with the rendered image
-            # Determine if masking is requested and establish the fill colour
-            mask_requested = exclusions is not None and self._exclusions
-            mask_color: Union[str, Tuple[int, int, int, int]] = "white"  # default
-            if mask_requested:
-                if exclusions != "mask":
-                    # Attempt to parse custom colour input
-                    try:
-                        if isinstance(exclusions, tuple):
-                            # Handle RGB/RGBA tuples with ints 0-255 or floats 0-1
-                            processed = []
-                            all_float = all(isinstance(c, float) for c in exclusions)
-                            for i, c in enumerate(exclusions):
-                                if isinstance(c, float):
-                                    val = int(c * 255) if all_float or i == 3 else int(c)
-                                else:
-                                    val = int(c)
-                                processed.append(max(0, min(255, val)))
-                            if len(processed) == 3:
-                                processed.append(255)  # add full alpha
-                            mask_color = tuple(processed)  # type: ignore[assignment]
-                        elif isinstance(exclusions, str):
-                            # Try using the optional 'colour' library for rich parsing
-                            try:
-                                from colour import Color  # type: ignore
-                                color_obj = Color(exclusions)
-                                mask_color = (
-                                    int(color_obj.red * 255),
-                                    int(color_obj.green * 255),
-                                    int(color_obj.blue * 255),
-                                    255,
-                                )
-                            except Exception:
-                                # Fallback: if parsing fails, treat as plain string accepted by PIL
-                                mask_color = exclusions  # e.g. "red"
-                        else:
-                            logger.warning(
-                                f"Unsupported exclusions colour spec: {exclusions!r}. Using white."
-                            )
-                    except Exception as colour_parse_err:  # pragma: no cover
-                        logger.warning(
-                            f"Failed to parse exclusions colour {exclusions!r}: {colour_parse_err}. Using white."
-                        )
-                try:
-                    # Ensure image is mutable (RGB or RGBA)
-                    if image_after_masking.mode not in ("RGB", "RGBA"):
-                        image_after_masking = image_after_masking.convert("RGB")
-                    exclusion_regions = self._get_exclusion_regions(
-                        include_callable=True, debug=False
-                    )
-                    if exclusion_regions:
-                        draw = ImageDraw.Draw(image_after_masking)
-                        # Scaling factor for converting PDF pts → image px
-                        img_scale = render_resolution / 72.0
-                        # Determine fill colour compatible with current mode
-                        def _mode_compatible(colour):
-                            if isinstance(colour, tuple) and image_after_masking.mode != "RGBA":
-                                return colour[:3]  # drop alpha for RGB images
-                            return colour
-                        fill_colour = _mode_compatible(mask_color)
-                        for region in exclusion_regions:
-                            img_x0 = region.x0 * img_scale
-                            img_top = region.top * img_scale
-                            img_x1 = region.x1 * img_scale
-                            img_bottom = region.bottom * img_scale
-                            img_coords = (
-                                max(0, img_x0),
-                                max(0, img_top),
-                                min(image_after_masking.width, img_x1),
-                                min(image_after_masking.height, img_bottom),
-                            )
-                            if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
-                                draw.rectangle(img_coords, fill=fill_colour)
-                            else:  # pragma: no cover
-                                logger.warning(
-                                    f"Skipping invalid exclusion rect for masking: {img_coords}"
-                                )
-                        del draw  # Release drawing context
-                except Exception as mask_error:  # pragma: no cover
-                    logger.error(
-                        f"Error applying exclusion mask to page {self.index}: {mask_error}",
-                        exc_info=True,
-                    )
-                    # Continue with potentially unmasked or partially masked image
-            # --- Resize the final image if width is provided ---
-            image_final_content = image_after_masking  # Start with image after masking
-            if width is not None and width > 0 and image_final_content.width > 0:
-                aspect_ratio = image_final_content.height / image_final_content.width
-                height = int(width * aspect_ratio)
-                try:
-                    image_final_content = image_final_content.resize(
-                        (width, height), Image.Resampling.LANCZOS
-                    )
-                except Exception as resize_error:  # pragma: no cover
-                    logger.warning(f"Could not resize image: {resize_error}")
-                    # image_final_content remains the un-resized version if resize fails
-            # Store in cache
-            if cache_key is not None:
-                self._to_image_cache[cache_key] = image_final_content
-                logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
-            image_to_return = image_final_content
-        # --- End of cache miss block ---
-        # Save the image (either from cache or newly generated) if path is provided
-        if path and image_to_return:
-            try:
-                # Ensure directory exists
-                if os.path.dirname(path):  # Only call makedirs if there's a directory part
-                    os.makedirs(os.path.dirname(path), exist_ok=True)
-                image_to_return.save(path)
-                logger.debug(f"Saved page image to: {path}")
-            except Exception as save_error:  # pragma: no cover
-                logger.error(f"Failed to save image to {path}: {save_error}")
-        return image_to_return
     def _create_text_elements_from_ocr(
         self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
     ) -> List["TextElement"]:
@@ -2309,7 +2135,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             # Get base image without highlights using the determined resolution
             # Use the global PDF rendering lock
             with pdf_render_lock:
-                image = self.to_image(resolution=final_resolution, include_highlights=False)
+                # Use render() for clean image without highlights
+                image = self.render(resolution=final_resolution)
                 if not image:
                     logger.error(
                         f"  Failed to render page {self.number} to image for OCR extraction."
@@ -2491,7 +2318,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         return self
     def get_section_between(
-        self, start_element=None, end_element=None, boundary_inclusion="both"
+        self, start_element=None, end_element=None, include_boundaries="both"
     ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
@@ -2504,7 +2331,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             return page_region.get_section_between(
                 start_element=start_element,
                 end_element=end_element,
-                boundary_inclusion=boundary_inclusion,
+                include_boundaries=include_boundaries,
             )
         except Exception as e:
             logger.error(
@@ -2526,7 +2353,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         self,
         start_elements=None,
         end_elements=None,
-        boundary_inclusion="start",
+        include_boundaries="start",
         y_threshold=5.0,
         bounding_box=None,
     ) -> "ElementCollection[Region]":
@@ -2567,8 +2394,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
             end_elements = []
         valid_inclusions = ["start", "end", "both", "none"]
-        if boundary_inclusion not in valid_inclusions:
-            raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
+        if include_boundaries not in valid_inclusions:
+            raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
         if not start_elements:
             # Return an empty ElementCollection if no start elements
@@ -2600,12 +2427,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                     # Determine region boundaries
                     sec_top = (
                         current_start_element.top
-                        if boundary_inclusion in ["start", "both"]
+                        if include_boundaries in ["start", "both"]
                         else current_start_element.bottom
                     )
                     sec_bottom = (
                         end_boundary_el.top
-                        if boundary_inclusion not in ["end", "both"]
+                        if include_boundaries not in ["end", "both"]
                         else end_boundary_el.bottom
                     )
@@ -2627,12 +2454,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                 end_boundary_el = element
                 sec_top = (
                     current_start_element.top
-                    if boundary_inclusion in ["start", "both"]
+                    if include_boundaries in ["start", "both"]
                     else current_start_element.bottom
                 )
                 sec_bottom = (
                     end_boundary_el.bottom
-                    if boundary_inclusion in ["end", "both"]
+                    if include_boundaries in ["end", "both"]
                     else end_boundary_el.top
                 )
@@ -2652,7 +2479,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         if active_section_started:
             sec_top = (
                 current_start_element.top
-                if boundary_inclusion in ["start", "both"]
+                if include_boundaries in ["start", "both"]
                 else current_start_element.bottom
             )
             x0, _, x1, page_bottom = get_bounds()
@@ -3069,13 +2896,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                 else default_resolution
             )
-            # Use to_image, ensuring no highlights interfere
-            img = self.to_image(
-                resolution=resolution,
-                include_highlights=False,
-                labels=False,
-                exclusions=None,  # Don't mask exclusions for classification input image
-            )
+            # Use render() for clean image without highlights
+            img = self.render(resolution=resolution)
             if img is None:
                 raise ValueError(
                     "Cannot classify page with 'vision' model: Failed to render image."
@@ -3134,7 +2956,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
         try:
             # Render the page at the specified detection resolution
-            img = self.to_image(resolution=resolution, include_highlights=False)
+            # Use render() for clean image without highlights
+            img = self.render(resolution=resolution)
             if not img:
                 logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
                 self._skew_angle = None
@@ -3213,7 +3036,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
         try:
             # Render the original page at the desired output resolution
-            img = self.to_image(resolution=resolution, include_highlights=False)
+            # Use render() for clean image without highlights
+            img = self.render(resolution=resolution)
             if not img:
                 logger.error(f"Page {self.number}: Failed to render image for deskewing.")
                 return None
@@ -3303,32 +3127,33 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
     def _apply_rtl_processing_to_text(self, text: str) -> str:
         """
         Apply RTL (Right-to-Left) text processing to a string.
         This converts visual order text (as stored in PDFs) to logical order
         for proper display of Arabic, Hebrew, and other RTL scripts.
         Args:
             text: Input text string in visual order
         Returns:
             Text string in logical order
         """
         if not text or not text.strip():
             return text
         # Quick check for RTL characters - if none found, return as-is
         import unicodedata
         def _contains_rtl(s):
             return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
         if not _contains_rtl(text):
             return text
         try:
             from bidi.algorithm import get_display  # type: ignore
             from natural_pdf.utils.bidi_mirror import mirror_brackets
             # Apply BiDi algorithm to convert from visual to logical order
             # Process line by line to handle mixed content properly
             processed_lines = []
@@ -3341,9 +3166,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
                     processed_lines.append(mirror_brackets(logical_line))
                 else:
                     processed_lines.append(line)
             return "\n".join(processed_lines)
         except (ImportError, Exception):
             # If bidi library is not available or fails, return original text
             return text
@@ -3361,3 +3186,31 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
     def images(self) -> List[Any]:
         """Get all embedded raster images on this page."""
         return self._element_mgr.images
+    def highlights(self, show: bool = False) -> "HighlightContext":
+        """
+        Create a highlight context for accumulating highlights.
+        This allows for clean syntax to show multiple highlight groups:
+        Example:
+            with page.highlights() as h:
+                h.add(page.find_all('table'), label='tables', color='blue')
+                h.add(page.find_all('text:bold'), label='bold text', color='red')
+                h.show()
+        Or with automatic display:
+            with page.highlights(show=True) as h:
+                h.add(page.find_all('table'), label='tables')
+                h.add(page.find_all('text:bold'), label='bold')
+                # Automatically shows when exiting the context
+        Args:
+            show: If True, automatically show highlights when exiting context
+        Returns:
+            HighlightContext for accumulating highlights
+        """
+        from natural_pdf.core.highlighting_service import HighlightContext
+        return HighlightContext(self, show_on_exit=show)

natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

natural-pdf 0.1.40py3-none-any.whl → 0.2.1.dev0py3-none-any.whl