PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +11 -6
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +252 -399
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +231 -89
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +405 -280
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +1658 -19
natural_pdf/flows/region.py +757 -263
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +35 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +101 -0
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/elements/{collections.py → element_collection.py} RENAMED Viewed

@@ -11,6 +11,7 @@ from typing import (
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -32,6 +33,9 @@ from natural_pdf.classification.manager import ClassificationManager
 from natural_pdf.classification.mixin import ClassificationMixin
 from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
 from natural_pdf.core.pdf import PDF
+# Add Visualizable import
+from natural_pdf.core.render_spec import RenderSpec, Visualizable
 from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
 from natural_pdf.elements.base import Element
 from natural_pdf.elements.region import Region
@@ -40,6 +44,7 @@ from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.text_mixin import TextMixin
 # Potentially lazy imports for optional dependencies needed in save_pdf
 try:
@@ -66,6 +71,7 @@ if TYPE_CHECKING:
     from natural_pdf.core.pdf import PDF  # ---> ADDED PDF type hint
     from natural_pdf.elements.region import Region
     from natural_pdf.elements.text import TextElement  # Ensure TextElement is imported
+    from natural_pdf.flows.flow import Flow
 T = TypeVar("T")
 P = TypeVar("P", bound="Page")
@@ -79,6 +85,7 @@ class ElementCollection(
     DirectionalCollectionMixin,
     DescribeMixin,
     InspectMixin,
+    Visualizable,
     MutableSequence,
 ):
     """Collection of PDF elements with batch operations.
@@ -168,13 +175,234 @@ class ElementCollection(
         """
         self._elements = elements or []
+    def _get_render_specs(
+        self,
+        mode: Literal["show", "render"] = "show",
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        crop: Union[bool, Literal["content"]] = False,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
+        group_by: Optional[str] = None,
+        bins: Optional[Union[int, List[float]]] = None,
+        annotate: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[RenderSpec]:
+        """Get render specifications for this element collection.
+        Args:
+            mode: Rendering mode - 'show' includes highlights, 'render' is clean
+            color: Default color for highlights in show mode (or colormap name when using group_by)
+            highlights: Additional highlight groups to show
+            crop: Whether to crop to element bounds
+            crop_bbox: Explicit crop bounds
+            group_by: Attribute to group elements by for color mapping
+            bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
+            annotate: List of attribute names to display on highlights
+            **kwargs: Additional parameters
+        Returns:
+            List of RenderSpec objects, one per page with elements
+        """
+        if not self._elements:
+            return []
+        # Group elements by page
+        elements_by_page = {}
+        for elem in self._elements:
+            if hasattr(elem, "page"):
+                page = elem.page
+                if page not in elements_by_page:
+                    elements_by_page[page] = []
+                elements_by_page[page].append(elem)
+        if not elements_by_page:
+            return []
+        # Create RenderSpec for each page
+        specs = []
+        for page, page_elements in elements_by_page.items():
+            spec = RenderSpec(page=page)
+            # Handle cropping
+            if crop_bbox:
+                spec.crop_bbox = crop_bbox
+            elif crop == "content" or crop is True:
+                # Calculate bounds of elements on this page
+                x_coords = []
+                y_coords = []
+                for elem in page_elements:
+                    if hasattr(elem, "bbox") and elem.bbox:
+                        x0, y0, x1, y1 = elem.bbox
+                        x_coords.extend([x0, x1])
+                        y_coords.extend([y0, y1])
+                if x_coords and y_coords:
+                    spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+            # Add highlights in show mode
+            if mode == "show":
+                # Handle group_by parameter for quantitative/categorical grouping
+                if group_by is not None:
+                    # Use the improved highlighting logic from _prepare_highlight_data
+                    prepared_highlights = self._prepare_highlight_data(
+                        group_by=group_by, color=color, bins=bins, annotate=annotate, **kwargs
+                    )
+                    # Check if we have quantitative metadata to preserve
+                    quantitative_metadata = None
+                    for highlight_data in prepared_highlights:
+                        if (
+                            "quantitative_metadata" in highlight_data
+                            and highlight_data["quantitative_metadata"]
+                        ):
+                            quantitative_metadata = highlight_data["quantitative_metadata"]
+                            break
+                    # Add highlights from prepared data
+                    for highlight_data in prepared_highlights:
+                        # Only add elements from this page
+                        elem = highlight_data.get("element")
+                        if elem and hasattr(elem, "page") and elem.page == page:
+                            # Create the highlight dict manually to preserve quantitative metadata
+                            highlight_dict = {
+                                "element": elem,
+                                "color": highlight_data.get("color"),
+                                "label": highlight_data.get("label"),
+                            }
+                            # Add quantitative metadata to the first highlight
+                            if quantitative_metadata and not any(
+                                h.get("quantitative_metadata") for h in spec.highlights
+                            ):
+                                highlight_dict["quantitative_metadata"] = quantitative_metadata
+                            # Add annotate if provided in the prepared data
+                            if "annotate" in highlight_data:
+                                highlight_dict["annotate"] = highlight_data["annotate"]
+                            if "attributes_to_draw" in highlight_data:
+                                highlight_dict["attributes_to_draw"] = highlight_data[
+                                    "attributes_to_draw"
+                                ]
+                            # Extract geometry from element
+                            if (
+                                hasattr(elem, "polygon")
+                                and hasattr(elem, "has_polygon")
+                                and elem.has_polygon
+                            ):
+                                highlight_dict["polygon"] = elem.polygon
+                            elif hasattr(elem, "bbox"):
+                                highlight_dict["bbox"] = elem.bbox
+                            spec.highlights.append(highlight_dict)
+                else:
+                    # Default behavior when no group_by is specified
+                    # Determine if all elements are of the same type
+                    element_types = set(type(elem).__name__ for elem in page_elements)
+                    if len(element_types) == 1:
+                        # All elements are the same type - use a single label
+                        type_name = element_types.pop()
+                        # Generate a clean label from the type name
+                        base_name = (
+                            type_name.replace("Element", "").replace("Region", "")
+                            if type_name != "Region"
+                            else "Region"
+                        )
+                        # Handle special cases for common types
+                        if base_name == "Text":
+                            shared_label = "Text Elements"
+                        elif base_name == "table_cell" or (
+                            hasattr(page_elements[0], "region_type")
+                            and page_elements[0].region_type == "table_cell"
+                        ):
+                            shared_label = "Table Cells"
+                        elif base_name == "table":
+                            shared_label = "Tables"
+                        else:
+                            shared_label = f"{base_name} Elements" if base_name else "Elements"
+                        # Add all elements with the same label (no color cycling)
+                        for elem in page_elements:
+                            # Get element highlight params with annotate
+                            element_data = self._get_element_highlight_params(elem, annotate)
+                            if element_data:
+                                # Use add_highlight with basic params
+                                spec.add_highlight(
+                                    element=elem,
+                                    color=color,  # Use provided color or None
+                                    label=shared_label,
+                                )
+                                # Update last highlight with attributes if present
+                                if element_data.get("attributes_to_draw") and spec.highlights:
+                                    spec.highlights[-1]["attributes_to_draw"] = element_data[
+                                        "attributes_to_draw"
+                                    ]
+                    else:
+                        # Mixed types - use individual labels (existing behavior)
+                        for elem in page_elements:
+                            # Get element highlight params with annotate
+                            element_data = self._get_element_highlight_params(elem, annotate)
+                            if element_data:
+                                spec.add_highlight(
+                                    element=elem,
+                                    color=color,
+                                    label=getattr(elem, "text", None) or str(elem),
+                                )
+                                # Update last highlight with attributes if present
+                                if element_data.get("attributes_to_draw") and spec.highlights:
+                                    spec.highlights[-1]["attributes_to_draw"] = element_data[
+                                        "attributes_to_draw"
+                                    ]
+                # Add additional highlight groups if provided
+                if highlights:
+                    for group in highlights:
+                        group_elements = group.get("elements", [])
+                        group_color = group.get("color", color)
+                        group_label = group.get("label")
+                        # Only add elements from this page
+                        for elem in group_elements:
+                            if hasattr(elem, "page") and elem.page == page:
+                                spec.add_highlight(
+                                    element=elem, color=group_color, label=group_label
+                                )
+            specs.append(spec)
+        return specs
+    def _get_highlighter(self):
+        """Get the highlighting service for rendering.
+        For ElementCollection, we get it from the first element's page.
+        """
+        if not self._elements:
+            raise RuntimeError("Cannot get highlighter from empty ElementCollection")
+        # Try to get highlighter from first element's page
+        for elem in self._elements:
+            if hasattr(elem, "page") and hasattr(elem.page, "_highlighter"):
+                return elem.page._highlighter
+        # If no elements have pages, we can't render
+        raise RuntimeError(
+            "Cannot find HighlightingService. ElementCollection elements don't have page access."
+        )
     def __len__(self) -> int:
         """Get the number of elements in the collection."""
         return len(self._elements)
-    def __getitem__(self, index: int) -> "Element":
-        """Get an element by index."""
-        return self._elements[index]
+    def __getitem__(self, index: Union[int, slice]) -> Union["Element", "ElementCollection"]:
+        """Get an element by index or a collection by slice."""
+        if isinstance(index, slice):
+            # Return a new ElementCollection for slices
+            return ElementCollection(self._elements[index])
+        else:
+            # Return the element for integer indices
+            return self._elements[index]
     def __repr__(self) -> str:
         """Return a string representation showing the element count."""
@@ -420,6 +648,7 @@ class ElementCollection(
         # Apply content filtering if provided
         if content_filter is not None:
             from natural_pdf.utils.text_extraction import _apply_content_filter
             all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
         # Check if layout is requested
@@ -531,8 +760,9 @@ class ElementCollection(
         group_by: Optional[str] = None,
         label_format: Optional[str] = None,
         distinct: bool = False,
-        include_attrs: Optional[List[str]] = None,
+        annotate: Optional[List[str]] = None,
         replace: bool = False,
+        bins: Optional[Union[int, List[float]]] = None,
     ) -> "ElementCollection":
         """
         Adds persistent highlights for all elements in the collection to the page
@@ -550,12 +780,15 @@ class ElementCollection(
             label: Optional explicit label for the entire collection. If provided,
                    all elements are highlighted as a single group with this label,
                    ignoring 'group_by' and the default type-based grouping.
-            color: Optional explicit color for the highlight (tuple/string). Applied
-                   consistently if 'label' is provided or if grouping occurs.
+            color: Optional explicit color for the highlight (tuple/string), or
+                   matplotlib colormap name for quantitative group_by (e.g., 'viridis', 'plasma',
+                   'inferno', 'coolwarm', 'RdBu'). Applied consistently if 'label' is provided
+                   or if grouping occurs.
             group_by: Optional attribute name present on the elements. If provided
                       (and 'label' is None), elements will be grouped based on the
                       value of this attribute, and each group will be highlighted
-                      with a distinct label and color.
+                      with a distinct label and color. Automatically detects quantitative
+                      data and uses gradient colormaps when appropriate.
             label_format: Optional Python f-string to format the group label when
                           'group_by' is used. Can reference element attributes
                           (e.g., "Type: {region_type}, Conf: {confidence:.2f}").
@@ -563,11 +796,14 @@ class ElementCollection(
             distinct: If True, bypasses all grouping and highlights each element
                       individually with cycling colors (the previous default behavior).
                       (default: False)
-            include_attrs: List of attribute names from the element to display directly
-                           on the highlight itself (distinct from group label).
+            annotate: List of attribute names from the element to display directly
+                      on the highlight itself (distinct from group label).
             replace: If True, existing highlights on the affected page(s)
                      are cleared before adding these highlights.
                      If False (default), highlights are appended to existing ones.
+            bins: Optional binning specification for quantitative data when using group_by.
+                  Can be an integer (number of equal-width bins) or a list of bin edges.
+                  Only used when group_by contains quantitative data.
         Returns:
             Self for method chaining
@@ -589,7 +825,8 @@ class ElementCollection(
             color=color,
             group_by=group_by,
             label_format=label_format,
-            include_attrs=include_attrs,
+            annotate=annotate,
+            bins=bins,
             # 'replace' flag is handled during the add call below
         )
@@ -630,7 +867,7 @@ class ElementCollection(
                     "use_color_cycling", False
                 ),  # Set by _prepare if distinct
                 "element": data["element"],
-                "include_attrs": data["include_attrs"],
+                "annotate": data["annotate"],
                 # Internal call to service always appends, as clearing was handled above
                 "existing": "append",
             }
@@ -652,7 +889,8 @@ class ElementCollection(
         color: Optional[Union[Tuple, str]] = None,
         group_by: Optional[str] = None,
         label_format: Optional[str] = None,
-        include_attrs: Optional[List[str]] = None,
+        annotate: Optional[List[str]] = None,
+        bins: Optional[Union[int, List[float]]] = None,
     ) -> List[Dict]:
         """
         Determines the parameters for highlighting each element based on the strategy.
@@ -661,7 +899,7 @@ class ElementCollection(
         Returns:
             List of dictionaries, each containing parameters for a single highlight
-            (e.g., page_index, bbox/polygon, color, label, element, include_attrs, attributes_to_draw).
+            (e.g., page_index, bbox/polygon, color, label, element, annotate, attributes_to_draw).
             Color and label determination happens here.
         """
         prepared_data = []
@@ -669,11 +907,25 @@ class ElementCollection(
             return prepared_data
         # Need access to the HighlightingService to determine colors correctly.
+        # Use highlighting protocol to find a valid service from any element
         highlighter = None
-        first_element = self._elements[0]
-        if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
-            highlighter = first_element.page._highlighter
-        else:
+        for element in self._elements:
+            # Try direct page access first (for regular elements)
+            if hasattr(element, "page") and hasattr(element.page, "_highlighter"):
+                highlighter = element.page._highlighter
+                break
+            # Try highlighting protocol for FlowRegions and other complex elements
+            elif hasattr(element, "get_highlight_specs"):
+                specs = element.get_highlight_specs()
+                for spec in specs:
+                    if "page" in spec and hasattr(spec["page"], "_highlighter"):
+                        highlighter = spec["page"]._highlighter
+                        break
+                if highlighter:
+                    break
+        if not highlighter:
             logger.warning(
                 "Cannot determine highlight colors: HighlightingService not accessible from elements."
             )
@@ -686,7 +938,7 @@ class ElementCollection(
                 final_color = highlighter._determine_highlight_color(
                     label=None, color_input=None, use_color_cycling=True
                 )
-                element_data = self._get_element_highlight_params(element, include_attrs)
+                element_data = self._get_element_highlight_params(element, annotate)
                 if element_data:
                     element_data.update(
                         {"color": final_color, "label": None, "use_color_cycling": True}
@@ -699,7 +951,7 @@ class ElementCollection(
                 label=label, color_input=color, use_color_cycling=False
             )
             for element in self._elements:
-                element_data = self._get_element_highlight_params(element, include_attrs)
+                element_data = self._get_element_highlight_params(element, annotate)
                 if element_data:
                     element_data.update({"color": final_color, "label": label})
                     prepared_data.append(element_data)
@@ -707,23 +959,84 @@ class ElementCollection(
         elif group_by is not None:
             logger.debug("_prepare: Grouping by attribute strategy.")
             grouped_elements = self._group_elements_by_attr(group_by)
+            # Collect all values for quantitative detection
+            all_values = []
             for group_key, group_elements in grouped_elements.items():
-                if not group_elements:
-                    continue
-                group_label = self._format_group_label(
-                    group_key, label_format, group_elements[0], group_by
-                )
-                final_color = highlighter._determine_highlight_color(
-                    label=group_label, color_input=None, use_color_cycling=False
-                )
-                logger.debug(
-                    f"  _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
+                if group_elements:
+                    all_values.append(group_key)
+            # Import the quantitative detection function
+            from natural_pdf.utils.visualization import (
+                create_quantitative_color_mapping,
+                detect_quantitative_data,
+            )
+            # Determine if we should use quantitative color mapping
+            use_quantitative = detect_quantitative_data(all_values)
+            if use_quantitative:
+                logger.debug("  _prepare: Using quantitative color mapping.")
+                # Use quantitative color mapping with specified colormap
+                colormap_name = color if isinstance(color, str) else "viridis"
+                value_to_color = create_quantitative_color_mapping(
+                    all_values, colormap=colormap_name, bins=bins
                 )
-                for element in group_elements:
-                    element_data = self._get_element_highlight_params(element, include_attrs)
-                    if element_data:
-                        element_data.update({"color": final_color, "label": group_label})
-                        prepared_data.append(element_data)
+                # Store quantitative metadata for colorbar creation
+                quantitative_metadata = {
+                    "values": all_values,
+                    "colormap": colormap_name,
+                    "bins": bins,
+                    "attribute": group_by,
+                }
+                for group_key, group_elements in grouped_elements.items():
+                    if not group_elements:
+                        continue
+                    group_label = self._format_group_label(
+                        group_key, label_format, group_elements[0], group_by
+                    )
+                    # Get quantitative color for this value
+                    final_color = value_to_color.get(group_key)
+                    if final_color is None:
+                        # Fallback to traditional color assignment
+                        final_color = highlighter._determine_highlight_color(
+                            label=group_label, color_input=None, use_color_cycling=False
+                        )
+                    logger.debug(
+                        f"  _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
+                    )
+                    for element in group_elements:
+                        element_data = self._get_element_highlight_params(element, annotate)
+                        if element_data:
+                            element_data.update({"color": final_color, "label": group_label})
+                            # Add quantitative metadata to the first element in each group
+                            if not any("quantitative_metadata" in pd for pd in prepared_data):
+                                element_data["quantitative_metadata"] = quantitative_metadata
+                            prepared_data.append(element_data)
+            else:
+                logger.debug("  _prepare: Using categorical color mapping.")
+                # Use traditional categorical color mapping
+                for group_key, group_elements in grouped_elements.items():
+                    if not group_elements:
+                        continue
+                    group_label = self._format_group_label(
+                        group_key, label_format, group_elements[0], group_by
+                    )
+                    final_color = highlighter._determine_highlight_color(
+                        label=group_label, color_input=None, use_color_cycling=False
+                    )
+                    logger.debug(
+                        f"  _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
+                    )
+                    for element in group_elements:
+                        element_data = self._get_element_highlight_params(element, annotate)
+                        if element_data:
+                            element_data.update({"color": final_color, "label": group_label})
+                            prepared_data.append(element_data)
         else:
             logger.debug("_prepare: Default grouping strategy.")
             element_types = set(type(el).__name__ for el in self._elements)
@@ -742,7 +1055,7 @@ class ElementCollection(
                 )
                 logger.debug(f"  _prepare default group '{auto_label}' -> color {final_color}")
                 for element in self._elements:
-                    element_data = self._get_element_highlight_params(element, include_attrs)
+                    element_data = self._get_element_highlight_params(element, annotate)
                     if element_data:
                         element_data.update({"color": final_color, "label": auto_label})
                         prepared_data.append(element_data)
@@ -761,7 +1074,7 @@ class ElementCollection(
                 # Determine color *before* logging or using it (already done above for this branch)
                 logger.debug(f"  _prepare default group '{auto_label}' -> color {final_color}")
                 for element in self._elements:
-                    element_data = self._get_element_highlight_params(element, include_attrs)
+                    element_data = self._get_element_highlight_params(element, annotate)
                     if element_data:
                         element_data.update({"color": final_color, "label": auto_label})
                         prepared_data.append(element_data)
@@ -774,7 +1087,7 @@ class ElementCollection(
         color: Optional[Union[Tuple, str]],
         label: Optional[str],
         use_color_cycling: bool,
-        include_attrs: Optional[List[str]],
+        annotate: Optional[List[str]],
         existing: str,
     ):
         """Low-level helper to call the appropriate HighlightingService method for an element."""
@@ -790,7 +1103,7 @@ class ElementCollection(
             "color": color,
             "label": label,
             "use_color_cycling": use_color_cycling,
-            "include_attrs": include_attrs,
+            "annotate": annotate,
             "existing": existing,
             "element": element,
         }
@@ -825,7 +1138,7 @@ class ElementCollection(
         self,
         label: str,
         color: Optional[Union[Tuple, str]],
-        include_attrs: Optional[List[str]],
+        annotate: Optional[List[str]],
         existing: str,
     ):
         """Highlights all elements with the same explicit label and color."""
@@ -835,7 +1148,7 @@ class ElementCollection(
                 color=color,  # Use explicit color if provided
                 label=label,  # Use the explicit group label
                 use_color_cycling=False,  # Use consistent color for the label
-                include_attrs=include_attrs,
+                annotate=annotate,
                 existing=existing,
             )
@@ -843,7 +1156,7 @@ class ElementCollection(
         self,
         group_by: str,
         label_format: Optional[str],
-        include_attrs: Optional[List[str]],
+        annotate: Optional[List[str]],
         existing: str,
     ):
         """Groups elements by attribute and highlights each group distinctly."""
@@ -915,11 +1228,11 @@ class ElementCollection(
                     color=None,  # Let ColorManager choose based on label
                     label=group_label,  # Use the derived group label
                     use_color_cycling=False,  # Use consistent color for the label
-                    include_attrs=include_attrs,
+                    annotate=annotate,
                     existing=existing,
                 )
-    def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
+    def _highlight_distinctly(self, annotate: Optional[List[str]], existing: str):
         """DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
         # This method is no longer called directly by the main highlight path.
         # The distinct logic is handled within _prepare_highlight_data.
@@ -929,152 +1242,191 @@ class ElementCollection(
                 color=None,  # Let ColorManager cycle
                 label=None,  # No label for distinct elements
                 use_color_cycling=True,  # Force cycling
-                include_attrs=include_attrs,
+                annotate=annotate,
                 existing=existing,
             )
-    def show(
+    def _render_multipage_highlights(
         self,
-        # --- Visualization Parameters ---
-        group_by: Optional[str] = None,
-        label: Optional[str] = None,
-        color: Optional[Union[Tuple, str]] = None,
-        label_format: Optional[str] = None,
-        distinct: bool = False,
-        include_attrs: Optional[List[str]] = None,
-        # --- Rendering Parameters ---
-        resolution: Optional[float] = None,
-        labels: bool = True,  # Use 'labels' consistent with service
-        legend_position: str = "right",
-        render_ocr: bool = False,
-        width: Optional[int] = None,  # Add width parameter
-        page: Optional[Any] = None,  # NEW: Optional page parameter for empty collections
-        crop: bool = False,  # NEW: If True, crop output to element bounds
-    ) -> Optional["Image.Image"]:
-        """
-        Generates a temporary preview image highlighting elements in this collection
-        on their page, ignoring any persistent highlights.
-        Currently only supports collections where all elements are on the same page
-        of the same PDF.
-        Allows grouping and coloring elements based on attributes, similar to the
-        persistent `highlight()` method, but only for this temporary view.
-        Args:
-            group_by: Attribute name to group elements by for distinct colors/labels.
-            label: Explicit label for all elements (overrides group_by).
-            color: Explicit color for all elements (if label used) or base color.
-            label_format: F-string to format group labels if group_by is used.
-            distinct: Highlight each element distinctly (overrides group_by/label).
-            include_attrs: Attributes to display on individual highlights.
-            resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
-            labels: Whether to include a legend for the temporary highlights.
-            legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
-            render_ocr: Whether to render OCR text.
-            width: Optional width for the output image in pixels.
-            crop: If True, crop the resulting image to the tight bounding box
-                        containing all elements in the collection. The elements are
-                        still highlighted first, then the image is cropped.
-        Returns:
-            PIL Image object of the temporary preview, or None if rendering fails or
-            elements span multiple pages/PDFs.
+        specs_by_page,
+        resolution,
+        width,
+        labels,
+        legend_position,
+        group_by,
+        label,
+        color,
+        label_format,
+        distinct,
+        annotate,
+        render_ocr,
+        crop,
+        stack_direction="vertical",
+        stack_gap=5,
+        stack_background_color=(255, 255, 255),
+    ):
+        """Render highlights across multiple pages and stack them."""
+        from PIL import Image
-        Raises:
-            ValueError: If the collection is empty or elements are on different pages/PDFs.
-        """
-        # Apply global options as defaults, but allow explicit parameters to override
-        import natural_pdf
+        # Sort pages by index for consistent output
+        sorted_pages = sorted(
+            specs_by_page.keys(), key=lambda p: p.index if hasattr(p, "index") else 0
+        )
-        # Use global options if parameters are not explicitly set
-        if width is None:
-            width = natural_pdf.options.image.width
-        if resolution is None:
-            if natural_pdf.options.image.resolution is not None:
-                resolution = natural_pdf.options.image.resolution
-            else:
-                resolution = 144  # Default resolution when none specified
+        page_images = []
-        if not self._elements:
-            raise ValueError("Cannot show an empty collection.")
+        for page in sorted_pages:
+            element_specs = specs_by_page[page]
-        # Check if elements are on multiple PDFs
-        if self._are_on_multiple_pdfs():
-            raise ValueError(
-                "show() currently only supports collections where all elements are from the same PDF."
-            )
+            # Get highlighter service from the page
+            if not hasattr(page, "_highlighter"):
+                logger.warning(
+                    f"Page {getattr(page, 'number', '?')} has no highlighter service, skipping"
+                )
+                continue
-        # Check if elements are on multiple pages
-        if self._are_on_multiple_pages():
-            raise ValueError(
-                "show() currently only supports collections where all elements are on the same page."
-            )
+            service = page._highlighter
-        # Get the page and highlighting service from the first element
-        first_element = self._elements[0]
-        if not hasattr(first_element, "page") or not first_element.page:
-            logger.warning("Cannot show collection: First element has no associated page.")
-            return None
-        page = first_element.page
-        if not hasattr(page, "pdf") or not page.pdf:
-            logger.warning("Cannot show collection: Page has no associated PDF object.")
-            return None
+            # Prepare highlight data for this page
+            highlight_data_list = []
-        service = page._highlighter
-        if not service:
-            logger.warning("Cannot show collection: PDF object has no highlighting service.")
-            return None
+            for element_idx, spec in element_specs:
+                # Use the element index to generate consistent colors/labels across pages
+                element = spec.get(
+                    "element",
+                    self._elements[element_idx] if element_idx < len(self._elements) else None,
+                )
-        # 1. Prepare temporary highlight data based on grouping parameters
-        # This returns a list of dicts, suitable for render_preview
-        highlight_data_list = self._prepare_highlight_data(
-            distinct=distinct,
-            label=label,
-            color=color,
-            group_by=group_by,
-            label_format=label_format,
-            include_attrs=include_attrs,
-        )
+                # Prepare highlight data based on grouping parameters
+                if distinct:
+                    # Use cycling colors for distinct mode
+                    element_color = None  # Let the highlighter service pick from palette
+                    use_color_cycling = True
+                    element_label = (
+                        f"Element_{element_idx + 1}"
+                        if label is None
+                        else f"{label}_{element_idx + 1}"
+                    )
+                elif label:
+                    # Explicit label for all elements
+                    element_color = color
+                    use_color_cycling = color is None
+                    element_label = label
+                elif group_by and element:
+                    # Group by attribute
+                    try:
+                        group_key = getattr(element, group_by, None)
+                        element_label = self._format_group_label(
+                            group_key, label_format, element, group_by
+                        )
+                        element_color = None  # Let service assign color by group
+                        use_color_cycling = True
+                    except:
+                        element_label = f"Element_{element_idx + 1}"
+                        element_color = color
+                        use_color_cycling = color is None
+                else:
+                    # Default behavior
+                    element_color = color
+                    use_color_cycling = color is None
+                    element_label = f"Element_{element_idx + 1}"
+                # Build highlight data
+                highlight_item = {
+                    "page_index": spec["page_index"],
+                    "bbox": spec["bbox"],
+                    "polygon": spec.get("polygon"),
+                    "color": element_color,
+                    "label": element_label if labels else None,
+                    "use_color_cycling": use_color_cycling,
+                }
+                # Add attributes if requested
+                if annotate and element:
+                    highlight_item["attributes_to_draw"] = {}
+                    for attr_name in annotate:
+                        try:
+                            attr_value = getattr(element, attr_name, None)
+                            if attr_value is not None:
+                                highlight_item["attributes_to_draw"][attr_name] = attr_value
+                        except:
+                            pass
-        if not highlight_data_list:
-            logger.warning("No highlight data generated for show(). Rendering clean page.")
-            # Render the page without any temporary highlights
-            highlight_data_list = []
+                highlight_data_list.append(highlight_item)
-        # 2. Call render_preview on the HighlightingService
-        try:
-            # Calculate crop bounding box in PDF coordinates if crop is requested
+            # Calculate crop bbox if requested
             crop_bbox = None
             if crop:
                 try:
-                    crop_bbox = (
-                        min(el.x0 for el in self._elements),
-                        min(el.top for el in self._elements),
-                        max(el.x1 for el in self._elements),
-                        max(el.bottom for el in self._elements),
-                    )
+                    # Get bboxes from all specs on this page
+                    bboxes = [spec["bbox"] for _, spec in element_specs if spec.get("bbox")]
+                    if bboxes:
+                        crop_bbox = (
+                            min(bbox[0] for bbox in bboxes),
+                            min(bbox[1] for bbox in bboxes),
+                            max(bbox[2] for bbox in bboxes),
+                            max(bbox[3] for bbox in bboxes),
+                        )
                 except Exception as bbox_err:
-                    logger.error(
-                        f"Error determining crop bbox for collection show: {bbox_err}",
-                        exc_info=True,
-                    )
+                    logger.error(f"Error determining crop bbox: {bbox_err}")
-            img = service.render_preview(
-                page_index=page.index,
-                temporary_highlights=highlight_data_list,
-                resolution=resolution,
-                width=width,  # Pass the width parameter
-                labels=labels,  # Use 'labels'
-                legend_position=legend_position,
-                render_ocr=render_ocr,
-                crop_bbox=crop_bbox,
-            )
-            return img
-        except Exception as e:
-            logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
+            # Render this page
+            try:
+                img = service.render_preview(
+                    page_index=page.index,
+                    temporary_highlights=highlight_data_list,
+                    resolution=resolution,
+                    width=width,
+                    labels=labels,
+                    legend_position=legend_position,
+                    render_ocr=render_ocr,
+                    crop_bbox=crop_bbox,
+                )
+                if img:
+                    page_images.append(img)
+            except Exception as e:
+                logger.error(
+                    f"Error rendering page {getattr(page, 'number', '?')}: {e}", exc_info=True
+                )
+        if not page_images:
+            logger.warning("Failed to render any pages")
             return None
+        if len(page_images) == 1:
+            return page_images[0]
+        # Stack the images
+        if stack_direction == "vertical":
+            final_width = max(img.width for img in page_images)
+            final_height = (
+                sum(img.height for img in page_images) + (len(page_images) - 1) * stack_gap
+            )
+            stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
+            current_y = 0
+            for img in page_images:
+                # Center horizontally
+                x_offset = (final_width - img.width) // 2
+                stacked_image.paste(img, (x_offset, current_y))
+                current_y += img.height + stack_gap
+        else:  # horizontal
+            final_width = sum(img.width for img in page_images) + (len(page_images) - 1) * stack_gap
+            final_height = max(img.height for img in page_images)
+            stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
+            current_x = 0
+            for img in page_images:
+                # Center vertically
+                y_offset = (final_height - img.height) // 2
+                stacked_image.paste(img, (current_x, y_offset))
+                current_x += img.width + stack_gap
+        return stacked_image
     def save(
         self,
         filename: str,
@@ -1110,8 +1462,8 @@ class ElementCollection(
             else:
                 resolution = 144  # Default resolution when none specified
-        # Use to_image to generate and save the image
-        self.to_image(
+        # Use export() to save the image
+        self.export(
             path=filename,
             resolution=resolution,
             width=width,
@@ -1121,42 +1473,6 @@ class ElementCollection(
         )
         return self
-    def to_image(
-        self,
-        path: Optional[str] = None,
-        resolution: Optional[float] = None,
-        width: Optional[int] = None,
-        labels: bool = True,
-        legend_position: str = "right",
-        render_ocr: bool = False,
-    ) -> Optional["Image.Image"]:
-        """
-        Generate an image of the page with this collection's elements highlighted,
-        optionally saving it to a file.
-        Args:
-            path: Optional path to save the image to
-            resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
-            width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
-            labels: Whether to include a legend for labels
-            legend_position: Position of the legend
-            render_ocr: Whether to render OCR text with white background boxes
-        Returns:
-            PIL Image of the page with elements highlighted, or None if no valid page
-        """
-        # Get the page from the first element (if available)
-        if self._elements and hasattr(self._elements[0], "page"):
-            page = self._elements[0].page
-            # Generate the image using to_image
-            return page.to_image(
-                path=path,
-                resolution=resolution,
-                width=width,
-                labels=labels,
-                legend_position=legend_position,
-                render_ocr=render_ocr,
-            )
         return None
     def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
@@ -1216,17 +1532,57 @@ class ElementCollection(
             return str(group_key)
     def _get_element_highlight_params(
-        self, element: T, include_attrs: Optional[List[str]]
+        self, element: T, annotate: Optional[List[str]]
     ) -> Optional[Dict]:
         """Extracts common parameters needed for highlighting a single element."""
+        # For FlowRegions and other complex elements, use highlighting protocol
+        if hasattr(element, "get_highlight_specs"):
+            specs = element.get_highlight_specs()
+            if not specs:
+                logger.warning(f"Element {element} returned no highlight specs")
+                return None
+            # For now, we'll use the first spec for the prepared data
+            # The actual rendering will use all specs
+            first_spec = specs[0]
+            page = first_spec["page"]
+            base_data = {
+                "page_index": first_spec["page_index"],
+                "element": element,
+                "annotate": annotate,
+                "attributes_to_draw": {},
+                "bbox": first_spec.get("bbox"),
+                "polygon": first_spec.get("polygon"),
+                "multi_spec": len(specs) > 1,  # Flag to indicate multiple specs
+                "all_specs": specs,  # Store all specs for rendering
+            }
+            # Extract attributes if requested
+            if annotate:
+                for attr_name in annotate:
+                    try:
+                        attr_value = getattr(element, attr_name, None)
+                        if attr_value is not None:
+                            base_data["attributes_to_draw"][attr_name] = attr_value
+                    except AttributeError:
+                        logger.warning(
+                            f"Attribute '{attr_name}' not found on element {element} for annotate"
+                        )
+            return base_data
+        # Fallback for regular elements with direct page access
         if not hasattr(element, "page"):
+            logger.warning(f"Element {element} has no page attribute and no highlighting protocol")
             return None
         page = element.page
         base_data = {
             "page_index": page.index,
             "element": element,
-            "include_attrs": include_attrs,
+            "annotate": annotate,
             "attributes_to_draw": {},
             "bbox": None,
             "polygon": None,
@@ -1251,15 +1607,15 @@ class ElementCollection(
             return None
         # Extract attributes if requested
-        if include_attrs:
-            for attr_name in include_attrs:
+        if annotate:
+            for attr_name in annotate:
                 try:
                     attr_value = getattr(element, attr_name, None)
                     if attr_value is not None:
                         base_data["attributes_to_draw"][attr_name] = attr_value
                 except AttributeError:
                     logger.warning(
-                        f"Attribute '{attr_name}' not found on element {element} for include_attrs"
+                        f"Attribute '{attr_name}' not found on element {element} for annotate"
                     )
         return base_data
@@ -1416,7 +1772,7 @@ class ElementCollection(
     def correct_ocr(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
         max_workers: Optional[int] = None,
     ) -> "ElementCollection":
         """
@@ -1425,10 +1781,10 @@ class ElementCollection(
         in parallel if `max_workers` is specified.
         Iterates through elements currently in the collection. If an element's
-        'source' attribute starts with 'ocr', it calls the `correction_callback`
+        'source' attribute starts with 'ocr', it calls the `transform`
         for that element, passing the element itself.
-        The `correction_callback` should contain the logic to:
+        The `transform` should contain the logic to:
         1. Determine if the element needs correction.
         2. Perform the correction (e.g., call an LLM).
         3. Return the new text (`str`) or `None`.
@@ -1438,8 +1794,8 @@ class ElementCollection(
         Elements without a source starting with 'ocr' are skipped.
         Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+            transform: A function accepting an element and returning
+                       `Optional[str]` (new text or None).
             max_workers: The maximum number of worker threads to use for parallel
                          correction on each page. If None, defaults are used.
@@ -1449,7 +1805,7 @@ class ElementCollection(
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
             elements=self._elements,
-            correction_callback=correction_callback,
+            correction_callback=transform,
             caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
             max_workers=max_workers,
         )
@@ -1696,9 +2052,7 @@ class ElementCollection(
                     image_path = image_dir / image_filename
                     # Save image
-                    element.to_image(
-                        path=str(image_path), resolution=image_resolution, include_highlights=True
-                    )
+                    element.show(path=str(image_path), resolution=image_resolution)
                     # Add relative path to data
                     element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
@@ -1986,8 +2340,8 @@ class ElementCollection(
     # ------------------------------------------------------------------
     def apply_ocr(
         self,
-        *,
         function: Optional[Callable[["Region"], Optional[str]]] = None,
+        *,
         show_progress: bool = True,
         **kwargs,
     ) -> "ElementCollection":
@@ -2043,1154 +2397,3 @@ class ElementCollection(
         return self
     # ------------------------------------------------------------------
-class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
-    """
-    Represents a collection of Page objects, often from a single PDF document.
-    Provides methods for batch operations on these pages.
-    """
-    def __init__(self, pages: Union[List[P], Sequence[P]]):
-        """
-        Initialize a page collection.
-        Args:
-            pages: List or sequence of Page objects (can be lazy)
-        """
-        # Store the sequence as-is to preserve lazy behavior
-        # Only convert to list if we need list-specific operations
-        if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
-            self.pages = pages
-        else:
-            # Fallback for non-sequence types
-            self.pages = list(pages)
-    def __len__(self) -> int:
-        """Return the number of pages in the collection."""
-        return len(self.pages)
-    def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
-        """Support indexing and slicing."""
-        if isinstance(idx, slice):
-            return PageCollection(self.pages[idx])
-        return self.pages[idx]
-    def __iter__(self) -> Iterator[P]:
-        """Support iteration."""
-        return iter(self.pages)
-    def __repr__(self) -> str:
-        """Return a string representation showing the page count."""
-        return f"<PageCollection(count={len(self)})>"
-    def _get_items_for_apply(self) -> Iterator[P]:
-        """
-        Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
-        Returns an iterator that yields pages on-demand rather than materializing
-        all pages at once, maintaining the lazy loading behavior.
-        """
-        return iter(self.pages)
-    def _get_page_indices(self) -> List[int]:
-        """
-        Get page indices without forcing materialization of pages.
-        Returns:
-            List of page indices for the pages in this collection.
-        """
-        # Handle different types of page sequences efficiently
-        if hasattr(self.pages, '_indices'):
-            # If it's a _LazyPageList (or slice), get indices directly
-            return list(self.pages._indices)
-        else:
-            # Fallback: if pages are already materialized, get indices normally
-            # This will force materialization but only if pages aren't lazy
-            return [p.index for p in self.pages]
-    def extract_text(
-        self,
-        keep_blank_chars: bool = True,
-        apply_exclusions: bool = True,
-        strip: Optional[bool] = None,
-        **kwargs,
-    ) -> str:
-        """
-        Extract text from all pages in the collection.
-        Args:
-            keep_blank_chars: Whether to keep blank characters (default: True)
-            apply_exclusions: Whether to apply exclusion regions (default: True)
-            strip: Whether to strip whitespace from the extracted text.
-            **kwargs: Additional extraction parameters
-        Returns:
-            Combined text from all pages
-        """
-        texts = []
-        for page in self.pages:
-            text = page.extract_text(
-                keep_blank_chars=keep_blank_chars,
-                apply_exclusions=apply_exclusions,
-                **kwargs,
-            )
-            texts.append(text)
-        combined = "\n".join(texts)
-        # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
-        use_layout = kwargs.get("layout", False)
-        strip_final = strip if strip is not None else (not use_layout)
-        if strip_final:
-            combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
-        return combined
-    def apply_ocr(
-        self,
-        engine: Optional[str] = None,
-        # --- Common OCR Parameters (Direct Arguments) ---
-        languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None,  # Min confidence threshold
-        device: Optional[str] = None,
-        resolution: Optional[int] = None,  # DPI for rendering
-        apply_exclusions: bool = True,  # New parameter
-        replace: bool = True,  # Whether to replace existing OCR elements
-        # --- Engine-Specific Options ---
-        options: Optional[Any] = None,  # e.g., EasyOCROptions(...)
-    ) -> "PageCollection[P]":
-        """
-        Applies OCR to all pages within this collection using batch processing.
-        This delegates the work to the parent PDF object's `apply_ocr` method.
-        Args:
-            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
-            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
-                       **Must be codes understood by the specific selected engine.**
-                       No mapping is performed.
-            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
-            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
-            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
-            apply_exclusions: If True (default), render page images for OCR with
-                              excluded areas masked (whited out). If False, OCR
-                              the raw page images without masking exclusions.
-            replace: If True (default), remove any existing OCR elements before
-                    adding new ones. If False, add new OCR elements to existing ones.
-            options: An engine-specific options object (e.g., EasyOCROptions) or dict.
-        Returns:
-            Self for method chaining.
-        Raises:
-            RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
-            (Propagates exceptions from PDF.apply_ocr)
-        """
-        if not self.pages:
-            logger.warning("Cannot apply OCR to an empty PageCollection.")
-            return self
-        # Assume all pages share the same parent PDF object
-        first_page = self.pages[0]
-        if not hasattr(first_page, "_parent") or not first_page._parent:
-            raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
-        parent_pdf = first_page._parent
-        if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
-            raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
-        # Get the 0-based indices of the pages in this collection
-        page_indices = self._get_page_indices()
-        logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
-        # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
-        parent_pdf.apply_ocr(
-            pages=page_indices,
-            engine=engine,
-            languages=languages,
-            min_confidence=min_confidence,  # Pass the renamed parameter
-            device=device,
-            resolution=resolution,
-            apply_exclusions=apply_exclusions,  # Pass down
-            replace=replace,  # Pass the replace parameter
-            options=options,
-        )
-        # The PDF method modifies the Page objects directly by adding elements.
-        return self  # Return self for chaining
-    @overload
-    def find(
-        self,
-        *,
-        text: str,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> Optional[T]: ...
-    @overload
-    def find(
-        self,
-        selector: str,
-        *,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> Optional[T]: ...
-    def find(
-        self,
-        selector: Optional[str] = None,
-        *,
-        text: Optional[str] = None,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> Optional[T]:
-        """
-        Find the first element matching the selector OR text across all pages in the collection.
-        Provide EITHER `selector` OR `text`, but not both.
-        Args:
-            selector: CSS-like selector string.
-            text: Text content to search for (equivalent to 'text:contains(...)').
-            contains: How to determine if elements are inside: 'all' (fully inside),
-                     'any' (any overlap), or 'center' (center point inside).
-                     (default: "all")
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
-            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
-            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
-            **kwargs: Additional filter parameters.
-        Returns:
-            First matching element or None.
-        """
-        # Input validation happens within page.find
-        for page in self.pages:
-            element = page.find(
-                selector=selector,
-                text=text,
-                contains=contains,
-                apply_exclusions=apply_exclusions,
-                regex=regex,
-                case=case,
-                **kwargs,
-            )
-            if element:
-                return element
-        return None
-    @overload
-    def find_all(
-        self,
-        *,
-        text: str,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> "ElementCollection": ...
-    @overload
-    def find_all(
-        self,
-        selector: str,
-        *,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> "ElementCollection": ...
-    def find_all(
-        self,
-        selector: Optional[str] = None,
-        *,
-        text: Optional[str] = None,
-        contains: str = "all",
-        apply_exclusions: bool = True,
-        regex: bool = False,
-        case: bool = True,
-        **kwargs,
-    ) -> "ElementCollection":
-        """
-        Find all elements matching the selector OR text across all pages in the collection.
-        Provide EITHER `selector` OR `text`, but not both.
-        Args:
-            selector: CSS-like selector string.
-            text: Text content to search for (equivalent to 'text:contains(...)').
-            contains: How to determine if elements are inside: 'all' (fully inside),
-                     'any' (any overlap), or 'center' (center point inside).
-                     (default: "all")
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
-            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
-            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
-            **kwargs: Additional filter parameters.
-        Returns:
-            ElementCollection with matching elements from all pages.
-        """
-        all_elements = []
-        # Input validation happens within page.find_all
-        for page in self.pages:
-            elements = page.find_all(
-                selector=selector,
-                text=text,
-                contains=contains,
-                apply_exclusions=apply_exclusions,
-                regex=regex,
-                case=case,
-                **kwargs,
-            )
-            if elements:
-                all_elements.extend(elements.elements)
-        return ElementCollection(all_elements)
-    def correct_ocr(
-        self,
-        correction_callback: Callable[[Any], Optional[str]],
-        max_workers: Optional[int] = None,
-    ) -> "PageCollection[P]":
-        """
-        Applies corrections to OCR-generated text elements across all pages
-        in this collection using a user-provided callback function, executed
-        in parallel if `max_workers` is specified.
-        This method delegates to the parent PDF's `correct_ocr` method,
-        targeting all pages within this collection.
-        Args:
-            correction_callback: A function that accepts a single argument (an element
-                                 object) and returns `Optional[str]` (new text or None).
-            max_workers: The maximum number of worker threads to use for parallel
-                         correction on each page. If None, defaults are used.
-        Returns:
-            Self for method chaining.
-        Raises:
-            RuntimeError: If the collection is empty, pages lack a parent PDF reference,
-                          or the parent PDF lacks the `correct_ocr` method.
-        """
-        if not self.pages:
-            logger.warning("Cannot correct OCR for an empty PageCollection.")
-            # Return self even if empty to maintain chaining consistency
-            return self
-        # Assume all pages share the same parent PDF object
-        parent_pdf = self.pages[0]._parent
-        if (
-            not parent_pdf
-            or not hasattr(parent_pdf, "correct_ocr")
-            or not callable(parent_pdf.correct_ocr)
-        ):
-            raise RuntimeError(
-                "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
-            )
-        page_indices = self._get_page_indices()
-        logger.info(
-            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
-        )
-        # Delegate the call to the parent PDF object for the relevant pages
-        # Pass the max_workers parameter down
-        parent_pdf.correct_ocr(
-            correction_callback=correction_callback,
-            pages=page_indices,
-            max_workers=max_workers,  # Pass it here
-        )
-        return self
-    def get_sections(
-        self,
-        start_elements=None,
-        end_elements=None,
-        new_section_on_page_break=False,
-        boundary_inclusion="both",
-    ) -> "ElementCollection[Region]":
-        """
-        Extract sections from a page collection based on start/end elements.
-        Args:
-            start_elements: Elements or selector string that mark the start of sections
-            end_elements: Elements or selector string that mark the end of sections
-            new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
-            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
-        Returns:
-            List of Region objects representing the extracted sections
-        """
-        # Find start and end elements across all pages
-        if isinstance(start_elements, str):
-            start_elements = self.find_all(start_elements).elements
-        if isinstance(end_elements, str):
-            end_elements = self.find_all(end_elements).elements
-        # If no start elements, return empty list
-        if not start_elements:
-            return []
-        # If there are page break boundaries, we'll need to add them
-        if new_section_on_page_break:
-            # For each page boundary, create virtual "end" and "start" elements
-            for i in range(len(self.pages) - 1):
-                # Add a virtual "end" element at the bottom of the current page
-                page = self.pages[i]
-                # If end_elements is None, initialize it as an empty list
-                if end_elements is None:
-                    end_elements = []
-                # Create a region at the bottom of the page as an artificial end marker
-                from natural_pdf.elements.region import Region
-                bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
-                bottom_region.is_page_boundary = True  # Mark it as a special boundary
-                end_elements.append(bottom_region)
-                # Add a virtual "start" element at the top of the next page
-                next_page = self.pages[i + 1]
-                top_region = Region(next_page, (0, 0, next_page.width, 1))
-                top_region.is_page_boundary = True  # Mark it as a special boundary
-                start_elements.append(top_region)
-        # Get all elements from all pages and sort them in document order
-        all_elements = []
-        for page in self.pages:
-            elements = page.get_elements()
-            all_elements.extend(elements)
-        # Sort by page index, then vertical position, then horizontal position
-        all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
-        # Mark section boundaries
-        section_boundaries = []
-        # Add start element boundaries
-        for element in start_elements:
-            if element in all_elements:
-                idx = all_elements.index(element)
-                section_boundaries.append(
-                    {
-                        "index": idx,
-                        "element": element,
-                        "type": "start",
-                        "page_idx": element.page.index,
-                    }
-                )
-            elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
-                # This is a virtual page boundary element
-                section_boundaries.append(
-                    {
-                        "index": -1,  # Special index for page boundaries
-                        "element": element,
-                        "type": "start",
-                        "page_idx": element.page.index,
-                    }
-                )
-        # Add end element boundaries if provided
-        if end_elements:
-            for element in end_elements:
-                if element in all_elements:
-                    idx = all_elements.index(element)
-                    section_boundaries.append(
-                        {
-                            "index": idx,
-                            "element": element,
-                            "type": "end",
-                            "page_idx": element.page.index,
-                        }
-                    )
-                elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
-                    # This is a virtual page boundary element
-                    section_boundaries.append(
-                        {
-                            "index": -1,  # Special index for page boundaries
-                            "element": element,
-                            "type": "end",
-                            "page_idx": element.page.index,
-                        }
-                    )
-        # Sort boundaries by page index, then by actual document position
-        section_boundaries.sort(
-            key=lambda x: (
-                x["page_idx"],
-                x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
-            )
-        )
-        # Generate sections
-        sections = []
-        # --- Helper: build a FlowRegion spanning multiple pages ---
-        def _build_flow_region(start_el, end_el):
-            """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
-            If *end_el* is None, the region continues to the bottom of the last
-            page in this PageCollection."""
-            # Local imports to avoid top-level cycles
-            from natural_pdf.elements.region import Region
-            from natural_pdf.flows.element import FlowElement
-            from natural_pdf.flows.flow import Flow
-            from natural_pdf.flows.region import FlowRegion
-            start_pg = start_el.page
-            end_pg = end_el.page if end_el is not None else self.pages[-1]
-            parts: list[Region] = []
-            # Slice of first page
-            parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
-            # Full middle pages
-            for pg_idx in range(start_pg.index + 1, end_pg.index):
-                mid_pg = self.pages[pg_idx]
-                parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
-            # Slice of last page (if distinct)
-            if end_pg is not start_pg:
-                bottom = end_el.bottom if end_el is not None else end_pg.height
-                parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
-            flow = Flow(segments=parts, arrangement="vertical")
-            src_fe = FlowElement(physical_object=start_el, flow=flow)
-            return FlowRegion(
-                flow=flow,
-                constituent_regions=parts,
-                source_flow_element=src_fe,
-                boundary_element_found=end_el,
-            )
-        # ------------------------------------------------------------------
-        current_start = None
-        for i, boundary in enumerate(section_boundaries):
-            # If it's a start boundary and we don't have a current start
-            if boundary["type"] == "start" and current_start is None:
-                current_start = boundary
-            # If it's an end boundary and we have a current start
-            elif boundary["type"] == "end" and current_start is not None:
-                # Create a section from current_start to this boundary
-                start_element = current_start["element"]
-                end_element = boundary["element"]
-                # If both elements are on the same page, use the page's get_section_between
-                if start_element.page == end_element.page:
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
-                    sections.append(section)
-                else:
-                    # Create FlowRegion spanning pages
-                    flow_region = _build_flow_region(start_element, end_element)
-                    sections.append(flow_region)
-                current_start = None
-            # If it's another start boundary and we have a current start (for splitting by starts only)
-            elif boundary["type"] == "start" and current_start is not None and not end_elements:
-                # Create a section from current_start to just before this boundary
-                start_element = current_start["element"]
-                # Find the last element before this boundary on the same page
-                if start_element.page == boundary["element"].page:
-                    # Find elements on this page
-                    page_elements = [e for e in all_elements if e.page == start_element.page]
-                    # Sort by position
-                    page_elements.sort(key=lambda e: (e.top, e.x0))
-                    # Find the last element before the boundary
-                    end_idx = (
-                        page_elements.index(boundary["element"]) - 1
-                        if boundary["element"] in page_elements
-                        else -1
-                    )
-                    end_element = page_elements[end_idx] if end_idx >= 0 else None
-                    # Create the section
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
-                    sections.append(section)
-                else:
-                    # Cross-page section - create from current_start to the end of its page
-                    from natural_pdf.elements.region import Region
-                    start_page = start_element.page
-                    region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
-                    )
-                    region.start_element = start_element
-                    sections.append(region)
-                current_start = boundary
-        # Handle the last section if we have a current start
-        if current_start is not None:
-            start_element = current_start["element"]
-            start_page = start_element.page
-            if end_elements:
-                # With end_elements, we need an explicit end - use the last element
-                # on the last page of the collection
-                last_page = self.pages[-1]
-                last_page_elements = [e for e in all_elements if e.page == last_page]
-                last_page_elements.sort(key=lambda e: (e.top, e.x0))
-                end_element = last_page_elements[-1] if last_page_elements else None
-                # Create FlowRegion spanning multiple pages using helper
-                flow_region = _build_flow_region(start_element, end_element)
-                sections.append(flow_region)
-            else:
-                # With start_elements only, create a section to the end of the current page
-                from natural_pdf.elements.region import Region
-                region = Region(
-                    start_page, (0, start_element.top, start_page.width, start_page.height)
-                )
-                region.start_element = start_element
-                sections.append(region)
-        return ElementCollection(sections)
-    def _gather_analysis_data(
-        self,
-        analysis_keys: List[str],
-        include_content: bool,
-        include_images: bool,
-        image_dir: Optional[Path],
-        image_format: str,
-        image_resolution: int,
-    ) -> List[Dict[str, Any]]:
-        """
-        Gather analysis data from all pages in the collection.
-        Args:
-            analysis_keys: Keys in the analyses dictionary to export
-            include_content: Whether to include extracted text
-            include_images: Whether to export images
-            image_dir: Directory to save images
-            image_format: Format to save images
-            image_resolution: Resolution for exported images
-        Returns:
-            List of dictionaries containing analysis data
-        """
-        if not self.elements:
-            logger.warning("No pages found in collection")
-            return []
-        all_data = []
-        for page in self.elements:
-            # Basic page information
-            page_data = {
-                "page_number": page.number,
-                "page_index": page.index,
-                "width": page.width,
-                "height": page.height,
-            }
-            # Add PDF information if available
-            if hasattr(page, "pdf") and page.pdf:
-                page_data["pdf_path"] = page.pdf.path
-                page_data["pdf_filename"] = Path(page.pdf.path).name
-            # Include extracted text if requested
-            if include_content:
-                try:
-                    page_data["content"] = page.extract_text(preserve_whitespace=True)
-                except Exception as e:
-                    logger.error(f"Error extracting text from page {page.number}: {e}")
-                    page_data["content"] = ""
-            # Save image if requested
-            if include_images:
-                try:
-                    # Create image filename
-                    pdf_name = "unknown"
-                    if hasattr(page, "pdf") and page.pdf:
-                        pdf_name = Path(page.pdf.path).stem
-                    image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
-                    image_path = image_dir / image_filename
-                    # Save image
-                    page.save_image(
-                        str(image_path), resolution=image_resolution, include_highlights=True
-                    )
-                    # Add relative path to data
-                    page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
-                except Exception as e:
-                    logger.error(f"Error saving image for page {page.number}: {e}")
-                    page_data["image_path"] = None
-            # Add analyses data
-            if hasattr(page, "analyses") and page.analyses:
-                for key in analysis_keys:
-                    if key not in page.analyses:
-                        raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
-                    # Get the analysis result
-                    analysis_result = page.analyses[key]
-                    # If the result has a to_dict method, use it
-                    if hasattr(analysis_result, "to_dict"):
-                        analysis_data = analysis_result.to_dict()
-                    else:
-                        # Otherwise, use the result directly if it's dict-like
-                        try:
-                            analysis_data = dict(analysis_result)
-                        except (TypeError, ValueError):
-                            # Last resort: convert to string
-                            analysis_data = {"raw_result": str(analysis_result)}
-                    # Add analysis data to page data with the key as prefix
-                    for k, v in analysis_data.items():
-                        page_data[f"{key}.{k}"] = v
-            all_data.append(page_data)
-        return all_data
-    # --- Deskew Method --- #
-    def deskew(
-        self,
-        resolution: int = 300,
-        detection_resolution: int = 72,
-        force_overwrite: bool = False,
-        **deskew_kwargs,
-    ) -> "PDF":  # Changed return type
-        """
-        Creates a new, in-memory PDF object containing deskewed versions of the pages
-        in this collection.
-        This method delegates the actual processing to the parent PDF object's
-        `deskew` method.
-        Important: The returned PDF is image-based. Any existing text, OCR results,
-        annotations, or other elements from the original pages will *not* be carried over.
-        Args:
-            resolution: DPI resolution for rendering the output deskewed pages.
-            detection_resolution: DPI resolution used for skew detection if angles are not
-                                  already cached on the page objects.
-            force_overwrite: If False (default), raises a ValueError if any target page
-                             already contains processed elements (text, OCR, regions) to
-                             prevent accidental data loss. Set to True to proceed anyway.
-            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
-                             during automatic detection (e.g., `max_angle`, `num_peaks`).
-        Returns:
-            A new PDF object representing the deskewed document.
-        Raises:
-            ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
-            ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
-                        or if the collection is empty.
-            RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
-        """
-        if not self.pages:
-            logger.warning("Cannot deskew an empty PageCollection.")
-            raise ValueError("Cannot deskew an empty PageCollection.")
-        # Assume all pages share the same parent PDF object
-        # Need to hint the type of _parent for type checkers
-        if TYPE_CHECKING:
-            parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
-        else:
-            parent_pdf = self.pages[0]._parent
-        if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
-            raise RuntimeError(
-                "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
-            )
-        # Get the 0-based indices of the pages in this collection
-        page_indices = self._get_page_indices()
-        logger.info(
-            f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
-        )
-        # Delegate the call to the parent PDF object for the relevant pages
-        # Pass all relevant arguments through (no output_path anymore)
-        return parent_pdf.deskew(
-            pages=page_indices,
-            resolution=resolution,
-            detection_resolution=detection_resolution,
-            force_overwrite=force_overwrite,
-            **deskew_kwargs,
-        )
-    # --- End Deskew Method --- #
-    def to_image(
-        self,
-        page_width: Optional[int] = None,
-        cols: Optional[int] = 4,
-        rows: Optional[int] = None,
-        max_pages: Optional[int] = None,
-        spacing: int = 10,
-        add_labels: bool = True,  # Add new flag
-        show_category: bool = False,
-    ) -> Optional["Image.Image"]:
-        """
-        Generate a grid of page images for this collection.
-        Args:
-            page_width: Width in pixels for rendering individual pages
-            cols: Number of columns in grid (default: 4)
-            rows: Number of rows in grid (calculated automatically if None)
-            max_pages: Maximum number of pages to include (default: all)
-            spacing: Spacing between page thumbnails in pixels
-            add_labels: Whether to add page number labels
-            show_category: Whether to add category and confidence labels (if available)
-        Returns:
-            PIL Image of the page grid or None if no pages
-        """
-        # Determine default page width from global options if not explicitly provided
-        if page_width is None:
-            try:
-                import natural_pdf
-                page_width = natural_pdf.options.image.width or 300
-            except Exception:
-                # Fallback if natural_pdf import fails in some edge context
-                page_width = 300
-        # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
-        try:
-            from PIL import Image, ImageDraw, ImageFont
-        except ImportError:
-            logger.error(
-                "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
-            )
-            return None
-        if not self.pages:
-            logger.warning("Cannot generate image for empty PageCollection")
-            return None
-        # Limit pages if max_pages is specified
-        pages_to_render = self.pages[:max_pages] if max_pages else self.pages
-        # Load font once outside the loop
-        font = None
-        if add_labels:
-            try:
-                # Try loading a commonly available font first
-                font = ImageFont.truetype("DejaVuSans.ttf", 16)
-            except IOError:
-                try:
-                    font = ImageFont.load_default(16)
-                except IOError:
-                    logger.warning("Default font not found. Labels cannot be added.")
-                    add_labels = False  # Disable if no font
-        # Render individual page images
-        page_images = []
-        for page in pages_to_render:
-            try:
-                # Assume page.to_image returns a PIL Image or None
-                img = page.to_image(
-                    width=page_width, include_highlights=True
-                )  # Render with highlights for visual context
-                if img is None:
-                    logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
-                    continue
-            except Exception as img_err:
-                logger.error(
-                    f"Error generating image for page {page.number}: {img_err}", exc_info=True
-                )
-                continue
-            # Add page number label
-            if add_labels and font:
-                draw = ImageDraw.Draw(img)
-                pdf_name = (
-                    Path(page.pdf.path).stem
-                    if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
-                    else ""
-                )
-                label_text = f"p{page.number}"
-                if pdf_name:
-                    label_text += f" - {pdf_name}"
-                # Add category if requested and available
-                if show_category:
-                    # Placeholder logic - adjust based on how classification results are stored
-                    category = None
-                    confidence = None
-                    if (
-                        hasattr(page, "analyses")
-                        and page.analyses
-                        and "classification" in page.analyses
-                    ):
-                        result = page.analyses["classification"]
-                        # Adapt based on actual structure of classification result
-                        category = (
-                            getattr(result, "label", None) or result.get("label", None)
-                            if isinstance(result, dict)
-                            else None
-                        )
-                        confidence = (
-                            getattr(result, "score", None) or result.get("score", None)
-                            if isinstance(result, dict)
-                            else None
-                        )
-                    if category is not None and confidence is not None:
-                        try:
-                            category_str = f"{category} ({confidence:.2f})"  # Format confidence
-                            label_text += f"\\n{category_str}"
-                        except (TypeError, ValueError):
-                            pass  # Ignore formatting errors
-                # Calculate bounding box for multi-line text and draw background/text
-                try:
-                    # Using textbbox for potentially better accuracy with specific fonts
-                    # Note: textbbox needs Pillow 8+
-                    bbox = draw.textbbox(
-                        (5, 5), label_text, font=font, spacing=2
-                    )  # Use textbbox if available
-                    bg_rect = (
-                        max(0, bbox[0] - 2),
-                        max(0, bbox[1] - 2),
-                        min(img.width, bbox[2] + 2),
-                        min(img.height, bbox[3] + 2),
-                    )
-                    # Draw semi-transparent background
-                    overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
-                    draw_overlay = ImageDraw.Draw(overlay)
-                    draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180))  # White with alpha
-                    img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
-                    draw = ImageDraw.Draw(img)  # Recreate draw object
-                    # Draw the potentially multi-line text
-                    draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
-                except AttributeError:  # Fallback for older Pillow without textbbox
-                    # Approximate size and draw
-                    # This might not be perfectly aligned
-                    draw.rectangle(
-                        (2, 2, 150, 40), fill=(255, 255, 255, 180)
-                    )  # Simple fixed background
-                    draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
-                except Exception as draw_err:
-                    logger.error(
-                        f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
-                    )
-            page_images.append(img)
-        if not page_images:
-            logger.warning("No page images were successfully rendered for the grid.")
-            return None
-        # Calculate grid dimensions if not provided
-        num_images = len(page_images)
-        if not rows and not cols:
-            cols = min(4, int(num_images**0.5) + 1)
-            rows = (num_images + cols - 1) // cols
-        elif rows and not cols:
-            cols = (num_images + rows - 1) // rows
-        elif cols and not rows:
-            rows = (num_images + cols - 1) // cols
-        cols = max(1, cols if cols else 1)  # Ensure at least 1
-        rows = max(1, rows if rows else 1)
-        # Get maximum dimensions for consistent grid cells
-        max_width = max(img.width for img in page_images) if page_images else 1
-        max_height = max(img.height for img in page_images) if page_images else 1
-        # Create grid image
-        grid_width = cols * max_width + (cols + 1) * spacing
-        grid_height = rows * max_height + (rows + 1) * spacing
-        grid_img = Image.new(
-            "RGB", (grid_width, grid_height), (220, 220, 220)
-        )  # Lighter gray background
-        # Place images in grid
-        for i, img in enumerate(page_images):
-            if i >= rows * cols:  # Ensure we don't exceed grid capacity
-                break
-            row = i // cols
-            col = i % cols
-            x = col * max_width + (col + 1) * spacing
-            y = row * max_height + (row + 1) * spacing
-            grid_img.paste(img, (x, y))
-        return grid_img
-    def save_pdf(
-        self,
-        output_path: Union[str, Path],
-        ocr: bool = False,
-        original: bool = False,
-        dpi: int = 300,
-    ):
-        """
-        Saves the pages in this collection to a new PDF file.
-        Choose one saving mode:
-        - `ocr=True`: Creates a new, image-based PDF using OCR results. This
-          makes the text generated during the natural-pdf session searchable,
-          but loses original vector content. Requires 'ocr-export' extras.
-        - `original=True`: Extracts the original pages from the source PDF,
-          preserving all vector content, fonts, and annotations. OCR results
-          from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
-        Args:
-            output_path: Path to save the new PDF file.
-            ocr: If True, save as a searchable, image-based PDF using OCR data.
-            original: If True, save the original, vector-based pages.
-            dpi: Resolution (dots per inch) used only when ocr=True for
-                 rendering page images and aligning the text layer.
-        Raises:
-            ValueError: If the collection is empty, if neither or both 'ocr'
-                        and 'original' are True, or if 'original=True' and
-                        pages originate from different PDFs.
-            ImportError: If required libraries ('pikepdf', 'Pillow')
-                         are not installed for the chosen mode.
-            RuntimeError: If an unexpected error occurs during saving.
-        """
-        if not self.pages:
-            raise ValueError("Cannot save an empty PageCollection.")
-        if not (ocr ^ original):  # XOR: exactly one must be true
-            raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
-        output_path_obj = Path(output_path)
-        output_path_str = str(output_path_obj)
-        if ocr:
-            if create_searchable_pdf is None:
-                raise ImportError(
-                    "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
-                    'Install with: pip install \\"natural-pdf[ocr-export]\\"'  # Escaped quotes
-                )
-            # Check for non-OCR vector elements (provide a warning)
-            has_vector_elements = False
-            for page in self.pages:
-                # Simplified check for common vector types or non-OCR chars/words
-                if (
-                    hasattr(page, "rects")
-                    and page.rects
-                    or hasattr(page, "lines")
-                    and page.lines
-                    or hasattr(page, "curves")
-                    and page.curves
-                    or (
-                        hasattr(page, "chars")
-                        and any(getattr(el, "source", None) != "ocr" for el in page.chars)
-                    )
-                    or (
-                        hasattr(page, "words")
-                        and any(getattr(el, "source", None) != "ocr" for el in page.words)
-                    )
-                ):
-                    has_vector_elements = True
-                    break
-            if has_vector_elements:
-                logger.warning(
-                    "Warning: Saving with ocr=True creates an image-based PDF. "
-                    "Original vector elements (rects, lines, non-OCR text/chars) "
-                    "on selected pages will not be preserved in the output file."
-                )
-            logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
-            try:
-                # Delegate to the searchable PDF exporter function
-                # Pass `self` (the PageCollection instance) as the source
-                create_searchable_pdf(self, output_path_str, dpi=dpi)
-                # Success log is now inside create_searchable_pdf if needed, or keep here
-                # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
-            except Exception as e:
-                logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
-                # Re-raise as RuntimeError for consistency, potentially handled in exporter too
-                raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
-        elif original:
-            # ---> MODIFIED: Call the new exporter
-            if create_original_pdf is None:
-                raise ImportError(
-                    "Saving with original=True requires 'pikepdf'. "
-                    'Install with: pip install \\"natural-pdf[ocr-export]\\"'  # Escaped quotes
-                )
-            # Check for OCR elements (provide a warning) - keep this check here
-            has_ocr_elements = False
-            for page in self.pages:
-                # Use find_all which returns a collection; check if it's non-empty
-                if hasattr(page, "find_all"):
-                    ocr_text_elements = page.find_all("text[source=ocr]")
-                    if ocr_text_elements:  # Check truthiness of collection
-                        has_ocr_elements = True
-                        break
-                elif hasattr(page, "words"):  # Fallback check if find_all isn't present?
-                    if any(getattr(el, "source", None) == "ocr" for el in page.words):
-                        has_ocr_elements = True
-                        break
-            if has_ocr_elements:
-                logger.warning(
-                    "Warning: Saving with original=True preserves original page content. "
-                    "OCR text generated in this session will not be included in the saved file."
-                )
-            logger.info(f"Saving original pages PDF to: {output_path_str}")
-            try:
-                # Delegate to the original PDF exporter function
-                # Pass `self` (the PageCollection instance) as the source
-                create_original_pdf(self, output_path_str)
-                # Success log is now inside create_original_pdf
-                # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
-            except Exception as e:
-                # Error logging is handled within create_original_pdf
-                # Re-raise the exception caught from the exporter
-                raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
-            # <--- END MODIFIED
-    # Alias .to_image() to .show() for convenience
-    def show(
-        self,
-        *args,
-        **kwargs,
-    ) -> Optional["Image.Image"]:
-        """Display pages similarly to ``to_image``.
-        This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
-        ElementCollection, where ``show()`` already exists. It forwards all
-        arguments and returns the resulting ``PIL.Image`` instance.
-        """
-        return self.to_image(*args, **kwargs)

natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl