PyPI - natural-pdf - Versions diffs - 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl - Mend

natural-pdf 0.1.40py3-none-any.whl → 0.2.1.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +6 -7
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +236 -383
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +172 -83
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +318 -243
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +4 -4
natural_pdf/flows/flow.py +1200 -243
natural_pdf/flows/region.py +707 -261
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +7 -3
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0

natural_pdf/flows/flow.py CHANGED Viewed

@@ -1,25 +1,43 @@
 import logging
-from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union, Tuple, Callable, overload
+import warnings
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
 if TYPE_CHECKING:
+    from PIL.Image import Image as PIL_Image
     from natural_pdf.core.page import Page
+    from natural_pdf.core.page_collection import PageCollection
     from natural_pdf.elements.base import Element as PhysicalElement
-    from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection, PageCollection
+    from natural_pdf.elements.element_collection import (
+        ElementCollection as PhysicalElementCollection,
+    )
     from natural_pdf.elements.region import Region as PhysicalRegion
-    from PIL.Image import Image as PIL_Image
     from .collections import FlowElementCollection
     from .element import FlowElement
 # Import required classes for the new methods
-from natural_pdf.tables import TableResult
 # For runtime image manipulation
 from PIL import Image as PIL_Image_Runtime
+from natural_pdf.core.render_spec import RenderSpec, Visualizable
+from natural_pdf.tables import TableResult
 logger = logging.getLogger(__name__)
-class Flow:
+class Flow(Visualizable):
     """Defines a logical flow or sequence of physical Page or Region objects.
     A Flow represents a continuous logical document structure that spans across
@@ -114,9 +132,9 @@ class Flow:
             segment_gap: The virtual gap (in PDF points) between segments.
         """
         # Handle PageCollection input
-        if hasattr(segments, 'pages'):  # It's a PageCollection
+        if hasattr(segments, "pages"):  # It's a PageCollection
             segments = list(segments.pages)
         if not segments:
             raise ValueError("Flow segments cannot be empty.")
         if arrangement not in ["vertical", "horizontal"]:
@@ -176,6 +194,103 @@ class Flow:
                 f"Valid options are: {valid_alignments[self.arrangement]}"
             )
+    def _get_highlighter(self):
+        """Get the highlighting service from the first segment."""
+        if not self.segments:
+            raise RuntimeError("Flow has no segments to get highlighter from")
+        # Get highlighter from first segment
+        first_segment = self.segments[0]
+        if hasattr(first_segment, "_highlighter"):
+            return first_segment._highlighter
+        elif hasattr(first_segment, "page") and hasattr(first_segment.page, "_highlighter"):
+            return first_segment.page._highlighter
+        else:
+            raise RuntimeError(
+                f"Cannot find HighlightingService from Flow segments. "
+                f"First segment type: {type(first_segment).__name__}"
+            )
+    def show(
+        self,
+        *,
+        # Basic rendering options
+        resolution: Optional[float] = None,
+        width: Optional[int] = None,
+        # Highlight options
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        labels: bool = True,
+        label_format: Optional[str] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        # Layout options for multi-page/region
+        layout: Literal["stack", "grid", "single"] = "stack",
+        stack_direction: Literal["vertical", "horizontal"] = "vertical",
+        gap: int = 5,
+        columns: Optional[int] = None,  # For grid layout
+        # Cropping options
+        crop: Union[bool, Literal["content"]] = False,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
+        # Flow-specific options
+        in_context: bool = False,
+        separator_color: Optional[Tuple[int, int, int]] = None,
+        separator_thickness: int = 2,
+        **kwargs,
+    ) -> Optional["PIL_Image"]:
+        """Generate a preview image with highlights.
+        If in_context=True, shows segments as cropped images stacked together
+        with separators between segments.
+        Args:
+            resolution: DPI for rendering (default from global settings)
+            width: Target width in pixels (overrides resolution)
+            color: Default highlight color
+            labels: Whether to show labels for highlights
+            label_format: Format string for labels
+            highlights: Additional highlight groups to show
+            layout: How to arrange multiple pages/regions
+            stack_direction: Direction for stack layout
+            gap: Pixels between stacked images
+            columns: Number of columns for grid layout
+            crop: Whether to crop
+            crop_bbox: Explicit crop bounds
+            in_context: If True, use special Flow visualization with separators
+            separator_color: RGB color for separator lines (default: red)
+            separator_thickness: Thickness of separator lines
+            **kwargs: Additional parameters passed to rendering
+        Returns:
+            PIL Image object or None if nothing to render
+        """
+        if in_context:
+            # Use the special in_context visualization
+            return self._show_in_context(
+                resolution=resolution or 150,
+                width=width,
+                stack_direction=stack_direction,
+                stack_gap=gap,
+                separator_color=separator_color or (255, 0, 0),
+                separator_thickness=separator_thickness,
+                **kwargs,
+            )
+        # Otherwise use the standard show method
+        return super().show(
+            resolution=resolution,
+            width=width,
+            color=color,
+            labels=labels,
+            label_format=label_format,
+            highlights=highlights,
+            layout=layout,
+            stack_direction=stack_direction,
+            gap=gap,
+            columns=columns,
+            crop=crop,
+            crop_bbox=crop_bbox,
+            **kwargs,
+        )
     def find(
         self,
         selector: Optional[str] = None,
@@ -224,11 +339,11 @@ class Flow:
     ) -> "FlowElementCollection":
         """
         Finds all elements within the flow that match the given selector or text criteria.
         This method efficiently groups segments by their parent pages, searches at the page level,
         then filters results appropriately for each segment. This ensures elements that intersect
         with flow segments (but aren't fully contained) are still found.
         Elements found are wrapped as FlowElement objects, anchored to this Flow,
         and returned in a FlowElementCollection.
         """
@@ -237,21 +352,26 @@ class Flow:
         # Step 1: Group segments by their parent pages (like in analyze_layout)
         segments_by_page = {}  # Dict[Page, List[Segment]]
         for i, segment in enumerate(self.segments):
             # Determine the page for this segment - fix type detection
-            if hasattr(segment, 'page') and hasattr(segment.page, 'find_all'):
+            if hasattr(segment, "page") and hasattr(segment.page, "find_all"):
                 # It's a Region object (has a parent page)
                 page_obj = segment.page
                 segment_type = "region"
-            elif hasattr(segment, 'find_all') and hasattr(segment, 'width') and hasattr(segment, 'height') and not hasattr(segment, 'page'):
+            elif (
+                hasattr(segment, "find_all")
+                and hasattr(segment, "width")
+                and hasattr(segment, "height")
+                and not hasattr(segment, "page")
+            ):
                 # It's a Page object (has find_all but no parent page)
                 page_obj = segment
                 segment_type = "page"
             else:
                 logger.warning(f"Segment {i+1} does not support find_all, skipping")
                 continue
             if page_obj not in segments_by_page:
                 segments_by_page[page_obj] = []
             segments_by_page[page_obj].append((segment, segment_type))
@@ -273,7 +393,7 @@ class Flow:
                 case=case,
                 **kwargs,
             )
             if not page_matches:
                 continue
@@ -283,31 +403,41 @@ class Flow:
                     # Full page segment: include all elements
                     for phys_elem in page_matches.elements:
                         all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
                 elif segment_type == "region":
                     # Region segment: filter to only intersecting elements
                     for phys_elem in page_matches.elements:
                         try:
                             # Check if element intersects with this flow segment
                             if segment.intersects(phys_elem):
-                                all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
+                                all_flow_elements.append(
+                                    FlowElement(physical_object=phys_elem, flow=self)
+                                )
                         except Exception as intersect_error:
-                            logger.debug(f"Error checking intersection for element: {intersect_error}")
+                            logger.debug(
+                                f"Error checking intersection for element: {intersect_error}"
+                            )
                             # Include the element anyway if intersection check fails
-                            all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
+                            all_flow_elements.append(
+                                FlowElement(physical_object=phys_elem, flow=self)
+                            )
         # Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
         unique_flow_elements = []
         seen_element_ids = set()
         for flow_elem in all_flow_elements:
             # Create a unique identifier for the underlying physical element
             phys_elem = flow_elem.physical_object
             elem_id = (
-                getattr(phys_elem.page, 'index', id(phys_elem.page)) if hasattr(phys_elem, 'page') else id(phys_elem),
-                phys_elem.bbox if hasattr(phys_elem, 'bbox') else id(phys_elem)
+                (
+                    getattr(phys_elem.page, "index", id(phys_elem.page))
+                    if hasattr(phys_elem, "page")
+                    else id(phys_elem)
+                ),
+                phys_elem.bbox if hasattr(phys_elem, "bbox") else id(phys_elem),
             )
             if elem_id not in seen_element_ids:
                 unique_flow_elements.append(flow_elem)
                 seen_element_ids.add(elem_id)
@@ -362,6 +492,7 @@ class Flow:
         show_progress: bool = False,
         content_filter: Optional[Any] = None,
         stitch_rows: Optional[Callable] = None,
+        merge_headers: Optional[bool] = None,
     ) -> TableResult:
         """
         Extract table data from all segments in the flow, combining results sequentially.
@@ -380,18 +511,24 @@ class Flow:
                                   and returns its string content. For 'text' method only.
             show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
             content_filter: Optional content filter to apply during cell text extraction.
+            merge_headers: Whether to merge tables by removing repeated headers from subsequent
+                segments. If None (default), auto-detects by checking if the first row
+                of each segment matches the first row of the first segment. If segments have
+                inconsistent header patterns (some repeat, others don't), raises ValueError.
+                Useful for multi-page tables where headers repeat on each page.
             stitch_rows: Optional callable to determine when rows should be merged across
-                         segment boundaries. Two overloaded signatures are supported:
+                         segment boundaries. Applied AFTER header removal if merge_headers
+                         is enabled. Two overloaded signatures are supported:
                          • func(current_row) -> bool
                            Called only on the first row of each segment (after the first).
                            Return True to merge this first row with the last row from
                            the previous segment.
                          • func(prev_row, current_row, row_index, segment) -> bool
                            Called for every row. Return True to merge current_row with
                            the previous row in the aggregated results.
                          When True is returned, rows are concatenated cell-by-cell.
                          This is useful for handling table rows split across page
                          boundaries or segments. If None, rows are never merged.
@@ -403,30 +540,32 @@ class Flow:
             Multi-page table extraction:
             ```python
             pdf = npdf.PDF("multi_page_table.pdf")
             # Create flow for table spanning pages 2-4
             table_flow = Flow(
                 segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
                 arrangement='vertical'
             )
             # Extract table as if it were continuous
             table_data = table_flow.extract_table()
             df = table_data.df  # Convert to pandas DataFrame
             # Custom row stitching - single parameter (simple case)
             table_data = table_flow.extract_table(
                 stitch_rows=lambda row: row and not (row[0] or "").strip()
             )
             # Custom row stitching - full parameters (advanced case)
             table_data = table_flow.extract_table(
                 stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
             )
             ```
         """
-        logger.info(f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})")
+        logger.info(
+            f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})"
+        )
         if not self.segments:
             logger.warning("Flow has no segments, returning empty table")
             return TableResult([])
@@ -434,12 +573,13 @@ class Flow:
         # Resolve predicate and determine its signature
         predicate: Optional[Callable] = None
         predicate_type: str = "none"
         if callable(stitch_rows):
             import inspect
             sig = inspect.signature(stitch_rows)
             param_count = len(sig.parameters)
             if param_count == 1:
                 predicate = stitch_rows
                 predicate_type = "single_param"
@@ -447,12 +587,17 @@ class Flow:
                 predicate = stitch_rows
                 predicate_type = "full_params"
             else:
-                logger.warning(f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring.")
+                logger.warning(
+                    f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring."
+                )
                 predicate = None
                 predicate_type = "none"
-        def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
+        def _default_merge(
+            prev_row: List[Optional[str]], cur_row: List[Optional[str]]
+        ) -> List[Optional[str]]:
             from itertools import zip_longest
             merged: List[Optional[str]] = []
             for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
                 if (p or "").strip() and (c or "").strip():
@@ -463,6 +608,10 @@ class Flow:
         aggregated_rows: List[List[Optional[str]]] = []
         processed_segments = 0
+        header_row: Optional[List[Optional[str]]] = None
+        merge_headers_enabled = False
+        headers_warned = False  # Track if we've already warned about dropping headers
+        segment_has_repeated_header = []  # Track which segments have repeated headers
         for seg_idx, segment in enumerate(self.segments):
             try:
@@ -491,9 +640,67 @@ class Flow:
                     logger.debug(f"    No table data found in segment {seg_idx+1}")
                     continue
+                # Handle header detection and merging for multi-page tables
+                if seg_idx == 0:
+                    # First segment: capture potential header row
+                    if segment_rows:
+                        header_row = segment_rows[0]
+                        # Determine if we should merge headers
+                        if merge_headers is None:
+                            # Auto-detect: we'll check all subsequent segments
+                            merge_headers_enabled = False  # Will be determined later
+                        else:
+                            merge_headers_enabled = merge_headers
+                        # Track that first segment exists (for consistency checking)
+                        segment_has_repeated_header.append(False)  # First segment doesn't "repeat"
+                elif seg_idx == 1 and merge_headers is None:
+                    # Auto-detection: check if first row of second segment matches header
+                    has_header = segment_rows and header_row and segment_rows[0] == header_row
+                    segment_has_repeated_header.append(has_header)
+                    if has_header:
+                        merge_headers_enabled = True
+                        # Remove the detected repeated header from this segment
+                        segment_rows = segment_rows[1:]
+                        logger.debug(
+                            f"    Auto-detected repeated header in segment {seg_idx+1}, removed"
+                        )
+                        if not headers_warned:
+                            warnings.warn(
+                                "Detected repeated headers in multi-page table. Merging by removing "
+                                "repeated headers from subsequent pages.",
+                                UserWarning,
+                                stacklevel=2,
+                            )
+                            headers_warned = True
+                    else:
+                        merge_headers_enabled = False
+                        logger.debug(f"    No repeated header detected in segment {seg_idx+1}")
+                elif seg_idx > 1:
+                    # Check consistency: all segments should have same pattern
+                    has_header = segment_rows and header_row and segment_rows[0] == header_row
+                    segment_has_repeated_header.append(has_header)
+                    # Remove header if merging is enabled and header is present
+                    if merge_headers_enabled and has_header:
+                        segment_rows = segment_rows[1:]
+                        logger.debug(f"    Removed repeated header from segment {seg_idx+1}")
+                elif seg_idx > 0 and merge_headers_enabled:
+                    # Explicit merge_headers=True: remove headers from subsequent segments
+                    if segment_rows and header_row and segment_rows[0] == header_row:
+                        segment_rows = segment_rows[1:]
+                        logger.debug(f"    Removed repeated header from segment {seg_idx+1}")
+                        if not headers_warned:
+                            warnings.warn(
+                                "Removing repeated headers from multi-page table during merge.",
+                                UserWarning,
+                                stacklevel=2,
+                            )
+                            headers_warned = True
                 for row_idx, row in enumerate(segment_rows):
                     should_merge = False
                     if predicate is not None and aggregated_rows:
                         if predicate_type == "single_param":
                             # For single param: only call on first row of segment (row_idx == 0)
@@ -503,19 +710,41 @@ class Flow:
                         elif predicate_type == "full_params":
                             # For full params: call with all arguments
                             should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
                     if should_merge:
                         aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
                     else:
                         aggregated_rows.append(row)
                 processed_segments += 1
-                logger.debug(f"    Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}")
+                logger.debug(
+                    f"    Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}"
+                )
             except Exception as e:
                 logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
                 continue
+        # Check for inconsistent header patterns after processing all segments
+        if merge_headers is None and len(segment_has_repeated_header) > 2:
+            # During auto-detection, check for consistency across all segments
+            expected_pattern = segment_has_repeated_header[1]  # Pattern from second segment
+            for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
+                if has_header != expected_pattern:
+                    # Inconsistent pattern detected
+                    segments_with_headers = [
+                        i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
+                    ]
+                    segments_without_headers = [
+                        i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
+                    ]
+                    raise ValueError(
+                        f"Inconsistent header pattern in multi-page table: "
+                        f"segments {segments_with_headers} have repeated headers, "
+                        f"but segments {segments_without_headers} do not. "
+                        f"All segments must have the same header pattern for reliable merging."
+                    )
         logger.info(
             f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
         )
@@ -558,45 +787,47 @@ class Flow:
             Multi-page layout analysis:
             ```python
             pdf = npdf.PDF("document.pdf")
             # Create flow for first 3 pages
             page_flow = Flow(
                 segments=pdf.pages[:3],
                 arrangement='vertical'
             )
             # Analyze layout across all pages (efficiently)
             all_regions = page_flow.analyze_layout(engine='yolo')
             # Find all tables across the flow
             tables = all_regions.filter('region[type=table]')
             ```
         """
-        from natural_pdf.elements.collections import ElementCollection
-        logger.info(f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})")
+        from natural_pdf.elements.element_collection import ElementCollection
+        logger.info(
+            f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})"
+        )
         if not self.segments:
             logger.warning("Flow has no segments, returning empty collection")
             return ElementCollection([])
         # Step 1: Group segments by their parent pages to avoid redundant analysis
         segments_by_page = {}  # Dict[Page, List[Segment]]
         for i, segment in enumerate(self.segments):
             # Determine the page for this segment
-            if hasattr(segment, 'analyze_layout'):
+            if hasattr(segment, "analyze_layout"):
                 # It's a Page object
                 page_obj = segment
                 segment_type = "page"
-            elif hasattr(segment, 'page') and hasattr(segment.page, 'analyze_layout'):
+            elif hasattr(segment, "page") and hasattr(segment.page, "analyze_layout"):
                 # It's a Region object
                 page_obj = segment.page
                 segment_type = "region"
             else:
                 logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
                 continue
             if page_obj not in segments_by_page:
                 segments_by_page[page_obj] = []
             segments_by_page[page_obj].append((segment, segment_type))
@@ -605,7 +836,9 @@ class Flow:
             logger.warning("No segments with analyzable pages found")
             return ElementCollection([])
-        logger.debug(f"  Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages")
+        logger.debug(
+            f"  Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages"
+        )
         # Step 2: Analyze each unique page only once
         all_detected_regions: List["PhysicalRegion"] = []
@@ -613,8 +846,10 @@ class Flow:
         for page_obj, page_segments in segments_by_page.items():
             try:
-                logger.debug(f"  Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments")
+                logger.debug(
+                    f"  Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments"
+                )
                 # Run layout analysis once for this page
                 page_results = page_obj.analyze_layout(
                     engine=engine,
@@ -629,18 +864,22 @@ class Flow:
                 )
                 # Extract regions from results
-                if hasattr(page_results, 'elements'):
+                if hasattr(page_results, "elements"):
                     # It's an ElementCollection
                     page_regions = page_results.elements
                 elif isinstance(page_results, list):
                     # It's a list of regions
                     page_regions = page_results
                 else:
-                    logger.warning(f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}")
+                    logger.warning(
+                        f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}"
+                    )
                     continue
                 if not page_regions:
-                    logger.debug(f"    No layout regions found on page {getattr(page_obj, 'number', '?')}")
+                    logger.debug(
+                        f"    No layout regions found on page {getattr(page_obj, 'number', '?')}"
+                    )
                     continue
                 # Step 3: For each segment on this page, collect relevant regions
@@ -651,7 +890,7 @@ class Flow:
                         all_detected_regions.extend(page_regions)
                         segments_processed_on_page += 1
                         logger.debug(f"    Added {len(page_regions)} regions for full-page segment")
                     elif segment_type == "region":
                         # Region segment: filter to only intersecting regions
                         intersecting_regions = []
@@ -660,32 +899,41 @@ class Flow:
                                 if segment.intersects(region):
                                     intersecting_regions.append(region)
                             except Exception as intersect_error:
-                                logger.debug(f"Error checking intersection for region: {intersect_error}")
+                                logger.debug(
+                                    f"Error checking intersection for region: {intersect_error}"
+                                )
                                 # Include the region anyway if intersection check fails
                                 intersecting_regions.append(region)
                         all_detected_regions.extend(intersecting_regions)
                         segments_processed_on_page += 1
-                        logger.debug(f"    Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}")
+                        logger.debug(
+                            f"    Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}"
+                        )
                 processed_pages += 1
-                logger.debug(f"    Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}")
+                logger.debug(
+                    f"    Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}"
+                )
             except Exception as e:
-                logger.error(f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}", exc_info=True)
+                logger.error(
+                    f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}",
+                    exc_info=True,
+                )
                 continue
         # Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
         unique_regions = []
         seen_region_ids = set()
         for region in all_detected_regions:
             # Create a unique identifier for this region (page + bbox)
             region_id = (
-                getattr(region.page, 'index', id(region.page)),
-                region.bbox if hasattr(region, 'bbox') else id(region)
+                getattr(region.page, "index", id(region.page)),
+                region.bbox if hasattr(region, "bbox") else id(region),
             )
             if region_id not in seen_region_ids:
                 unique_regions.append(region)
                 seen_region_ids.add(region_id)
@@ -694,87 +942,54 @@ class Flow:
         if dedupe_removed > 0:
             logger.debug(f"  Removed {dedupe_removed} duplicate regions")
-        logger.info(f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages")
+        logger.info(
+            f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages"
+        )
         return ElementCollection(unique_regions)
-    def show(
+    def _get_render_specs(
         self,
-        resolution: Optional[float] = None,
-        labels: bool = True,
-        legend_position: str = "right",
-        color: Optional[Union[Tuple, str]] = "blue",
+        mode: Literal["show", "render"] = "show",
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        crop: Union[bool, Literal["content"]] = False,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         label_prefix: Optional[str] = "FlowSegment",
-        width: Optional[int] = None,
-        stack_direction: str = "vertical",
-        stack_gap: int = 5,
-        stack_background_color: Tuple[int, int, int] = (255, 255, 255),
-        crop: bool = False,
         **kwargs,
-    ) -> Optional["PIL_Image"]:
-        """
-        Generates and returns a PIL Image showing all segments in the flow with highlights.
-        This method visualizes the entire flow by highlighting each segment on its respective
-        page and combining the results into a single image. If multiple pages are involved,
-        they are stacked according to the flow's arrangement.
+    ) -> List[RenderSpec]:
+        """Get render specifications for this flow.
         Args:
-            resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
-            labels: Whether to include a legend for highlights.
-            legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
-            color: Color for highlighting the flow segments.
-            label_prefix: Prefix for segment labels (e.g., 'FlowSegment').
-            width: Optional width for the output image (overrides resolution).
-            stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
-            stack_gap: Gap in pixels between stacked pages.
-            stack_background_color: RGB background color for the stacked image.
-            crop: If True, crop each rendered page to the bounding box of segments on that page.
-            **kwargs: Additional arguments passed to the underlying rendering methods.
+            mode: Rendering mode - 'show' includes highlights, 'render' is clean
+            color: Color for highlighting segments in show mode
+            highlights: Additional highlight groups to show
+            crop: Whether to crop to segments
+            crop_bbox: Explicit crop bounds
+            label_prefix: Prefix for segment labels
+            **kwargs: Additional parameters
         Returns:
-            PIL Image of the rendered pages with highlighted flow segments, or None if rendering fails.
-        Example:
-            Visualizing a multi-page flow:
-            ```python
-            pdf = npdf.PDF("document.pdf")
-            # Create flow across multiple pages
-            page_flow = Flow(
-                segments=[pdf.pages[0], pdf.pages[1], pdf.pages[2]],
-                arrangement='vertical'
-            )
-            # Show the entire flow
-            flow_image = page_flow.show(color="green", labels=True)
-            ```
+            List of RenderSpec objects, one per page with segments
         """
-        logger.info(f"Rendering Flow with {len(self.segments)} segments")
         if not self.segments:
-            logger.warning("Flow has no segments to show")
-            return None
-        # Apply global options as defaults for resolution
-        import natural_pdf
-        if resolution is None:
-            if natural_pdf.options.image.resolution is not None:
-                resolution = natural_pdf.options.image.resolution
-            else:
-                resolution = 144  # Default resolution
+            return []
-        # 1. Group segments by their physical pages
+        # Group segments by their physical pages
         segments_by_page = {}  # Dict[Page, List[PhysicalRegion]]
         for i, segment in enumerate(self.segments):
             # Get the page for this segment
-            if hasattr(segment, 'page') and segment.page is not None:
+            if hasattr(segment, "page") and segment.page is not None:
                 # It's a Region, use its page
                 page_obj = segment.page
                 if page_obj not in segments_by_page:
                     segments_by_page[page_obj] = []
                 segments_by_page[page_obj].append(segment)
-            elif hasattr(segment, 'index') and hasattr(segment, 'width') and hasattr(segment, 'height'):
+            elif (
+                hasattr(segment, "index")
+                and hasattr(segment, "width")
+                and hasattr(segment, "height")
+            ):
                 # It's a full Page object, create a full-page region for it
                 page_obj = segment
                 full_page_region = segment.region(0, 0, segment.width, segment.height)
@@ -786,17 +1001,10 @@ class Flow:
                 continue
         if not segments_by_page:
-            logger.warning("No segments with identifiable pages found")
-            return None
+            return []
-        # 2. Get a highlighter service from the first page
-        first_page = next(iter(segments_by_page.keys()))
-        if not hasattr(first_page, '_highlighter'):
-            logger.error("Cannot get highlighter service for Flow.show(). Page missing highlighter.")
-            return None
-        highlighter_service = first_page._highlighter
-        output_page_images: List["PIL_Image_Runtime"] = []
+        # Create RenderSpec for each page
+        specs = []
         # Sort pages by index for consistent output order
         sorted_pages = sorted(
@@ -804,134 +1012,243 @@ class Flow:
             key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
         )
-        # 3. Render each page with its relevant segments highlighted
         for page_idx, page_obj in enumerate(sorted_pages):
             segments_on_this_page = segments_by_page[page_obj]
             if not segments_on_this_page:
                 continue
-            temp_highlights_for_page = []
-            for i, segment in enumerate(segments_on_this_page):
-                segment_label = None
-                if labels and label_prefix:
-                    # Create label for this segment
-                    global_segment_idx = None
-                    try:
-                        # Find the global index of this segment in the original flow
-                        global_segment_idx = self.segments.index(segment)
-                    except ValueError:
-                        # If it's a generated full-page region, find its source page
-                        for idx, orig_segment in enumerate(self.segments):
-                            if (hasattr(orig_segment, 'index') and hasattr(segment, 'page')
-                                and orig_segment.index == segment.page.index):
-                                global_segment_idx = idx
-                                break
-                    if global_segment_idx is not None:
-                        segment_label = f"{label_prefix}_{global_segment_idx + 1}"
-                    else:
-                        segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
-                temp_highlights_for_page.append(
-                    {
-                        "page_index": (
-                            page_obj.index
-                            if hasattr(page_obj, "index")
-                            else getattr(page_obj, "page_number", 1) - 1
-                        ),
-                        "bbox": segment.bbox,
-                        "polygon": segment.polygon if hasattr(segment, 'polygon') and hasattr(segment, 'has_polygon') and segment.has_polygon else None,
-                        "color": color,
-                        "label": segment_label,
-                        "use_color_cycling": False,  # Keep specific color
-                    }
-                )
+            spec = RenderSpec(page=page_obj)
+            # Handle cropping
+            if crop_bbox:
+                spec.crop_bbox = crop_bbox
+            elif crop == "content" or crop is True:
+                # Calculate bounds of segments on this page
+                x_coords = []
+                y_coords = []
+                for segment in segments_on_this_page:
+                    if hasattr(segment, "bbox") and segment.bbox:
+                        x0, y0, x1, y1 = segment.bbox
+                        x_coords.extend([x0, x1])
+                        y_coords.extend([y0, y1])
+                if x_coords and y_coords:
+                    spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+            # Add highlights in show mode
+            if mode == "show":
+                # Highlight segments
+                for i, segment in enumerate(segments_on_this_page):
+                    segment_label = None
+                    if label_prefix:
+                        # Create label for this segment
+                        global_segment_idx = None
+                        try:
+                            # Find the global index of this segment in the original flow
+                            global_segment_idx = self.segments.index(segment)
+                        except ValueError:
+                            # If it's a generated full-page region, find its source page
+                            for idx, orig_segment in enumerate(self.segments):
+                                if (
+                                    hasattr(orig_segment, "index")
+                                    and hasattr(segment, "page")
+                                    and orig_segment.index == segment.page.index
+                                ):
+                                    global_segment_idx = idx
+                                    break
+                        if global_segment_idx is not None:
+                            segment_label = f"{label_prefix}_{global_segment_idx + 1}"
+                        else:
+                            segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
+                    spec.add_highlight(
+                        bbox=segment.bbox,
+                        polygon=segment.polygon if segment.has_polygon else None,
+                        color=color or "blue",
+                        label=segment_label,
+                    )
-            if not temp_highlights_for_page:
-                continue
+                # Add additional highlight groups if provided
+                if highlights:
+                    for group in highlights:
+                        group_elements = group.get("elements", [])
+                        group_color = group.get("color", color)
+                        group_label = group.get("label")
-            # Calculate crop bbox if cropping is enabled
-            crop_bbox = None
-            if crop and segments_on_this_page:
-                # Calculate the bounding box that encompasses all segments on this page
-                min_x0 = min(segment.bbox[0] for segment in segments_on_this_page)
-                min_y0 = min(segment.bbox[1] for segment in segments_on_this_page)
-                max_x1 = max(segment.bbox[2] for segment in segments_on_this_page)
-                max_y1 = max(segment.bbox[3] for segment in segments_on_this_page)
-                crop_bbox = (min_x0, min_y0, max_x1, max_y1)
-            # Render this page with highlights
-            page_image = highlighter_service.render_preview(
-                page_index=(
-                    page_obj.index
-                    if hasattr(page_obj, "index")
-                    else getattr(page_obj, "page_number", 1) - 1
-                ),
-                temporary_highlights=temp_highlights_for_page,
-                resolution=resolution,
-                width=width,
-                labels=labels,
-                legend_position=legend_position,
-                crop_bbox=crop_bbox,
-                **kwargs,
-            )
-            if page_image:
-                output_page_images.append(page_image)
+                        for elem in group_elements:
+                            # Only add if element is on this page
+                            if hasattr(elem, "page") and elem.page == page_obj:
+                                spec.add_highlight(
+                                    element=elem, color=group_color, label=group_label
+                                )
-        # 4. Stack the generated page images if multiple
-        if not output_page_images:
-            logger.warning("Flow.show() produced no page images")
-            return None
+            specs.append(spec)
+        return specs
+    def _show_in_context(
+        self,
+        resolution: float,
+        width: Optional[int] = None,
+        stack_direction: str = "vertical",
+        stack_gap: int = 5,
+        stack_background_color: Tuple[int, int, int] = (255, 255, 255),
+        separator_color: Tuple[int, int, int] = (255, 0, 0),
+        separator_thickness: int = 2,
+        **kwargs,
+    ) -> Optional["PIL_Image"]:
+        """
+        Show segments as cropped images stacked together with separators between segments.
+        Args:
+            resolution: Resolution in DPI for rendering segment images
+            width: Optional width for segment images
+            stack_direction: Direction to stack segments ('vertical' or 'horizontal')
+            stack_gap: Gap in pixels between segments
+            stack_background_color: RGB background color for the final image
+            separator_color: RGB color for separator lines between segments
+            separator_thickness: Thickness in pixels of separator lines
+            **kwargs: Additional arguments passed to segment rendering
-        if len(output_page_images) == 1:
-            return output_page_images[0]
+        Returns:
+            PIL Image with all segments stacked together
+        """
+        from PIL import Image, ImageDraw
-        # Determine stacking direction (default to flow arrangement, but allow override)
+        segment_images = []
+        segment_pages = []
+        # Determine stacking direction
         final_stack_direction = stack_direction
         if stack_direction == "auto":
             final_stack_direction = self.arrangement
-        # Stack multiple page images
+        # Get cropped images for each segment
+        for i, segment in enumerate(self.segments):
+            # Get the page reference for this segment
+            if hasattr(segment, "page") and segment.page is not None:
+                segment_page = segment.page
+                # Get cropped image of the segment
+                # Use render() for clean image without highlights
+                segment_image = segment.render(
+                    resolution=resolution,
+                    crop=True,
+                    width=width,
+                    **kwargs,
+                )
+            elif (
+                hasattr(segment, "index")
+                and hasattr(segment, "width")
+                and hasattr(segment, "height")
+            ):
+                # It's a full Page object
+                segment_page = segment
+                # Use render() for clean image without highlights
+                segment_image = segment.render(resolution=resolution, width=width, **kwargs)
+            else:
+                raise ValueError(
+                    f"Segment {i+1} has no identifiable page. Segment type: {type(segment)}, attributes: {dir(segment)}"
+                )
+            if segment_image is not None:
+                segment_images.append(segment_image)
+                segment_pages.append(segment_page)
+            else:
+                logger.warning(f"Segment {i+1} render() returned None, skipping")
+        # Check if we have any valid images
+        if not segment_images:
+            logger.error("No valid segment images could be rendered")
+            return None
+        # We should have at least one segment image by now (or an exception would have been raised)
+        if len(segment_images) == 1:
+            return segment_images[0]
+        # Calculate dimensions for the final stacked image
         if final_stack_direction == "vertical":
-            final_width = max(img.width for img in output_page_images)
-            final_height = (
-                sum(img.height for img in output_page_images)
-                + (len(output_page_images) - 1) * stack_gap
-            )
-            if final_width == 0 or final_height == 0:
-                raise ValueError("Cannot create concatenated image with zero width or height.")
+            # Stack vertically
+            final_width = max(img.width for img in segment_images)
+            # Calculate total height including gaps and separators
+            total_height = sum(img.height for img in segment_images)
+            total_height += (len(segment_images) - 1) * stack_gap
+            # Add separator thickness between all segments
+            num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
+            total_height += num_separators * separator_thickness
+            # Create the final image
+            final_image = Image.new("RGB", (final_width, total_height), stack_background_color)
+            draw = ImageDraw.Draw(final_image)
-            concatenated_image = PIL_Image_Runtime.new(
-                "RGB", (final_width, final_height), stack_background_color
-            )
             current_y = 0
-            for img in output_page_images:
-                paste_x = (final_width - img.width) // 2
-                concatenated_image.paste(img, (paste_x, current_y))
-                current_y += img.height + stack_gap
-            return concatenated_image
+            for i, img in enumerate(segment_images):
+                # Add separator line before each segment (except the first one)
+                if i > 0:
+                    # Draw separator line
+                    draw.rectangle(
+                        [(0, current_y), (final_width, current_y + separator_thickness)],
+                        fill=separator_color,
+                    )
+                    current_y += separator_thickness
+                # Paste the segment image
+                paste_x = (final_width - img.width) // 2  # Center horizontally
+                final_image.paste(img, (paste_x, current_y))
+                current_y += img.height
+                # Add gap after segment (except for the last one)
+                if i < len(segment_images) - 1:
+                    current_y += stack_gap
+            return final_image
         elif final_stack_direction == "horizontal":
-            final_width = (
-                sum(img.width for img in output_page_images)
-                + (len(output_page_images) - 1) * stack_gap
-            )
-            final_height = max(img.height for img in output_page_images)
-            if final_width == 0 or final_height == 0:
-                raise ValueError("Cannot create concatenated image with zero width or height.")
+            # Stack horizontally
+            final_height = max(img.height for img in segment_images)
+            # Calculate total width including gaps and separators
+            total_width = sum(img.width for img in segment_images)
+            total_width += (len(segment_images) - 1) * stack_gap
+            # Add separator thickness between all segments
+            num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
+            total_width += num_separators * separator_thickness
+            # Create the final image
+            final_image = Image.new("RGB", (total_width, final_height), stack_background_color)
+            draw = ImageDraw.Draw(final_image)
-            concatenated_image = PIL_Image_Runtime.new(
-                "RGB", (final_width, final_height), stack_background_color
-            )
             current_x = 0
-            for img in output_page_images:
-                paste_y = (final_height - img.height) // 2
-                concatenated_image.paste(img, (current_x, paste_y))
-                current_x += img.width + stack_gap
-            return concatenated_image
+            for i, img in enumerate(segment_images):
+                # Add separator line before each segment (except the first one)
+                if i > 0:
+                    # Draw separator line
+                    draw.rectangle(
+                        [(current_x, 0), (current_x + separator_thickness, final_height)],
+                        fill=separator_color,
+                    )
+                    current_x += separator_thickness
+                # Paste the segment image
+                paste_y = (final_height - img.height) // 2  # Center vertically
+                final_image.paste(img, (current_x, paste_y))
+                current_x += img.width
+                # Add gap after segment (except for the last one)
+                if i < len(segment_images) - 1:
+                    current_x += stack_gap
+            return final_image
         else:
             raise ValueError(
-                f"Invalid stack_direction '{final_stack_direction}' for Flow.show(). Must be 'vertical' or 'horizontal'."
+                f"Invalid stack_direction '{final_stack_direction}' for in_context. Must be 'vertical' or 'horizontal'."
             )
     # --- Helper methods for coordinate transformations and segment iteration ---
@@ -972,3 +1289,643 @@ class Flow:
         raise NotImplementedError(
             "Translating element coordinates to a unified flow coordinate system is not yet implemented."
         )
+    def get_sections(
+        self,
+        start_elements=None,
+        end_elements=None,
+        new_section_on_page_break: bool = False,
+        include_boundaries: str = "both",
+    ) -> "ElementCollection":
+        """
+        Extract logical sections from the Flow based on *start* and *end* boundary
+        elements, mirroring the behaviour of PDF/PageCollection.get_sections().
+        This implementation is a thin wrapper that converts the Flow into a
+        temporary PageCollection (constructed from the unique pages that the
+        Flow spans) and then delegates the heavy‐lifting to that existing
+        implementation.  Any FlowElement / FlowElementCollection inputs are
+        automatically unwrapped to their underlying physical elements so that
+        PageCollection can work with them directly.
+        Args:
+            start_elements: Elements or selector string that mark the start of
+                sections (optional).
+            end_elements: Elements or selector string that mark the end of
+                sections (optional).
+            new_section_on_page_break: Whether to start a new section at page
+                boundaries (default: False).
+            include_boundaries: How to include boundary elements: 'start',
+                'end', 'both', or 'none' (default: 'both').
+        Returns:
+            ElementCollection of Region/FlowRegion objects representing the
+            extracted sections.
+        """
+        # ------------------------------------------------------------------
+        # Unwrap FlowElement(-Collection) inputs and selector strings so we
+        # can reason about them generically.
+        # ------------------------------------------------------------------
+        from natural_pdf.flows.collections import FlowElementCollection
+        from natural_pdf.flows.element import FlowElement
+        def _unwrap(obj):
+            """Convert Flow-specific wrappers to their underlying physical objects.
+            Keeps selector strings as-is; converts FlowElement to its physical
+            element; converts FlowElementCollection to list of physical
+            elements; passes through ElementCollection by taking .elements.
+            """
+            if obj is None or isinstance(obj, str):
+                return obj
+            if isinstance(obj, FlowElement):
+                return obj.physical_object
+            if isinstance(obj, FlowElementCollection):
+                return [fe.physical_object for fe in obj.flow_elements]
+            if hasattr(obj, "elements"):
+                return obj.elements
+            if isinstance(obj, (list, tuple, set)):
+                out = []
+                for item in obj:
+                    if isinstance(item, FlowElement):
+                        out.append(item.physical_object)
+                    else:
+                        out.append(item)
+                return out
+            return obj  # Fallback – unknown type
+        start_elements_unwrapped = _unwrap(start_elements)
+        end_elements_unwrapped = _unwrap(end_elements)
+        # ------------------------------------------------------------------
+        # PRIMARY IMPLEMENTATION – operate on each Flow **segment region**
+        # independently so that sectioning happens *per-region*, not per page.
+        # ------------------------------------------------------------------
+        from natural_pdf.elements.element_collection import ElementCollection
+        aggregated_sections = []
+        # Helper to decide if an element lies inside a segment (Region)
+        def _element_in_segment(elem, segment_region):
+            try:
+                return segment_region.intersects(elem)  # Region method – robust
+            except Exception:
+                # Fallback to bounding-box containment checks
+                if not hasattr(elem, "bbox"):
+                    return False
+                ex0, etop, ex1, ebottom = elem.bbox
+                sx0, stop, sx1, sbottom = segment_region.bbox
+                return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
+        for seg in self.segments:
+            # Each *seg* is guaranteed to be a Region (see _normalize_segments)
+            # Resolve segment-specific boundary arguments
+            seg_start_elems = None
+            seg_end_elems = None
+            # --- Handle selector strings ---
+            if isinstance(start_elements_unwrapped, str):
+                seg_start_elems = seg.find_all(start_elements_unwrapped).elements
+            elif start_elements_unwrapped is not None:
+                seg_start_elems = [
+                    e for e in start_elements_unwrapped if _element_in_segment(e, seg)
+                ]
+            if isinstance(end_elements_unwrapped, str):
+                seg_end_elems = seg.find_all(end_elements_unwrapped).elements
+            elif end_elements_unwrapped is not None:
+                seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
+            # Call Region.get_sections – this returns ElementCollection[Region]
+            seg_sections = seg.get_sections(
+                start_elements=seg_start_elems,
+                end_elements=seg_end_elems,
+                include_boundaries=include_boundaries,
+            )
+            if seg_sections:
+                aggregated_sections.extend(seg_sections.elements)
+            # Optionally, handle new_section_on_page_break – interpreted here as
+            # *new_section_on_segment_break*: if True and there were *no* explicit
+            # boundaries, treat the entire segment as a single section.
+            if (
+                new_section_on_page_break
+                and not seg_sections
+                and start_elements_unwrapped is None
+                and end_elements_unwrapped is None
+            ):
+                aggregated_sections.append(seg)
+        # ------------------------------------------------------------------
+        # CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
+        # span multiple segments and create FlowRegions for those cases.
+        # ------------------------------------------------------------------
+        # If we have explicit start/end elements, check for cross-segment sections
+        if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
+            # Find all start and end elements across all segments
+            all_start_elements = []
+            all_end_elements = []
+            # Map elements to their segments for tracking
+            element_to_segment = {}
+            for seg_idx, seg in enumerate(self.segments):
+                if isinstance(start_elements_unwrapped, str):
+                    seg_starts = seg.find_all(start_elements_unwrapped).elements
+                else:
+                    seg_starts = [
+                        e for e in start_elements_unwrapped if _element_in_segment(e, seg)
+                    ]
+                if isinstance(end_elements_unwrapped, str):
+                    seg_ends = seg.find_all(end_elements_unwrapped).elements
+                else:
+                    seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
+                for elem in seg_starts:
+                    all_start_elements.append((elem, seg_idx))
+                    element_to_segment[id(elem)] = seg_idx
+                for elem in seg_ends:
+                    all_end_elements.append((elem, seg_idx))
+                    element_to_segment[id(elem)] = seg_idx
+            # Sort by segment index, then by position within segment
+            all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
+            all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
+            # Look for cross-segment pairs (start in one segment, end in another)
+            cross_segment_sections = []
+            used_starts = set()
+            used_ends = set()
+            for start_elem, start_seg_idx in all_start_elements:
+                if id(start_elem) in used_starts:
+                    continue
+                # Find the next end element that comes after this start
+                matching_end = None
+                for end_elem, end_seg_idx in all_end_elements:
+                    if id(end_elem) in used_ends:
+                        continue
+                    # Check if this end comes after the start (by segment order or position)
+                    if end_seg_idx > start_seg_idx or (
+                        end_seg_idx == start_seg_idx
+                        and (
+                            end_elem.top > start_elem.top
+                            or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
+                        )
+                    ):
+                        matching_end = (end_elem, end_seg_idx)
+                        break
+                if matching_end is not None:
+                    end_elem, end_seg_idx = matching_end
+                    # If start and end are in different segments, create FlowRegion
+                    if start_seg_idx != end_seg_idx:
+                        cross_segment_sections.append(
+                            (start_elem, start_seg_idx, end_elem, end_seg_idx)
+                        )
+                        used_starts.add(id(start_elem))
+                        used_ends.add(id(end_elem))
+            # Create FlowRegions for cross-segment sections
+            from natural_pdf.elements.region import Region
+            from natural_pdf.flows.element import FlowElement
+            from natural_pdf.flows.region import FlowRegion
+            for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
+                # Build constituent regions spanning from start segment to end segment
+                constituent_regions = []
+                # First segment: from start element to bottom
+                start_seg = self.segments[start_seg_idx]
+                first_region = Region(
+                    start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
+                )
+                constituent_regions.append(first_region)
+                # Middle segments: full segments
+                for seg_idx in range(start_seg_idx + 1, end_seg_idx):
+                    constituent_regions.append(self.segments[seg_idx])
+                # Last segment: from top to end element
+                if end_seg_idx != start_seg_idx:
+                    end_seg = self.segments[end_seg_idx]
+                    last_region = Region(
+                        end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
+                    )
+                    constituent_regions.append(last_region)
+                # Create FlowRegion
+                flow_element = FlowElement(physical_object=start_elem, flow=self)
+                flow_region = FlowRegion(
+                    flow=self,
+                    constituent_regions=constituent_regions,
+                    source_flow_element=flow_element,
+                    boundary_element_found=end_elem,
+                )
+                # Remove any single-segment sections that are now covered by this FlowRegion
+                # This prevents duplication of content
+                aggregated_sections = [
+                    s
+                    for s in aggregated_sections
+                    if not any(
+                        cr.intersects(s)
+                        for cr in constituent_regions
+                        if hasattr(cr, "intersects") and hasattr(s, "intersects")
+                    )
+                ]
+                aggregated_sections.append(flow_region)
+        # ------------------------------------------------------------------
+        # NEW APPROACH: First collect ALL boundary elements across all segments,
+        # then pair them up to create sections (either single-segment Regions
+        # or multi-segment FlowRegions).
+        # ------------------------------------------------------------------
+        from natural_pdf.elements.element_collection import ElementCollection
+        from natural_pdf.elements.region import Region
+        from natural_pdf.flows.element import FlowElement
+        from natural_pdf.flows.region import FlowRegion
+        # Helper to decide if an element lies inside a segment (Region)
+        def _element_in_segment(elem, segment_region):
+            try:
+                return segment_region.intersects(elem)  # Region method – robust
+            except Exception:
+                # Fallback to bounding-box containment checks
+                if not hasattr(elem, "bbox"):
+                    return False
+                ex0, etop, ex1, ebottom = elem.bbox
+                sx0, stop, sx1, sbottom = segment_region.bbox
+                return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
+        # Collect ALL boundary elements across all segments with their segment indices
+        all_start_elements = []
+        all_end_elements = []
+        for seg_idx, seg in enumerate(self.segments):
+            # Find start elements in this segment
+            if isinstance(start_elements_unwrapped, str):
+                seg_starts = seg.find_all(start_elements_unwrapped).elements
+            elif start_elements_unwrapped is not None:
+                seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
+            else:
+                seg_starts = []
+            logger.debug(f"\n=== Processing segment {seg_idx} ===")
+            logger.debug(f"Segment bbox: {seg.bbox}")
+            logger.debug(
+                f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
+            )
+            logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
+            for i, elem in enumerate(seg_starts):
+                logger.debug(
+                    f"  Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
+                )
+            # Find end elements in this segment
+            if isinstance(end_elements_unwrapped, str):
+                seg_ends = seg.find_all(end_elements_unwrapped).elements
+            elif end_elements_unwrapped is not None:
+                seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
+            else:
+                seg_ends = []
+            logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
+            for i, elem in enumerate(seg_ends):
+                logger.debug(
+                    f"  End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
+                )
+            # Add to global lists with segment index
+            for elem in seg_starts:
+                all_start_elements.append((elem, seg_idx))
+            for elem in seg_ends:
+                all_end_elements.append((elem, seg_idx))
+        # Sort by flow order: segment index first, then position within segment
+        all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
+        all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
+        logger.debug(f"\n=== Total boundary elements found ===")
+        logger.debug(f"Total start elements: {len(all_start_elements)}")
+        logger.debug(f"Total end elements: {len(all_end_elements)}")
+        # Pair up start and end elements to create sections
+        sections = []
+        used_starts = set()
+        used_ends = set()
+        for start_elem, start_seg_idx in all_start_elements:
+            if id(start_elem) in used_starts:
+                continue
+            logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
+            logger.debug(
+                f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
+            )
+            # Find the next unused end element that comes after this start
+            matching_end = None
+            for end_elem, end_seg_idx in all_end_elements:
+                if id(end_elem) in used_ends:
+                    continue
+                # Check if this end comes after the start in flow order
+                if end_seg_idx > start_seg_idx or (
+                    end_seg_idx == start_seg_idx
+                    and (
+                        end_elem.top > start_elem.top
+                        or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
+                    )
+                ):
+                    matching_end = (end_elem, end_seg_idx)
+                    break
+            if matching_end is not None:
+                end_elem, end_seg_idx = matching_end
+                used_starts.add(id(start_elem))
+                used_ends.add(id(end_elem))
+                logger.debug(f"  Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
+                # Create section based on whether it spans segments
+                if start_seg_idx == end_seg_idx:
+                    # Single segment section - use Region.get_section_between
+                    seg = self.segments[start_seg_idx]
+                    section = seg.get_section_between(start_elem, end_elem, include_boundaries)
+                    sections.append(section)
+                    logger.debug(f"  Created single-segment Region")
+                else:
+                    # Multi-segment section - create FlowRegion
+                    logger.debug(
+                        f"  Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
+                    )
+                    constituent_regions = []
+                    # First segment: from start element to bottom
+                    start_seg = self.segments[start_seg_idx]
+                    if include_boundaries in ["start", "both"]:
+                        first_top = start_elem.top
+                    else:
+                        first_top = start_elem.bottom
+                    first_region = Region(
+                        start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
+                    )
+                    constituent_regions.append(first_region)
+                    # Middle segments: full segments
+                    for seg_idx in range(start_seg_idx + 1, end_seg_idx):
+                        constituent_regions.append(self.segments[seg_idx])
+                    # Last segment: from top to end element
+                    end_seg = self.segments[end_seg_idx]
+                    if include_boundaries in ["end", "both"]:
+                        last_bottom = end_elem.bottom
+                    else:
+                        last_bottom = end_elem.top
+                    last_region = Region(
+                        end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
+                    )
+                    constituent_regions.append(last_region)
+                    # Create FlowRegion
+                    flow_element = FlowElement(physical_object=start_elem, flow=self)
+                    flow_region = FlowRegion(
+                        flow=self,
+                        constituent_regions=constituent_regions,
+                        source_flow_element=flow_element,
+                        boundary_element_found=end_elem,
+                    )
+                    sections.append(flow_region)
+        # Handle special cases when only start or only end elements are provided
+        if start_elements_unwrapped is not None and end_elements_unwrapped is None:
+            logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
+            for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
+                if id(start_elem) in used_starts:
+                    continue
+                # Find next start element
+                next_start = None
+                if i + 1 < len(all_start_elements):
+                    next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
+                    # Create section from this start to just before next start
+                    if start_seg_idx == next_start_seg_idx:
+                        # Same segment
+                        seg = self.segments[start_seg_idx]
+                        # Find element just before next start
+                        all_elems = seg.get_elements()
+                        all_elems.sort(key=lambda e: (e.top, e.x0))
+                        try:
+                            next_idx = all_elems.index(next_start_elem)
+                            if next_idx > 0:
+                                end_elem = all_elems[next_idx - 1]
+                                section = seg.get_section_between(
+                                    start_elem, end_elem, include_boundaries
+                                )
+                                sections.append(section)
+                        except ValueError:
+                            pass
+                    elif next_start_seg_idx == start_seg_idx + 1:
+                        # Next start is in the immediately following segment in the flow
+                        # Create a FlowRegion that spans from current start to just before next start
+                        logger.debug(f"  Next start is in next flow segment - creating FlowRegion")
+                        constituent_regions = []
+                        # First segment: from start element to bottom
+                        start_seg = self.segments[start_seg_idx]
+                        if include_boundaries in ["start", "both"]:
+                            first_top = start_elem.top
+                        else:
+                            first_top = start_elem.bottom
+                        first_region = Region(
+                            start_seg.page,
+                            (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
+                        )
+                        constituent_regions.append(first_region)
+                        # Next segment: from top to just before next start
+                        next_seg = self.segments[next_start_seg_idx]
+                        # Find element just before next start in the next segment
+                        next_seg_elems = next_seg.get_elements()
+                        next_seg_elems.sort(key=lambda e: (e.top, e.x0))
+                        last_bottom = next_start_elem.top  # Default to just before the next start
+                        try:
+                            next_idx = next_seg_elems.index(next_start_elem)
+                            if next_idx > 0:
+                                # Use the bottom of the element before next start
+                                prev_elem = next_seg_elems[next_idx - 1]
+                                last_bottom = prev_elem.bottom
+                        except ValueError:
+                            pass
+                        last_region = Region(
+                            next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
+                        )
+                        constituent_regions.append(last_region)
+                        # Create FlowRegion
+                        flow_element = FlowElement(physical_object=start_elem, flow=self)
+                        flow_region = FlowRegion(
+                            flow=self,
+                            constituent_regions=constituent_regions,
+                            source_flow_element=flow_element,
+                            boundary_element_found=None,
+                        )
+                        sections.append(flow_region)
+                        logger.debug(
+                            f"  Created FlowRegion with {len(constituent_regions)} constituent regions"
+                        )
+                    else:
+                        # Next start is more than one segment away - just end at current segment
+                        start_seg = self.segments[start_seg_idx]
+                        if include_boundaries in ["start", "both"]:
+                            region_top = start_elem.top
+                        else:
+                            region_top = start_elem.bottom
+                        section = Region(
+                            start_seg.page,
+                            (start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
+                        )
+                        sections.append(section)
+                        logger.debug(
+                            f"  Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
+                        )
+                else:
+                    # Last start element: section goes to end of flow
+                    # This could span multiple segments
+                    if start_seg_idx == len(self.segments) - 1:
+                        # Only in last segment
+                        seg = self.segments[start_seg_idx]
+                        if include_boundaries in ["start", "both"]:
+                            region_top = start_elem.top
+                        else:
+                            region_top = start_elem.bottom
+                        section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
+                        sections.append(section)
+                    else:
+                        # Spans to end of flow - create FlowRegion
+                        constituent_regions = []
+                        # First segment
+                        start_seg = self.segments[start_seg_idx]
+                        if include_boundaries in ["start", "both"]:
+                            first_top = start_elem.top
+                        else:
+                            first_top = start_elem.bottom
+                        first_region = Region(
+                            start_seg.page,
+                            (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
+                        )
+                        constituent_regions.append(first_region)
+                        # Remaining segments
+                        for seg_idx in range(start_seg_idx + 1, len(self.segments)):
+                            constituent_regions.append(self.segments[seg_idx])
+                        flow_element = FlowElement(physical_object=start_elem, flow=self)
+                        flow_region = FlowRegion(
+                            flow=self,
+                            constituent_regions=constituent_regions,
+                            source_flow_element=flow_element,
+                            boundary_element_found=None,
+                        )
+                        sections.append(flow_region)
+        # Handle new_section_on_page_break when no explicit boundaries
+        if (
+            new_section_on_page_break
+            and start_elements_unwrapped is None
+            and end_elements_unwrapped is None
+        ):
+            # Each segment becomes its own section
+            sections = list(self.segments)
+        # Sort sections by their position in the flow
+        def _section_sort_key(section):
+            if hasattr(section, "constituent_regions"):
+                # FlowRegion - use first constituent region
+                first_region = (
+                    section.constituent_regions[0] if section.constituent_regions else None
+                )
+                if first_region:
+                    # Find which segment this region belongs to
+                    for idx, seg in enumerate(self.segments):
+                        try:
+                            if seg.intersects(first_region):
+                                return (
+                                    idx,
+                                    getattr(first_region, "top", 0),
+                                    getattr(first_region, "x0", 0),
+                                )
+                        except:
+                            pass
+            else:
+                # Regular Region
+                for idx, seg in enumerate(self.segments):
+                    try:
+                        if seg.intersects(section):
+                            return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
+                    except:
+                        pass
+            return (float("inf"), 0, 0)
+        sections.sort(key=_section_sort_key)
+        logger.debug(f"\n=== Section creation complete ===")
+        logger.debug(f"Total sections created: {len(sections)}")
+        for i, section in enumerate(sections):
+            if hasattr(section, "constituent_regions"):
+                logger.debug(
+                    f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
+                )
+            else:
+                logger.debug(f"Section {i}: Region with bbox={section.bbox}")
+        return ElementCollection(sections)
+    def highlights(self, show: bool = False) -> "HighlightContext":
+        """
+        Create a highlight context for accumulating highlights.
+        This allows for clean syntax to show multiple highlight groups:
+        Example:
+            with flow.highlights() as h:
+                h.add(flow.find_all('table'), label='tables', color='blue')
+                h.add(flow.find_all('text:bold'), label='bold text', color='red')
+                h.show()
+        Or with automatic display:
+            with flow.highlights(show=True) as h:
+                h.add(flow.find_all('table'), label='tables')
+                h.add(flow.find_all('text:bold'), label='bold')
+                # Automatically shows when exiting the context
+        Args:
+            show: If True, automatically show highlights when exiting context
+        Returns:
+            HighlightContext for accumulating highlights
+        """
+        from natural_pdf.core.highlighting_service import HighlightContext
+        return HighlightContext(self, show_on_exit=show)

natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

natural-pdf 0.1.40py3-none-any.whl → 0.2.1.dev0py3-none-any.whl