natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py CHANGED
@@ -1,19 +1,43 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
2
+ import warnings
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Any,
6
+ Callable,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Tuple,
12
+ Union,
13
+ overload,
14
+ )
3
15
 
4
16
  if TYPE_CHECKING:
17
+ from PIL.Image import Image as PIL_Image
18
+
5
19
  from natural_pdf.core.page import Page
20
+ from natural_pdf.core.page_collection import PageCollection
6
21
  from natural_pdf.elements.base import Element as PhysicalElement
7
- from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
22
+ from natural_pdf.elements.element_collection import (
23
+ ElementCollection as PhysicalElementCollection,
24
+ )
8
25
  from natural_pdf.elements.region import Region as PhysicalRegion
9
26
 
10
27
  from .collections import FlowElementCollection
11
28
  from .element import FlowElement
12
29
 
30
+ # Import required classes for the new methods
31
+ # For runtime image manipulation
32
+ from PIL import Image as PIL_Image_Runtime
33
+
34
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
35
+ from natural_pdf.tables import TableResult
36
+
13
37
  logger = logging.getLogger(__name__)
14
38
 
15
39
 
16
- class Flow:
40
+ class Flow(Visualizable):
17
41
  """Defines a logical flow or sequence of physical Page or Region objects.
18
42
 
19
43
  A Flow represents a continuous logical document structure that spans across
@@ -81,7 +105,7 @@ class Flow:
81
105
 
82
106
  def __init__(
83
107
  self,
84
- segments: List[Union["Page", "PhysicalRegion"]],
108
+ segments: Union[List[Union["Page", "PhysicalRegion"]], "PageCollection"],
85
109
  arrangement: Literal["vertical", "horizontal"],
86
110
  alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
87
111
  segment_gap: float = 0.0,
@@ -91,7 +115,8 @@ class Flow:
91
115
 
92
116
  Args:
93
117
  segments: An ordered list of natural_pdf.core.page.Page or
94
- natural_pdf.elements.region.Region objects that constitute the flow.
118
+ natural_pdf.elements.region.Region objects that constitute the flow,
119
+ or a PageCollection containing pages.
95
120
  arrangement: The primary direction of the flow.
96
121
  - "vertical": Segments are stacked top-to-bottom.
97
122
  - "horizontal": Segments are arranged left-to-right.
@@ -106,6 +131,10 @@ class Flow:
106
131
  - "bottom" (or "end"): Align bottom edges.
107
132
  segment_gap: The virtual gap (in PDF points) between segments.
108
133
  """
134
+ # Handle PageCollection input
135
+ if hasattr(segments, "pages"): # It's a PageCollection
136
+ segments = list(segments.pages)
137
+
109
138
  if not segments:
110
139
  raise ValueError("Flow segments cannot be empty.")
111
140
  if arrangement not in ["vertical", "horizontal"]:
@@ -165,6 +194,103 @@ class Flow:
165
194
  f"Valid options are: {valid_alignments[self.arrangement]}"
166
195
  )
167
196
 
197
+ def _get_highlighter(self):
198
+ """Get the highlighting service from the first segment."""
199
+ if not self.segments:
200
+ raise RuntimeError("Flow has no segments to get highlighter from")
201
+
202
+ # Get highlighter from first segment
203
+ first_segment = self.segments[0]
204
+ if hasattr(first_segment, "_highlighter"):
205
+ return first_segment._highlighter
206
+ elif hasattr(first_segment, "page") and hasattr(first_segment.page, "_highlighter"):
207
+ return first_segment.page._highlighter
208
+ else:
209
+ raise RuntimeError(
210
+ f"Cannot find HighlightingService from Flow segments. "
211
+ f"First segment type: {type(first_segment).__name__}"
212
+ )
213
+
214
+ def show(
215
+ self,
216
+ *,
217
+ # Basic rendering options
218
+ resolution: Optional[float] = None,
219
+ width: Optional[int] = None,
220
+ # Highlight options
221
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
222
+ labels: bool = True,
223
+ label_format: Optional[str] = None,
224
+ highlights: Optional[List[Dict[str, Any]]] = None,
225
+ # Layout options for multi-page/region
226
+ layout: Literal["stack", "grid", "single"] = "stack",
227
+ stack_direction: Literal["vertical", "horizontal"] = "vertical",
228
+ gap: int = 5,
229
+ columns: Optional[int] = None, # For grid layout
230
+ # Cropping options
231
+ crop: Union[bool, Literal["content"]] = False,
232
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
233
+ # Flow-specific options
234
+ in_context: bool = False,
235
+ separator_color: Optional[Tuple[int, int, int]] = None,
236
+ separator_thickness: int = 2,
237
+ **kwargs,
238
+ ) -> Optional["PIL_Image"]:
239
+ """Generate a preview image with highlights.
240
+
241
+ If in_context=True, shows segments as cropped images stacked together
242
+ with separators between segments.
243
+
244
+ Args:
245
+ resolution: DPI for rendering (default from global settings)
246
+ width: Target width in pixels (overrides resolution)
247
+ color: Default highlight color
248
+ labels: Whether to show labels for highlights
249
+ label_format: Format string for labels
250
+ highlights: Additional highlight groups to show
251
+ layout: How to arrange multiple pages/regions
252
+ stack_direction: Direction for stack layout
253
+ gap: Pixels between stacked images
254
+ columns: Number of columns for grid layout
255
+ crop: Whether to crop
256
+ crop_bbox: Explicit crop bounds
257
+ in_context: If True, use special Flow visualization with separators
258
+ separator_color: RGB color for separator lines (default: red)
259
+ separator_thickness: Thickness of separator lines
260
+ **kwargs: Additional parameters passed to rendering
261
+
262
+ Returns:
263
+ PIL Image object or None if nothing to render
264
+ """
265
+ if in_context:
266
+ # Use the special in_context visualization
267
+ return self._show_in_context(
268
+ resolution=resolution or 150,
269
+ width=width,
270
+ stack_direction=stack_direction,
271
+ stack_gap=gap,
272
+ separator_color=separator_color or (255, 0, 0),
273
+ separator_thickness=separator_thickness,
274
+ **kwargs,
275
+ )
276
+
277
+ # Otherwise use the standard show method
278
+ return super().show(
279
+ resolution=resolution,
280
+ width=width,
281
+ color=color,
282
+ labels=labels,
283
+ label_format=label_format,
284
+ highlights=highlights,
285
+ layout=layout,
286
+ stack_direction=stack_direction,
287
+ gap=gap,
288
+ columns=columns,
289
+ crop=crop,
290
+ crop_bbox=crop_bbox,
291
+ **kwargs,
292
+ )
293
+
168
294
  def find(
169
295
  self,
170
296
  selector: Optional[str] = None,
@@ -213,7 +339,10 @@ class Flow:
213
339
  ) -> "FlowElementCollection":
214
340
  """
215
341
  Finds all elements within the flow that match the given selector or text criteria.
216
- Elements are collected segment by segment, preserving the flow order.
342
+
343
+ This method efficiently groups segments by their parent pages, searches at the page level,
344
+ then filters results appropriately for each segment. This ensures elements that intersect
345
+ with flow segments (but aren't fully contained) are still found.
217
346
 
218
347
  Elements found are wrapped as FlowElement objects, anchored to this Flow,
219
348
  and returned in a FlowElementCollection.
@@ -221,13 +350,42 @@ class Flow:
221
350
  from .collections import FlowElementCollection
222
351
  from .element import FlowElement
223
352
 
353
+ # Step 1: Group segments by their parent pages (like in analyze_layout)
354
+ segments_by_page = {} # Dict[Page, List[Segment]]
355
+
356
+ for i, segment in enumerate(self.segments):
357
+ # Determine the page for this segment - fix type detection
358
+ if hasattr(segment, "page") and hasattr(segment.page, "find_all"):
359
+ # It's a Region object (has a parent page)
360
+ page_obj = segment.page
361
+ segment_type = "region"
362
+ elif (
363
+ hasattr(segment, "find_all")
364
+ and hasattr(segment, "width")
365
+ and hasattr(segment, "height")
366
+ and not hasattr(segment, "page")
367
+ ):
368
+ # It's a Page object (has find_all but no parent page)
369
+ page_obj = segment
370
+ segment_type = "page"
371
+ else:
372
+ logger.warning(f"Segment {i+1} does not support find_all, skipping")
373
+ continue
374
+
375
+ if page_obj not in segments_by_page:
376
+ segments_by_page[page_obj] = []
377
+ segments_by_page[page_obj].append((segment, segment_type))
378
+
379
+ if not segments_by_page:
380
+ logger.warning("No segments with searchable pages found")
381
+ return FlowElementCollection([])
382
+
383
+ # Step 2: Search each unique page only once
224
384
  all_flow_elements: List["FlowElement"] = []
225
385
 
226
- # Iterate through segments in their defined flow order
227
- for physical_segment in self.segments:
228
- # Find all matching physical elements within the current segment
229
- # Region.find_all() should return elements in local reading order.
230
- matches_in_segment: "PhysicalElementCollection" = physical_segment.find_all(
386
+ for page_obj, page_segments in segments_by_page.items():
387
+ # Find all matching elements on this page
388
+ page_matches = page_obj.find_all(
231
389
  selector=selector,
232
390
  text=text,
233
391
  apply_exclusions=apply_exclusions,
@@ -235,16 +393,56 @@ class Flow:
235
393
  case=case,
236
394
  **kwargs,
237
395
  )
238
- if matches_in_segment:
239
- # Wrap each found physical element as a FlowElement and add to the list
240
- # This preserves the order from matches_in_segment.elements
241
- for phys_elem in matches_in_segment.elements:
242
- all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
243
396
 
244
- # The global sort that was here previously has been removed.
245
- # The order is now determined by segment sequence, then by local order within each segment.
397
+ if not page_matches:
398
+ continue
399
+
400
+ # Step 3: For each segment on this page, collect relevant elements
401
+ for segment, segment_type in page_segments:
402
+ if segment_type == "page":
403
+ # Full page segment: include all elements
404
+ for phys_elem in page_matches.elements:
405
+ all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
406
+
407
+ elif segment_type == "region":
408
+ # Region segment: filter to only intersecting elements
409
+ for phys_elem in page_matches.elements:
410
+ try:
411
+ # Check if element intersects with this flow segment
412
+ if segment.intersects(phys_elem):
413
+ all_flow_elements.append(
414
+ FlowElement(physical_object=phys_elem, flow=self)
415
+ )
416
+ except Exception as intersect_error:
417
+ logger.debug(
418
+ f"Error checking intersection for element: {intersect_error}"
419
+ )
420
+ # Include the element anyway if intersection check fails
421
+ all_flow_elements.append(
422
+ FlowElement(physical_object=phys_elem, flow=self)
423
+ )
424
+
425
+ # Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
426
+ unique_flow_elements = []
427
+ seen_element_ids = set()
428
+
429
+ for flow_elem in all_flow_elements:
430
+ # Create a unique identifier for the underlying physical element
431
+ phys_elem = flow_elem.physical_object
432
+ elem_id = (
433
+ (
434
+ getattr(phys_elem.page, "index", id(phys_elem.page))
435
+ if hasattr(phys_elem, "page")
436
+ else id(phys_elem)
437
+ ),
438
+ phys_elem.bbox if hasattr(phys_elem, "bbox") else id(phys_elem),
439
+ )
440
+
441
+ if elem_id not in seen_element_ids:
442
+ unique_flow_elements.append(flow_elem)
443
+ seen_element_ids.add(elem_id)
246
444
 
247
- return FlowElementCollection(all_flow_elements)
445
+ return FlowElementCollection(unique_flow_elements)
248
446
 
249
447
  def __repr__(self) -> str:
250
448
  return (
@@ -252,6 +450,807 @@ class Flow:
252
450
  f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
253
451
  )
254
452
 
453
+ @overload
454
+ def extract_table(
455
+ self,
456
+ method: Optional[str] = None,
457
+ table_settings: Optional[dict] = None,
458
+ use_ocr: bool = False,
459
+ ocr_config: Optional[dict] = None,
460
+ text_options: Optional[dict] = None,
461
+ cell_extraction_func: Optional[Any] = None,
462
+ show_progress: bool = False,
463
+ content_filter: Optional[Any] = None,
464
+ stitch_rows: Callable[[List[Optional[str]]], bool] = None,
465
+ ) -> TableResult: ...
466
+
467
+ @overload
468
+ def extract_table(
469
+ self,
470
+ method: Optional[str] = None,
471
+ table_settings: Optional[dict] = None,
472
+ use_ocr: bool = False,
473
+ ocr_config: Optional[dict] = None,
474
+ text_options: Optional[dict] = None,
475
+ cell_extraction_func: Optional[Any] = None,
476
+ show_progress: bool = False,
477
+ content_filter: Optional[Any] = None,
478
+ stitch_rows: Callable[
479
+ [List[Optional[str]], List[Optional[str]], int, Union["Page", "PhysicalRegion"]],
480
+ bool,
481
+ ] = None,
482
+ ) -> TableResult: ...
483
+
484
+ def extract_table(
485
+ self,
486
+ method: Optional[str] = None,
487
+ table_settings: Optional[dict] = None,
488
+ use_ocr: bool = False,
489
+ ocr_config: Optional[dict] = None,
490
+ text_options: Optional[dict] = None,
491
+ cell_extraction_func: Optional[Any] = None,
492
+ show_progress: bool = False,
493
+ content_filter: Optional[Any] = None,
494
+ stitch_rows: Optional[Callable] = None,
495
+ merge_headers: Optional[bool] = None,
496
+ ) -> TableResult:
497
+ """
498
+ Extract table data from all segments in the flow, combining results sequentially.
499
+
500
+ This method extracts table data from each segment in flow order and combines
501
+ the results into a single logical table. This is particularly useful for
502
+ multi-page tables or tables that span across columns.
503
+
504
+ Args:
505
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
506
+ table_settings: Settings for pdfplumber table extraction.
507
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
508
+ ocr_config: OCR configuration parameters.
509
+ text_options: Dictionary of options for the 'text' method.
510
+ cell_extraction_func: Optional callable function that takes a cell Region object
511
+ and returns its string content. For 'text' method only.
512
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
513
+ content_filter: Optional content filter to apply during cell text extraction.
514
+ merge_headers: Whether to merge tables by removing repeated headers from subsequent
515
+ segments. If None (default), auto-detects by checking if the first row
516
+ of each segment matches the first row of the first segment. If segments have
517
+ inconsistent header patterns (some repeat, others don't), raises ValueError.
518
+ Useful for multi-page tables where headers repeat on each page.
519
+ stitch_rows: Optional callable to determine when rows should be merged across
520
+ segment boundaries. Applied AFTER header removal if merge_headers
521
+ is enabled. Two overloaded signatures are supported:
522
+
523
+ • func(current_row) -> bool
524
+ Called only on the first row of each segment (after the first).
525
+ Return True to merge this first row with the last row from
526
+ the previous segment.
527
+
528
+ • func(prev_row, current_row, row_index, segment) -> bool
529
+ Called for every row. Return True to merge current_row with
530
+ the previous row in the aggregated results.
531
+
532
+ When True is returned, rows are concatenated cell-by-cell.
533
+ This is useful for handling table rows split across page
534
+ boundaries or segments. If None, rows are never merged.
535
+
536
+ Returns:
537
+ TableResult object containing the aggregated table data from all segments.
538
+
539
+ Example:
540
+ Multi-page table extraction:
541
+ ```python
542
+ pdf = npdf.PDF("multi_page_table.pdf")
543
+
544
+ # Create flow for table spanning pages 2-4
545
+ table_flow = Flow(
546
+ segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
547
+ arrangement='vertical'
548
+ )
549
+
550
+ # Extract table as if it were continuous
551
+ table_data = table_flow.extract_table()
552
+ df = table_data.df # Convert to pandas DataFrame
553
+
554
+ # Custom row stitching - single parameter (simple case)
555
+ table_data = table_flow.extract_table(
556
+ stitch_rows=lambda row: row and not (row[0] or "").strip()
557
+ )
558
+
559
+ # Custom row stitching - full parameters (advanced case)
560
+ table_data = table_flow.extract_table(
561
+ stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
562
+ )
563
+ ```
564
+ """
565
+ logger.info(
566
+ f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})"
567
+ )
568
+
569
+ if not self.segments:
570
+ logger.warning("Flow has no segments, returning empty table")
571
+ return TableResult([])
572
+
573
+ # Resolve predicate and determine its signature
574
+ predicate: Optional[Callable] = None
575
+ predicate_type: str = "none"
576
+
577
+ if callable(stitch_rows):
578
+ import inspect
579
+
580
+ sig = inspect.signature(stitch_rows)
581
+ param_count = len(sig.parameters)
582
+
583
+ if param_count == 1:
584
+ predicate = stitch_rows
585
+ predicate_type = "single_param"
586
+ elif param_count == 4:
587
+ predicate = stitch_rows
588
+ predicate_type = "full_params"
589
+ else:
590
+ logger.warning(
591
+ f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring."
592
+ )
593
+ predicate = None
594
+ predicate_type = "none"
595
+
596
+ def _default_merge(
597
+ prev_row: List[Optional[str]], cur_row: List[Optional[str]]
598
+ ) -> List[Optional[str]]:
599
+ from itertools import zip_longest
600
+
601
+ merged: List[Optional[str]] = []
602
+ for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
603
+ if (p or "").strip() and (c or "").strip():
604
+ merged.append(f"{p} {c}".strip())
605
+ else:
606
+ merged.append((p or "") + (c or ""))
607
+ return merged
608
+
609
+ aggregated_rows: List[List[Optional[str]]] = []
610
+ processed_segments = 0
611
+ header_row: Optional[List[Optional[str]]] = None
612
+ merge_headers_enabled = False
613
+ headers_warned = False # Track if we've already warned about dropping headers
614
+ segment_has_repeated_header = [] # Track which segments have repeated headers
615
+
616
+ for seg_idx, segment in enumerate(self.segments):
617
+ try:
618
+ logger.debug(f" Extracting table from segment {seg_idx+1}/{len(self.segments)}")
619
+
620
+ segment_result = segment.extract_table(
621
+ method=method,
622
+ table_settings=table_settings.copy() if table_settings else None,
623
+ use_ocr=use_ocr,
624
+ ocr_config=ocr_config,
625
+ text_options=text_options.copy() if text_options else None,
626
+ cell_extraction_func=cell_extraction_func,
627
+ show_progress=show_progress,
628
+ content_filter=content_filter,
629
+ )
630
+
631
+ if not segment_result:
632
+ continue
633
+
634
+ if hasattr(segment_result, "_rows"):
635
+ segment_rows = list(segment_result._rows)
636
+ else:
637
+ segment_rows = list(segment_result)
638
+
639
+ if not segment_rows:
640
+ logger.debug(f" No table data found in segment {seg_idx+1}")
641
+ continue
642
+
643
+ # Handle header detection and merging for multi-page tables
644
+ if seg_idx == 0:
645
+ # First segment: capture potential header row
646
+ if segment_rows:
647
+ header_row = segment_rows[0]
648
+ # Determine if we should merge headers
649
+ if merge_headers is None:
650
+ # Auto-detect: we'll check all subsequent segments
651
+ merge_headers_enabled = False # Will be determined later
652
+ else:
653
+ merge_headers_enabled = merge_headers
654
+ # Track that first segment exists (for consistency checking)
655
+ segment_has_repeated_header.append(False) # First segment doesn't "repeat"
656
+ elif seg_idx == 1 and merge_headers is None:
657
+ # Auto-detection: check if first row of second segment matches header
658
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
659
+ segment_has_repeated_header.append(has_header)
660
+
661
+ if has_header:
662
+ merge_headers_enabled = True
663
+ # Remove the detected repeated header from this segment
664
+ segment_rows = segment_rows[1:]
665
+ logger.debug(
666
+ f" Auto-detected repeated header in segment {seg_idx+1}, removed"
667
+ )
668
+ if not headers_warned:
669
+ warnings.warn(
670
+ "Detected repeated headers in multi-page table. Merging by removing "
671
+ "repeated headers from subsequent pages.",
672
+ UserWarning,
673
+ stacklevel=2,
674
+ )
675
+ headers_warned = True
676
+ else:
677
+ merge_headers_enabled = False
678
+ logger.debug(f" No repeated header detected in segment {seg_idx+1}")
679
+ elif seg_idx > 1:
680
+ # Check consistency: all segments should have same pattern
681
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
682
+ segment_has_repeated_header.append(has_header)
683
+
684
+ # Remove header if merging is enabled and header is present
685
+ if merge_headers_enabled and has_header:
686
+ segment_rows = segment_rows[1:]
687
+ logger.debug(f" Removed repeated header from segment {seg_idx+1}")
688
+ elif seg_idx > 0 and merge_headers_enabled:
689
+ # Explicit merge_headers=True: remove headers from subsequent segments
690
+ if segment_rows and header_row and segment_rows[0] == header_row:
691
+ segment_rows = segment_rows[1:]
692
+ logger.debug(f" Removed repeated header from segment {seg_idx+1}")
693
+ if not headers_warned:
694
+ warnings.warn(
695
+ "Removing repeated headers from multi-page table during merge.",
696
+ UserWarning,
697
+ stacklevel=2,
698
+ )
699
+ headers_warned = True
700
+
701
+ for row_idx, row in enumerate(segment_rows):
702
+ should_merge = False
703
+
704
+ if predicate is not None and aggregated_rows:
705
+ if predicate_type == "single_param":
706
+ # For single param: only call on first row of segment (row_idx == 0)
707
+ # and pass the current row
708
+ if row_idx == 0:
709
+ should_merge = predicate(row)
710
+ elif predicate_type == "full_params":
711
+ # For full params: call with all arguments
712
+ should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
713
+
714
+ if should_merge:
715
+ aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
716
+ else:
717
+ aggregated_rows.append(row)
718
+
719
+ processed_segments += 1
720
+ logger.debug(
721
+ f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}"
722
+ )
723
+
724
+ except Exception as e:
725
+ logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
726
+ continue
727
+
728
+ # Check for inconsistent header patterns after processing all segments
729
+ if merge_headers is None and len(segment_has_repeated_header) > 2:
730
+ # During auto-detection, check for consistency across all segments
731
+ expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
732
+ for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
733
+ if has_header != expected_pattern:
734
+ # Inconsistent pattern detected
735
+ segments_with_headers = [
736
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
737
+ ]
738
+ segments_without_headers = [
739
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
740
+ ]
741
+ raise ValueError(
742
+ f"Inconsistent header pattern in multi-page table: "
743
+ f"segments {segments_with_headers} have repeated headers, "
744
+ f"but segments {segments_without_headers} do not. "
745
+ f"All segments must have the same header pattern for reliable merging."
746
+ )
747
+
748
+ logger.info(
749
+ f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
750
+ )
751
+ return TableResult(aggregated_rows)
752
+
753
+ def analyze_layout(
754
+ self,
755
+ engine: Optional[str] = None,
756
+ options: Optional[Any] = None,
757
+ confidence: Optional[float] = None,
758
+ classes: Optional[List[str]] = None,
759
+ exclude_classes: Optional[List[str]] = None,
760
+ device: Optional[str] = None,
761
+ existing: str = "replace",
762
+ model_name: Optional[str] = None,
763
+ client: Optional[Any] = None,
764
+ ) -> "PhysicalElementCollection":
765
+ """
766
+ Analyze layout across all segments in the flow.
767
+
768
+ This method efficiently groups segments by their parent pages, runs layout analysis
769
+ only once per unique page, then filters results appropriately for each segment.
770
+ This avoids redundant analysis when multiple flow segments come from the same page.
771
+
772
+ Args:
773
+ engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
774
+ options: Specific LayoutOptions object for advanced configuration.
775
+ confidence: Minimum confidence threshold.
776
+ classes: Specific classes to detect.
777
+ exclude_classes: Classes to exclude.
778
+ device: Device for inference.
779
+ existing: How to handle existing detected regions: 'replace' (default) or 'append'.
780
+ model_name: Optional model name for the engine.
781
+ client: Optional client for API-based engines.
782
+
783
+ Returns:
784
+ ElementCollection containing all detected Region objects from all segments.
785
+
786
+ Example:
787
+ Multi-page layout analysis:
788
+ ```python
789
+ pdf = npdf.PDF("document.pdf")
790
+
791
+ # Create flow for first 3 pages
792
+ page_flow = Flow(
793
+ segments=pdf.pages[:3],
794
+ arrangement='vertical'
795
+ )
796
+
797
+ # Analyze layout across all pages (efficiently)
798
+ all_regions = page_flow.analyze_layout(engine='yolo')
799
+
800
+ # Find all tables across the flow
801
+ tables = all_regions.filter('region[type=table]')
802
+ ```
803
+ """
804
+ from natural_pdf.elements.element_collection import ElementCollection
805
+
806
+ logger.info(
807
+ f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})"
808
+ )
809
+
810
+ if not self.segments:
811
+ logger.warning("Flow has no segments, returning empty collection")
812
+ return ElementCollection([])
813
+
814
+ # Step 1: Group segments by their parent pages to avoid redundant analysis
815
+ segments_by_page = {} # Dict[Page, List[Segment]]
816
+
817
+ for i, segment in enumerate(self.segments):
818
+ # Determine the page for this segment
819
+ if hasattr(segment, "analyze_layout"):
820
+ # It's a Page object
821
+ page_obj = segment
822
+ segment_type = "page"
823
+ elif hasattr(segment, "page") and hasattr(segment.page, "analyze_layout"):
824
+ # It's a Region object
825
+ page_obj = segment.page
826
+ segment_type = "region"
827
+ else:
828
+ logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
829
+ continue
830
+
831
+ if page_obj not in segments_by_page:
832
+ segments_by_page[page_obj] = []
833
+ segments_by_page[page_obj].append((segment, segment_type))
834
+
835
+ if not segments_by_page:
836
+ logger.warning("No segments with analyzable pages found")
837
+ return ElementCollection([])
838
+
839
+ logger.debug(
840
+ f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages"
841
+ )
842
+
843
+ # Step 2: Analyze each unique page only once
844
+ all_detected_regions: List["PhysicalRegion"] = []
845
+ processed_pages = 0
846
+
847
+ for page_obj, page_segments in segments_by_page.items():
848
+ try:
849
+ logger.debug(
850
+ f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments"
851
+ )
852
+
853
+ # Run layout analysis once for this page
854
+ page_results = page_obj.analyze_layout(
855
+ engine=engine,
856
+ options=options,
857
+ confidence=confidence,
858
+ classes=classes,
859
+ exclude_classes=exclude_classes,
860
+ device=device,
861
+ existing=existing,
862
+ model_name=model_name,
863
+ client=client,
864
+ )
865
+
866
+ # Extract regions from results
867
+ if hasattr(page_results, "elements"):
868
+ # It's an ElementCollection
869
+ page_regions = page_results.elements
870
+ elif isinstance(page_results, list):
871
+ # It's a list of regions
872
+ page_regions = page_results
873
+ else:
874
+ logger.warning(
875
+ f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}"
876
+ )
877
+ continue
878
+
879
+ if not page_regions:
880
+ logger.debug(
881
+ f" No layout regions found on page {getattr(page_obj, 'number', '?')}"
882
+ )
883
+ continue
884
+
885
+ # Step 3: For each segment on this page, collect relevant regions
886
+ segments_processed_on_page = 0
887
+ for segment, segment_type in page_segments:
888
+ if segment_type == "page":
889
+ # Full page segment: include all detected regions
890
+ all_detected_regions.extend(page_regions)
891
+ segments_processed_on_page += 1
892
+ logger.debug(f" Added {len(page_regions)} regions for full-page segment")
893
+
894
+ elif segment_type == "region":
895
+ # Region segment: filter to only intersecting regions
896
+ intersecting_regions = []
897
+ for region in page_regions:
898
+ try:
899
+ if segment.intersects(region):
900
+ intersecting_regions.append(region)
901
+ except Exception as intersect_error:
902
+ logger.debug(
903
+ f"Error checking intersection for region: {intersect_error}"
904
+ )
905
+ # Include the region anyway if intersection check fails
906
+ intersecting_regions.append(region)
907
+
908
+ all_detected_regions.extend(intersecting_regions)
909
+ segments_processed_on_page += 1
910
+ logger.debug(
911
+ f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}"
912
+ )
913
+
914
+ processed_pages += 1
915
+ logger.debug(
916
+ f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}"
917
+ )
918
+
919
+ except Exception as e:
920
+ logger.error(
921
+ f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}",
922
+ exc_info=True,
923
+ )
924
+ continue
925
+
926
+ # Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
927
+ unique_regions = []
928
+ seen_region_ids = set()
929
+
930
+ for region in all_detected_regions:
931
+ # Create a unique identifier for this region (page + bbox)
932
+ region_id = (
933
+ getattr(region.page, "index", id(region.page)),
934
+ region.bbox if hasattr(region, "bbox") else id(region),
935
+ )
936
+
937
+ if region_id not in seen_region_ids:
938
+ unique_regions.append(region)
939
+ seen_region_ids.add(region_id)
940
+
941
+ dedupe_removed = len(all_detected_regions) - len(unique_regions)
942
+ if dedupe_removed > 0:
943
+ logger.debug(f" Removed {dedupe_removed} duplicate regions")
944
+
945
+ logger.info(
946
+ f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages"
947
+ )
948
+ return ElementCollection(unique_regions)
949
+
950
+ def _get_render_specs(
951
+ self,
952
+ mode: Literal["show", "render"] = "show",
953
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
954
+ highlights: Optional[List[Dict[str, Any]]] = None,
955
+ crop: Union[bool, Literal["content"]] = False,
956
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
957
+ label_prefix: Optional[str] = "FlowSegment",
958
+ **kwargs,
959
+ ) -> List[RenderSpec]:
960
+ """Get render specifications for this flow.
961
+
962
+ Args:
963
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
964
+ color: Color for highlighting segments in show mode
965
+ highlights: Additional highlight groups to show
966
+ crop: Whether to crop to segments
967
+ crop_bbox: Explicit crop bounds
968
+ label_prefix: Prefix for segment labels
969
+ **kwargs: Additional parameters
970
+
971
+ Returns:
972
+ List of RenderSpec objects, one per page with segments
973
+ """
974
+ if not self.segments:
975
+ return []
976
+
977
+ # Group segments by their physical pages
978
+ segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
979
+
980
+ for i, segment in enumerate(self.segments):
981
+ # Get the page for this segment
982
+ if hasattr(segment, "page") and segment.page is not None:
983
+ # It's a Region, use its page
984
+ page_obj = segment.page
985
+ if page_obj not in segments_by_page:
986
+ segments_by_page[page_obj] = []
987
+ segments_by_page[page_obj].append(segment)
988
+ elif (
989
+ hasattr(segment, "index")
990
+ and hasattr(segment, "width")
991
+ and hasattr(segment, "height")
992
+ ):
993
+ # It's a full Page object, create a full-page region for it
994
+ page_obj = segment
995
+ full_page_region = segment.region(0, 0, segment.width, segment.height)
996
+ if page_obj not in segments_by_page:
997
+ segments_by_page[page_obj] = []
998
+ segments_by_page[page_obj].append(full_page_region)
999
+ else:
1000
+ logger.warning(f"Segment {i+1} has no identifiable page, skipping")
1001
+ continue
1002
+
1003
+ if not segments_by_page:
1004
+ return []
1005
+
1006
+ # Create RenderSpec for each page
1007
+ specs = []
1008
+
1009
+ # Sort pages by index for consistent output order
1010
+ sorted_pages = sorted(
1011
+ segments_by_page.keys(),
1012
+ key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
1013
+ )
1014
+
1015
+ for page_idx, page_obj in enumerate(sorted_pages):
1016
+ segments_on_this_page = segments_by_page[page_obj]
1017
+ if not segments_on_this_page:
1018
+ continue
1019
+
1020
+ spec = RenderSpec(page=page_obj)
1021
+
1022
+ # Handle cropping
1023
+ if crop_bbox:
1024
+ spec.crop_bbox = crop_bbox
1025
+ elif crop == "content" or crop is True:
1026
+ # Calculate bounds of segments on this page
1027
+ x_coords = []
1028
+ y_coords = []
1029
+ for segment in segments_on_this_page:
1030
+ if hasattr(segment, "bbox") and segment.bbox:
1031
+ x0, y0, x1, y1 = segment.bbox
1032
+ x_coords.extend([x0, x1])
1033
+ y_coords.extend([y0, y1])
1034
+
1035
+ if x_coords and y_coords:
1036
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
1037
+
1038
+ # Add highlights in show mode
1039
+ if mode == "show":
1040
+ # Highlight segments
1041
+ for i, segment in enumerate(segments_on_this_page):
1042
+ segment_label = None
1043
+ if label_prefix:
1044
+ # Create label for this segment
1045
+ global_segment_idx = None
1046
+ try:
1047
+ # Find the global index of this segment in the original flow
1048
+ global_segment_idx = self.segments.index(segment)
1049
+ except ValueError:
1050
+ # If it's a generated full-page region, find its source page
1051
+ for idx, orig_segment in enumerate(self.segments):
1052
+ if (
1053
+ hasattr(orig_segment, "index")
1054
+ and hasattr(segment, "page")
1055
+ and orig_segment.index == segment.page.index
1056
+ ):
1057
+ global_segment_idx = idx
1058
+ break
1059
+
1060
+ if global_segment_idx is not None:
1061
+ segment_label = f"{label_prefix}_{global_segment_idx + 1}"
1062
+ else:
1063
+ segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
1064
+
1065
+ spec.add_highlight(
1066
+ bbox=segment.bbox,
1067
+ polygon=segment.polygon if segment.has_polygon else None,
1068
+ color=color or "blue",
1069
+ label=segment_label,
1070
+ )
1071
+
1072
+ # Add additional highlight groups if provided
1073
+ if highlights:
1074
+ for group in highlights:
1075
+ group_elements = group.get("elements", [])
1076
+ group_color = group.get("color", color)
1077
+ group_label = group.get("label")
1078
+
1079
+ for elem in group_elements:
1080
+ # Only add if element is on this page
1081
+ if hasattr(elem, "page") and elem.page == page_obj:
1082
+ spec.add_highlight(
1083
+ element=elem, color=group_color, label=group_label
1084
+ )
1085
+
1086
+ specs.append(spec)
1087
+
1088
+ return specs
1089
+
1090
+ def _show_in_context(
1091
+ self,
1092
+ resolution: float,
1093
+ width: Optional[int] = None,
1094
+ stack_direction: str = "vertical",
1095
+ stack_gap: int = 5,
1096
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255),
1097
+ separator_color: Tuple[int, int, int] = (255, 0, 0),
1098
+ separator_thickness: int = 2,
1099
+ **kwargs,
1100
+ ) -> Optional["PIL_Image"]:
1101
+ """
1102
+ Show segments as cropped images stacked together with separators between segments.
1103
+
1104
+ Args:
1105
+ resolution: Resolution in DPI for rendering segment images
1106
+ width: Optional width for segment images
1107
+ stack_direction: Direction to stack segments ('vertical' or 'horizontal')
1108
+ stack_gap: Gap in pixels between segments
1109
+ stack_background_color: RGB background color for the final image
1110
+ separator_color: RGB color for separator lines between segments
1111
+ separator_thickness: Thickness in pixels of separator lines
1112
+ **kwargs: Additional arguments passed to segment rendering
1113
+
1114
+ Returns:
1115
+ PIL Image with all segments stacked together
1116
+ """
1117
+ from PIL import Image, ImageDraw
1118
+
1119
+ segment_images = []
1120
+ segment_pages = []
1121
+
1122
+ # Determine stacking direction
1123
+ final_stack_direction = stack_direction
1124
+ if stack_direction == "auto":
1125
+ final_stack_direction = self.arrangement
1126
+
1127
+ # Get cropped images for each segment
1128
+ for i, segment in enumerate(self.segments):
1129
+ # Get the page reference for this segment
1130
+ if hasattr(segment, "page") and segment.page is not None:
1131
+ segment_page = segment.page
1132
+ # Get cropped image of the segment
1133
+ # Use render() for clean image without highlights
1134
+ segment_image = segment.render(
1135
+ resolution=resolution,
1136
+ crop=True,
1137
+ width=width,
1138
+ **kwargs,
1139
+ )
1140
+
1141
+ elif (
1142
+ hasattr(segment, "index")
1143
+ and hasattr(segment, "width")
1144
+ and hasattr(segment, "height")
1145
+ ):
1146
+ # It's a full Page object
1147
+ segment_page = segment
1148
+ # Use render() for clean image without highlights
1149
+ segment_image = segment.render(resolution=resolution, width=width, **kwargs)
1150
+ else:
1151
+ raise ValueError(
1152
+ f"Segment {i+1} has no identifiable page. Segment type: {type(segment)}, attributes: {dir(segment)}"
1153
+ )
1154
+
1155
+ if segment_image is not None:
1156
+ segment_images.append(segment_image)
1157
+ segment_pages.append(segment_page)
1158
+ else:
1159
+ logger.warning(f"Segment {i+1} render() returned None, skipping")
1160
+
1161
+ # Check if we have any valid images
1162
+ if not segment_images:
1163
+ logger.error("No valid segment images could be rendered")
1164
+ return None
1165
+
1166
+ # We should have at least one segment image by now (or an exception would have been raised)
1167
+ if len(segment_images) == 1:
1168
+ return segment_images[0]
1169
+
1170
+ # Calculate dimensions for the final stacked image
1171
+ if final_stack_direction == "vertical":
1172
+ # Stack vertically
1173
+ final_width = max(img.width for img in segment_images)
1174
+
1175
+ # Calculate total height including gaps and separators
1176
+ total_height = sum(img.height for img in segment_images)
1177
+ total_height += (len(segment_images) - 1) * stack_gap
1178
+
1179
+ # Add separator thickness between all segments
1180
+ num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
1181
+ total_height += num_separators * separator_thickness
1182
+
1183
+ # Create the final image
1184
+ final_image = Image.new("RGB", (final_width, total_height), stack_background_color)
1185
+ draw = ImageDraw.Draw(final_image)
1186
+
1187
+ current_y = 0
1188
+
1189
+ for i, img in enumerate(segment_images):
1190
+ # Add separator line before each segment (except the first one)
1191
+ if i > 0:
1192
+ # Draw separator line
1193
+ draw.rectangle(
1194
+ [(0, current_y), (final_width, current_y + separator_thickness)],
1195
+ fill=separator_color,
1196
+ )
1197
+ current_y += separator_thickness
1198
+
1199
+ # Paste the segment image
1200
+ paste_x = (final_width - img.width) // 2 # Center horizontally
1201
+ final_image.paste(img, (paste_x, current_y))
1202
+ current_y += img.height
1203
+
1204
+ # Add gap after segment (except for the last one)
1205
+ if i < len(segment_images) - 1:
1206
+ current_y += stack_gap
1207
+
1208
+ return final_image
1209
+
1210
+ elif final_stack_direction == "horizontal":
1211
+ # Stack horizontally
1212
+ final_height = max(img.height for img in segment_images)
1213
+
1214
+ # Calculate total width including gaps and separators
1215
+ total_width = sum(img.width for img in segment_images)
1216
+ total_width += (len(segment_images) - 1) * stack_gap
1217
+
1218
+ # Add separator thickness between all segments
1219
+ num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
1220
+ total_width += num_separators * separator_thickness
1221
+
1222
+ # Create the final image
1223
+ final_image = Image.new("RGB", (total_width, final_height), stack_background_color)
1224
+ draw = ImageDraw.Draw(final_image)
1225
+
1226
+ current_x = 0
1227
+
1228
+ for i, img in enumerate(segment_images):
1229
+ # Add separator line before each segment (except the first one)
1230
+ if i > 0:
1231
+ # Draw separator line
1232
+ draw.rectangle(
1233
+ [(current_x, 0), (current_x + separator_thickness, final_height)],
1234
+ fill=separator_color,
1235
+ )
1236
+ current_x += separator_thickness
1237
+
1238
+ # Paste the segment image
1239
+ paste_y = (final_height - img.height) // 2 # Center vertically
1240
+ final_image.paste(img, (current_x, paste_y))
1241
+ current_x += img.width
1242
+
1243
+ # Add gap after segment (except for the last one)
1244
+ if i < len(segment_images) - 1:
1245
+ current_x += stack_gap
1246
+
1247
+ return final_image
1248
+
1249
+ else:
1250
+ raise ValueError(
1251
+ f"Invalid stack_direction '{final_stack_direction}' for in_context. Must be 'vertical' or 'horizontal'."
1252
+ )
1253
+
255
1254
  # --- Helper methods for coordinate transformations and segment iteration ---
256
1255
  # These will be crucial for FlowElement's directional methods.
257
1256
 
@@ -290,3 +1289,643 @@ class Flow:
290
1289
  raise NotImplementedError(
291
1290
  "Translating element coordinates to a unified flow coordinate system is not yet implemented."
292
1291
  )
1292
+
1293
+ def get_sections(
1294
+ self,
1295
+ start_elements=None,
1296
+ end_elements=None,
1297
+ new_section_on_page_break: bool = False,
1298
+ include_boundaries: str = "both",
1299
+ ) -> "ElementCollection":
1300
+ """
1301
+ Extract logical sections from the Flow based on *start* and *end* boundary
1302
+ elements, mirroring the behaviour of PDF/PageCollection.get_sections().
1303
+
1304
+ This implementation is a thin wrapper that converts the Flow into a
1305
+ temporary PageCollection (constructed from the unique pages that the
1306
+ Flow spans) and then delegates the heavy‐lifting to that existing
1307
+ implementation. Any FlowElement / FlowElementCollection inputs are
1308
+ automatically unwrapped to their underlying physical elements so that
1309
+ PageCollection can work with them directly.
1310
+
1311
+ Args:
1312
+ start_elements: Elements or selector string that mark the start of
1313
+ sections (optional).
1314
+ end_elements: Elements or selector string that mark the end of
1315
+ sections (optional).
1316
+ new_section_on_page_break: Whether to start a new section at page
1317
+ boundaries (default: False).
1318
+ include_boundaries: How to include boundary elements: 'start',
1319
+ 'end', 'both', or 'none' (default: 'both').
1320
+
1321
+ Returns:
1322
+ ElementCollection of Region/FlowRegion objects representing the
1323
+ extracted sections.
1324
+ """
1325
+ # ------------------------------------------------------------------
1326
+ # Unwrap FlowElement(-Collection) inputs and selector strings so we
1327
+ # can reason about them generically.
1328
+ # ------------------------------------------------------------------
1329
+ from natural_pdf.flows.collections import FlowElementCollection
1330
+ from natural_pdf.flows.element import FlowElement
1331
+
1332
+ def _unwrap(obj):
1333
+ """Convert Flow-specific wrappers to their underlying physical objects.
1334
+
1335
+ Keeps selector strings as-is; converts FlowElement to its physical
1336
+ element; converts FlowElementCollection to list of physical
1337
+ elements; passes through ElementCollection by taking .elements.
1338
+ """
1339
+
1340
+ if obj is None or isinstance(obj, str):
1341
+ return obj
1342
+
1343
+ if isinstance(obj, FlowElement):
1344
+ return obj.physical_object
1345
+
1346
+ if isinstance(obj, FlowElementCollection):
1347
+ return [fe.physical_object for fe in obj.flow_elements]
1348
+
1349
+ if hasattr(obj, "elements"):
1350
+ return obj.elements
1351
+
1352
+ if isinstance(obj, (list, tuple, set)):
1353
+ out = []
1354
+ for item in obj:
1355
+ if isinstance(item, FlowElement):
1356
+ out.append(item.physical_object)
1357
+ else:
1358
+ out.append(item)
1359
+ return out
1360
+
1361
+ return obj # Fallback – unknown type
1362
+
1363
+ start_elements_unwrapped = _unwrap(start_elements)
1364
+ end_elements_unwrapped = _unwrap(end_elements)
1365
+
1366
+ # ------------------------------------------------------------------
1367
+ # PRIMARY IMPLEMENTATION – operate on each Flow **segment region**
1368
+ # independently so that sectioning happens *per-region*, not per page.
1369
+ # ------------------------------------------------------------------
1370
+ from natural_pdf.elements.element_collection import ElementCollection
1371
+
1372
+ aggregated_sections = []
1373
+
1374
+ # Helper to decide if an element lies inside a segment (Region)
1375
+ def _element_in_segment(elem, segment_region):
1376
+ try:
1377
+ return segment_region.intersects(elem) # Region method – robust
1378
+ except Exception:
1379
+ # Fallback to bounding-box containment checks
1380
+ if not hasattr(elem, "bbox"):
1381
+ return False
1382
+ ex0, etop, ex1, ebottom = elem.bbox
1383
+ sx0, stop, sx1, sbottom = segment_region.bbox
1384
+ return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1385
+
1386
+ for seg in self.segments:
1387
+ # Each *seg* is guaranteed to be a Region (see _normalize_segments)
1388
+
1389
+ # Resolve segment-specific boundary arguments
1390
+ seg_start_elems = None
1391
+ seg_end_elems = None
1392
+
1393
+ # --- Handle selector strings ---
1394
+ if isinstance(start_elements_unwrapped, str):
1395
+ seg_start_elems = seg.find_all(start_elements_unwrapped).elements
1396
+ elif start_elements_unwrapped is not None:
1397
+ seg_start_elems = [
1398
+ e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1399
+ ]
1400
+
1401
+ if isinstance(end_elements_unwrapped, str):
1402
+ seg_end_elems = seg.find_all(end_elements_unwrapped).elements
1403
+ elif end_elements_unwrapped is not None:
1404
+ seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1405
+
1406
+ # Call Region.get_sections – this returns ElementCollection[Region]
1407
+ seg_sections = seg.get_sections(
1408
+ start_elements=seg_start_elems,
1409
+ end_elements=seg_end_elems,
1410
+ include_boundaries=include_boundaries,
1411
+ )
1412
+
1413
+ if seg_sections:
1414
+ aggregated_sections.extend(seg_sections.elements)
1415
+
1416
+ # Optionally, handle new_section_on_page_break – interpreted here as
1417
+ # *new_section_on_segment_break*: if True and there were *no* explicit
1418
+ # boundaries, treat the entire segment as a single section.
1419
+ if (
1420
+ new_section_on_page_break
1421
+ and not seg_sections
1422
+ and start_elements_unwrapped is None
1423
+ and end_elements_unwrapped is None
1424
+ ):
1425
+ aggregated_sections.append(seg)
1426
+
1427
+ # ------------------------------------------------------------------
1428
+ # CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
1429
+ # span multiple segments and create FlowRegions for those cases.
1430
+ # ------------------------------------------------------------------
1431
+
1432
+ # If we have explicit start/end elements, check for cross-segment sections
1433
+ if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
1434
+ # Find all start and end elements across all segments
1435
+ all_start_elements = []
1436
+ all_end_elements = []
1437
+
1438
+ # Map elements to their segments for tracking
1439
+ element_to_segment = {}
1440
+
1441
+ for seg_idx, seg in enumerate(self.segments):
1442
+ if isinstance(start_elements_unwrapped, str):
1443
+ seg_starts = seg.find_all(start_elements_unwrapped).elements
1444
+ else:
1445
+ seg_starts = [
1446
+ e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1447
+ ]
1448
+
1449
+ if isinstance(end_elements_unwrapped, str):
1450
+ seg_ends = seg.find_all(end_elements_unwrapped).elements
1451
+ else:
1452
+ seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1453
+
1454
+ for elem in seg_starts:
1455
+ all_start_elements.append((elem, seg_idx))
1456
+ element_to_segment[id(elem)] = seg_idx
1457
+
1458
+ for elem in seg_ends:
1459
+ all_end_elements.append((elem, seg_idx))
1460
+ element_to_segment[id(elem)] = seg_idx
1461
+
1462
+ # Sort by segment index, then by position within segment
1463
+ all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1464
+ all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1465
+
1466
+ # Look for cross-segment pairs (start in one segment, end in another)
1467
+ cross_segment_sections = []
1468
+ used_starts = set()
1469
+ used_ends = set()
1470
+
1471
+ for start_elem, start_seg_idx in all_start_elements:
1472
+ if id(start_elem) in used_starts:
1473
+ continue
1474
+
1475
+ # Find the next end element that comes after this start
1476
+ matching_end = None
1477
+ for end_elem, end_seg_idx in all_end_elements:
1478
+ if id(end_elem) in used_ends:
1479
+ continue
1480
+
1481
+ # Check if this end comes after the start (by segment order or position)
1482
+ if end_seg_idx > start_seg_idx or (
1483
+ end_seg_idx == start_seg_idx
1484
+ and (
1485
+ end_elem.top > start_elem.top
1486
+ or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1487
+ )
1488
+ ):
1489
+ matching_end = (end_elem, end_seg_idx)
1490
+ break
1491
+
1492
+ if matching_end is not None:
1493
+ end_elem, end_seg_idx = matching_end
1494
+
1495
+ # If start and end are in different segments, create FlowRegion
1496
+ if start_seg_idx != end_seg_idx:
1497
+ cross_segment_sections.append(
1498
+ (start_elem, start_seg_idx, end_elem, end_seg_idx)
1499
+ )
1500
+ used_starts.add(id(start_elem))
1501
+ used_ends.add(id(end_elem))
1502
+
1503
+ # Create FlowRegions for cross-segment sections
1504
+ from natural_pdf.elements.region import Region
1505
+ from natural_pdf.flows.element import FlowElement
1506
+ from natural_pdf.flows.region import FlowRegion
1507
+
1508
+ for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
1509
+ # Build constituent regions spanning from start segment to end segment
1510
+ constituent_regions = []
1511
+
1512
+ # First segment: from start element to bottom
1513
+ start_seg = self.segments[start_seg_idx]
1514
+ first_region = Region(
1515
+ start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
1516
+ )
1517
+ constituent_regions.append(first_region)
1518
+
1519
+ # Middle segments: full segments
1520
+ for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1521
+ constituent_regions.append(self.segments[seg_idx])
1522
+
1523
+ # Last segment: from top to end element
1524
+ if end_seg_idx != start_seg_idx:
1525
+ end_seg = self.segments[end_seg_idx]
1526
+ last_region = Region(
1527
+ end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
1528
+ )
1529
+ constituent_regions.append(last_region)
1530
+
1531
+ # Create FlowRegion
1532
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1533
+ flow_region = FlowRegion(
1534
+ flow=self,
1535
+ constituent_regions=constituent_regions,
1536
+ source_flow_element=flow_element,
1537
+ boundary_element_found=end_elem,
1538
+ )
1539
+
1540
+ # Remove any single-segment sections that are now covered by this FlowRegion
1541
+ # This prevents duplication of content
1542
+ aggregated_sections = [
1543
+ s
1544
+ for s in aggregated_sections
1545
+ if not any(
1546
+ cr.intersects(s)
1547
+ for cr in constituent_regions
1548
+ if hasattr(cr, "intersects") and hasattr(s, "intersects")
1549
+ )
1550
+ ]
1551
+
1552
+ aggregated_sections.append(flow_region)
1553
+
1554
+ # ------------------------------------------------------------------
1555
+ # NEW APPROACH: First collect ALL boundary elements across all segments,
1556
+ # then pair them up to create sections (either single-segment Regions
1557
+ # or multi-segment FlowRegions).
1558
+ # ------------------------------------------------------------------
1559
+ from natural_pdf.elements.element_collection import ElementCollection
1560
+ from natural_pdf.elements.region import Region
1561
+ from natural_pdf.flows.element import FlowElement
1562
+ from natural_pdf.flows.region import FlowRegion
1563
+
1564
+ # Helper to decide if an element lies inside a segment (Region)
1565
+ def _element_in_segment(elem, segment_region):
1566
+ try:
1567
+ return segment_region.intersects(elem) # Region method – robust
1568
+ except Exception:
1569
+ # Fallback to bounding-box containment checks
1570
+ if not hasattr(elem, "bbox"):
1571
+ return False
1572
+ ex0, etop, ex1, ebottom = elem.bbox
1573
+ sx0, stop, sx1, sbottom = segment_region.bbox
1574
+ return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1575
+
1576
+ # Collect ALL boundary elements across all segments with their segment indices
1577
+ all_start_elements = []
1578
+ all_end_elements = []
1579
+
1580
+ for seg_idx, seg in enumerate(self.segments):
1581
+ # Find start elements in this segment
1582
+ if isinstance(start_elements_unwrapped, str):
1583
+ seg_starts = seg.find_all(start_elements_unwrapped).elements
1584
+ elif start_elements_unwrapped is not None:
1585
+ seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
1586
+ else:
1587
+ seg_starts = []
1588
+
1589
+ logger.debug(f"\n=== Processing segment {seg_idx} ===")
1590
+ logger.debug(f"Segment bbox: {seg.bbox}")
1591
+ logger.debug(
1592
+ f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
1593
+ )
1594
+
1595
+ logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
1596
+ for i, elem in enumerate(seg_starts):
1597
+ logger.debug(
1598
+ f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1599
+ )
1600
+
1601
+ # Find end elements in this segment
1602
+ if isinstance(end_elements_unwrapped, str):
1603
+ seg_ends = seg.find_all(end_elements_unwrapped).elements
1604
+ elif end_elements_unwrapped is not None:
1605
+ seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1606
+ else:
1607
+ seg_ends = []
1608
+
1609
+ logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
1610
+ for i, elem in enumerate(seg_ends):
1611
+ logger.debug(
1612
+ f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1613
+ )
1614
+
1615
+ # Add to global lists with segment index
1616
+ for elem in seg_starts:
1617
+ all_start_elements.append((elem, seg_idx))
1618
+ for elem in seg_ends:
1619
+ all_end_elements.append((elem, seg_idx))
1620
+
1621
+ # Sort by flow order: segment index first, then position within segment
1622
+ all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1623
+ all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1624
+
1625
+ logger.debug(f"\n=== Total boundary elements found ===")
1626
+ logger.debug(f"Total start elements: {len(all_start_elements)}")
1627
+ logger.debug(f"Total end elements: {len(all_end_elements)}")
1628
+
1629
+ # Pair up start and end elements to create sections
1630
+ sections = []
1631
+ used_starts = set()
1632
+ used_ends = set()
1633
+
1634
+ for start_elem, start_seg_idx in all_start_elements:
1635
+ if id(start_elem) in used_starts:
1636
+ continue
1637
+
1638
+ logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
1639
+ logger.debug(
1640
+ f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
1641
+ )
1642
+
1643
+ # Find the next unused end element that comes after this start
1644
+ matching_end = None
1645
+ for end_elem, end_seg_idx in all_end_elements:
1646
+ if id(end_elem) in used_ends:
1647
+ continue
1648
+
1649
+ # Check if this end comes after the start in flow order
1650
+ if end_seg_idx > start_seg_idx or (
1651
+ end_seg_idx == start_seg_idx
1652
+ and (
1653
+ end_elem.top > start_elem.top
1654
+ or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1655
+ )
1656
+ ):
1657
+ matching_end = (end_elem, end_seg_idx)
1658
+ break
1659
+
1660
+ if matching_end is not None:
1661
+ end_elem, end_seg_idx = matching_end
1662
+ used_starts.add(id(start_elem))
1663
+ used_ends.add(id(end_elem))
1664
+
1665
+ logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
1666
+
1667
+ # Create section based on whether it spans segments
1668
+ if start_seg_idx == end_seg_idx:
1669
+ # Single segment section - use Region.get_section_between
1670
+ seg = self.segments[start_seg_idx]
1671
+ section = seg.get_section_between(start_elem, end_elem, include_boundaries)
1672
+ sections.append(section)
1673
+ logger.debug(f" Created single-segment Region")
1674
+ else:
1675
+ # Multi-segment section - create FlowRegion
1676
+ logger.debug(
1677
+ f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
1678
+ )
1679
+ constituent_regions = []
1680
+
1681
+ # First segment: from start element to bottom
1682
+ start_seg = self.segments[start_seg_idx]
1683
+ if include_boundaries in ["start", "both"]:
1684
+ first_top = start_elem.top
1685
+ else:
1686
+ first_top = start_elem.bottom
1687
+ first_region = Region(
1688
+ start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
1689
+ )
1690
+ constituent_regions.append(first_region)
1691
+
1692
+ # Middle segments: full segments
1693
+ for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1694
+ constituent_regions.append(self.segments[seg_idx])
1695
+
1696
+ # Last segment: from top to end element
1697
+ end_seg = self.segments[end_seg_idx]
1698
+ if include_boundaries in ["end", "both"]:
1699
+ last_bottom = end_elem.bottom
1700
+ else:
1701
+ last_bottom = end_elem.top
1702
+ last_region = Region(
1703
+ end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
1704
+ )
1705
+ constituent_regions.append(last_region)
1706
+
1707
+ # Create FlowRegion
1708
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1709
+ flow_region = FlowRegion(
1710
+ flow=self,
1711
+ constituent_regions=constituent_regions,
1712
+ source_flow_element=flow_element,
1713
+ boundary_element_found=end_elem,
1714
+ )
1715
+ sections.append(flow_region)
1716
+
1717
+ # Handle special cases when only start or only end elements are provided
1718
+ if start_elements_unwrapped is not None and end_elements_unwrapped is None:
1719
+ logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
1720
+ for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
1721
+ if id(start_elem) in used_starts:
1722
+ continue
1723
+
1724
+ # Find next start element
1725
+ next_start = None
1726
+ if i + 1 < len(all_start_elements):
1727
+ next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
1728
+ # Create section from this start to just before next start
1729
+ if start_seg_idx == next_start_seg_idx:
1730
+ # Same segment
1731
+ seg = self.segments[start_seg_idx]
1732
+ # Find element just before next start
1733
+ all_elems = seg.get_elements()
1734
+ all_elems.sort(key=lambda e: (e.top, e.x0))
1735
+ try:
1736
+ next_idx = all_elems.index(next_start_elem)
1737
+ if next_idx > 0:
1738
+ end_elem = all_elems[next_idx - 1]
1739
+ section = seg.get_section_between(
1740
+ start_elem, end_elem, include_boundaries
1741
+ )
1742
+ sections.append(section)
1743
+ except ValueError:
1744
+ pass
1745
+ elif next_start_seg_idx == start_seg_idx + 1:
1746
+ # Next start is in the immediately following segment in the flow
1747
+ # Create a FlowRegion that spans from current start to just before next start
1748
+ logger.debug(f" Next start is in next flow segment - creating FlowRegion")
1749
+
1750
+ constituent_regions = []
1751
+
1752
+ # First segment: from start element to bottom
1753
+ start_seg = self.segments[start_seg_idx]
1754
+ if include_boundaries in ["start", "both"]:
1755
+ first_top = start_elem.top
1756
+ else:
1757
+ first_top = start_elem.bottom
1758
+ first_region = Region(
1759
+ start_seg.page,
1760
+ (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1761
+ )
1762
+ constituent_regions.append(first_region)
1763
+
1764
+ # Next segment: from top to just before next start
1765
+ next_seg = self.segments[next_start_seg_idx]
1766
+ # Find element just before next start in the next segment
1767
+ next_seg_elems = next_seg.get_elements()
1768
+ next_seg_elems.sort(key=lambda e: (e.top, e.x0))
1769
+
1770
+ last_bottom = next_start_elem.top # Default to just before the next start
1771
+ try:
1772
+ next_idx = next_seg_elems.index(next_start_elem)
1773
+ if next_idx > 0:
1774
+ # Use the bottom of the element before next start
1775
+ prev_elem = next_seg_elems[next_idx - 1]
1776
+ last_bottom = prev_elem.bottom
1777
+ except ValueError:
1778
+ pass
1779
+
1780
+ last_region = Region(
1781
+ next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
1782
+ )
1783
+ constituent_regions.append(last_region)
1784
+
1785
+ # Create FlowRegion
1786
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1787
+ flow_region = FlowRegion(
1788
+ flow=self,
1789
+ constituent_regions=constituent_regions,
1790
+ source_flow_element=flow_element,
1791
+ boundary_element_found=None,
1792
+ )
1793
+ sections.append(flow_region)
1794
+ logger.debug(
1795
+ f" Created FlowRegion with {len(constituent_regions)} constituent regions"
1796
+ )
1797
+ else:
1798
+ # Next start is more than one segment away - just end at current segment
1799
+ start_seg = self.segments[start_seg_idx]
1800
+ if include_boundaries in ["start", "both"]:
1801
+ region_top = start_elem.top
1802
+ else:
1803
+ region_top = start_elem.bottom
1804
+ section = Region(
1805
+ start_seg.page,
1806
+ (start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
1807
+ )
1808
+ sections.append(section)
1809
+ logger.debug(
1810
+ f" Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
1811
+ )
1812
+ else:
1813
+ # Last start element: section goes to end of flow
1814
+ # This could span multiple segments
1815
+ if start_seg_idx == len(self.segments) - 1:
1816
+ # Only in last segment
1817
+ seg = self.segments[start_seg_idx]
1818
+ if include_boundaries in ["start", "both"]:
1819
+ region_top = start_elem.top
1820
+ else:
1821
+ region_top = start_elem.bottom
1822
+ section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
1823
+ sections.append(section)
1824
+ else:
1825
+ # Spans to end of flow - create FlowRegion
1826
+ constituent_regions = []
1827
+
1828
+ # First segment
1829
+ start_seg = self.segments[start_seg_idx]
1830
+ if include_boundaries in ["start", "both"]:
1831
+ first_top = start_elem.top
1832
+ else:
1833
+ first_top = start_elem.bottom
1834
+ first_region = Region(
1835
+ start_seg.page,
1836
+ (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1837
+ )
1838
+ constituent_regions.append(first_region)
1839
+
1840
+ # Remaining segments
1841
+ for seg_idx in range(start_seg_idx + 1, len(self.segments)):
1842
+ constituent_regions.append(self.segments[seg_idx])
1843
+
1844
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1845
+ flow_region = FlowRegion(
1846
+ flow=self,
1847
+ constituent_regions=constituent_regions,
1848
+ source_flow_element=flow_element,
1849
+ boundary_element_found=None,
1850
+ )
1851
+ sections.append(flow_region)
1852
+
1853
+ # Handle new_section_on_page_break when no explicit boundaries
1854
+ if (
1855
+ new_section_on_page_break
1856
+ and start_elements_unwrapped is None
1857
+ and end_elements_unwrapped is None
1858
+ ):
1859
+ # Each segment becomes its own section
1860
+ sections = list(self.segments)
1861
+
1862
+ # Sort sections by their position in the flow
1863
+ def _section_sort_key(section):
1864
+ if hasattr(section, "constituent_regions"):
1865
+ # FlowRegion - use first constituent region
1866
+ first_region = (
1867
+ section.constituent_regions[0] if section.constituent_regions else None
1868
+ )
1869
+ if first_region:
1870
+ # Find which segment this region belongs to
1871
+ for idx, seg in enumerate(self.segments):
1872
+ try:
1873
+ if seg.intersects(first_region):
1874
+ return (
1875
+ idx,
1876
+ getattr(first_region, "top", 0),
1877
+ getattr(first_region, "x0", 0),
1878
+ )
1879
+ except:
1880
+ pass
1881
+ else:
1882
+ # Regular Region
1883
+ for idx, seg in enumerate(self.segments):
1884
+ try:
1885
+ if seg.intersects(section):
1886
+ return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
1887
+ except:
1888
+ pass
1889
+ return (float("inf"), 0, 0)
1890
+
1891
+ sections.sort(key=_section_sort_key)
1892
+
1893
+ logger.debug(f"\n=== Section creation complete ===")
1894
+ logger.debug(f"Total sections created: {len(sections)}")
1895
+ for i, section in enumerate(sections):
1896
+ if hasattr(section, "constituent_regions"):
1897
+ logger.debug(
1898
+ f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
1899
+ )
1900
+ else:
1901
+ logger.debug(f"Section {i}: Region with bbox={section.bbox}")
1902
+
1903
+ return ElementCollection(sections)
1904
+
1905
+ def highlights(self, show: bool = False) -> "HighlightContext":
1906
+ """
1907
+ Create a highlight context for accumulating highlights.
1908
+
1909
+ This allows for clean syntax to show multiple highlight groups:
1910
+
1911
+ Example:
1912
+ with flow.highlights() as h:
1913
+ h.add(flow.find_all('table'), label='tables', color='blue')
1914
+ h.add(flow.find_all('text:bold'), label='bold text', color='red')
1915
+ h.show()
1916
+
1917
+ Or with automatic display:
1918
+ with flow.highlights(show=True) as h:
1919
+ h.add(flow.find_all('table'), label='tables')
1920
+ h.add(flow.find_all('text:bold'), label='bold')
1921
+ # Automatically shows when exiting the context
1922
+
1923
+ Args:
1924
+ show: If True, automatically show highlights when exiting context
1925
+
1926
+ Returns:
1927
+ HighlightContext for accumulating highlights
1928
+ """
1929
+ from natural_pdf.core.highlighting_service import HighlightContext
1930
+
1931
+ return HighlightContext(self, show_on_exit=show)