natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py CHANGED
@@ -1,25 +1,43 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union, Tuple, Callable, overload
2
+ import warnings
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Any,
6
+ Callable,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Tuple,
12
+ Union,
13
+ overload,
14
+ )
3
15
 
4
16
  if TYPE_CHECKING:
17
+ from PIL.Image import Image as PIL_Image
18
+
5
19
  from natural_pdf.core.page import Page
20
+ from natural_pdf.core.page_collection import PageCollection
6
21
  from natural_pdf.elements.base import Element as PhysicalElement
7
- from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection, PageCollection
22
+ from natural_pdf.elements.element_collection import (
23
+ ElementCollection as PhysicalElementCollection,
24
+ )
8
25
  from natural_pdf.elements.region import Region as PhysicalRegion
9
- from PIL.Image import Image as PIL_Image
10
26
 
11
27
  from .collections import FlowElementCollection
12
28
  from .element import FlowElement
13
29
 
14
30
  # Import required classes for the new methods
15
- from natural_pdf.tables import TableResult
16
31
  # For runtime image manipulation
17
32
  from PIL import Image as PIL_Image_Runtime
18
33
 
34
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
35
+ from natural_pdf.tables import TableResult
36
+
19
37
  logger = logging.getLogger(__name__)
20
38
 
21
39
 
22
- class Flow:
40
+ class Flow(Visualizable):
23
41
  """Defines a logical flow or sequence of physical Page or Region objects.
24
42
 
25
43
  A Flow represents a continuous logical document structure that spans across
@@ -114,9 +132,9 @@ class Flow:
114
132
  segment_gap: The virtual gap (in PDF points) between segments.
115
133
  """
116
134
  # Handle PageCollection input
117
- if hasattr(segments, 'pages'): # It's a PageCollection
135
+ if hasattr(segments, "pages"): # It's a PageCollection
118
136
  segments = list(segments.pages)
119
-
137
+
120
138
  if not segments:
121
139
  raise ValueError("Flow segments cannot be empty.")
122
140
  if arrangement not in ["vertical", "horizontal"]:
@@ -176,6 +194,103 @@ class Flow:
176
194
  f"Valid options are: {valid_alignments[self.arrangement]}"
177
195
  )
178
196
 
197
+ def _get_highlighter(self):
198
+ """Get the highlighting service from the first segment."""
199
+ if not self.segments:
200
+ raise RuntimeError("Flow has no segments to get highlighter from")
201
+
202
+ # Get highlighter from first segment
203
+ first_segment = self.segments[0]
204
+ if hasattr(first_segment, "_highlighter"):
205
+ return first_segment._highlighter
206
+ elif hasattr(first_segment, "page") and hasattr(first_segment.page, "_highlighter"):
207
+ return first_segment.page._highlighter
208
+ else:
209
+ raise RuntimeError(
210
+ f"Cannot find HighlightingService from Flow segments. "
211
+ f"First segment type: {type(first_segment).__name__}"
212
+ )
213
+
214
+ def show(
215
+ self,
216
+ *,
217
+ # Basic rendering options
218
+ resolution: Optional[float] = None,
219
+ width: Optional[int] = None,
220
+ # Highlight options
221
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
222
+ labels: bool = True,
223
+ label_format: Optional[str] = None,
224
+ highlights: Optional[List[Dict[str, Any]]] = None,
225
+ # Layout options for multi-page/region
226
+ layout: Literal["stack", "grid", "single"] = "stack",
227
+ stack_direction: Literal["vertical", "horizontal"] = "vertical",
228
+ gap: int = 5,
229
+ columns: Optional[int] = None, # For grid layout
230
+ # Cropping options
231
+ crop: Union[bool, Literal["content"]] = False,
232
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
233
+ # Flow-specific options
234
+ in_context: bool = False,
235
+ separator_color: Optional[Tuple[int, int, int]] = None,
236
+ separator_thickness: int = 2,
237
+ **kwargs,
238
+ ) -> Optional["PIL_Image"]:
239
+ """Generate a preview image with highlights.
240
+
241
+ If in_context=True, shows segments as cropped images stacked together
242
+ with separators between segments.
243
+
244
+ Args:
245
+ resolution: DPI for rendering (default from global settings)
246
+ width: Target width in pixels (overrides resolution)
247
+ color: Default highlight color
248
+ labels: Whether to show labels for highlights
249
+ label_format: Format string for labels
250
+ highlights: Additional highlight groups to show
251
+ layout: How to arrange multiple pages/regions
252
+ stack_direction: Direction for stack layout
253
+ gap: Pixels between stacked images
254
+ columns: Number of columns for grid layout
255
+ crop: Whether to crop
256
+ crop_bbox: Explicit crop bounds
257
+ in_context: If True, use special Flow visualization with separators
258
+ separator_color: RGB color for separator lines (default: red)
259
+ separator_thickness: Thickness of separator lines
260
+ **kwargs: Additional parameters passed to rendering
261
+
262
+ Returns:
263
+ PIL Image object or None if nothing to render
264
+ """
265
+ if in_context:
266
+ # Use the special in_context visualization
267
+ return self._show_in_context(
268
+ resolution=resolution or 150,
269
+ width=width,
270
+ stack_direction=stack_direction,
271
+ stack_gap=gap,
272
+ separator_color=separator_color or (255, 0, 0),
273
+ separator_thickness=separator_thickness,
274
+ **kwargs,
275
+ )
276
+
277
+ # Otherwise use the standard show method
278
+ return super().show(
279
+ resolution=resolution,
280
+ width=width,
281
+ color=color,
282
+ labels=labels,
283
+ label_format=label_format,
284
+ highlights=highlights,
285
+ layout=layout,
286
+ stack_direction=stack_direction,
287
+ gap=gap,
288
+ columns=columns,
289
+ crop=crop,
290
+ crop_bbox=crop_bbox,
291
+ **kwargs,
292
+ )
293
+
179
294
  def find(
180
295
  self,
181
296
  selector: Optional[str] = None,
@@ -224,11 +339,11 @@ class Flow:
224
339
  ) -> "FlowElementCollection":
225
340
  """
226
341
  Finds all elements within the flow that match the given selector or text criteria.
227
-
342
+
228
343
  This method efficiently groups segments by their parent pages, searches at the page level,
229
344
  then filters results appropriately for each segment. This ensures elements that intersect
230
345
  with flow segments (but aren't fully contained) are still found.
231
-
346
+
232
347
  Elements found are wrapped as FlowElement objects, anchored to this Flow,
233
348
  and returned in a FlowElementCollection.
234
349
  """
@@ -237,21 +352,26 @@ class Flow:
237
352
 
238
353
  # Step 1: Group segments by their parent pages (like in analyze_layout)
239
354
  segments_by_page = {} # Dict[Page, List[Segment]]
240
-
355
+
241
356
  for i, segment in enumerate(self.segments):
242
357
  # Determine the page for this segment - fix type detection
243
- if hasattr(segment, 'page') and hasattr(segment.page, 'find_all'):
358
+ if hasattr(segment, "page") and hasattr(segment.page, "find_all"):
244
359
  # It's a Region object (has a parent page)
245
360
  page_obj = segment.page
246
361
  segment_type = "region"
247
- elif hasattr(segment, 'find_all') and hasattr(segment, 'width') and hasattr(segment, 'height') and not hasattr(segment, 'page'):
362
+ elif (
363
+ hasattr(segment, "find_all")
364
+ and hasattr(segment, "width")
365
+ and hasattr(segment, "height")
366
+ and not hasattr(segment, "page")
367
+ ):
248
368
  # It's a Page object (has find_all but no parent page)
249
369
  page_obj = segment
250
370
  segment_type = "page"
251
371
  else:
252
372
  logger.warning(f"Segment {i+1} does not support find_all, skipping")
253
373
  continue
254
-
374
+
255
375
  if page_obj not in segments_by_page:
256
376
  segments_by_page[page_obj] = []
257
377
  segments_by_page[page_obj].append((segment, segment_type))
@@ -273,7 +393,7 @@ class Flow:
273
393
  case=case,
274
394
  **kwargs,
275
395
  )
276
-
396
+
277
397
  if not page_matches:
278
398
  continue
279
399
 
@@ -283,31 +403,41 @@ class Flow:
283
403
  # Full page segment: include all elements
284
404
  for phys_elem in page_matches.elements:
285
405
  all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
286
-
406
+
287
407
  elif segment_type == "region":
288
408
  # Region segment: filter to only intersecting elements
289
409
  for phys_elem in page_matches.elements:
290
410
  try:
291
411
  # Check if element intersects with this flow segment
292
412
  if segment.intersects(phys_elem):
293
- all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
413
+ all_flow_elements.append(
414
+ FlowElement(physical_object=phys_elem, flow=self)
415
+ )
294
416
  except Exception as intersect_error:
295
- logger.debug(f"Error checking intersection for element: {intersect_error}")
417
+ logger.debug(
418
+ f"Error checking intersection for element: {intersect_error}"
419
+ )
296
420
  # Include the element anyway if intersection check fails
297
- all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
421
+ all_flow_elements.append(
422
+ FlowElement(physical_object=phys_elem, flow=self)
423
+ )
298
424
 
299
425
  # Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
300
426
  unique_flow_elements = []
301
427
  seen_element_ids = set()
302
-
428
+
303
429
  for flow_elem in all_flow_elements:
304
430
  # Create a unique identifier for the underlying physical element
305
431
  phys_elem = flow_elem.physical_object
306
432
  elem_id = (
307
- getattr(phys_elem.page, 'index', id(phys_elem.page)) if hasattr(phys_elem, 'page') else id(phys_elem),
308
- phys_elem.bbox if hasattr(phys_elem, 'bbox') else id(phys_elem)
433
+ (
434
+ getattr(phys_elem.page, "index", id(phys_elem.page))
435
+ if hasattr(phys_elem, "page")
436
+ else id(phys_elem)
437
+ ),
438
+ phys_elem.bbox if hasattr(phys_elem, "bbox") else id(phys_elem),
309
439
  )
310
-
440
+
311
441
  if elem_id not in seen_element_ids:
312
442
  unique_flow_elements.append(flow_elem)
313
443
  seen_element_ids.add(elem_id)
@@ -362,6 +492,7 @@ class Flow:
362
492
  show_progress: bool = False,
363
493
  content_filter: Optional[Any] = None,
364
494
  stitch_rows: Optional[Callable] = None,
495
+ merge_headers: Optional[bool] = None,
365
496
  ) -> TableResult:
366
497
  """
367
498
  Extract table data from all segments in the flow, combining results sequentially.
@@ -380,18 +511,24 @@ class Flow:
380
511
  and returns its string content. For 'text' method only.
381
512
  show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
382
513
  content_filter: Optional content filter to apply during cell text extraction.
514
+ merge_headers: Whether to merge tables by removing repeated headers from subsequent
515
+ segments. If None (default), auto-detects by checking if the first row
516
+ of each segment matches the first row of the first segment. If segments have
517
+ inconsistent header patterns (some repeat, others don't), raises ValueError.
518
+ Useful for multi-page tables where headers repeat on each page.
383
519
  stitch_rows: Optional callable to determine when rows should be merged across
384
- segment boundaries. Two overloaded signatures are supported:
385
-
520
+ segment boundaries. Applied AFTER header removal if merge_headers
521
+ is enabled. Two overloaded signatures are supported:
522
+
386
523
  • func(current_row) -> bool
387
524
  Called only on the first row of each segment (after the first).
388
525
  Return True to merge this first row with the last row from
389
526
  the previous segment.
390
-
527
+
391
528
  • func(prev_row, current_row, row_index, segment) -> bool
392
529
  Called for every row. Return True to merge current_row with
393
530
  the previous row in the aggregated results.
394
-
531
+
395
532
  When True is returned, rows are concatenated cell-by-cell.
396
533
  This is useful for handling table rows split across page
397
534
  boundaries or segments. If None, rows are never merged.
@@ -403,30 +540,32 @@ class Flow:
403
540
  Multi-page table extraction:
404
541
  ```python
405
542
  pdf = npdf.PDF("multi_page_table.pdf")
406
-
543
+
407
544
  # Create flow for table spanning pages 2-4
408
545
  table_flow = Flow(
409
546
  segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
410
547
  arrangement='vertical'
411
548
  )
412
-
549
+
413
550
  # Extract table as if it were continuous
414
551
  table_data = table_flow.extract_table()
415
552
  df = table_data.df # Convert to pandas DataFrame
416
-
553
+
417
554
  # Custom row stitching - single parameter (simple case)
418
555
  table_data = table_flow.extract_table(
419
556
  stitch_rows=lambda row: row and not (row[0] or "").strip()
420
557
  )
421
-
558
+
422
559
  # Custom row stitching - full parameters (advanced case)
423
560
  table_data = table_flow.extract_table(
424
561
  stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
425
562
  )
426
563
  ```
427
564
  """
428
- logger.info(f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})")
429
-
565
+ logger.info(
566
+ f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})"
567
+ )
568
+
430
569
  if not self.segments:
431
570
  logger.warning("Flow has no segments, returning empty table")
432
571
  return TableResult([])
@@ -434,12 +573,13 @@ class Flow:
434
573
  # Resolve predicate and determine its signature
435
574
  predicate: Optional[Callable] = None
436
575
  predicate_type: str = "none"
437
-
576
+
438
577
  if callable(stitch_rows):
439
578
  import inspect
579
+
440
580
  sig = inspect.signature(stitch_rows)
441
581
  param_count = len(sig.parameters)
442
-
582
+
443
583
  if param_count == 1:
444
584
  predicate = stitch_rows
445
585
  predicate_type = "single_param"
@@ -447,12 +587,17 @@ class Flow:
447
587
  predicate = stitch_rows
448
588
  predicate_type = "full_params"
449
589
  else:
450
- logger.warning(f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring.")
590
+ logger.warning(
591
+ f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring."
592
+ )
451
593
  predicate = None
452
594
  predicate_type = "none"
453
595
 
454
- def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
596
+ def _default_merge(
597
+ prev_row: List[Optional[str]], cur_row: List[Optional[str]]
598
+ ) -> List[Optional[str]]:
455
599
  from itertools import zip_longest
600
+
456
601
  merged: List[Optional[str]] = []
457
602
  for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
458
603
  if (p or "").strip() and (c or "").strip():
@@ -463,6 +608,10 @@ class Flow:
463
608
 
464
609
  aggregated_rows: List[List[Optional[str]]] = []
465
610
  processed_segments = 0
611
+ header_row: Optional[List[Optional[str]]] = None
612
+ merge_headers_enabled = False
613
+ headers_warned = False # Track if we've already warned about dropping headers
614
+ segment_has_repeated_header = [] # Track which segments have repeated headers
466
615
 
467
616
  for seg_idx, segment in enumerate(self.segments):
468
617
  try:
@@ -491,9 +640,67 @@ class Flow:
491
640
  logger.debug(f" No table data found in segment {seg_idx+1}")
492
641
  continue
493
642
 
643
+ # Handle header detection and merging for multi-page tables
644
+ if seg_idx == 0:
645
+ # First segment: capture potential header row
646
+ if segment_rows:
647
+ header_row = segment_rows[0]
648
+ # Determine if we should merge headers
649
+ if merge_headers is None:
650
+ # Auto-detect: we'll check all subsequent segments
651
+ merge_headers_enabled = False # Will be determined later
652
+ else:
653
+ merge_headers_enabled = merge_headers
654
+ # Track that first segment exists (for consistency checking)
655
+ segment_has_repeated_header.append(False) # First segment doesn't "repeat"
656
+ elif seg_idx == 1 and merge_headers is None:
657
+ # Auto-detection: check if first row of second segment matches header
658
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
659
+ segment_has_repeated_header.append(has_header)
660
+
661
+ if has_header:
662
+ merge_headers_enabled = True
663
+ # Remove the detected repeated header from this segment
664
+ segment_rows = segment_rows[1:]
665
+ logger.debug(
666
+ f" Auto-detected repeated header in segment {seg_idx+1}, removed"
667
+ )
668
+ if not headers_warned:
669
+ warnings.warn(
670
+ "Detected repeated headers in multi-page table. Merging by removing "
671
+ "repeated headers from subsequent pages.",
672
+ UserWarning,
673
+ stacklevel=2,
674
+ )
675
+ headers_warned = True
676
+ else:
677
+ merge_headers_enabled = False
678
+ logger.debug(f" No repeated header detected in segment {seg_idx+1}")
679
+ elif seg_idx > 1:
680
+ # Check consistency: all segments should have same pattern
681
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
682
+ segment_has_repeated_header.append(has_header)
683
+
684
+ # Remove header if merging is enabled and header is present
685
+ if merge_headers_enabled and has_header:
686
+ segment_rows = segment_rows[1:]
687
+ logger.debug(f" Removed repeated header from segment {seg_idx+1}")
688
+ elif seg_idx > 0 and merge_headers_enabled:
689
+ # Explicit merge_headers=True: remove headers from subsequent segments
690
+ if segment_rows and header_row and segment_rows[0] == header_row:
691
+ segment_rows = segment_rows[1:]
692
+ logger.debug(f" Removed repeated header from segment {seg_idx+1}")
693
+ if not headers_warned:
694
+ warnings.warn(
695
+ "Removing repeated headers from multi-page table during merge.",
696
+ UserWarning,
697
+ stacklevel=2,
698
+ )
699
+ headers_warned = True
700
+
494
701
  for row_idx, row in enumerate(segment_rows):
495
702
  should_merge = False
496
-
703
+
497
704
  if predicate is not None and aggregated_rows:
498
705
  if predicate_type == "single_param":
499
706
  # For single param: only call on first row of segment (row_idx == 0)
@@ -503,19 +710,41 @@ class Flow:
503
710
  elif predicate_type == "full_params":
504
711
  # For full params: call with all arguments
505
712
  should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
506
-
713
+
507
714
  if should_merge:
508
715
  aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
509
716
  else:
510
717
  aggregated_rows.append(row)
511
718
 
512
719
  processed_segments += 1
513
- logger.debug(f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}")
720
+ logger.debug(
721
+ f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}"
722
+ )
514
723
 
515
724
  except Exception as e:
516
725
  logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
517
726
  continue
518
727
 
728
+ # Check for inconsistent header patterns after processing all segments
729
+ if merge_headers is None and len(segment_has_repeated_header) > 2:
730
+ # During auto-detection, check for consistency across all segments
731
+ expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
732
+ for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
733
+ if has_header != expected_pattern:
734
+ # Inconsistent pattern detected
735
+ segments_with_headers = [
736
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
737
+ ]
738
+ segments_without_headers = [
739
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
740
+ ]
741
+ raise ValueError(
742
+ f"Inconsistent header pattern in multi-page table: "
743
+ f"segments {segments_with_headers} have repeated headers, "
744
+ f"but segments {segments_without_headers} do not. "
745
+ f"All segments must have the same header pattern for reliable merging."
746
+ )
747
+
519
748
  logger.info(
520
749
  f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
521
750
  )
@@ -558,45 +787,47 @@ class Flow:
558
787
  Multi-page layout analysis:
559
788
  ```python
560
789
  pdf = npdf.PDF("document.pdf")
561
-
790
+
562
791
  # Create flow for first 3 pages
563
792
  page_flow = Flow(
564
793
  segments=pdf.pages[:3],
565
794
  arrangement='vertical'
566
795
  )
567
-
796
+
568
797
  # Analyze layout across all pages (efficiently)
569
798
  all_regions = page_flow.analyze_layout(engine='yolo')
570
-
799
+
571
800
  # Find all tables across the flow
572
801
  tables = all_regions.filter('region[type=table]')
573
802
  ```
574
803
  """
575
- from natural_pdf.elements.collections import ElementCollection
576
-
577
- logger.info(f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})")
578
-
804
+ from natural_pdf.elements.element_collection import ElementCollection
805
+
806
+ logger.info(
807
+ f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})"
808
+ )
809
+
579
810
  if not self.segments:
580
811
  logger.warning("Flow has no segments, returning empty collection")
581
812
  return ElementCollection([])
582
813
 
583
814
  # Step 1: Group segments by their parent pages to avoid redundant analysis
584
815
  segments_by_page = {} # Dict[Page, List[Segment]]
585
-
816
+
586
817
  for i, segment in enumerate(self.segments):
587
818
  # Determine the page for this segment
588
- if hasattr(segment, 'analyze_layout'):
819
+ if hasattr(segment, "analyze_layout"):
589
820
  # It's a Page object
590
821
  page_obj = segment
591
822
  segment_type = "page"
592
- elif hasattr(segment, 'page') and hasattr(segment.page, 'analyze_layout'):
823
+ elif hasattr(segment, "page") and hasattr(segment.page, "analyze_layout"):
593
824
  # It's a Region object
594
825
  page_obj = segment.page
595
826
  segment_type = "region"
596
827
  else:
597
828
  logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
598
829
  continue
599
-
830
+
600
831
  if page_obj not in segments_by_page:
601
832
  segments_by_page[page_obj] = []
602
833
  segments_by_page[page_obj].append((segment, segment_type))
@@ -605,7 +836,9 @@ class Flow:
605
836
  logger.warning("No segments with analyzable pages found")
606
837
  return ElementCollection([])
607
838
 
608
- logger.debug(f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages")
839
+ logger.debug(
840
+ f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages"
841
+ )
609
842
 
610
843
  # Step 2: Analyze each unique page only once
611
844
  all_detected_regions: List["PhysicalRegion"] = []
@@ -613,8 +846,10 @@ class Flow:
613
846
 
614
847
  for page_obj, page_segments in segments_by_page.items():
615
848
  try:
616
- logger.debug(f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments")
617
-
849
+ logger.debug(
850
+ f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments"
851
+ )
852
+
618
853
  # Run layout analysis once for this page
619
854
  page_results = page_obj.analyze_layout(
620
855
  engine=engine,
@@ -629,18 +864,22 @@ class Flow:
629
864
  )
630
865
 
631
866
  # Extract regions from results
632
- if hasattr(page_results, 'elements'):
867
+ if hasattr(page_results, "elements"):
633
868
  # It's an ElementCollection
634
869
  page_regions = page_results.elements
635
870
  elif isinstance(page_results, list):
636
871
  # It's a list of regions
637
872
  page_regions = page_results
638
873
  else:
639
- logger.warning(f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}")
874
+ logger.warning(
875
+ f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}"
876
+ )
640
877
  continue
641
878
 
642
879
  if not page_regions:
643
- logger.debug(f" No layout regions found on page {getattr(page_obj, 'number', '?')}")
880
+ logger.debug(
881
+ f" No layout regions found on page {getattr(page_obj, 'number', '?')}"
882
+ )
644
883
  continue
645
884
 
646
885
  # Step 3: For each segment on this page, collect relevant regions
@@ -651,7 +890,7 @@ class Flow:
651
890
  all_detected_regions.extend(page_regions)
652
891
  segments_processed_on_page += 1
653
892
  logger.debug(f" Added {len(page_regions)} regions for full-page segment")
654
-
893
+
655
894
  elif segment_type == "region":
656
895
  # Region segment: filter to only intersecting regions
657
896
  intersecting_regions = []
@@ -660,32 +899,41 @@ class Flow:
660
899
  if segment.intersects(region):
661
900
  intersecting_regions.append(region)
662
901
  except Exception as intersect_error:
663
- logger.debug(f"Error checking intersection for region: {intersect_error}")
902
+ logger.debug(
903
+ f"Error checking intersection for region: {intersect_error}"
904
+ )
664
905
  # Include the region anyway if intersection check fails
665
906
  intersecting_regions.append(region)
666
-
907
+
667
908
  all_detected_regions.extend(intersecting_regions)
668
909
  segments_processed_on_page += 1
669
- logger.debug(f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}")
910
+ logger.debug(
911
+ f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}"
912
+ )
670
913
 
671
914
  processed_pages += 1
672
- logger.debug(f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}")
915
+ logger.debug(
916
+ f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}"
917
+ )
673
918
 
674
919
  except Exception as e:
675
- logger.error(f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}", exc_info=True)
920
+ logger.error(
921
+ f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}",
922
+ exc_info=True,
923
+ )
676
924
  continue
677
925
 
678
926
  # Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
679
927
  unique_regions = []
680
928
  seen_region_ids = set()
681
-
929
+
682
930
  for region in all_detected_regions:
683
931
  # Create a unique identifier for this region (page + bbox)
684
932
  region_id = (
685
- getattr(region.page, 'index', id(region.page)),
686
- region.bbox if hasattr(region, 'bbox') else id(region)
933
+ getattr(region.page, "index", id(region.page)),
934
+ region.bbox if hasattr(region, "bbox") else id(region),
687
935
  )
688
-
936
+
689
937
  if region_id not in seen_region_ids:
690
938
  unique_regions.append(region)
691
939
  seen_region_ids.add(region_id)
@@ -694,87 +942,54 @@ class Flow:
694
942
  if dedupe_removed > 0:
695
943
  logger.debug(f" Removed {dedupe_removed} duplicate regions")
696
944
 
697
- logger.info(f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages")
945
+ logger.info(
946
+ f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages"
947
+ )
698
948
  return ElementCollection(unique_regions)
699
949
 
700
- def show(
950
+ def _get_render_specs(
701
951
  self,
702
- resolution: Optional[float] = None,
703
- labels: bool = True,
704
- legend_position: str = "right",
705
- color: Optional[Union[Tuple, str]] = "blue",
952
+ mode: Literal["show", "render"] = "show",
953
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
954
+ highlights: Optional[List[Dict[str, Any]]] = None,
955
+ crop: Union[bool, Literal["content"]] = False,
956
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
706
957
  label_prefix: Optional[str] = "FlowSegment",
707
- width: Optional[int] = None,
708
- stack_direction: str = "vertical",
709
- stack_gap: int = 5,
710
- stack_background_color: Tuple[int, int, int] = (255, 255, 255),
711
- crop: bool = False,
712
958
  **kwargs,
713
- ) -> Optional["PIL_Image"]:
714
- """
715
- Generates and returns a PIL Image showing all segments in the flow with highlights.
716
-
717
- This method visualizes the entire flow by highlighting each segment on its respective
718
- page and combining the results into a single image. If multiple pages are involved,
719
- they are stacked according to the flow's arrangement.
959
+ ) -> List[RenderSpec]:
960
+ """Get render specifications for this flow.
720
961
 
721
962
  Args:
722
- resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
723
- labels: Whether to include a legend for highlights.
724
- legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
725
- color: Color for highlighting the flow segments.
726
- label_prefix: Prefix for segment labels (e.g., 'FlowSegment').
727
- width: Optional width for the output image (overrides resolution).
728
- stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
729
- stack_gap: Gap in pixels between stacked pages.
730
- stack_background_color: RGB background color for the stacked image.
731
- crop: If True, crop each rendered page to the bounding box of segments on that page.
732
- **kwargs: Additional arguments passed to the underlying rendering methods.
963
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
964
+ color: Color for highlighting segments in show mode
965
+ highlights: Additional highlight groups to show
966
+ crop: Whether to crop to segments
967
+ crop_bbox: Explicit crop bounds
968
+ label_prefix: Prefix for segment labels
969
+ **kwargs: Additional parameters
733
970
 
734
971
  Returns:
735
- PIL Image of the rendered pages with highlighted flow segments, or None if rendering fails.
736
-
737
- Example:
738
- Visualizing a multi-page flow:
739
- ```python
740
- pdf = npdf.PDF("document.pdf")
741
-
742
- # Create flow across multiple pages
743
- page_flow = Flow(
744
- segments=[pdf.pages[0], pdf.pages[1], pdf.pages[2]],
745
- arrangement='vertical'
746
- )
747
-
748
- # Show the entire flow
749
- flow_image = page_flow.show(color="green", labels=True)
750
- ```
972
+ List of RenderSpec objects, one per page with segments
751
973
  """
752
- logger.info(f"Rendering Flow with {len(self.segments)} segments")
753
-
754
974
  if not self.segments:
755
- logger.warning("Flow has no segments to show")
756
- return None
757
-
758
- # Apply global options as defaults for resolution
759
- import natural_pdf
760
- if resolution is None:
761
- if natural_pdf.options.image.resolution is not None:
762
- resolution = natural_pdf.options.image.resolution
763
- else:
764
- resolution = 144 # Default resolution
975
+ return []
765
976
 
766
- # 1. Group segments by their physical pages
977
+ # Group segments by their physical pages
767
978
  segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
768
-
979
+
769
980
  for i, segment in enumerate(self.segments):
770
981
  # Get the page for this segment
771
- if hasattr(segment, 'page') and segment.page is not None:
982
+ if hasattr(segment, "page") and segment.page is not None:
772
983
  # It's a Region, use its page
773
984
  page_obj = segment.page
774
985
  if page_obj not in segments_by_page:
775
986
  segments_by_page[page_obj] = []
776
987
  segments_by_page[page_obj].append(segment)
777
- elif hasattr(segment, 'index') and hasattr(segment, 'width') and hasattr(segment, 'height'):
988
+ elif (
989
+ hasattr(segment, "index")
990
+ and hasattr(segment, "width")
991
+ and hasattr(segment, "height")
992
+ ):
778
993
  # It's a full Page object, create a full-page region for it
779
994
  page_obj = segment
780
995
  full_page_region = segment.region(0, 0, segment.width, segment.height)
@@ -786,17 +1001,10 @@ class Flow:
786
1001
  continue
787
1002
 
788
1003
  if not segments_by_page:
789
- logger.warning("No segments with identifiable pages found")
790
- return None
1004
+ return []
791
1005
 
792
- # 2. Get a highlighter service from the first page
793
- first_page = next(iter(segments_by_page.keys()))
794
- if not hasattr(first_page, '_highlighter'):
795
- logger.error("Cannot get highlighter service for Flow.show(). Page missing highlighter.")
796
- return None
797
-
798
- highlighter_service = first_page._highlighter
799
- output_page_images: List["PIL_Image_Runtime"] = []
1006
+ # Create RenderSpec for each page
1007
+ specs = []
800
1008
 
801
1009
  # Sort pages by index for consistent output order
802
1010
  sorted_pages = sorted(
@@ -804,134 +1012,243 @@ class Flow:
804
1012
  key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
805
1013
  )
806
1014
 
807
- # 3. Render each page with its relevant segments highlighted
808
1015
  for page_idx, page_obj in enumerate(sorted_pages):
809
1016
  segments_on_this_page = segments_by_page[page_obj]
810
1017
  if not segments_on_this_page:
811
1018
  continue
812
1019
 
813
- temp_highlights_for_page = []
814
- for i, segment in enumerate(segments_on_this_page):
815
- segment_label = None
816
- if labels and label_prefix:
817
- # Create label for this segment
818
- global_segment_idx = None
819
- try:
820
- # Find the global index of this segment in the original flow
821
- global_segment_idx = self.segments.index(segment)
822
- except ValueError:
823
- # If it's a generated full-page region, find its source page
824
- for idx, orig_segment in enumerate(self.segments):
825
- if (hasattr(orig_segment, 'index') and hasattr(segment, 'page')
826
- and orig_segment.index == segment.page.index):
827
- global_segment_idx = idx
828
- break
829
-
830
- if global_segment_idx is not None:
831
- segment_label = f"{label_prefix}_{global_segment_idx + 1}"
832
- else:
833
- segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
834
-
835
- temp_highlights_for_page.append(
836
- {
837
- "page_index": (
838
- page_obj.index
839
- if hasattr(page_obj, "index")
840
- else getattr(page_obj, "page_number", 1) - 1
841
- ),
842
- "bbox": segment.bbox,
843
- "polygon": segment.polygon if hasattr(segment, 'polygon') and hasattr(segment, 'has_polygon') and segment.has_polygon else None,
844
- "color": color,
845
- "label": segment_label,
846
- "use_color_cycling": False, # Keep specific color
847
- }
848
- )
1020
+ spec = RenderSpec(page=page_obj)
1021
+
1022
+ # Handle cropping
1023
+ if crop_bbox:
1024
+ spec.crop_bbox = crop_bbox
1025
+ elif crop == "content" or crop is True:
1026
+ # Calculate bounds of segments on this page
1027
+ x_coords = []
1028
+ y_coords = []
1029
+ for segment in segments_on_this_page:
1030
+ if hasattr(segment, "bbox") and segment.bbox:
1031
+ x0, y0, x1, y1 = segment.bbox
1032
+ x_coords.extend([x0, x1])
1033
+ y_coords.extend([y0, y1])
1034
+
1035
+ if x_coords and y_coords:
1036
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
1037
+
1038
+ # Add highlights in show mode
1039
+ if mode == "show":
1040
+ # Highlight segments
1041
+ for i, segment in enumerate(segments_on_this_page):
1042
+ segment_label = None
1043
+ if label_prefix:
1044
+ # Create label for this segment
1045
+ global_segment_idx = None
1046
+ try:
1047
+ # Find the global index of this segment in the original flow
1048
+ global_segment_idx = self.segments.index(segment)
1049
+ except ValueError:
1050
+ # If it's a generated full-page region, find its source page
1051
+ for idx, orig_segment in enumerate(self.segments):
1052
+ if (
1053
+ hasattr(orig_segment, "index")
1054
+ and hasattr(segment, "page")
1055
+ and orig_segment.index == segment.page.index
1056
+ ):
1057
+ global_segment_idx = idx
1058
+ break
1059
+
1060
+ if global_segment_idx is not None:
1061
+ segment_label = f"{label_prefix}_{global_segment_idx + 1}"
1062
+ else:
1063
+ segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
1064
+
1065
+ spec.add_highlight(
1066
+ bbox=segment.bbox,
1067
+ polygon=segment.polygon if segment.has_polygon else None,
1068
+ color=color or "blue",
1069
+ label=segment_label,
1070
+ )
849
1071
 
850
- if not temp_highlights_for_page:
851
- continue
1072
+ # Add additional highlight groups if provided
1073
+ if highlights:
1074
+ for group in highlights:
1075
+ group_elements = group.get("elements", [])
1076
+ group_color = group.get("color", color)
1077
+ group_label = group.get("label")
852
1078
 
853
- # Calculate crop bbox if cropping is enabled
854
- crop_bbox = None
855
- if crop and segments_on_this_page:
856
- # Calculate the bounding box that encompasses all segments on this page
857
- min_x0 = min(segment.bbox[0] for segment in segments_on_this_page)
858
- min_y0 = min(segment.bbox[1] for segment in segments_on_this_page)
859
- max_x1 = max(segment.bbox[2] for segment in segments_on_this_page)
860
- max_y1 = max(segment.bbox[3] for segment in segments_on_this_page)
861
- crop_bbox = (min_x0, min_y0, max_x1, max_y1)
862
-
863
- # Render this page with highlights
864
- page_image = highlighter_service.render_preview(
865
- page_index=(
866
- page_obj.index
867
- if hasattr(page_obj, "index")
868
- else getattr(page_obj, "page_number", 1) - 1
869
- ),
870
- temporary_highlights=temp_highlights_for_page,
871
- resolution=resolution,
872
- width=width,
873
- labels=labels,
874
- legend_position=legend_position,
875
- crop_bbox=crop_bbox,
876
- **kwargs,
877
- )
878
- if page_image:
879
- output_page_images.append(page_image)
1079
+ for elem in group_elements:
1080
+ # Only add if element is on this page
1081
+ if hasattr(elem, "page") and elem.page == page_obj:
1082
+ spec.add_highlight(
1083
+ element=elem, color=group_color, label=group_label
1084
+ )
880
1085
 
881
- # 4. Stack the generated page images if multiple
882
- if not output_page_images:
883
- logger.warning("Flow.show() produced no page images")
884
- return None
1086
+ specs.append(spec)
1087
+
1088
+ return specs
1089
+
1090
+ def _show_in_context(
1091
+ self,
1092
+ resolution: float,
1093
+ width: Optional[int] = None,
1094
+ stack_direction: str = "vertical",
1095
+ stack_gap: int = 5,
1096
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255),
1097
+ separator_color: Tuple[int, int, int] = (255, 0, 0),
1098
+ separator_thickness: int = 2,
1099
+ **kwargs,
1100
+ ) -> Optional["PIL_Image"]:
1101
+ """
1102
+ Show segments as cropped images stacked together with separators between segments.
1103
+
1104
+ Args:
1105
+ resolution: Resolution in DPI for rendering segment images
1106
+ width: Optional width for segment images
1107
+ stack_direction: Direction to stack segments ('vertical' or 'horizontal')
1108
+ stack_gap: Gap in pixels between segments
1109
+ stack_background_color: RGB background color for the final image
1110
+ separator_color: RGB color for separator lines between segments
1111
+ separator_thickness: Thickness in pixels of separator lines
1112
+ **kwargs: Additional arguments passed to segment rendering
885
1113
 
886
- if len(output_page_images) == 1:
887
- return output_page_images[0]
1114
+ Returns:
1115
+ PIL Image with all segments stacked together
1116
+ """
1117
+ from PIL import Image, ImageDraw
888
1118
 
889
- # Determine stacking direction (default to flow arrangement, but allow override)
1119
+ segment_images = []
1120
+ segment_pages = []
1121
+
1122
+ # Determine stacking direction
890
1123
  final_stack_direction = stack_direction
891
1124
  if stack_direction == "auto":
892
1125
  final_stack_direction = self.arrangement
893
1126
 
894
- # Stack multiple page images
1127
+ # Get cropped images for each segment
1128
+ for i, segment in enumerate(self.segments):
1129
+ # Get the page reference for this segment
1130
+ if hasattr(segment, "page") and segment.page is not None:
1131
+ segment_page = segment.page
1132
+ # Get cropped image of the segment
1133
+ # Use render() for clean image without highlights
1134
+ segment_image = segment.render(
1135
+ resolution=resolution,
1136
+ crop=True,
1137
+ width=width,
1138
+ **kwargs,
1139
+ )
1140
+
1141
+ elif (
1142
+ hasattr(segment, "index")
1143
+ and hasattr(segment, "width")
1144
+ and hasattr(segment, "height")
1145
+ ):
1146
+ # It's a full Page object
1147
+ segment_page = segment
1148
+ # Use render() for clean image without highlights
1149
+ segment_image = segment.render(resolution=resolution, width=width, **kwargs)
1150
+ else:
1151
+ raise ValueError(
1152
+ f"Segment {i+1} has no identifiable page. Segment type: {type(segment)}, attributes: {dir(segment)}"
1153
+ )
1154
+
1155
+ if segment_image is not None:
1156
+ segment_images.append(segment_image)
1157
+ segment_pages.append(segment_page)
1158
+ else:
1159
+ logger.warning(f"Segment {i+1} render() returned None, skipping")
1160
+
1161
+ # Check if we have any valid images
1162
+ if not segment_images:
1163
+ logger.error("No valid segment images could be rendered")
1164
+ return None
1165
+
1166
+ # We should have at least one segment image by now (or an exception would have been raised)
1167
+ if len(segment_images) == 1:
1168
+ return segment_images[0]
1169
+
1170
+ # Calculate dimensions for the final stacked image
895
1171
  if final_stack_direction == "vertical":
896
- final_width = max(img.width for img in output_page_images)
897
- final_height = (
898
- sum(img.height for img in output_page_images)
899
- + (len(output_page_images) - 1) * stack_gap
900
- )
901
- if final_width == 0 or final_height == 0:
902
- raise ValueError("Cannot create concatenated image with zero width or height.")
1172
+ # Stack vertically
1173
+ final_width = max(img.width for img in segment_images)
1174
+
1175
+ # Calculate total height including gaps and separators
1176
+ total_height = sum(img.height for img in segment_images)
1177
+ total_height += (len(segment_images) - 1) * stack_gap
1178
+
1179
+ # Add separator thickness between all segments
1180
+ num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
1181
+ total_height += num_separators * separator_thickness
1182
+
1183
+ # Create the final image
1184
+ final_image = Image.new("RGB", (final_width, total_height), stack_background_color)
1185
+ draw = ImageDraw.Draw(final_image)
903
1186
 
904
- concatenated_image = PIL_Image_Runtime.new(
905
- "RGB", (final_width, final_height), stack_background_color
906
- )
907
1187
  current_y = 0
908
- for img in output_page_images:
909
- paste_x = (final_width - img.width) // 2
910
- concatenated_image.paste(img, (paste_x, current_y))
911
- current_y += img.height + stack_gap
912
- return concatenated_image
913
-
1188
+
1189
+ for i, img in enumerate(segment_images):
1190
+ # Add separator line before each segment (except the first one)
1191
+ if i > 0:
1192
+ # Draw separator line
1193
+ draw.rectangle(
1194
+ [(0, current_y), (final_width, current_y + separator_thickness)],
1195
+ fill=separator_color,
1196
+ )
1197
+ current_y += separator_thickness
1198
+
1199
+ # Paste the segment image
1200
+ paste_x = (final_width - img.width) // 2 # Center horizontally
1201
+ final_image.paste(img, (paste_x, current_y))
1202
+ current_y += img.height
1203
+
1204
+ # Add gap after segment (except for the last one)
1205
+ if i < len(segment_images) - 1:
1206
+ current_y += stack_gap
1207
+
1208
+ return final_image
1209
+
914
1210
  elif final_stack_direction == "horizontal":
915
- final_width = (
916
- sum(img.width for img in output_page_images)
917
- + (len(output_page_images) - 1) * stack_gap
918
- )
919
- final_height = max(img.height for img in output_page_images)
920
- if final_width == 0 or final_height == 0:
921
- raise ValueError("Cannot create concatenated image with zero width or height.")
1211
+ # Stack horizontally
1212
+ final_height = max(img.height for img in segment_images)
1213
+
1214
+ # Calculate total width including gaps and separators
1215
+ total_width = sum(img.width for img in segment_images)
1216
+ total_width += (len(segment_images) - 1) * stack_gap
1217
+
1218
+ # Add separator thickness between all segments
1219
+ num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
1220
+ total_width += num_separators * separator_thickness
1221
+
1222
+ # Create the final image
1223
+ final_image = Image.new("RGB", (total_width, final_height), stack_background_color)
1224
+ draw = ImageDraw.Draw(final_image)
922
1225
 
923
- concatenated_image = PIL_Image_Runtime.new(
924
- "RGB", (final_width, final_height), stack_background_color
925
- )
926
1226
  current_x = 0
927
- for img in output_page_images:
928
- paste_y = (final_height - img.height) // 2
929
- concatenated_image.paste(img, (current_x, paste_y))
930
- current_x += img.width + stack_gap
931
- return concatenated_image
1227
+
1228
+ for i, img in enumerate(segment_images):
1229
+ # Add separator line before each segment (except the first one)
1230
+ if i > 0:
1231
+ # Draw separator line
1232
+ draw.rectangle(
1233
+ [(current_x, 0), (current_x + separator_thickness, final_height)],
1234
+ fill=separator_color,
1235
+ )
1236
+ current_x += separator_thickness
1237
+
1238
+ # Paste the segment image
1239
+ paste_y = (final_height - img.height) // 2 # Center vertically
1240
+ final_image.paste(img, (current_x, paste_y))
1241
+ current_x += img.width
1242
+
1243
+ # Add gap after segment (except for the last one)
1244
+ if i < len(segment_images) - 1:
1245
+ current_x += stack_gap
1246
+
1247
+ return final_image
1248
+
932
1249
  else:
933
1250
  raise ValueError(
934
- f"Invalid stack_direction '{final_stack_direction}' for Flow.show(). Must be 'vertical' or 'horizontal'."
1251
+ f"Invalid stack_direction '{final_stack_direction}' for in_context. Must be 'vertical' or 'horizontal'."
935
1252
  )
936
1253
 
937
1254
  # --- Helper methods for coordinate transformations and segment iteration ---
@@ -972,3 +1289,643 @@ class Flow:
972
1289
  raise NotImplementedError(
973
1290
  "Translating element coordinates to a unified flow coordinate system is not yet implemented."
974
1291
  )
1292
+
1293
+ def get_sections(
1294
+ self,
1295
+ start_elements=None,
1296
+ end_elements=None,
1297
+ new_section_on_page_break: bool = False,
1298
+ include_boundaries: str = "both",
1299
+ ) -> "ElementCollection":
1300
+ """
1301
+ Extract logical sections from the Flow based on *start* and *end* boundary
1302
+ elements, mirroring the behaviour of PDF/PageCollection.get_sections().
1303
+
1304
+ This implementation is a thin wrapper that converts the Flow into a
1305
+ temporary PageCollection (constructed from the unique pages that the
1306
+ Flow spans) and then delegates the heavy‐lifting to that existing
1307
+ implementation. Any FlowElement / FlowElementCollection inputs are
1308
+ automatically unwrapped to their underlying physical elements so that
1309
+ PageCollection can work with them directly.
1310
+
1311
+ Args:
1312
+ start_elements: Elements or selector string that mark the start of
1313
+ sections (optional).
1314
+ end_elements: Elements or selector string that mark the end of
1315
+ sections (optional).
1316
+ new_section_on_page_break: Whether to start a new section at page
1317
+ boundaries (default: False).
1318
+ include_boundaries: How to include boundary elements: 'start',
1319
+ 'end', 'both', or 'none' (default: 'both').
1320
+
1321
+ Returns:
1322
+ ElementCollection of Region/FlowRegion objects representing the
1323
+ extracted sections.
1324
+ """
1325
+ # ------------------------------------------------------------------
1326
+ # Unwrap FlowElement(-Collection) inputs and selector strings so we
1327
+ # can reason about them generically.
1328
+ # ------------------------------------------------------------------
1329
+ from natural_pdf.flows.collections import FlowElementCollection
1330
+ from natural_pdf.flows.element import FlowElement
1331
+
1332
+ def _unwrap(obj):
1333
+ """Convert Flow-specific wrappers to their underlying physical objects.
1334
+
1335
+ Keeps selector strings as-is; converts FlowElement to its physical
1336
+ element; converts FlowElementCollection to list of physical
1337
+ elements; passes through ElementCollection by taking .elements.
1338
+ """
1339
+
1340
+ if obj is None or isinstance(obj, str):
1341
+ return obj
1342
+
1343
+ if isinstance(obj, FlowElement):
1344
+ return obj.physical_object
1345
+
1346
+ if isinstance(obj, FlowElementCollection):
1347
+ return [fe.physical_object for fe in obj.flow_elements]
1348
+
1349
+ if hasattr(obj, "elements"):
1350
+ return obj.elements
1351
+
1352
+ if isinstance(obj, (list, tuple, set)):
1353
+ out = []
1354
+ for item in obj:
1355
+ if isinstance(item, FlowElement):
1356
+ out.append(item.physical_object)
1357
+ else:
1358
+ out.append(item)
1359
+ return out
1360
+
1361
+ return obj # Fallback – unknown type
1362
+
1363
+ start_elements_unwrapped = _unwrap(start_elements)
1364
+ end_elements_unwrapped = _unwrap(end_elements)
1365
+
1366
+ # ------------------------------------------------------------------
1367
+ # PRIMARY IMPLEMENTATION – operate on each Flow **segment region**
1368
+ # independently so that sectioning happens *per-region*, not per page.
1369
+ # ------------------------------------------------------------------
1370
+ from natural_pdf.elements.element_collection import ElementCollection
1371
+
1372
+ aggregated_sections = []
1373
+
1374
+ # Helper to decide if an element lies inside a segment (Region)
1375
+ def _element_in_segment(elem, segment_region):
1376
+ try:
1377
+ return segment_region.intersects(elem) # Region method – robust
1378
+ except Exception:
1379
+ # Fallback to bounding-box containment checks
1380
+ if not hasattr(elem, "bbox"):
1381
+ return False
1382
+ ex0, etop, ex1, ebottom = elem.bbox
1383
+ sx0, stop, sx1, sbottom = segment_region.bbox
1384
+ return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1385
+
1386
+ for seg in self.segments:
1387
+ # Each *seg* is guaranteed to be a Region (see _normalize_segments)
1388
+
1389
+ # Resolve segment-specific boundary arguments
1390
+ seg_start_elems = None
1391
+ seg_end_elems = None
1392
+
1393
+ # --- Handle selector strings ---
1394
+ if isinstance(start_elements_unwrapped, str):
1395
+ seg_start_elems = seg.find_all(start_elements_unwrapped).elements
1396
+ elif start_elements_unwrapped is not None:
1397
+ seg_start_elems = [
1398
+ e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1399
+ ]
1400
+
1401
+ if isinstance(end_elements_unwrapped, str):
1402
+ seg_end_elems = seg.find_all(end_elements_unwrapped).elements
1403
+ elif end_elements_unwrapped is not None:
1404
+ seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1405
+
1406
+ # Call Region.get_sections – this returns ElementCollection[Region]
1407
+ seg_sections = seg.get_sections(
1408
+ start_elements=seg_start_elems,
1409
+ end_elements=seg_end_elems,
1410
+ include_boundaries=include_boundaries,
1411
+ )
1412
+
1413
+ if seg_sections:
1414
+ aggregated_sections.extend(seg_sections.elements)
1415
+
1416
+ # Optionally, handle new_section_on_page_break – interpreted here as
1417
+ # *new_section_on_segment_break*: if True and there were *no* explicit
1418
+ # boundaries, treat the entire segment as a single section.
1419
+ if (
1420
+ new_section_on_page_break
1421
+ and not seg_sections
1422
+ and start_elements_unwrapped is None
1423
+ and end_elements_unwrapped is None
1424
+ ):
1425
+ aggregated_sections.append(seg)
1426
+
1427
+ # ------------------------------------------------------------------
1428
+ # CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
1429
+ # span multiple segments and create FlowRegions for those cases.
1430
+ # ------------------------------------------------------------------
1431
+
1432
+ # If we have explicit start/end elements, check for cross-segment sections
1433
+ if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
1434
+ # Find all start and end elements across all segments
1435
+ all_start_elements = []
1436
+ all_end_elements = []
1437
+
1438
+ # Map elements to their segments for tracking
1439
+ element_to_segment = {}
1440
+
1441
+ for seg_idx, seg in enumerate(self.segments):
1442
+ if isinstance(start_elements_unwrapped, str):
1443
+ seg_starts = seg.find_all(start_elements_unwrapped).elements
1444
+ else:
1445
+ seg_starts = [
1446
+ e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1447
+ ]
1448
+
1449
+ if isinstance(end_elements_unwrapped, str):
1450
+ seg_ends = seg.find_all(end_elements_unwrapped).elements
1451
+ else:
1452
+ seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1453
+
1454
+ for elem in seg_starts:
1455
+ all_start_elements.append((elem, seg_idx))
1456
+ element_to_segment[id(elem)] = seg_idx
1457
+
1458
+ for elem in seg_ends:
1459
+ all_end_elements.append((elem, seg_idx))
1460
+ element_to_segment[id(elem)] = seg_idx
1461
+
1462
+ # Sort by segment index, then by position within segment
1463
+ all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1464
+ all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1465
+
1466
+ # Look for cross-segment pairs (start in one segment, end in another)
1467
+ cross_segment_sections = []
1468
+ used_starts = set()
1469
+ used_ends = set()
1470
+
1471
+ for start_elem, start_seg_idx in all_start_elements:
1472
+ if id(start_elem) in used_starts:
1473
+ continue
1474
+
1475
+ # Find the next end element that comes after this start
1476
+ matching_end = None
1477
+ for end_elem, end_seg_idx in all_end_elements:
1478
+ if id(end_elem) in used_ends:
1479
+ continue
1480
+
1481
+ # Check if this end comes after the start (by segment order or position)
1482
+ if end_seg_idx > start_seg_idx or (
1483
+ end_seg_idx == start_seg_idx
1484
+ and (
1485
+ end_elem.top > start_elem.top
1486
+ or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1487
+ )
1488
+ ):
1489
+ matching_end = (end_elem, end_seg_idx)
1490
+ break
1491
+
1492
+ if matching_end is not None:
1493
+ end_elem, end_seg_idx = matching_end
1494
+
1495
+ # If start and end are in different segments, create FlowRegion
1496
+ if start_seg_idx != end_seg_idx:
1497
+ cross_segment_sections.append(
1498
+ (start_elem, start_seg_idx, end_elem, end_seg_idx)
1499
+ )
1500
+ used_starts.add(id(start_elem))
1501
+ used_ends.add(id(end_elem))
1502
+
1503
+ # Create FlowRegions for cross-segment sections
1504
+ from natural_pdf.elements.region import Region
1505
+ from natural_pdf.flows.element import FlowElement
1506
+ from natural_pdf.flows.region import FlowRegion
1507
+
1508
+ for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
1509
+ # Build constituent regions spanning from start segment to end segment
1510
+ constituent_regions = []
1511
+
1512
+ # First segment: from start element to bottom
1513
+ start_seg = self.segments[start_seg_idx]
1514
+ first_region = Region(
1515
+ start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
1516
+ )
1517
+ constituent_regions.append(first_region)
1518
+
1519
+ # Middle segments: full segments
1520
+ for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1521
+ constituent_regions.append(self.segments[seg_idx])
1522
+
1523
+ # Last segment: from top to end element
1524
+ if end_seg_idx != start_seg_idx:
1525
+ end_seg = self.segments[end_seg_idx]
1526
+ last_region = Region(
1527
+ end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
1528
+ )
1529
+ constituent_regions.append(last_region)
1530
+
1531
+ # Create FlowRegion
1532
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1533
+ flow_region = FlowRegion(
1534
+ flow=self,
1535
+ constituent_regions=constituent_regions,
1536
+ source_flow_element=flow_element,
1537
+ boundary_element_found=end_elem,
1538
+ )
1539
+
1540
+ # Remove any single-segment sections that are now covered by this FlowRegion
1541
+ # This prevents duplication of content
1542
+ aggregated_sections = [
1543
+ s
1544
+ for s in aggregated_sections
1545
+ if not any(
1546
+ cr.intersects(s)
1547
+ for cr in constituent_regions
1548
+ if hasattr(cr, "intersects") and hasattr(s, "intersects")
1549
+ )
1550
+ ]
1551
+
1552
+ aggregated_sections.append(flow_region)
1553
+
1554
+ # ------------------------------------------------------------------
1555
+ # NEW APPROACH: First collect ALL boundary elements across all segments,
1556
+ # then pair them up to create sections (either single-segment Regions
1557
+ # or multi-segment FlowRegions).
1558
+ # ------------------------------------------------------------------
1559
+ from natural_pdf.elements.element_collection import ElementCollection
1560
+ from natural_pdf.elements.region import Region
1561
+ from natural_pdf.flows.element import FlowElement
1562
+ from natural_pdf.flows.region import FlowRegion
1563
+
1564
+ # Helper to decide if an element lies inside a segment (Region)
1565
+ def _element_in_segment(elem, segment_region):
1566
+ try:
1567
+ return segment_region.intersects(elem) # Region method – robust
1568
+ except Exception:
1569
+ # Fallback to bounding-box containment checks
1570
+ if not hasattr(elem, "bbox"):
1571
+ return False
1572
+ ex0, etop, ex1, ebottom = elem.bbox
1573
+ sx0, stop, sx1, sbottom = segment_region.bbox
1574
+ return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1575
+
1576
+ # Collect ALL boundary elements across all segments with their segment indices
1577
+ all_start_elements = []
1578
+ all_end_elements = []
1579
+
1580
+ for seg_idx, seg in enumerate(self.segments):
1581
+ # Find start elements in this segment
1582
+ if isinstance(start_elements_unwrapped, str):
1583
+ seg_starts = seg.find_all(start_elements_unwrapped).elements
1584
+ elif start_elements_unwrapped is not None:
1585
+ seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
1586
+ else:
1587
+ seg_starts = []
1588
+
1589
+ logger.debug(f"\n=== Processing segment {seg_idx} ===")
1590
+ logger.debug(f"Segment bbox: {seg.bbox}")
1591
+ logger.debug(
1592
+ f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
1593
+ )
1594
+
1595
+ logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
1596
+ for i, elem in enumerate(seg_starts):
1597
+ logger.debug(
1598
+ f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1599
+ )
1600
+
1601
+ # Find end elements in this segment
1602
+ if isinstance(end_elements_unwrapped, str):
1603
+ seg_ends = seg.find_all(end_elements_unwrapped).elements
1604
+ elif end_elements_unwrapped is not None:
1605
+ seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1606
+ else:
1607
+ seg_ends = []
1608
+
1609
+ logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
1610
+ for i, elem in enumerate(seg_ends):
1611
+ logger.debug(
1612
+ f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1613
+ )
1614
+
1615
+ # Add to global lists with segment index
1616
+ for elem in seg_starts:
1617
+ all_start_elements.append((elem, seg_idx))
1618
+ for elem in seg_ends:
1619
+ all_end_elements.append((elem, seg_idx))
1620
+
1621
+ # Sort by flow order: segment index first, then position within segment
1622
+ all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1623
+ all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1624
+
1625
+ logger.debug(f"\n=== Total boundary elements found ===")
1626
+ logger.debug(f"Total start elements: {len(all_start_elements)}")
1627
+ logger.debug(f"Total end elements: {len(all_end_elements)}")
1628
+
1629
+ # Pair up start and end elements to create sections
1630
+ sections = []
1631
+ used_starts = set()
1632
+ used_ends = set()
1633
+
1634
+ for start_elem, start_seg_idx in all_start_elements:
1635
+ if id(start_elem) in used_starts:
1636
+ continue
1637
+
1638
+ logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
1639
+ logger.debug(
1640
+ f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
1641
+ )
1642
+
1643
+ # Find the next unused end element that comes after this start
1644
+ matching_end = None
1645
+ for end_elem, end_seg_idx in all_end_elements:
1646
+ if id(end_elem) in used_ends:
1647
+ continue
1648
+
1649
+ # Check if this end comes after the start in flow order
1650
+ if end_seg_idx > start_seg_idx or (
1651
+ end_seg_idx == start_seg_idx
1652
+ and (
1653
+ end_elem.top > start_elem.top
1654
+ or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1655
+ )
1656
+ ):
1657
+ matching_end = (end_elem, end_seg_idx)
1658
+ break
1659
+
1660
+ if matching_end is not None:
1661
+ end_elem, end_seg_idx = matching_end
1662
+ used_starts.add(id(start_elem))
1663
+ used_ends.add(id(end_elem))
1664
+
1665
+ logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
1666
+
1667
+ # Create section based on whether it spans segments
1668
+ if start_seg_idx == end_seg_idx:
1669
+ # Single segment section - use Region.get_section_between
1670
+ seg = self.segments[start_seg_idx]
1671
+ section = seg.get_section_between(start_elem, end_elem, include_boundaries)
1672
+ sections.append(section)
1673
+ logger.debug(f" Created single-segment Region")
1674
+ else:
1675
+ # Multi-segment section - create FlowRegion
1676
+ logger.debug(
1677
+ f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
1678
+ )
1679
+ constituent_regions = []
1680
+
1681
+ # First segment: from start element to bottom
1682
+ start_seg = self.segments[start_seg_idx]
1683
+ if include_boundaries in ["start", "both"]:
1684
+ first_top = start_elem.top
1685
+ else:
1686
+ first_top = start_elem.bottom
1687
+ first_region = Region(
1688
+ start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
1689
+ )
1690
+ constituent_regions.append(first_region)
1691
+
1692
+ # Middle segments: full segments
1693
+ for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1694
+ constituent_regions.append(self.segments[seg_idx])
1695
+
1696
+ # Last segment: from top to end element
1697
+ end_seg = self.segments[end_seg_idx]
1698
+ if include_boundaries in ["end", "both"]:
1699
+ last_bottom = end_elem.bottom
1700
+ else:
1701
+ last_bottom = end_elem.top
1702
+ last_region = Region(
1703
+ end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
1704
+ )
1705
+ constituent_regions.append(last_region)
1706
+
1707
+ # Create FlowRegion
1708
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1709
+ flow_region = FlowRegion(
1710
+ flow=self,
1711
+ constituent_regions=constituent_regions,
1712
+ source_flow_element=flow_element,
1713
+ boundary_element_found=end_elem,
1714
+ )
1715
+ sections.append(flow_region)
1716
+
1717
+ # Handle special cases when only start or only end elements are provided
1718
+ if start_elements_unwrapped is not None and end_elements_unwrapped is None:
1719
+ logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
1720
+ for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
1721
+ if id(start_elem) in used_starts:
1722
+ continue
1723
+
1724
+ # Find next start element
1725
+ next_start = None
1726
+ if i + 1 < len(all_start_elements):
1727
+ next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
1728
+ # Create section from this start to just before next start
1729
+ if start_seg_idx == next_start_seg_idx:
1730
+ # Same segment
1731
+ seg = self.segments[start_seg_idx]
1732
+ # Find element just before next start
1733
+ all_elems = seg.get_elements()
1734
+ all_elems.sort(key=lambda e: (e.top, e.x0))
1735
+ try:
1736
+ next_idx = all_elems.index(next_start_elem)
1737
+ if next_idx > 0:
1738
+ end_elem = all_elems[next_idx - 1]
1739
+ section = seg.get_section_between(
1740
+ start_elem, end_elem, include_boundaries
1741
+ )
1742
+ sections.append(section)
1743
+ except ValueError:
1744
+ pass
1745
+ elif next_start_seg_idx == start_seg_idx + 1:
1746
+ # Next start is in the immediately following segment in the flow
1747
+ # Create a FlowRegion that spans from current start to just before next start
1748
+ logger.debug(f" Next start is in next flow segment - creating FlowRegion")
1749
+
1750
+ constituent_regions = []
1751
+
1752
+ # First segment: from start element to bottom
1753
+ start_seg = self.segments[start_seg_idx]
1754
+ if include_boundaries in ["start", "both"]:
1755
+ first_top = start_elem.top
1756
+ else:
1757
+ first_top = start_elem.bottom
1758
+ first_region = Region(
1759
+ start_seg.page,
1760
+ (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1761
+ )
1762
+ constituent_regions.append(first_region)
1763
+
1764
+ # Next segment: from top to just before next start
1765
+ next_seg = self.segments[next_start_seg_idx]
1766
+ # Find element just before next start in the next segment
1767
+ next_seg_elems = next_seg.get_elements()
1768
+ next_seg_elems.sort(key=lambda e: (e.top, e.x0))
1769
+
1770
+ last_bottom = next_start_elem.top # Default to just before the next start
1771
+ try:
1772
+ next_idx = next_seg_elems.index(next_start_elem)
1773
+ if next_idx > 0:
1774
+ # Use the bottom of the element before next start
1775
+ prev_elem = next_seg_elems[next_idx - 1]
1776
+ last_bottom = prev_elem.bottom
1777
+ except ValueError:
1778
+ pass
1779
+
1780
+ last_region = Region(
1781
+ next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
1782
+ )
1783
+ constituent_regions.append(last_region)
1784
+
1785
+ # Create FlowRegion
1786
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1787
+ flow_region = FlowRegion(
1788
+ flow=self,
1789
+ constituent_regions=constituent_regions,
1790
+ source_flow_element=flow_element,
1791
+ boundary_element_found=None,
1792
+ )
1793
+ sections.append(flow_region)
1794
+ logger.debug(
1795
+ f" Created FlowRegion with {len(constituent_regions)} constituent regions"
1796
+ )
1797
+ else:
1798
+ # Next start is more than one segment away - just end at current segment
1799
+ start_seg = self.segments[start_seg_idx]
1800
+ if include_boundaries in ["start", "both"]:
1801
+ region_top = start_elem.top
1802
+ else:
1803
+ region_top = start_elem.bottom
1804
+ section = Region(
1805
+ start_seg.page,
1806
+ (start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
1807
+ )
1808
+ sections.append(section)
1809
+ logger.debug(
1810
+ f" Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
1811
+ )
1812
+ else:
1813
+ # Last start element: section goes to end of flow
1814
+ # This could span multiple segments
1815
+ if start_seg_idx == len(self.segments) - 1:
1816
+ # Only in last segment
1817
+ seg = self.segments[start_seg_idx]
1818
+ if include_boundaries in ["start", "both"]:
1819
+ region_top = start_elem.top
1820
+ else:
1821
+ region_top = start_elem.bottom
1822
+ section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
1823
+ sections.append(section)
1824
+ else:
1825
+ # Spans to end of flow - create FlowRegion
1826
+ constituent_regions = []
1827
+
1828
+ # First segment
1829
+ start_seg = self.segments[start_seg_idx]
1830
+ if include_boundaries in ["start", "both"]:
1831
+ first_top = start_elem.top
1832
+ else:
1833
+ first_top = start_elem.bottom
1834
+ first_region = Region(
1835
+ start_seg.page,
1836
+ (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1837
+ )
1838
+ constituent_regions.append(first_region)
1839
+
1840
+ # Remaining segments
1841
+ for seg_idx in range(start_seg_idx + 1, len(self.segments)):
1842
+ constituent_regions.append(self.segments[seg_idx])
1843
+
1844
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1845
+ flow_region = FlowRegion(
1846
+ flow=self,
1847
+ constituent_regions=constituent_regions,
1848
+ source_flow_element=flow_element,
1849
+ boundary_element_found=None,
1850
+ )
1851
+ sections.append(flow_region)
1852
+
1853
+ # Handle new_section_on_page_break when no explicit boundaries
1854
+ if (
1855
+ new_section_on_page_break
1856
+ and start_elements_unwrapped is None
1857
+ and end_elements_unwrapped is None
1858
+ ):
1859
+ # Each segment becomes its own section
1860
+ sections = list(self.segments)
1861
+
1862
+ # Sort sections by their position in the flow
1863
+ def _section_sort_key(section):
1864
+ if hasattr(section, "constituent_regions"):
1865
+ # FlowRegion - use first constituent region
1866
+ first_region = (
1867
+ section.constituent_regions[0] if section.constituent_regions else None
1868
+ )
1869
+ if first_region:
1870
+ # Find which segment this region belongs to
1871
+ for idx, seg in enumerate(self.segments):
1872
+ try:
1873
+ if seg.intersects(first_region):
1874
+ return (
1875
+ idx,
1876
+ getattr(first_region, "top", 0),
1877
+ getattr(first_region, "x0", 0),
1878
+ )
1879
+ except:
1880
+ pass
1881
+ else:
1882
+ # Regular Region
1883
+ for idx, seg in enumerate(self.segments):
1884
+ try:
1885
+ if seg.intersects(section):
1886
+ return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
1887
+ except:
1888
+ pass
1889
+ return (float("inf"), 0, 0)
1890
+
1891
+ sections.sort(key=_section_sort_key)
1892
+
1893
+ logger.debug(f"\n=== Section creation complete ===")
1894
+ logger.debug(f"Total sections created: {len(sections)}")
1895
+ for i, section in enumerate(sections):
1896
+ if hasattr(section, "constituent_regions"):
1897
+ logger.debug(
1898
+ f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
1899
+ )
1900
+ else:
1901
+ logger.debug(f"Section {i}: Region with bbox={section.bbox}")
1902
+
1903
+ return ElementCollection(sections)
1904
+
1905
+ def highlights(self, show: bool = False) -> "HighlightContext":
1906
+ """
1907
+ Create a highlight context for accumulating highlights.
1908
+
1909
+ This allows for clean syntax to show multiple highlight groups:
1910
+
1911
+ Example:
1912
+ with flow.highlights() as h:
1913
+ h.add(flow.find_all('table'), label='tables', color='blue')
1914
+ h.add(flow.find_all('text:bold'), label='bold text', color='red')
1915
+ h.show()
1916
+
1917
+ Or with automatic display:
1918
+ with flow.highlights(show=True) as h:
1919
+ h.add(flow.find_all('table'), label='tables')
1920
+ h.add(flow.find_all('text:bold'), label='bold')
1921
+ # Automatically shows when exiting the context
1922
+
1923
+ Args:
1924
+ show: If True, automatically show highlights when exiting context
1925
+
1926
+ Returns:
1927
+ HighlightContext for accumulating highlights
1928
+ """
1929
+ from natural_pdf.core.highlighting_service import HighlightContext
1930
+
1931
+ return HighlightContext(self, show_on_exit=show)