natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/flows/flow.py CHANGED
@@ -1,15 +1,21 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
2
+ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union, Tuple, Callable, overload
3
3
 
4
4
  if TYPE_CHECKING:
5
5
  from natural_pdf.core.page import Page
6
6
  from natural_pdf.elements.base import Element as PhysicalElement
7
- from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
7
+ from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection, PageCollection
8
8
  from natural_pdf.elements.region import Region as PhysicalRegion
9
+ from PIL.Image import Image as PIL_Image
9
10
 
10
11
  from .collections import FlowElementCollection
11
12
  from .element import FlowElement
12
13
 
14
+ # Import required classes for the new methods
15
+ from natural_pdf.tables import TableResult
16
+ # For runtime image manipulation
17
+ from PIL import Image as PIL_Image_Runtime
18
+
13
19
  logger = logging.getLogger(__name__)
14
20
 
15
21
 
@@ -81,7 +87,7 @@ class Flow:
81
87
 
82
88
  def __init__(
83
89
  self,
84
- segments: List[Union["Page", "PhysicalRegion"]],
90
+ segments: Union[List[Union["Page", "PhysicalRegion"]], "PageCollection"],
85
91
  arrangement: Literal["vertical", "horizontal"],
86
92
  alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
87
93
  segment_gap: float = 0.0,
@@ -91,7 +97,8 @@ class Flow:
91
97
 
92
98
  Args:
93
99
  segments: An ordered list of natural_pdf.core.page.Page or
94
- natural_pdf.elements.region.Region objects that constitute the flow.
100
+ natural_pdf.elements.region.Region objects that constitute the flow,
101
+ or a PageCollection containing pages.
95
102
  arrangement: The primary direction of the flow.
96
103
  - "vertical": Segments are stacked top-to-bottom.
97
104
  - "horizontal": Segments are arranged left-to-right.
@@ -106,6 +113,10 @@ class Flow:
106
113
  - "bottom" (or "end"): Align bottom edges.
107
114
  segment_gap: The virtual gap (in PDF points) between segments.
108
115
  """
116
+ # Handle PageCollection input
117
+ if hasattr(segments, 'pages'): # It's a PageCollection
118
+ segments = list(segments.pages)
119
+
109
120
  if not segments:
110
121
  raise ValueError("Flow segments cannot be empty.")
111
122
  if arrangement not in ["vertical", "horizontal"]:
@@ -213,21 +224,48 @@ class Flow:
213
224
  ) -> "FlowElementCollection":
214
225
  """
215
226
  Finds all elements within the flow that match the given selector or text criteria.
216
- Elements are collected segment by segment, preserving the flow order.
217
-
227
+
228
+ This method efficiently groups segments by their parent pages, searches at the page level,
229
+ then filters results appropriately for each segment. This ensures elements that intersect
230
+ with flow segments (but aren't fully contained) are still found.
231
+
218
232
  Elements found are wrapped as FlowElement objects, anchored to this Flow,
219
233
  and returned in a FlowElementCollection.
220
234
  """
221
235
  from .collections import FlowElementCollection
222
236
  from .element import FlowElement
223
237
 
238
+ # Step 1: Group segments by their parent pages (like in analyze_layout)
239
+ segments_by_page = {} # Dict[Page, List[Segment]]
240
+
241
+ for i, segment in enumerate(self.segments):
242
+ # Determine the page for this segment - fix type detection
243
+ if hasattr(segment, 'page') and hasattr(segment.page, 'find_all'):
244
+ # It's a Region object (has a parent page)
245
+ page_obj = segment.page
246
+ segment_type = "region"
247
+ elif hasattr(segment, 'find_all') and hasattr(segment, 'width') and hasattr(segment, 'height') and not hasattr(segment, 'page'):
248
+ # It's a Page object (has find_all but no parent page)
249
+ page_obj = segment
250
+ segment_type = "page"
251
+ else:
252
+ logger.warning(f"Segment {i+1} does not support find_all, skipping")
253
+ continue
254
+
255
+ if page_obj not in segments_by_page:
256
+ segments_by_page[page_obj] = []
257
+ segments_by_page[page_obj].append((segment, segment_type))
258
+
259
+ if not segments_by_page:
260
+ logger.warning("No segments with searchable pages found")
261
+ return FlowElementCollection([])
262
+
263
+ # Step 2: Search each unique page only once
224
264
  all_flow_elements: List["FlowElement"] = []
225
265
 
226
- # Iterate through segments in their defined flow order
227
- for physical_segment in self.segments:
228
- # Find all matching physical elements within the current segment
229
- # Region.find_all() should return elements in local reading order.
230
- matches_in_segment: "PhysicalElementCollection" = physical_segment.find_all(
266
+ for page_obj, page_segments in segments_by_page.items():
267
+ # Find all matching elements on this page
268
+ page_matches = page_obj.find_all(
231
269
  selector=selector,
232
270
  text=text,
233
271
  apply_exclusions=apply_exclusions,
@@ -235,16 +273,46 @@ class Flow:
235
273
  case=case,
236
274
  **kwargs,
237
275
  )
238
- if matches_in_segment:
239
- # Wrap each found physical element as a FlowElement and add to the list
240
- # This preserves the order from matches_in_segment.elements
241
- for phys_elem in matches_in_segment.elements:
242
- all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
243
-
244
- # The global sort that was here previously has been removed.
245
- # The order is now determined by segment sequence, then by local order within each segment.
276
+
277
+ if not page_matches:
278
+ continue
279
+
280
+ # Step 3: For each segment on this page, collect relevant elements
281
+ for segment, segment_type in page_segments:
282
+ if segment_type == "page":
283
+ # Full page segment: include all elements
284
+ for phys_elem in page_matches.elements:
285
+ all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
286
+
287
+ elif segment_type == "region":
288
+ # Region segment: filter to only intersecting elements
289
+ for phys_elem in page_matches.elements:
290
+ try:
291
+ # Check if element intersects with this flow segment
292
+ if segment.intersects(phys_elem):
293
+ all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
294
+ except Exception as intersect_error:
295
+ logger.debug(f"Error checking intersection for element: {intersect_error}")
296
+ # Include the element anyway if intersection check fails
297
+ all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
298
+
299
+ # Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
300
+ unique_flow_elements = []
301
+ seen_element_ids = set()
302
+
303
+ for flow_elem in all_flow_elements:
304
+ # Create a unique identifier for the underlying physical element
305
+ phys_elem = flow_elem.physical_object
306
+ elem_id = (
307
+ getattr(phys_elem.page, 'index', id(phys_elem.page)) if hasattr(phys_elem, 'page') else id(phys_elem),
308
+ phys_elem.bbox if hasattr(phys_elem, 'bbox') else id(phys_elem)
309
+ )
310
+
311
+ if elem_id not in seen_element_ids:
312
+ unique_flow_elements.append(flow_elem)
313
+ seen_element_ids.add(elem_id)
246
314
 
247
- return FlowElementCollection(all_flow_elements)
315
+ return FlowElementCollection(unique_flow_elements)
248
316
 
249
317
  def __repr__(self) -> str:
250
318
  return (
@@ -252,6 +320,620 @@ class Flow:
252
320
  f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
253
321
  )
254
322
 
323
+ @overload
324
+ def extract_table(
325
+ self,
326
+ method: Optional[str] = None,
327
+ table_settings: Optional[dict] = None,
328
+ use_ocr: bool = False,
329
+ ocr_config: Optional[dict] = None,
330
+ text_options: Optional[dict] = None,
331
+ cell_extraction_func: Optional[Any] = None,
332
+ show_progress: bool = False,
333
+ content_filter: Optional[Any] = None,
334
+ stitch_rows: Callable[[List[Optional[str]]], bool] = None,
335
+ ) -> TableResult: ...
336
+
337
+ @overload
338
+ def extract_table(
339
+ self,
340
+ method: Optional[str] = None,
341
+ table_settings: Optional[dict] = None,
342
+ use_ocr: bool = False,
343
+ ocr_config: Optional[dict] = None,
344
+ text_options: Optional[dict] = None,
345
+ cell_extraction_func: Optional[Any] = None,
346
+ show_progress: bool = False,
347
+ content_filter: Optional[Any] = None,
348
+ stitch_rows: Callable[
349
+ [List[Optional[str]], List[Optional[str]], int, Union["Page", "PhysicalRegion"]],
350
+ bool,
351
+ ] = None,
352
+ ) -> TableResult: ...
353
+
354
+ def extract_table(
355
+ self,
356
+ method: Optional[str] = None,
357
+ table_settings: Optional[dict] = None,
358
+ use_ocr: bool = False,
359
+ ocr_config: Optional[dict] = None,
360
+ text_options: Optional[dict] = None,
361
+ cell_extraction_func: Optional[Any] = None,
362
+ show_progress: bool = False,
363
+ content_filter: Optional[Any] = None,
364
+ stitch_rows: Optional[Callable] = None,
365
+ ) -> TableResult:
366
+ """
367
+ Extract table data from all segments in the flow, combining results sequentially.
368
+
369
+ This method extracts table data from each segment in flow order and combines
370
+ the results into a single logical table. This is particularly useful for
371
+ multi-page tables or tables that span across columns.
372
+
373
+ Args:
374
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
375
+ table_settings: Settings for pdfplumber table extraction.
376
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
377
+ ocr_config: OCR configuration parameters.
378
+ text_options: Dictionary of options for the 'text' method.
379
+ cell_extraction_func: Optional callable function that takes a cell Region object
380
+ and returns its string content. For 'text' method only.
381
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
382
+ content_filter: Optional content filter to apply during cell text extraction.
383
+ stitch_rows: Optional callable to determine when rows should be merged across
384
+ segment boundaries. Two overloaded signatures are supported:
385
+
386
+ • func(current_row) -> bool
387
+ Called only on the first row of each segment (after the first).
388
+ Return True to merge this first row with the last row from
389
+ the previous segment.
390
+
391
+ • func(prev_row, current_row, row_index, segment) -> bool
392
+ Called for every row. Return True to merge current_row with
393
+ the previous row in the aggregated results.
394
+
395
+ When True is returned, rows are concatenated cell-by-cell.
396
+ This is useful for handling table rows split across page
397
+ boundaries or segments. If None, rows are never merged.
398
+
399
+ Returns:
400
+ TableResult object containing the aggregated table data from all segments.
401
+
402
+ Example:
403
+ Multi-page table extraction:
404
+ ```python
405
+ pdf = npdf.PDF("multi_page_table.pdf")
406
+
407
+ # Create flow for table spanning pages 2-4
408
+ table_flow = Flow(
409
+ segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
410
+ arrangement='vertical'
411
+ )
412
+
413
+ # Extract table as if it were continuous
414
+ table_data = table_flow.extract_table()
415
+ df = table_data.df # Convert to pandas DataFrame
416
+
417
+ # Custom row stitching - single parameter (simple case)
418
+ table_data = table_flow.extract_table(
419
+ stitch_rows=lambda row: row and not (row[0] or "").strip()
420
+ )
421
+
422
+ # Custom row stitching - full parameters (advanced case)
423
+ table_data = table_flow.extract_table(
424
+ stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
425
+ )
426
+ ```
427
+ """
428
+ logger.info(f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})")
429
+
430
+ if not self.segments:
431
+ logger.warning("Flow has no segments, returning empty table")
432
+ return TableResult([])
433
+
434
+ # Resolve predicate and determine its signature
435
+ predicate: Optional[Callable] = None
436
+ predicate_type: str = "none"
437
+
438
+ if callable(stitch_rows):
439
+ import inspect
440
+ sig = inspect.signature(stitch_rows)
441
+ param_count = len(sig.parameters)
442
+
443
+ if param_count == 1:
444
+ predicate = stitch_rows
445
+ predicate_type = "single_param"
446
+ elif param_count == 4:
447
+ predicate = stitch_rows
448
+ predicate_type = "full_params"
449
+ else:
450
+ logger.warning(f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring.")
451
+ predicate = None
452
+ predicate_type = "none"
453
+
454
+ def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
455
+ from itertools import zip_longest
456
+ merged: List[Optional[str]] = []
457
+ for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
458
+ if (p or "").strip() and (c or "").strip():
459
+ merged.append(f"{p} {c}".strip())
460
+ else:
461
+ merged.append((p or "") + (c or ""))
462
+ return merged
463
+
464
+ aggregated_rows: List[List[Optional[str]]] = []
465
+ processed_segments = 0
466
+
467
+ for seg_idx, segment in enumerate(self.segments):
468
+ try:
469
+ logger.debug(f" Extracting table from segment {seg_idx+1}/{len(self.segments)}")
470
+
471
+ segment_result = segment.extract_table(
472
+ method=method,
473
+ table_settings=table_settings.copy() if table_settings else None,
474
+ use_ocr=use_ocr,
475
+ ocr_config=ocr_config,
476
+ text_options=text_options.copy() if text_options else None,
477
+ cell_extraction_func=cell_extraction_func,
478
+ show_progress=show_progress,
479
+ content_filter=content_filter,
480
+ )
481
+
482
+ if not segment_result:
483
+ continue
484
+
485
+ if hasattr(segment_result, "_rows"):
486
+ segment_rows = list(segment_result._rows)
487
+ else:
488
+ segment_rows = list(segment_result)
489
+
490
+ if not segment_rows:
491
+ logger.debug(f" No table data found in segment {seg_idx+1}")
492
+ continue
493
+
494
+ for row_idx, row in enumerate(segment_rows):
495
+ should_merge = False
496
+
497
+ if predicate is not None and aggregated_rows:
498
+ if predicate_type == "single_param":
499
+ # For single param: only call on first row of segment (row_idx == 0)
500
+ # and pass the current row
501
+ if row_idx == 0:
502
+ should_merge = predicate(row)
503
+ elif predicate_type == "full_params":
504
+ # For full params: call with all arguments
505
+ should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
506
+
507
+ if should_merge:
508
+ aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
509
+ else:
510
+ aggregated_rows.append(row)
511
+
512
+ processed_segments += 1
513
+ logger.debug(f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}")
514
+
515
+ except Exception as e:
516
+ logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
517
+ continue
518
+
519
+ logger.info(
520
+ f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
521
+ )
522
+ return TableResult(aggregated_rows)
523
+
524
+ def analyze_layout(
525
+ self,
526
+ engine: Optional[str] = None,
527
+ options: Optional[Any] = None,
528
+ confidence: Optional[float] = None,
529
+ classes: Optional[List[str]] = None,
530
+ exclude_classes: Optional[List[str]] = None,
531
+ device: Optional[str] = None,
532
+ existing: str = "replace",
533
+ model_name: Optional[str] = None,
534
+ client: Optional[Any] = None,
535
+ ) -> "PhysicalElementCollection":
536
+ """
537
+ Analyze layout across all segments in the flow.
538
+
539
+ This method efficiently groups segments by their parent pages, runs layout analysis
540
+ only once per unique page, then filters results appropriately for each segment.
541
+ This avoids redundant analysis when multiple flow segments come from the same page.
542
+
543
+ Args:
544
+ engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
545
+ options: Specific LayoutOptions object for advanced configuration.
546
+ confidence: Minimum confidence threshold.
547
+ classes: Specific classes to detect.
548
+ exclude_classes: Classes to exclude.
549
+ device: Device for inference.
550
+ existing: How to handle existing detected regions: 'replace' (default) or 'append'.
551
+ model_name: Optional model name for the engine.
552
+ client: Optional client for API-based engines.
553
+
554
+ Returns:
555
+ ElementCollection containing all detected Region objects from all segments.
556
+
557
+ Example:
558
+ Multi-page layout analysis:
559
+ ```python
560
+ pdf = npdf.PDF("document.pdf")
561
+
562
+ # Create flow for first 3 pages
563
+ page_flow = Flow(
564
+ segments=pdf.pages[:3],
565
+ arrangement='vertical'
566
+ )
567
+
568
+ # Analyze layout across all pages (efficiently)
569
+ all_regions = page_flow.analyze_layout(engine='yolo')
570
+
571
+ # Find all tables across the flow
572
+ tables = all_regions.filter('region[type=table]')
573
+ ```
574
+ """
575
+ from natural_pdf.elements.collections import ElementCollection
576
+
577
+ logger.info(f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})")
578
+
579
+ if not self.segments:
580
+ logger.warning("Flow has no segments, returning empty collection")
581
+ return ElementCollection([])
582
+
583
+ # Step 1: Group segments by their parent pages to avoid redundant analysis
584
+ segments_by_page = {} # Dict[Page, List[Segment]]
585
+
586
+ for i, segment in enumerate(self.segments):
587
+ # Determine the page for this segment
588
+ if hasattr(segment, 'analyze_layout'):
589
+ # It's a Page object
590
+ page_obj = segment
591
+ segment_type = "page"
592
+ elif hasattr(segment, 'page') and hasattr(segment.page, 'analyze_layout'):
593
+ # It's a Region object
594
+ page_obj = segment.page
595
+ segment_type = "region"
596
+ else:
597
+ logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
598
+ continue
599
+
600
+ if page_obj not in segments_by_page:
601
+ segments_by_page[page_obj] = []
602
+ segments_by_page[page_obj].append((segment, segment_type))
603
+
604
+ if not segments_by_page:
605
+ logger.warning("No segments with analyzable pages found")
606
+ return ElementCollection([])
607
+
608
+ logger.debug(f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages")
609
+
610
+ # Step 2: Analyze each unique page only once
611
+ all_detected_regions: List["PhysicalRegion"] = []
612
+ processed_pages = 0
613
+
614
+ for page_obj, page_segments in segments_by_page.items():
615
+ try:
616
+ logger.debug(f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments")
617
+
618
+ # Run layout analysis once for this page
619
+ page_results = page_obj.analyze_layout(
620
+ engine=engine,
621
+ options=options,
622
+ confidence=confidence,
623
+ classes=classes,
624
+ exclude_classes=exclude_classes,
625
+ device=device,
626
+ existing=existing,
627
+ model_name=model_name,
628
+ client=client,
629
+ )
630
+
631
+ # Extract regions from results
632
+ if hasattr(page_results, 'elements'):
633
+ # It's an ElementCollection
634
+ page_regions = page_results.elements
635
+ elif isinstance(page_results, list):
636
+ # It's a list of regions
637
+ page_regions = page_results
638
+ else:
639
+ logger.warning(f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}")
640
+ continue
641
+
642
+ if not page_regions:
643
+ logger.debug(f" No layout regions found on page {getattr(page_obj, 'number', '?')}")
644
+ continue
645
+
646
+ # Step 3: For each segment on this page, collect relevant regions
647
+ segments_processed_on_page = 0
648
+ for segment, segment_type in page_segments:
649
+ if segment_type == "page":
650
+ # Full page segment: include all detected regions
651
+ all_detected_regions.extend(page_regions)
652
+ segments_processed_on_page += 1
653
+ logger.debug(f" Added {len(page_regions)} regions for full-page segment")
654
+
655
+ elif segment_type == "region":
656
+ # Region segment: filter to only intersecting regions
657
+ intersecting_regions = []
658
+ for region in page_regions:
659
+ try:
660
+ if segment.intersects(region):
661
+ intersecting_regions.append(region)
662
+ except Exception as intersect_error:
663
+ logger.debug(f"Error checking intersection for region: {intersect_error}")
664
+ # Include the region anyway if intersection check fails
665
+ intersecting_regions.append(region)
666
+
667
+ all_detected_regions.extend(intersecting_regions)
668
+ segments_processed_on_page += 1
669
+ logger.debug(f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}")
670
+
671
+ processed_pages += 1
672
+ logger.debug(f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}")
673
+
674
+ except Exception as e:
675
+ logger.error(f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}", exc_info=True)
676
+ continue
677
+
678
+ # Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
679
+ unique_regions = []
680
+ seen_region_ids = set()
681
+
682
+ for region in all_detected_regions:
683
+ # Create a unique identifier for this region (page + bbox)
684
+ region_id = (
685
+ getattr(region.page, 'index', id(region.page)),
686
+ region.bbox if hasattr(region, 'bbox') else id(region)
687
+ )
688
+
689
+ if region_id not in seen_region_ids:
690
+ unique_regions.append(region)
691
+ seen_region_ids.add(region_id)
692
+
693
+ dedupe_removed = len(all_detected_regions) - len(unique_regions)
694
+ if dedupe_removed > 0:
695
+ logger.debug(f" Removed {dedupe_removed} duplicate regions")
696
+
697
+ logger.info(f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages")
698
+ return ElementCollection(unique_regions)
699
+
700
+ def show(
701
+ self,
702
+ resolution: Optional[float] = None,
703
+ labels: bool = True,
704
+ legend_position: str = "right",
705
+ color: Optional[Union[Tuple, str]] = "blue",
706
+ label_prefix: Optional[str] = "FlowSegment",
707
+ width: Optional[int] = None,
708
+ stack_direction: str = "vertical",
709
+ stack_gap: int = 5,
710
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255),
711
+ crop: bool = False,
712
+ **kwargs,
713
+ ) -> Optional["PIL_Image"]:
714
+ """
715
+ Generates and returns a PIL Image showing all segments in the flow with highlights.
716
+
717
+ This method visualizes the entire flow by highlighting each segment on its respective
718
+ page and combining the results into a single image. If multiple pages are involved,
719
+ they are stacked according to the flow's arrangement.
720
+
721
+ Args:
722
+ resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
723
+ labels: Whether to include a legend for highlights.
724
+ legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
725
+ color: Color for highlighting the flow segments.
726
+ label_prefix: Prefix for segment labels (e.g., 'FlowSegment').
727
+ width: Optional width for the output image (overrides resolution).
728
+ stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
729
+ stack_gap: Gap in pixels between stacked pages.
730
+ stack_background_color: RGB background color for the stacked image.
731
+ crop: If True, crop each rendered page to the bounding box of segments on that page.
732
+ **kwargs: Additional arguments passed to the underlying rendering methods.
733
+
734
+ Returns:
735
+ PIL Image of the rendered pages with highlighted flow segments, or None if rendering fails.
736
+
737
+ Example:
738
+ Visualizing a multi-page flow:
739
+ ```python
740
+ pdf = npdf.PDF("document.pdf")
741
+
742
+ # Create flow across multiple pages
743
+ page_flow = Flow(
744
+ segments=[pdf.pages[0], pdf.pages[1], pdf.pages[2]],
745
+ arrangement='vertical'
746
+ )
747
+
748
+ # Show the entire flow
749
+ flow_image = page_flow.show(color="green", labels=True)
750
+ ```
751
+ """
752
+ logger.info(f"Rendering Flow with {len(self.segments)} segments")
753
+
754
+ if not self.segments:
755
+ logger.warning("Flow has no segments to show")
756
+ return None
757
+
758
+ # Apply global options as defaults for resolution
759
+ import natural_pdf
760
+ if resolution is None:
761
+ if natural_pdf.options.image.resolution is not None:
762
+ resolution = natural_pdf.options.image.resolution
763
+ else:
764
+ resolution = 144 # Default resolution
765
+
766
+ # 1. Group segments by their physical pages
767
+ segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
768
+
769
+ for i, segment in enumerate(self.segments):
770
+ # Get the page for this segment
771
+ if hasattr(segment, 'page') and segment.page is not None:
772
+ # It's a Region, use its page
773
+ page_obj = segment.page
774
+ if page_obj not in segments_by_page:
775
+ segments_by_page[page_obj] = []
776
+ segments_by_page[page_obj].append(segment)
777
+ elif hasattr(segment, 'index') and hasattr(segment, 'width') and hasattr(segment, 'height'):
778
+ # It's a full Page object, create a full-page region for it
779
+ page_obj = segment
780
+ full_page_region = segment.region(0, 0, segment.width, segment.height)
781
+ if page_obj not in segments_by_page:
782
+ segments_by_page[page_obj] = []
783
+ segments_by_page[page_obj].append(full_page_region)
784
+ else:
785
+ logger.warning(f"Segment {i+1} has no identifiable page, skipping")
786
+ continue
787
+
788
+ if not segments_by_page:
789
+ logger.warning("No segments with identifiable pages found")
790
+ return None
791
+
792
+ # 2. Get a highlighter service from the first page
793
+ first_page = next(iter(segments_by_page.keys()))
794
+ if not hasattr(first_page, '_highlighter'):
795
+ logger.error("Cannot get highlighter service for Flow.show(). Page missing highlighter.")
796
+ return None
797
+
798
+ highlighter_service = first_page._highlighter
799
+ output_page_images: List["PIL_Image_Runtime"] = []
800
+
801
+ # Sort pages by index for consistent output order
802
+ sorted_pages = sorted(
803
+ segments_by_page.keys(),
804
+ key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
805
+ )
806
+
807
+ # 3. Render each page with its relevant segments highlighted
808
+ for page_idx, page_obj in enumerate(sorted_pages):
809
+ segments_on_this_page = segments_by_page[page_obj]
810
+ if not segments_on_this_page:
811
+ continue
812
+
813
+ temp_highlights_for_page = []
814
+ for i, segment in enumerate(segments_on_this_page):
815
+ segment_label = None
816
+ if labels and label_prefix:
817
+ # Create label for this segment
818
+ global_segment_idx = None
819
+ try:
820
+ # Find the global index of this segment in the original flow
821
+ global_segment_idx = self.segments.index(segment)
822
+ except ValueError:
823
+ # If it's a generated full-page region, find its source page
824
+ for idx, orig_segment in enumerate(self.segments):
825
+ if (hasattr(orig_segment, 'index') and hasattr(segment, 'page')
826
+ and orig_segment.index == segment.page.index):
827
+ global_segment_idx = idx
828
+ break
829
+
830
+ if global_segment_idx is not None:
831
+ segment_label = f"{label_prefix}_{global_segment_idx + 1}"
832
+ else:
833
+ segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
834
+
835
+ temp_highlights_for_page.append(
836
+ {
837
+ "page_index": (
838
+ page_obj.index
839
+ if hasattr(page_obj, "index")
840
+ else getattr(page_obj, "page_number", 1) - 1
841
+ ),
842
+ "bbox": segment.bbox,
843
+ "polygon": segment.polygon if hasattr(segment, 'polygon') and hasattr(segment, 'has_polygon') and segment.has_polygon else None,
844
+ "color": color,
845
+ "label": segment_label,
846
+ "use_color_cycling": False, # Keep specific color
847
+ }
848
+ )
849
+
850
+ if not temp_highlights_for_page:
851
+ continue
852
+
853
+ # Calculate crop bbox if cropping is enabled
854
+ crop_bbox = None
855
+ if crop and segments_on_this_page:
856
+ # Calculate the bounding box that encompasses all segments on this page
857
+ min_x0 = min(segment.bbox[0] for segment in segments_on_this_page)
858
+ min_y0 = min(segment.bbox[1] for segment in segments_on_this_page)
859
+ max_x1 = max(segment.bbox[2] for segment in segments_on_this_page)
860
+ max_y1 = max(segment.bbox[3] for segment in segments_on_this_page)
861
+ crop_bbox = (min_x0, min_y0, max_x1, max_y1)
862
+
863
+ # Render this page with highlights
864
+ page_image = highlighter_service.render_preview(
865
+ page_index=(
866
+ page_obj.index
867
+ if hasattr(page_obj, "index")
868
+ else getattr(page_obj, "page_number", 1) - 1
869
+ ),
870
+ temporary_highlights=temp_highlights_for_page,
871
+ resolution=resolution,
872
+ width=width,
873
+ labels=labels,
874
+ legend_position=legend_position,
875
+ crop_bbox=crop_bbox,
876
+ **kwargs,
877
+ )
878
+ if page_image:
879
+ output_page_images.append(page_image)
880
+
881
+ # 4. Stack the generated page images if multiple
882
+ if not output_page_images:
883
+ logger.warning("Flow.show() produced no page images")
884
+ return None
885
+
886
+ if len(output_page_images) == 1:
887
+ return output_page_images[0]
888
+
889
+ # Determine stacking direction (default to flow arrangement, but allow override)
890
+ final_stack_direction = stack_direction
891
+ if stack_direction == "auto":
892
+ final_stack_direction = self.arrangement
893
+
894
+ # Stack multiple page images
895
+ if final_stack_direction == "vertical":
896
+ final_width = max(img.width for img in output_page_images)
897
+ final_height = (
898
+ sum(img.height for img in output_page_images)
899
+ + (len(output_page_images) - 1) * stack_gap
900
+ )
901
+ if final_width == 0 or final_height == 0:
902
+ raise ValueError("Cannot create concatenated image with zero width or height.")
903
+
904
+ concatenated_image = PIL_Image_Runtime.new(
905
+ "RGB", (final_width, final_height), stack_background_color
906
+ )
907
+ current_y = 0
908
+ for img in output_page_images:
909
+ paste_x = (final_width - img.width) // 2
910
+ concatenated_image.paste(img, (paste_x, current_y))
911
+ current_y += img.height + stack_gap
912
+ return concatenated_image
913
+
914
+ elif final_stack_direction == "horizontal":
915
+ final_width = (
916
+ sum(img.width for img in output_page_images)
917
+ + (len(output_page_images) - 1) * stack_gap
918
+ )
919
+ final_height = max(img.height for img in output_page_images)
920
+ if final_width == 0 or final_height == 0:
921
+ raise ValueError("Cannot create concatenated image with zero width or height.")
922
+
923
+ concatenated_image = PIL_Image_Runtime.new(
924
+ "RGB", (final_width, final_height), stack_background_color
925
+ )
926
+ current_x = 0
927
+ for img in output_page_images:
928
+ paste_y = (final_height - img.height) // 2
929
+ concatenated_image.paste(img, (current_x, paste_y))
930
+ current_x += img.width + stack_gap
931
+ return concatenated_image
932
+ else:
933
+ raise ValueError(
934
+ f"Invalid stack_direction '{final_stack_direction}' for Flow.show(). Must be 'vertical' or 'horizontal'."
935
+ )
936
+
255
937
  # --- Helper methods for coordinate transformations and segment iteration ---
256
938
  # These will be crucial for FlowElement's directional methods.
257
939