natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,17 +8,20 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Uni
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
+ from natural_pdf.utils.layout import merge_bboxes
12
+
11
13
  if TYPE_CHECKING:
12
14
  from natural_pdf.core.page import Page
13
15
  from natural_pdf.elements.base import Element
14
16
  from natural_pdf.elements.collections import ElementCollection
15
17
  from natural_pdf.elements.region import Region
18
+ from natural_pdf.flows.region import FlowRegion
16
19
 
17
20
  logger = logging.getLogger(__name__)
18
21
 
19
22
 
20
23
  def _normalize_markers(
21
- markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region"]
24
+ markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region", "FlowRegion"]
22
25
  ) -> List[str]:
23
26
  """
24
27
  Normalize markers parameter to a list of text strings for guide creation.
@@ -37,6 +40,21 @@ def _normalize_markers(
37
40
  if markers is None:
38
41
  return []
39
42
 
43
+ # Handle FlowRegion by collecting markers from all constituent regions
44
+ if hasattr(obj, "constituent_regions"):
45
+ all_markers = []
46
+ for region in obj.constituent_regions:
47
+ region_markers = _normalize_markers(markers, region)
48
+ all_markers.extend(region_markers)
49
+ # Remove duplicates while preserving order
50
+ seen = set()
51
+ unique_markers = []
52
+ for m in all_markers:
53
+ if m not in seen:
54
+ seen.add(m)
55
+ unique_markers.append(m)
56
+ return unique_markers
57
+
40
58
  if isinstance(markers, str):
41
59
  # Single selector or text string
42
60
  if markers.startswith(("text", "region", "line", "rect", "blob", "image")):
@@ -115,7 +133,7 @@ class GuidesList(UserList):
115
133
  def from_content(
116
134
  self,
117
135
  markers: Union[str, List[str], "ElementCollection", None],
118
- obj: Optional[Union["Page", "Region"]] = None,
136
+ obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
119
137
  align: Literal["left", "right", "center", "between"] = "left",
120
138
  outer: bool = True,
121
139
  tolerance: float = 5,
@@ -131,7 +149,7 @@ class GuidesList(UserList):
131
149
  - List[str]: list of selectors or literal text strings
132
150
  - ElementCollection: collection of elements to extract text from
133
151
  - None: no markers
134
- obj: Page/Region to search (uses parent's context if None)
152
+ obj: Page/Region/FlowRegion to search (uses parent's context if None)
135
153
  align: How to align guides relative to found elements
136
154
  outer: Whether to add outer boundary guides
137
155
  tolerance: Tolerance for snapping to element edges
@@ -143,6 +161,88 @@ class GuidesList(UserList):
143
161
  if target_obj is None:
144
162
  raise ValueError("No object provided and no context available")
145
163
 
164
+ # Check if parent is in flow mode
165
+ if self._parent.is_flow_region:
166
+ # Create guides across all constituent regions
167
+ all_guides = []
168
+ for region in self._parent.context.constituent_regions:
169
+ # Normalize markers for this region
170
+ marker_texts = _normalize_markers(markers, region)
171
+
172
+ # Create guides for this region
173
+ region_guides = Guides.from_content(
174
+ obj=region,
175
+ axis=self._axis,
176
+ markers=marker_texts,
177
+ align=align,
178
+ outer=outer,
179
+ tolerance=tolerance,
180
+ )
181
+
182
+ # Collect guides from this region
183
+ if self._axis == "vertical":
184
+ all_guides.extend(region_guides.vertical)
185
+ else:
186
+ all_guides.extend(region_guides.horizontal)
187
+
188
+ # Update parent's flow guides structure
189
+ if append:
190
+ # Append to existing
191
+ existing = [coord for coord, _ in
192
+ (self._parent._unified_vertical if self._axis == "vertical"
193
+ else self._parent._unified_horizontal)]
194
+ all_guides = existing + all_guides
195
+
196
+ # Remove duplicates and sort
197
+ unique_guides = sorted(list(set(all_guides)))
198
+
199
+ # Clear and rebuild unified view
200
+ if self._axis == "vertical":
201
+ self._parent._unified_vertical = []
202
+ for coord in unique_guides:
203
+ # Find which region(s) this guide belongs to
204
+ for region in self._parent.context.constituent_regions:
205
+ if hasattr(region, "bbox"):
206
+ x0, _, x1, _ = region.bbox
207
+ if x0 <= coord <= x1:
208
+ self._parent._unified_vertical.append((coord, region))
209
+ break
210
+ self._parent._vertical_cache = None
211
+ self.data = unique_guides
212
+ else:
213
+ self._parent._unified_horizontal = []
214
+ for coord in unique_guides:
215
+ # Find which region(s) this guide belongs to
216
+ for region in self._parent.context.constituent_regions:
217
+ if hasattr(region, "bbox"):
218
+ _, y0, _, y1 = region.bbox
219
+ if y0 <= coord <= y1:
220
+ self._parent._unified_horizontal.append((coord, region))
221
+ break
222
+ self._parent._horizontal_cache = None
223
+ self.data = unique_guides
224
+
225
+ # Update per-region guides
226
+ for region in self._parent.context.constituent_regions:
227
+ region_verticals = []
228
+ region_horizontals = []
229
+
230
+ for coord, r in self._parent._unified_vertical:
231
+ if r == region:
232
+ region_verticals.append(coord)
233
+
234
+ for coord, r in self._parent._unified_horizontal:
235
+ if r == region:
236
+ region_horizontals.append(coord)
237
+
238
+ self._parent._flow_guides[region] = (
239
+ sorted(region_verticals),
240
+ sorted(region_horizontals)
241
+ )
242
+
243
+ return self._parent
244
+
245
+ # Original single-region logic
146
246
  # Normalize markers to list of text strings
147
247
  marker_texts = _normalize_markers(markers, target_obj)
148
248
 
@@ -181,7 +281,7 @@ class GuidesList(UserList):
181
281
 
182
282
  def from_lines(
183
283
  self,
184
- obj: Optional[Union["Page", "Region"]] = None,
284
+ obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
185
285
  threshold: Union[float, str] = "auto",
186
286
  source_label: Optional[str] = None,
187
287
  max_lines: Optional[int] = None,
@@ -198,7 +298,7 @@ class GuidesList(UserList):
198
298
  Create guides from detected line elements.
199
299
 
200
300
  Args:
201
- obj: Page/Region to search (uses parent's context if None)
301
+ obj: Page/Region/FlowRegion to search (uses parent's context if None)
202
302
  threshold: Line detection threshold ('auto' or float 0.0-1.0)
203
303
  source_label: Filter lines by source label (for vector method)
204
304
  max_lines: Maximum lines to use (alias: n)
@@ -236,6 +336,90 @@ class GuidesList(UserList):
236
336
  axis_key = "min_gap_h" if self._axis == "horizontal" else "min_gap_v"
237
337
  detect_kwargs.setdefault(axis_key, min_gap)
238
338
 
339
+ # Check if parent is in flow mode
340
+ if self._parent.is_flow_region:
341
+ # Create guides across all constituent regions
342
+ all_guides = []
343
+
344
+ for region in self._parent.context.constituent_regions:
345
+ # Create guides for this specific region
346
+ region_guides = Guides.from_lines(
347
+ obj=region,
348
+ axis=self._axis,
349
+ threshold=threshold,
350
+ source_label=source_label,
351
+ max_lines_h=max_lines_h,
352
+ max_lines_v=max_lines_v,
353
+ outer=outer,
354
+ detection_method=detection_method,
355
+ resolution=resolution,
356
+ **detect_kwargs
357
+ )
358
+
359
+ # Collect guides from this region
360
+ if self._axis == "vertical":
361
+ all_guides.extend(region_guides.vertical)
362
+ else:
363
+ all_guides.extend(region_guides.horizontal)
364
+
365
+ # Update parent's flow guides structure
366
+ if append:
367
+ # Append to existing
368
+ existing = [coord for coord, _ in
369
+ (self._parent._unified_vertical if self._axis == "vertical"
370
+ else self._parent._unified_horizontal)]
371
+ all_guides = existing + all_guides
372
+
373
+ # Remove duplicates and sort
374
+ unique_guides = sorted(list(set(all_guides)))
375
+
376
+ # Clear and rebuild unified view
377
+ if self._axis == "vertical":
378
+ self._parent._unified_vertical = []
379
+ for coord in unique_guides:
380
+ # Find which region(s) this guide belongs to
381
+ for region in self._parent.context.constituent_regions:
382
+ if hasattr(region, "bbox"):
383
+ x0, _, x1, _ = region.bbox
384
+ if x0 <= coord <= x1:
385
+ self._parent._unified_vertical.append((coord, region))
386
+ break
387
+ self._parent._vertical_cache = None
388
+ self.data = unique_guides
389
+ else:
390
+ self._parent._unified_horizontal = []
391
+ for coord in unique_guides:
392
+ # Find which region(s) this guide belongs to
393
+ for region in self._parent.context.constituent_regions:
394
+ if hasattr(region, "bbox"):
395
+ _, y0, _, y1 = region.bbox
396
+ if y0 <= coord <= y1:
397
+ self._parent._unified_horizontal.append((coord, region))
398
+ break
399
+ self._parent._horizontal_cache = None
400
+ self.data = unique_guides
401
+
402
+ # Update per-region guides
403
+ for region in self._parent.context.constituent_regions:
404
+ region_verticals = []
405
+ region_horizontals = []
406
+
407
+ for coord, r in self._parent._unified_vertical:
408
+ if r == region:
409
+ region_verticals.append(coord)
410
+
411
+ for coord, r in self._parent._unified_horizontal:
412
+ if r == region:
413
+ region_horizontals.append(coord)
414
+
415
+ self._parent._flow_guides[region] = (
416
+ sorted(region_verticals),
417
+ sorted(region_horizontals)
418
+ )
419
+
420
+ return self._parent
421
+
422
+ # Original single-region logic
239
423
  # Create guides for this axis
240
424
  new_guides = Guides.from_lines(
241
425
  obj=target_obj,
@@ -274,14 +458,14 @@ class GuidesList(UserList):
274
458
  return self._parent
275
459
 
276
460
  def from_whitespace(
277
- self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10,
461
+ self, obj: Optional[Union["Page", "Region", "FlowRegion"]] = None, min_gap: float = 10,
278
462
  *, append: bool = False
279
463
  ) -> "Guides":
280
464
  """
281
465
  Create guides from whitespace gaps.
282
466
 
283
467
  Args:
284
- obj: Page/Region to analyze (uses parent's context if None)
468
+ obj: Page/Region/FlowRegion to analyze (uses parent's context if None)
285
469
  min_gap: Minimum gap size to consider
286
470
 
287
471
  Returns:
@@ -291,6 +475,83 @@ class GuidesList(UserList):
291
475
  if target_obj is None:
292
476
  raise ValueError("No object provided and no context available")
293
477
 
478
+ # Check if parent is in flow mode
479
+ if self._parent.is_flow_region:
480
+ # Create guides across all constituent regions
481
+ all_guides = []
482
+
483
+ for region in self._parent.context.constituent_regions:
484
+ # Create guides for this specific region
485
+ region_guides = Guides.from_whitespace(
486
+ obj=region,
487
+ axis=self._axis,
488
+ min_gap=min_gap
489
+ )
490
+
491
+ # Collect guides from this region
492
+ if self._axis == "vertical":
493
+ all_guides.extend(region_guides.vertical)
494
+ else:
495
+ all_guides.extend(region_guides.horizontal)
496
+
497
+ # Update parent's flow guides structure
498
+ if append:
499
+ # Append to existing
500
+ existing = [coord for coord, _ in
501
+ (self._parent._unified_vertical if self._axis == "vertical"
502
+ else self._parent._unified_horizontal)]
503
+ all_guides = existing + all_guides
504
+
505
+ # Remove duplicates and sort
506
+ unique_guides = sorted(list(set(all_guides)))
507
+
508
+ # Clear and rebuild unified view
509
+ if self._axis == "vertical":
510
+ self._parent._unified_vertical = []
511
+ for coord in unique_guides:
512
+ # Find which region(s) this guide belongs to
513
+ for region in self._parent.context.constituent_regions:
514
+ if hasattr(region, "bbox"):
515
+ x0, _, x1, _ = region.bbox
516
+ if x0 <= coord <= x1:
517
+ self._parent._unified_vertical.append((coord, region))
518
+ break
519
+ self._parent._vertical_cache = None
520
+ self.data = unique_guides
521
+ else:
522
+ self._parent._unified_horizontal = []
523
+ for coord in unique_guides:
524
+ # Find which region(s) this guide belongs to
525
+ for region in self._parent.context.constituent_regions:
526
+ if hasattr(region, "bbox"):
527
+ _, y0, _, y1 = region.bbox
528
+ if y0 <= coord <= y1:
529
+ self._parent._unified_horizontal.append((coord, region))
530
+ break
531
+ self._parent._horizontal_cache = None
532
+ self.data = unique_guides
533
+
534
+ # Update per-region guides
535
+ for region in self._parent.context.constituent_regions:
536
+ region_verticals = []
537
+ region_horizontals = []
538
+
539
+ for coord, r in self._parent._unified_vertical:
540
+ if r == region:
541
+ region_verticals.append(coord)
542
+
543
+ for coord, r in self._parent._unified_horizontal:
544
+ if r == region:
545
+ region_horizontals.append(coord)
546
+
547
+ self._parent._flow_guides[region] = (
548
+ sorted(region_verticals),
549
+ sorted(region_horizontals)
550
+ )
551
+
552
+ return self._parent
553
+
554
+ # Original single-region logic
294
555
  # Create guides for this axis
295
556
  new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
296
557
 
@@ -618,9 +879,9 @@ class Guides:
618
879
 
619
880
  def __init__(
620
881
  self,
621
- verticals: Optional[Union[List[float], "Page", "Region"]] = None,
882
+ verticals: Optional[Union[List[float], "Page", "Region", "FlowRegion"]] = None,
622
883
  horizontals: Optional[List[float]] = None,
623
- context: Optional[Union["Page", "Region"]] = None,
884
+ context: Optional[Union["Page", "Region", "FlowRegion"]] = None,
624
885
  bounds: Optional[Tuple[float, float, float, float]] = None,
625
886
  relative: bool = False,
626
887
  snap_behavior: Literal["raise", "warn", "ignore"] = "warn",
@@ -629,21 +890,21 @@ class Guides:
629
890
  Initialize a Guides object.
630
891
 
631
892
  Args:
632
- verticals: List of x-coordinates for vertical guides, or a Page/Region as context
893
+ verticals: List of x-coordinates for vertical guides, or a Page/Region/FlowRegion as context
633
894
  horizontals: List of y-coordinates for horizontal guides
634
- context: Page or Region object these guides were created from
895
+ context: Page, Region, or FlowRegion object these guides were created from
635
896
  bounds: Bounding box (x0, top, x1, bottom) if context not provided
636
897
  relative: Whether coordinates are relative (0-1) or absolute
637
898
  snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
638
899
  """
639
- # Handle Guides(page) shorthand
900
+ # Handle Guides(page) or Guides(flow_region) shorthand
640
901
  if (
641
902
  verticals is not None
642
903
  and not isinstance(verticals, (list, tuple))
643
904
  and horizontals is None
644
905
  and context is None
645
906
  ):
646
- # First argument is a page/region, not coordinates
907
+ # First argument is a page/region/flow_region, not coordinates
647
908
  context = verticals
648
909
  verticals = None
649
910
 
@@ -652,6 +913,19 @@ class Guides:
652
913
  self.relative = relative
653
914
  self.snap_behavior = snap_behavior
654
915
 
916
+ # Check if we're dealing with a FlowRegion
917
+ self.is_flow_region = hasattr(context, "constituent_regions")
918
+
919
+ # If FlowRegion, we'll store guides per constituent region
920
+ if self.is_flow_region:
921
+ self._flow_guides: Dict["Region", Tuple[List[float], List[float]]] = {}
922
+ # For unified view across all regions
923
+ self._unified_vertical: List[Tuple[float, "Region"]] = []
924
+ self._unified_horizontal: List[Tuple[float, "Region"]] = []
925
+ # Cache for sorted unique coordinates
926
+ self._vertical_cache: Optional[List[float]] = None
927
+ self._horizontal_cache: Optional[List[float]] = None
928
+
655
929
  # Initialize with GuidesList instances
656
930
  self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
657
931
  self._horizontal = GuidesList(
@@ -683,11 +957,26 @@ class Guides:
683
957
  @property
684
958
  def vertical(self) -> GuidesList:
685
959
  """Get vertical guide coordinates."""
960
+ if self.is_flow_region and self._vertical_cache is not None:
961
+ # Return cached unified view
962
+ self._vertical.data = self._vertical_cache
963
+ elif self.is_flow_region and self._unified_vertical:
964
+ # Build unified view from flow guides
965
+ all_verticals = []
966
+ for coord, region in self._unified_vertical:
967
+ all_verticals.append(coord)
968
+ # Remove duplicates and sort
969
+ self._vertical_cache = sorted(list(set(all_verticals)))
970
+ self._vertical.data = self._vertical_cache
686
971
  return self._vertical
687
972
 
688
973
  @vertical.setter
689
974
  def vertical(self, value: Union[List[float], "Guides", None]):
690
975
  """Set vertical guides from a list of coordinates or another Guides object."""
976
+ if self.is_flow_region:
977
+ # Invalidate cache when setting new values
978
+ self._vertical_cache = None
979
+
691
980
  if value is None:
692
981
  self._vertical.data = []
693
982
  elif isinstance(value, Guides):
@@ -710,11 +999,26 @@ class Guides:
710
999
  @property
711
1000
  def horizontal(self) -> GuidesList:
712
1001
  """Get horizontal guide coordinates."""
1002
+ if self.is_flow_region and self._horizontal_cache is not None:
1003
+ # Return cached unified view
1004
+ self._horizontal.data = self._horizontal_cache
1005
+ elif self.is_flow_region and self._unified_horizontal:
1006
+ # Build unified view from flow guides
1007
+ all_horizontals = []
1008
+ for coord, region in self._unified_horizontal:
1009
+ all_horizontals.append(coord)
1010
+ # Remove duplicates and sort
1011
+ self._horizontal_cache = sorted(list(set(all_horizontals)))
1012
+ self._horizontal.data = self._horizontal_cache
713
1013
  return self._horizontal
714
1014
 
715
1015
  @horizontal.setter
716
1016
  def horizontal(self, value: Union[List[float], "Guides", None]):
717
1017
  """Set horizontal guides from a list of coordinates or another Guides object."""
1018
+ if self.is_flow_region:
1019
+ # Invalidate cache when setting new values
1020
+ self._horizontal_cache = None
1021
+
718
1022
  if value is None:
719
1023
  self._horizontal.data = []
720
1024
  elif isinstance(value, Guides):
@@ -821,7 +1125,7 @@ class Guides:
821
1125
  @classmethod
822
1126
  def from_lines(
823
1127
  cls,
824
- obj: Union["Page", "Region"],
1128
+ obj: Union["Page", "Region", "FlowRegion"],
825
1129
  axis: Literal["vertical", "horizontal", "both"] = "both",
826
1130
  threshold: Union[float, str] = "auto",
827
1131
  source_label: Optional[str] = None,
@@ -836,7 +1140,7 @@ class Guides:
836
1140
  Create guides from detected line elements.
837
1141
 
838
1142
  Args:
839
- obj: Page or Region to detect lines from
1143
+ obj: Page, Region, or FlowRegion to detect lines from
840
1144
  axis: Which orientations to detect
841
1145
  threshold: Detection threshold ('auto' or float 0.0-1.0) - used for pixel detection
842
1146
  source_label: Filter for line source (vector method) or label for detected lines (pixel method)
@@ -856,6 +1160,45 @@ class Guides:
856
1160
  Returns:
857
1161
  New Guides object with detected line positions
858
1162
  """
1163
+ # Handle FlowRegion
1164
+ if hasattr(obj, "constituent_regions"):
1165
+ guides = cls(context=obj)
1166
+
1167
+ # Process each constituent region
1168
+ for region in obj.constituent_regions:
1169
+ # Create guides for this specific region
1170
+ region_guides = cls.from_lines(
1171
+ region,
1172
+ axis=axis,
1173
+ threshold=threshold,
1174
+ source_label=source_label,
1175
+ max_lines_h=max_lines_h,
1176
+ max_lines_v=max_lines_v,
1177
+ outer=outer,
1178
+ detection_method=detection_method,
1179
+ resolution=resolution,
1180
+ **detect_kwargs
1181
+ )
1182
+
1183
+ # Store in flow guides
1184
+ guides._flow_guides[region] = (
1185
+ list(region_guides.vertical),
1186
+ list(region_guides.horizontal)
1187
+ )
1188
+
1189
+ # Add to unified view
1190
+ for v in region_guides.vertical:
1191
+ guides._unified_vertical.append((v, region))
1192
+ for h in region_guides.horizontal:
1193
+ guides._unified_horizontal.append((h, region))
1194
+
1195
+ # Invalidate caches to force rebuild on next access
1196
+ guides._vertical_cache = None
1197
+ guides._horizontal_cache = None
1198
+
1199
+ return guides
1200
+
1201
+ # Original single-region logic follows...
859
1202
  # Get bounds for potential outer guides
860
1203
  if hasattr(obj, "bbox"):
861
1204
  bounds = obj.bbox
@@ -1028,7 +1371,7 @@ class Guides:
1028
1371
  @classmethod
1029
1372
  def from_content(
1030
1373
  cls,
1031
- obj: Union["Page", "Region"],
1374
+ obj: Union["Page", "Region", "FlowRegion"],
1032
1375
  axis: Literal["vertical", "horizontal"] = "vertical",
1033
1376
  markers: Union[str, List[str], "ElementCollection", None] = None,
1034
1377
  align: Literal["left", "right", "center", "between"] = "left",
@@ -1039,7 +1382,7 @@ class Guides:
1039
1382
  Create guides based on text content positions.
1040
1383
 
1041
1384
  Args:
1042
- obj: Page or Region to search for content
1385
+ obj: Page, Region, or FlowRegion to search for content
1043
1386
  axis: Whether to create vertical or horizontal guides
1044
1387
  markers: Content to search for. Can be:
1045
1388
  - str: single selector (e.g., 'text:contains("Name")') or literal text
@@ -1053,6 +1396,41 @@ class Guides:
1053
1396
  Returns:
1054
1397
  New Guides object aligned to text content
1055
1398
  """
1399
+ # Handle FlowRegion
1400
+ if hasattr(obj, "constituent_regions"):
1401
+ guides = cls(context=obj)
1402
+
1403
+ # Process each constituent region
1404
+ for region in obj.constituent_regions:
1405
+ # Create guides for this specific region
1406
+ region_guides = cls.from_content(
1407
+ region,
1408
+ axis=axis,
1409
+ markers=markers,
1410
+ align=align,
1411
+ outer=outer,
1412
+ tolerance=tolerance
1413
+ )
1414
+
1415
+ # Store in flow guides
1416
+ guides._flow_guides[region] = (
1417
+ list(region_guides.vertical),
1418
+ list(region_guides.horizontal)
1419
+ )
1420
+
1421
+ # Add to unified view
1422
+ for v in region_guides.vertical:
1423
+ guides._unified_vertical.append((v, region))
1424
+ for h in region_guides.horizontal:
1425
+ guides._unified_horizontal.append((h, region))
1426
+
1427
+ # Invalidate caches
1428
+ guides._vertical_cache = None
1429
+ guides._horizontal_cache = None
1430
+
1431
+ return guides
1432
+
1433
+ # Original single-region logic follows...
1056
1434
  guides_coords = []
1057
1435
  bounds = None
1058
1436
 
@@ -1141,7 +1519,7 @@ class Guides:
1141
1519
  @classmethod
1142
1520
  def from_whitespace(
1143
1521
  cls,
1144
- obj: Union["Page", "Region"],
1522
+ obj: Union["Page", "Region", "FlowRegion"],
1145
1523
  axis: Literal["vertical", "horizontal", "both"] = "both",
1146
1524
  min_gap: float = 10,
1147
1525
  ) -> "Guides":
@@ -1212,6 +1590,117 @@ class Guides:
1212
1590
  logger.warning("No context available for whitespace detection")
1213
1591
  return self
1214
1592
 
1593
+ # Handle FlowRegion case - collect all text elements across regions
1594
+ if self.is_flow_region:
1595
+ all_text_elements = []
1596
+ region_bounds = {}
1597
+
1598
+ for region in self.context.constituent_regions:
1599
+ # Get text elements from this region
1600
+ if hasattr(region, "find_all"):
1601
+ try:
1602
+ text_elements = region.find_all("text", apply_exclusions=False)
1603
+ elements = text_elements.elements if hasattr(text_elements, "elements") else text_elements
1604
+ all_text_elements.extend(elements)
1605
+
1606
+ # Store bounds for each region
1607
+ if hasattr(region, "bbox"):
1608
+ region_bounds[region] = region.bbox
1609
+ elif hasattr(region, "x0"):
1610
+ region_bounds[region] = (region.x0, region.top, region.x1, region.bottom)
1611
+ except Exception as e:
1612
+ logger.warning(f"Error getting text elements from region: {e}")
1613
+
1614
+ if not all_text_elements:
1615
+ logger.warning("No text elements found across flow regions for whitespace detection")
1616
+ return self
1617
+
1618
+ # Find whitespace gaps across all regions
1619
+ if axis == "vertical":
1620
+ gaps = self._find_vertical_whitespace_gaps(all_text_elements, min_gap, threshold)
1621
+ # Get all vertical guides across regions
1622
+ all_guides = []
1623
+ guide_to_region_map = {} # Map guide coordinate to its original list of regions
1624
+ for coord, region in self._unified_vertical:
1625
+ all_guides.append(coord)
1626
+ guide_to_region_map.setdefault(coord, []).append(region)
1627
+
1628
+ if gaps and all_guides:
1629
+ # Keep a copy of original guides to maintain mapping
1630
+ original_guides = all_guides.copy()
1631
+
1632
+ # Snap guides to gaps
1633
+ self._snap_guides_to_gaps(all_guides, gaps, axis)
1634
+
1635
+ # Update the unified view with snapped positions
1636
+ self._unified_vertical = []
1637
+ for i, new_coord in enumerate(all_guides):
1638
+ # Find the original region for this guide using the original position
1639
+ original_coord = original_guides[i]
1640
+ # A guide might be associated with multiple regions, add them all
1641
+ regions = guide_to_region_map.get(original_coord, [])
1642
+ for region in regions:
1643
+ self._unified_vertical.append((new_coord, region))
1644
+
1645
+ # Update individual region guides
1646
+ for region in self._flow_guides:
1647
+ region_verticals = []
1648
+ for coord, r in self._unified_vertical:
1649
+ if r == region:
1650
+ region_verticals.append(coord)
1651
+ self._flow_guides[region] = (
1652
+ sorted(list(set(region_verticals))), # Deduplicate here
1653
+ self._flow_guides[region][1]
1654
+ )
1655
+
1656
+ # Invalidate cache
1657
+ self._vertical_cache = None
1658
+
1659
+ elif axis == "horizontal":
1660
+ gaps = self._find_horizontal_whitespace_gaps(all_text_elements, min_gap, threshold)
1661
+ # Get all horizontal guides across regions
1662
+ all_guides = []
1663
+ guide_to_region_map = {} # Map guide coordinate to its original list of regions
1664
+ for coord, region in self._unified_horizontal:
1665
+ all_guides.append(coord)
1666
+ guide_to_region_map.setdefault(coord, []).append(region)
1667
+
1668
+ if gaps and all_guides:
1669
+ # Keep a copy of original guides to maintain mapping
1670
+ original_guides = all_guides.copy()
1671
+
1672
+ # Snap guides to gaps
1673
+ self._snap_guides_to_gaps(all_guides, gaps, axis)
1674
+
1675
+ # Update the unified view with snapped positions
1676
+ self._unified_horizontal = []
1677
+ for i, new_coord in enumerate(all_guides):
1678
+ # Find the original region for this guide using the original position
1679
+ original_coord = original_guides[i]
1680
+ regions = guide_to_region_map.get(original_coord, [])
1681
+ for region in regions:
1682
+ self._unified_horizontal.append((new_coord, region))
1683
+
1684
+ # Update individual region guides
1685
+ for region in self._flow_guides:
1686
+ region_horizontals = []
1687
+ for coord, r in self._unified_horizontal:
1688
+ if r == region:
1689
+ region_horizontals.append(coord)
1690
+ self._flow_guides[region] = (
1691
+ self._flow_guides[region][0],
1692
+ sorted(list(set(region_horizontals))) # Deduplicate here
1693
+ )
1694
+
1695
+ # Invalidate cache
1696
+ self._horizontal_cache = None
1697
+
1698
+ else:
1699
+ raise ValueError("axis must be 'vertical' or 'horizontal'")
1700
+
1701
+ return self
1702
+
1703
+ # Original single-region logic
1215
1704
  # Get elements for trough detection
1216
1705
  text_elements = self._get_text_elements()
1217
1706
  if not text_elements:
@@ -1303,14 +1792,47 @@ class Guides:
1303
1792
  combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
1304
1793
  combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
1305
1794
 
1306
- # Use context from self if available
1307
- return Guides(
1795
+ # Handle FlowRegion context merging
1796
+ new_context = self.context or other.context
1797
+
1798
+ # If both are flow regions, we might need a more complex merge,
1799
+ # but for now, just picking one context is sufficient.
1800
+
1801
+ # Create the new Guides object
1802
+ new_guides = Guides(
1308
1803
  verticals=combined_verticals,
1309
1804
  horizontals=combined_horizontals,
1310
- context=self.context or other.context,
1805
+ context=new_context,
1311
1806
  bounds=self.bounds or other.bounds,
1312
1807
  )
1313
1808
 
1809
+ # If the new context is a FlowRegion, we need to rebuild the flow-related state
1810
+ if new_guides.is_flow_region:
1811
+ # Re-initialize flow guides from both sources
1812
+ # This is a simplification; a true merge would be more complex.
1813
+ # For now, we combine the flow_guides dictionaries.
1814
+ if hasattr(self, "_flow_guides"):
1815
+ new_guides._flow_guides.update(self._flow_guides)
1816
+ if hasattr(other, "_flow_guides"):
1817
+ new_guides._flow_guides.update(other._flow_guides)
1818
+
1819
+ # Re-initialize unified views
1820
+ if hasattr(self, "_unified_vertical"):
1821
+ new_guides._unified_vertical.extend(self._unified_vertical)
1822
+ if hasattr(other, "_unified_vertical"):
1823
+ new_guides._unified_vertical.extend(other._unified_vertical)
1824
+
1825
+ if hasattr(self, "_unified_horizontal"):
1826
+ new_guides._unified_horizontal.extend(self._unified_horizontal)
1827
+ if hasattr(other, "_unified_horizontal"):
1828
+ new_guides._unified_horizontal.extend(other._unified_horizontal)
1829
+
1830
+ # Invalidate caches to force rebuild
1831
+ new_guides._vertical_cache = None
1832
+ new_guides._horizontal_cache = None
1833
+
1834
+ return new_guides
1835
+
1314
1836
  def show(self, on=None, **kwargs):
1315
1837
  """
1316
1838
  Display the guides overlaid on a page or region.
@@ -1324,6 +1846,122 @@ class Guides:
1324
1846
  Returns:
1325
1847
  PIL Image with guides drawn on it.
1326
1848
  """
1849
+ # Handle FlowRegion case
1850
+ if self.is_flow_region and (on is None or on == self.context):
1851
+ if not self._flow_guides:
1852
+ raise ValueError("No guides to show for FlowRegion")
1853
+
1854
+ # Get stacking parameters from kwargs or use defaults
1855
+ stack_direction = kwargs.get('stack_direction', 'vertical')
1856
+ stack_gap = kwargs.get('stack_gap', 5)
1857
+ stack_background_color = kwargs.get('stack_background_color', (255, 255, 255))
1858
+
1859
+ # First, render all constituent regions without guides to get base images
1860
+ base_images = []
1861
+ region_infos = [] # Store region info for guide coordinate mapping
1862
+
1863
+ for region in self.context.constituent_regions:
1864
+ try:
1865
+ # Render region without guides
1866
+ img = region.to_image(**kwargs)
1867
+ if img:
1868
+ base_images.append(img)
1869
+
1870
+ # Calculate scaling factors for this region
1871
+ scale_x = img.width / region.width
1872
+ scale_y = img.height / region.height
1873
+
1874
+ region_infos.append({
1875
+ 'region': region,
1876
+ 'img_width': img.width,
1877
+ 'img_height': img.height,
1878
+ 'scale_x': scale_x,
1879
+ 'scale_y': scale_y,
1880
+ 'pdf_x0': region.x0,
1881
+ 'pdf_top': region.top,
1882
+ 'pdf_x1': region.x1,
1883
+ 'pdf_bottom': region.bottom
1884
+ })
1885
+ except Exception as e:
1886
+ logger.warning(f"Failed to render region: {e}")
1887
+
1888
+ if not base_images:
1889
+ raise ValueError("Failed to render any images for FlowRegion")
1890
+
1891
+ # Calculate final canvas size based on stacking direction
1892
+ if stack_direction == "vertical":
1893
+ final_width = max(img.width for img in base_images)
1894
+ final_height = (
1895
+ sum(img.height for img in base_images)
1896
+ + (len(base_images) - 1) * stack_gap
1897
+ )
1898
+ else: # horizontal
1899
+ final_width = (
1900
+ sum(img.width for img in base_images)
1901
+ + (len(base_images) - 1) * stack_gap
1902
+ )
1903
+ final_height = max(img.height for img in base_images)
1904
+
1905
+ # Create unified canvas
1906
+ canvas = Image.new("RGB", (final_width, final_height), stack_background_color)
1907
+ draw = ImageDraw.Draw(canvas)
1908
+
1909
+ # Paste base images and track positions
1910
+ region_positions = [] # (region_info, paste_x, paste_y)
1911
+
1912
+ if stack_direction == "vertical":
1913
+ current_y = 0
1914
+ for i, (img, info) in enumerate(zip(base_images, region_infos)):
1915
+ paste_x = (final_width - img.width) // 2 # Center horizontally
1916
+ canvas.paste(img, (paste_x, current_y))
1917
+ region_positions.append((info, paste_x, current_y))
1918
+ current_y += img.height + stack_gap
1919
+ else: # horizontal
1920
+ current_x = 0
1921
+ for i, (img, info) in enumerate(zip(base_images, region_infos)):
1922
+ paste_y = (final_height - img.height) // 2 # Center vertically
1923
+ canvas.paste(img, (current_x, paste_y))
1924
+ region_positions.append((info, current_x, paste_y))
1925
+ current_x += img.width + stack_gap
1926
+
1927
+ # Now draw guides on the unified canvas
1928
+ # Draw vertical guides (blue) - these extend through the full canvas height
1929
+ for v_coord in self.vertical:
1930
+ # Find which region(s) this guide intersects
1931
+ for info, paste_x, paste_y in region_positions:
1932
+ if info['pdf_x0'] <= v_coord <= info['pdf_x1']:
1933
+ # This guide is within this region's x-bounds
1934
+ # Convert PDF coordinate to pixel coordinate relative to the region
1935
+ adjusted_x = v_coord - info['pdf_x0']
1936
+ pixel_x = adjusted_x * info['scale_x'] + paste_x
1937
+
1938
+ # Draw full-height line on canvas (not clipped to region)
1939
+ if 0 <= pixel_x <= final_width:
1940
+ x_pixel = int(pixel_x)
1941
+ draw.line([(x_pixel, 0), (x_pixel, final_height - 1)],
1942
+ fill=(0, 0, 255, 200), width=2)
1943
+ break # Only draw once per guide
1944
+
1945
+ # Draw horizontal guides (red) - these extend through the full canvas width
1946
+ for h_coord in self.horizontal:
1947
+ # Find which region(s) this guide intersects
1948
+ for info, paste_x, paste_y in region_positions:
1949
+ if info['pdf_top'] <= h_coord <= info['pdf_bottom']:
1950
+ # This guide is within this region's y-bounds
1951
+ # Convert PDF coordinate to pixel coordinate relative to the region
1952
+ adjusted_y = h_coord - info['pdf_top']
1953
+ pixel_y = adjusted_y * info['scale_y'] + paste_y
1954
+
1955
+ # Draw full-width line on canvas (not clipped to region)
1956
+ if 0 <= pixel_y <= final_height:
1957
+ y_pixel = int(pixel_y)
1958
+ draw.line([(0, y_pixel), (final_width - 1, y_pixel)],
1959
+ fill=(255, 0, 0, 200), width=2)
1960
+ break # Only draw once per guide
1961
+
1962
+ return canvas
1963
+
1964
+ # Original single-region logic follows...
1327
1965
  # Determine what to display guides on
1328
1966
  target = on if on is not None else self.context
1329
1967
 
@@ -1950,7 +2588,9 @@ class Guides:
1950
2588
  source: str = "guides",
1951
2589
  cell_padding: float = 0.5,
1952
2590
  include_outer_boundaries: bool = False,
1953
- ) -> Dict[str, int]:
2591
+ *,
2592
+ multi_page: Literal["auto", True, False] = "auto",
2593
+ ) -> Dict[str, Any]:
1954
2594
  """
1955
2595
  Create table structure (table, rows, columns, cells) from guide coordinates.
1956
2596
 
@@ -1959,11 +2599,331 @@ class Guides:
1959
2599
  source: Source label for created regions (for identification)
1960
2600
  cell_padding: Internal padding for cell regions in points
1961
2601
  include_outer_boundaries: Whether to add boundaries at edges if missing
2602
+ multi_page: Controls multi-page table creation for FlowRegions.
2603
+ - "auto": (default) Creates a multi-page grid if guides span pages.
2604
+ - True: Forces creation of a multi-page grid.
2605
+ - False: Creates separate grids for each page.
1962
2606
 
1963
2607
  Returns:
1964
- Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
2608
+ Dictionary with 'counts' and 'regions' created.
2609
+ """
2610
+ # Dispatch to appropriate implementation based on context and flags
2611
+ if self.is_flow_region:
2612
+ spans_pages = self._spans_pages()
2613
+ if multi_page is True or (multi_page == "auto" and spans_pages):
2614
+ return self._build_grid_multi_page(
2615
+ source=source,
2616
+ cell_padding=cell_padding,
2617
+ include_outer_boundaries=include_outer_boundaries,
2618
+ )
2619
+ else:
2620
+ # FlowRegion context, but creating separate tables per page
2621
+ total_counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
2622
+ all_regions = {"table": [], "rows": [], "columns": [], "cells": []}
2623
+
2624
+ for region in self.context.constituent_regions:
2625
+ if region in self._flow_guides:
2626
+ verticals, horizontals = self._flow_guides[region]
2627
+
2628
+ region_guides = Guides(
2629
+ verticals=verticals,
2630
+ horizontals=horizontals,
2631
+ context=region
2632
+ )
2633
+
2634
+ try:
2635
+ result = region_guides._build_grid_single_page(
2636
+ target=region,
2637
+ source=source,
2638
+ cell_padding=cell_padding,
2639
+ include_outer_boundaries=include_outer_boundaries
2640
+ )
2641
+
2642
+ for key in total_counts:
2643
+ total_counts[key] += result["counts"][key]
2644
+
2645
+ if result["regions"]["table"]:
2646
+ all_regions["table"].append(result["regions"]["table"])
2647
+ all_regions["rows"].extend(result["regions"]["rows"])
2648
+ all_regions["columns"].extend(result["regions"]["columns"])
2649
+ all_regions["cells"].extend(result["regions"]["cells"])
2650
+
2651
+ except Exception as e:
2652
+ logger.warning(f"Failed to build grid on region: {e}")
2653
+
2654
+ logger.info(
2655
+ f"Created {total_counts['table']} tables, {total_counts['rows']} rows, "
2656
+ f"{total_counts['columns']} columns, and {total_counts['cells']} cells "
2657
+ f"from guides across {len(self._flow_guides)} regions"
2658
+ )
2659
+
2660
+ return {"counts": total_counts, "regions": all_regions}
2661
+
2662
+ # Fallback for single page/region
2663
+ return self._build_grid_single_page(
2664
+ target=target,
2665
+ source=source,
2666
+ cell_padding=cell_padding,
2667
+ include_outer_boundaries=include_outer_boundaries,
2668
+ )
2669
+
2670
+ def _build_grid_multi_page(
2671
+ self,
2672
+ source: str,
2673
+ cell_padding: float,
2674
+ include_outer_boundaries: bool,
2675
+ ) -> Dict[str, Any]:
2676
+ """Builds a single, coherent grid across multiple pages of a FlowRegion."""
2677
+ from natural_pdf.flows.region import FlowRegion
2678
+
2679
+ if not self.is_flow_region or not hasattr(self.context, "flow") or not self.context.flow:
2680
+ raise ValueError("Multi-page grid building requires a FlowRegion with a valid Flow.")
2681
+
2682
+ # Determine flow orientation to guide stitching
2683
+ orientation = self._get_flow_orientation()
2684
+
2685
+ # Phase 1: Build physical grid on each page, clipping guides to that page's region
2686
+ results_by_region = []
2687
+ unified_verticals = self.vertical.data
2688
+ unified_horizontals = self.horizontal.data
2689
+
2690
+ for region in self.context.constituent_regions:
2691
+ bounds = region.bbox
2692
+ if not bounds:
2693
+ continue
2694
+
2695
+ # Clip unified guides to the current region's bounds
2696
+ clipped_verticals = [v for v in unified_verticals if bounds[0] <= v <= bounds[2]]
2697
+ clipped_horizontals = [h for h in unified_horizontals if bounds[1] <= h <= bounds[3]]
2698
+
2699
+ # Ensure the region's own boundaries are included to close off cells at page breaks
2700
+ clipped_verticals = sorted(list(set([bounds[0], bounds[2]] + clipped_verticals)))
2701
+ clipped_horizontals = sorted(list(set([bounds[1], bounds[3]] + clipped_horizontals)))
2702
+
2703
+ if len(clipped_verticals) < 2 or len(clipped_horizontals) < 2:
2704
+ continue # Not enough guides to form a cell
2705
+
2706
+ region_guides = Guides(
2707
+ verticals=clipped_verticals,
2708
+ horizontals=clipped_horizontals,
2709
+ context=region,
2710
+ )
2711
+
2712
+ grid_parts = region_guides._build_grid_single_page(
2713
+ target=region,
2714
+ source=source,
2715
+ cell_padding=cell_padding,
2716
+ include_outer_boundaries=False, # Boundaries are already handled
2717
+ )
2718
+
2719
+ if grid_parts["counts"]["table"] > 0:
2720
+ results_by_region.append(grid_parts)
2721
+
2722
+ if not results_by_region:
2723
+ return {
2724
+ "counts": {"table": 0, "rows": 0, "columns": 0, "cells": 0},
2725
+ "regions": {"table": None, "rows": [], "columns": [], "cells": []},
2726
+ }
2727
+
2728
+ # Phase 2: Stitch physical regions into logical FlowRegions based on orientation
2729
+ flow = self.context.flow
2730
+
2731
+ # The overall table is always a FlowRegion
2732
+ physical_tables = [res["regions"]["table"] for res in results_by_region]
2733
+ multi_page_table = FlowRegion(
2734
+ flow=flow, constituent_regions=physical_tables, source_flow_element=None
2735
+ )
2736
+ multi_page_table.source = source
2737
+ multi_page_table.region_type = "table"
2738
+ multi_page_table.metadata.update(
2739
+ {"is_multi_page": True, "num_rows": self.n_rows, "num_cols": self.n_cols}
2740
+ )
2741
+
2742
+ # Initialize final region collections
2743
+ final_rows = []
2744
+ final_cols = []
2745
+ final_cells = []
2746
+
2747
+ orientation = self._get_flow_orientation()
2748
+
2749
+ if orientation == "vertical":
2750
+ # Start with all rows & cells from the first page's grid
2751
+ if results_by_region:
2752
+ # Make copies to modify
2753
+ page_rows = [res["regions"]["rows"] for res in results_by_region]
2754
+ page_cells = [res["regions"]["cells"] for res in results_by_region]
2755
+
2756
+ # Iterate through page breaks to merge split rows/cells
2757
+ for i in range(len(results_by_region) - 1):
2758
+ region_A = self.context.constituent_regions[i]
2759
+
2760
+ # Check if a guide exists at the boundary
2761
+ is_break_bounded = any(abs(h - region_A.bottom) < 0.1 for h in self.horizontal.data)
2762
+
2763
+ if not is_break_bounded and page_rows[i] and page_rows[i+1]:
2764
+ # No guide at break -> merge last row of A with first row of B
2765
+ last_row_A = page_rows[i].pop(-1)
2766
+ first_row_B = page_rows[i+1].pop(0)
2767
+
2768
+ merged_row = FlowRegion(flow, [last_row_A, first_row_B], source_flow_element=None)
2769
+ merged_row.source = source
2770
+ merged_row.region_type = "table_row"
2771
+ merged_row.metadata.update({"row_index": last_row_A.metadata.get("row_index"), "is_multi_page": True})
2772
+ page_rows[i].append(merged_row) # Add merged row back in place of A's last
2773
+
2774
+ # Merge the corresponding cells using explicit row/col indices
2775
+ last_row_idx = last_row_A.metadata.get("row_index")
2776
+ first_row_idx = first_row_B.metadata.get("row_index")
2777
+
2778
+ # Cells belonging to those rows
2779
+ last_cells_A = [c for c in page_cells[i] if c.metadata.get("row_index") == last_row_idx]
2780
+ first_cells_B = [c for c in page_cells[i+1] if c.metadata.get("row_index") == first_row_idx]
2781
+
2782
+ # Remove them from their page lists
2783
+ page_cells[i] = [c for c in page_cells[i] if c.metadata.get("row_index") != last_row_idx]
2784
+ page_cells[i+1] = [c for c in page_cells[i+1] if c.metadata.get("row_index") != first_row_idx]
2785
+
2786
+ # Sort both lists by column index to keep alignment stable
2787
+ last_cells_A.sort(key=lambda c: c.metadata.get("col_index", 0))
2788
+ first_cells_B.sort(key=lambda c: c.metadata.get("col_index", 0))
2789
+
2790
+ # Pair-wise merge
2791
+ for cell_A, cell_B in zip(last_cells_A, first_cells_B):
2792
+ merged_cell = FlowRegion(flow, [cell_A, cell_B], source_flow_element=None)
2793
+ merged_cell.source = source
2794
+ merged_cell.region_type = "table_cell"
2795
+ merged_cell.metadata.update({
2796
+ "row_index": cell_A.metadata.get("row_index"),
2797
+ "col_index": cell_A.metadata.get("col_index"),
2798
+ "is_multi_page": True
2799
+ })
2800
+ page_cells[i].append(merged_cell)
2801
+
2802
+ # Flatten the potentially modified lists of rows and cells
2803
+ final_rows = [row for rows_list in page_rows for row in rows_list]
2804
+ final_cells = [cell for cells_list in page_cells for cell in cells_list]
2805
+
2806
+ # Stitch columns, which always span vertically
2807
+ physical_cols_by_index = zip(*(res["regions"]["columns"] for res in results_by_region))
2808
+ for j, physical_cols in enumerate(physical_cols_by_index):
2809
+ col_fr = FlowRegion(flow=flow, constituent_regions=list(physical_cols), source_flow_element=None)
2810
+ col_fr.source = source
2811
+ col_fr.region_type = "table_column"
2812
+ col_fr.metadata.update({"col_index": j, "is_multi_page": True})
2813
+ final_cols.append(col_fr)
2814
+
2815
+ elif orientation == "horizontal":
2816
+ # Symmetric logic for horizontal flow (not fully implemented here for brevity)
2817
+ # This would merge last column of A with first column of B if no vertical guide exists
2818
+ logger.warning("Horizontal table stitching not fully implemented.")
2819
+ final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
2820
+ final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
2821
+ final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
2822
+
2823
+ else: # Unknown orientation, just flatten everything
2824
+ final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
2825
+ final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
2826
+ final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
2827
+
2828
+ # SMART PAGE-LEVEL REGISTRY: Remove individual tables and replace with multi-page table
2829
+ # This ensures that page.find('table') finds the logical multi-page table, not fragments
2830
+ constituent_pages = set()
2831
+ for region in self.context.constituent_regions:
2832
+ if hasattr(region, 'page') and hasattr(region.page, '_element_mgr'):
2833
+ constituent_pages.add(region.page)
2834
+
2835
+ # First, remove ONLY the specific individual Region tables that were created during this build
2836
+ # (i.e., the physical_tables), not ALL tables with the same source
2837
+ physical_tables_to_remove = set(physical_tables) # Convert to set for fast lookup
2838
+
2839
+ for page in constituent_pages:
2840
+ try:
2841
+ # Find and remove only the specific physical tables that are part of this multi-page table
2842
+ existing_tables = page.find_all('table')
2843
+ tables_to_remove = [
2844
+ table for table in existing_tables
2845
+ if (table in physical_tables_to_remove and
2846
+ not isinstance(table, FlowRegion)) # Only remove the specific Region tables we created
2847
+ ]
2848
+
2849
+ for table in tables_to_remove:
2850
+ page._element_mgr.remove_element(table, element_type="regions")
2851
+ logger.debug(f"Removed physical table fragment from page {page.page_number}")
2852
+
2853
+ # Now register the multi-page table
2854
+ page._element_mgr.add_element(multi_page_table, element_type="regions")
2855
+ logger.debug(f"Registered multi-page table with page {page.page_number}")
2856
+
2857
+ except Exception as e:
2858
+ logger.warning(f"Failed to register multi-page table with page {page.page_number}: {e}")
2859
+
2860
+ # SMART PAGE-LEVEL REGISTRY: Also register rows, columns, and cells with their respective pages
2861
+ # This ensures that page.find('table_cell') etc. also work across the multi-page structure
2862
+ for row in final_rows:
2863
+ if hasattr(row, 'constituent_regions'):
2864
+ # This is a FlowRegion row spanning multiple pages
2865
+ for constituent_region in row.constituent_regions:
2866
+ if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2867
+ try:
2868
+ constituent_region.page._element_mgr.add_element(row, element_type="regions")
2869
+ except Exception as e:
2870
+ logger.warning(f"Failed to register multi-page row: {e}")
2871
+
2872
+ for col in final_cols:
2873
+ if hasattr(col, 'constituent_regions'):
2874
+ # This is a FlowRegion column spanning multiple pages
2875
+ for constituent_region in col.constituent_regions:
2876
+ if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2877
+ try:
2878
+ constituent_region.page._element_mgr.add_element(col, element_type="regions")
2879
+ except Exception as e:
2880
+ logger.warning(f"Failed to register multi-page column: {e}")
2881
+
2882
+ for cell in final_cells:
2883
+ if hasattr(cell, 'constituent_regions'):
2884
+ # This is a FlowRegion cell spanning multiple pages
2885
+ for constituent_region in cell.constituent_regions:
2886
+ if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2887
+ try:
2888
+ constituent_region.page._element_mgr.add_element(cell, element_type="regions")
2889
+ except Exception as e:
2890
+ logger.warning(f"Failed to register multi-page cell: {e}")
2891
+
2892
+ final_counts = {
2893
+ "table": 1,
2894
+ "rows": len(final_rows),
2895
+ "columns": len(final_cols),
2896
+ "cells": len(final_cells),
2897
+ }
2898
+ final_regions = {
2899
+ "table": multi_page_table,
2900
+ "rows": final_rows,
2901
+ "columns": final_cols,
2902
+ "cells": final_cells,
2903
+ }
2904
+
2905
+ logger.info(
2906
+ f"Created 1 multi-page table, {final_counts['rows']} logical rows, "
2907
+ f"{final_counts['columns']} logical columns from guides and registered with all constituent pages"
2908
+ )
2909
+
2910
+ return {"counts": final_counts, "regions": final_regions}
2911
+
2912
+ def _build_grid_single_page(
2913
+ self,
2914
+ target: Optional[Union["Page", "Region"]] = None,
2915
+ source: str = "guides",
2916
+ cell_padding: float = 0.5,
2917
+ include_outer_boundaries: bool = False,
2918
+ ) -> Dict[str, Any]:
2919
+ """
2920
+ Private method to create table structure on a single page or region.
2921
+ (Refactored from the original public build_grid method).
1965
2922
  """
1966
- # Determine target object
2923
+ # This method now only handles a single page/region context.
2924
+ # Looping for FlowRegions is handled by the public `build_grid` method.
2925
+
2926
+ # Original single-region logic follows...
1967
2927
  target_obj = target or self.context
1968
2928
  if not target_obj:
1969
2929
  raise ValueError("No target object available. Provide target parameter or context.")
@@ -2055,8 +3015,9 @@ class Guides:
2055
3015
  f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
2056
3016
  )
2057
3017
 
2058
- # Track creation counts
3018
+ # Track creation counts and regions
2059
3019
  counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
3020
+ created_regions = {"table": None, "rows": [], "columns": [], "cells": []}
2060
3021
 
2061
3022
  # Create overall table region
2062
3023
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
@@ -2076,6 +3037,7 @@ class Guides:
2076
3037
  )
2077
3038
  element_manager.add_element(table_region, element_type="regions")
2078
3039
  counts["table"] = 1
3040
+ created_regions["table"] = table_region
2079
3041
 
2080
3042
  # Create row regions
2081
3043
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
@@ -2089,6 +3051,7 @@ class Guides:
2089
3051
  row_region.metadata.update({"row_index": i, "source_guides": True})
2090
3052
  element_manager.add_element(row_region, element_type="regions")
2091
3053
  counts["rows"] += 1
3054
+ created_regions["rows"].append(row_region)
2092
3055
 
2093
3056
  # Create column regions
2094
3057
  if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
@@ -2102,6 +3065,7 @@ class Guides:
2102
3065
  col_region.metadata.update({"col_index": j, "source_guides": True})
2103
3066
  element_manager.add_element(col_region, element_type="regions")
2104
3067
  counts["columns"] += 1
3068
+ created_regions["columns"].append(col_region)
2105
3069
 
2106
3070
  # Create cell regions
2107
3071
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
@@ -2136,13 +3100,14 @@ class Guides:
2136
3100
  )
2137
3101
  element_manager.add_element(cell_region, element_type="regions")
2138
3102
  counts["cells"] += 1
3103
+ created_regions["cells"].append(cell_region)
2139
3104
 
2140
3105
  logger.info(
2141
3106
  f"Created {counts['table']} table, {counts['rows']} rows, "
2142
3107
  f"{counts['columns']} columns, and {counts['cells']} cells from guides"
2143
3108
  )
2144
3109
 
2145
- return counts
3110
+ return {"counts": counts, "regions": created_regions}
2146
3111
 
2147
3112
  def __repr__(self) -> str:
2148
3113
  """String representation of the guides."""
@@ -2157,6 +3122,22 @@ class Guides:
2157
3122
  if not self.context:
2158
3123
  return []
2159
3124
 
3125
+ # Handle FlowRegion context
3126
+ if self.is_flow_region:
3127
+ all_text_elements = []
3128
+ for region in self.context.constituent_regions:
3129
+ if hasattr(region, "find_all"):
3130
+ try:
3131
+ text_elements = region.find_all("text", apply_exclusions=False)
3132
+ elements = (
3133
+ text_elements.elements if hasattr(text_elements, "elements") else text_elements
3134
+ )
3135
+ all_text_elements.extend(elements)
3136
+ except Exception as e:
3137
+ logger.warning(f"Error getting text elements from region: {e}")
3138
+ return all_text_elements
3139
+
3140
+ # Original single-region logic
2160
3141
  # Get text elements from the context
2161
3142
  if hasattr(self.context, "find_all"):
2162
3143
  try:
@@ -2171,6 +3152,31 @@ class Guides:
2171
3152
  logger.warning("Context does not support text element search")
2172
3153
  return []
2173
3154
 
3155
+ def _spans_pages(self) -> bool:
3156
+ """Check if any guides are defined across multiple pages in a FlowRegion."""
3157
+ if not self.is_flow_region:
3158
+ return False
3159
+
3160
+ # Check vertical guides
3161
+ v_guide_pages = {}
3162
+ for coord, region in self._unified_vertical:
3163
+ v_guide_pages.setdefault(coord, set()).add(region.page.page_number)
3164
+
3165
+ for pages in v_guide_pages.values():
3166
+ if len(pages) > 1:
3167
+ return True
3168
+
3169
+ # Check horizontal guides
3170
+ h_guide_pages = {}
3171
+ for coord, region in self._unified_horizontal:
3172
+ h_guide_pages.setdefault(coord, set()).add(region.page.page_number)
3173
+
3174
+ for pages in h_guide_pages.values():
3175
+ if len(pages) > 1:
3176
+ return True
3177
+
3178
+ return False
3179
+
2174
3180
  # -------------------------------------------------------------------------
2175
3181
  # Instance methods for fluent chaining (avoid name conflicts with class methods)
2176
3182
  # -------------------------------------------------------------------------
@@ -2318,3 +3324,24 @@ class Guides:
2318
3324
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2319
3325
 
2320
3326
  return self
3327
+
3328
+ def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3329
+ """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3330
+ if not self.is_flow_region or len(self.context.constituent_regions) < 2:
3331
+ return "unknown"
3332
+
3333
+ r1 = self.context.constituent_regions[0]
3334
+ r2 = self.context.constituent_regions[1] # Compare first two regions
3335
+
3336
+ if not r1.bbox or not r2.bbox:
3337
+ return "unknown"
3338
+
3339
+ # Calculate non-overlapping distances.
3340
+ # This determines the primary direction of separation.
3341
+ x_dist = max(0, max(r1.x0, r2.x0) - min(r1.x1, r2.x1))
3342
+ y_dist = max(0, max(r1.top, r2.top) - min(r1.bottom, r2.bottom))
3343
+
3344
+ if y_dist > x_dist:
3345
+ return "vertical"
3346
+ else:
3347
+ return "horizontal"