natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
  50. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
  51. optimization/memory_comparison.py +1 -1
  52. optimization/pdf_analyzer.py +2 -2
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
@@ -13,7 +13,7 @@ from natural_pdf.utils.layout import merge_bboxes
13
13
  if TYPE_CHECKING:
14
14
  from natural_pdf.core.page import Page
15
15
  from natural_pdf.elements.base import Element
16
- from natural_pdf.elements.collections import ElementCollection
16
+ from natural_pdf.elements.element_collection import ElementCollection
17
17
  from natural_pdf.elements.region import Region
18
18
  from natural_pdf.flows.region import FlowRegion
19
19
 
@@ -21,7 +21,8 @@ logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
23
  def _normalize_markers(
24
- markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region", "FlowRegion"]
24
+ markers: Union[str, List[str], "ElementCollection", None],
25
+ obj: Union["Page", "Region", "FlowRegion"],
25
26
  ) -> List[str]:
26
27
  """
27
28
  Normalize markers parameter to a list of text strings for guide creation.
@@ -168,7 +169,7 @@ class GuidesList(UserList):
168
169
  for region in self._parent.context.constituent_regions:
169
170
  # Normalize markers for this region
170
171
  marker_texts = _normalize_markers(markers, region)
171
-
172
+
172
173
  # Create guides for this region
173
174
  region_guides = Guides.from_content(
174
175
  obj=region,
@@ -178,24 +179,29 @@ class GuidesList(UserList):
178
179
  outer=outer,
179
180
  tolerance=tolerance,
180
181
  )
181
-
182
+
182
183
  # Collect guides from this region
183
184
  if self._axis == "vertical":
184
185
  all_guides.extend(region_guides.vertical)
185
186
  else:
186
187
  all_guides.extend(region_guides.horizontal)
187
-
188
+
188
189
  # Update parent's flow guides structure
189
190
  if append:
190
191
  # Append to existing
191
- existing = [coord for coord, _ in
192
- (self._parent._unified_vertical if self._axis == "vertical"
193
- else self._parent._unified_horizontal)]
192
+ existing = [
193
+ coord
194
+ for coord, _ in (
195
+ self._parent._unified_vertical
196
+ if self._axis == "vertical"
197
+ else self._parent._unified_horizontal
198
+ )
199
+ ]
194
200
  all_guides = existing + all_guides
195
-
201
+
196
202
  # Remove duplicates and sort
197
203
  unique_guides = sorted(list(set(all_guides)))
198
-
204
+
199
205
  # Clear and rebuild unified view
200
206
  if self._axis == "vertical":
201
207
  self._parent._unified_vertical = []
@@ -221,27 +227,27 @@ class GuidesList(UserList):
221
227
  break
222
228
  self._parent._horizontal_cache = None
223
229
  self.data = unique_guides
224
-
230
+
225
231
  # Update per-region guides
226
232
  for region in self._parent.context.constituent_regions:
227
233
  region_verticals = []
228
234
  region_horizontals = []
229
-
235
+
230
236
  for coord, r in self._parent._unified_vertical:
231
237
  if r == region:
232
238
  region_verticals.append(coord)
233
-
239
+
234
240
  for coord, r in self._parent._unified_horizontal:
235
241
  if r == region:
236
242
  region_horizontals.append(coord)
237
-
243
+
238
244
  self._parent._flow_guides[region] = (
239
245
  sorted(region_verticals),
240
- sorted(region_horizontals)
246
+ sorted(region_horizontals),
241
247
  )
242
-
248
+
243
249
  return self._parent
244
-
250
+
245
251
  # Original single-region logic
246
252
  # Normalize markers to list of text strings
247
253
  marker_texts = _normalize_markers(markers, target_obj)
@@ -286,7 +292,7 @@ class GuidesList(UserList):
286
292
  source_label: Optional[str] = None,
287
293
  max_lines: Optional[int] = None,
288
294
  outer: bool = False,
289
- detection_method: str = "vector",
295
+ detection_method: str = "pixels",
290
296
  resolution: int = 192,
291
297
  *,
292
298
  n: Optional[int] = None,
@@ -340,7 +346,7 @@ class GuidesList(UserList):
340
346
  if self._parent.is_flow_region:
341
347
  # Create guides across all constituent regions
342
348
  all_guides = []
343
-
349
+
344
350
  for region in self._parent.context.constituent_regions:
345
351
  # Create guides for this specific region
346
352
  region_guides = Guides.from_lines(
@@ -353,26 +359,31 @@ class GuidesList(UserList):
353
359
  outer=outer,
354
360
  detection_method=detection_method,
355
361
  resolution=resolution,
356
- **detect_kwargs
362
+ **detect_kwargs,
357
363
  )
358
-
364
+
359
365
  # Collect guides from this region
360
366
  if self._axis == "vertical":
361
367
  all_guides.extend(region_guides.vertical)
362
368
  else:
363
369
  all_guides.extend(region_guides.horizontal)
364
-
370
+
365
371
  # Update parent's flow guides structure
366
372
  if append:
367
373
  # Append to existing
368
- existing = [coord for coord, _ in
369
- (self._parent._unified_vertical if self._axis == "vertical"
370
- else self._parent._unified_horizontal)]
374
+ existing = [
375
+ coord
376
+ for coord, _ in (
377
+ self._parent._unified_vertical
378
+ if self._axis == "vertical"
379
+ else self._parent._unified_horizontal
380
+ )
381
+ ]
371
382
  all_guides = existing + all_guides
372
-
383
+
373
384
  # Remove duplicates and sort
374
385
  unique_guides = sorted(list(set(all_guides)))
375
-
386
+
376
387
  # Clear and rebuild unified view
377
388
  if self._axis == "vertical":
378
389
  self._parent._unified_vertical = []
@@ -398,25 +409,25 @@ class GuidesList(UserList):
398
409
  break
399
410
  self._parent._horizontal_cache = None
400
411
  self.data = unique_guides
401
-
412
+
402
413
  # Update per-region guides
403
414
  for region in self._parent.context.constituent_regions:
404
415
  region_verticals = []
405
416
  region_horizontals = []
406
-
417
+
407
418
  for coord, r in self._parent._unified_vertical:
408
419
  if r == region:
409
420
  region_verticals.append(coord)
410
-
421
+
411
422
  for coord, r in self._parent._unified_horizontal:
412
423
  if r == region:
413
424
  region_horizontals.append(coord)
414
-
425
+
415
426
  self._parent._flow_guides[region] = (
416
427
  sorted(region_verticals),
417
- sorted(region_horizontals)
428
+ sorted(region_horizontals),
418
429
  )
419
-
430
+
420
431
  return self._parent
421
432
 
422
433
  # Original single-region logic
@@ -458,8 +469,11 @@ class GuidesList(UserList):
458
469
  return self._parent
459
470
 
460
471
  def from_whitespace(
461
- self, obj: Optional[Union["Page", "Region", "FlowRegion"]] = None, min_gap: float = 10,
462
- *, append: bool = False
472
+ self,
473
+ obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
474
+ min_gap: float = 10,
475
+ *,
476
+ append: bool = False,
463
477
  ) -> "Guides":
464
478
  """
465
479
  Create guides from whitespace gaps.
@@ -479,32 +493,33 @@ class GuidesList(UserList):
479
493
  if self._parent.is_flow_region:
480
494
  # Create guides across all constituent regions
481
495
  all_guides = []
482
-
496
+
483
497
  for region in self._parent.context.constituent_regions:
484
498
  # Create guides for this specific region
485
- region_guides = Guides.from_whitespace(
486
- obj=region,
487
- axis=self._axis,
488
- min_gap=min_gap
489
- )
490
-
499
+ region_guides = Guides.from_whitespace(obj=region, axis=self._axis, min_gap=min_gap)
500
+
491
501
  # Collect guides from this region
492
502
  if self._axis == "vertical":
493
503
  all_guides.extend(region_guides.vertical)
494
504
  else:
495
505
  all_guides.extend(region_guides.horizontal)
496
-
506
+
497
507
  # Update parent's flow guides structure
498
508
  if append:
499
509
  # Append to existing
500
- existing = [coord for coord, _ in
501
- (self._parent._unified_vertical if self._axis == "vertical"
502
- else self._parent._unified_horizontal)]
510
+ existing = [
511
+ coord
512
+ for coord, _ in (
513
+ self._parent._unified_vertical
514
+ if self._axis == "vertical"
515
+ else self._parent._unified_horizontal
516
+ )
517
+ ]
503
518
  all_guides = existing + all_guides
504
-
519
+
505
520
  # Remove duplicates and sort
506
521
  unique_guides = sorted(list(set(all_guides)))
507
-
522
+
508
523
  # Clear and rebuild unified view
509
524
  if self._axis == "vertical":
510
525
  self._parent._unified_vertical = []
@@ -530,25 +545,25 @@ class GuidesList(UserList):
530
545
  break
531
546
  self._parent._horizontal_cache = None
532
547
  self.data = unique_guides
533
-
548
+
534
549
  # Update per-region guides
535
550
  for region in self._parent.context.constituent_regions:
536
551
  region_verticals = []
537
552
  region_horizontals = []
538
-
553
+
539
554
  for coord, r in self._parent._unified_vertical:
540
555
  if r == region:
541
556
  region_verticals.append(coord)
542
-
557
+
543
558
  for coord, r in self._parent._unified_horizontal:
544
559
  if r == region:
545
560
  region_horizontals.append(coord)
546
-
561
+
547
562
  self._parent._flow_guides[region] = (
548
563
  sorted(region_verticals),
549
- sorted(region_horizontals)
564
+ sorted(region_horizontals),
550
565
  )
551
-
566
+
552
567
  return self._parent
553
568
 
554
569
  # Original single-region logic
@@ -915,7 +930,7 @@ class Guides:
915
930
 
916
931
  # Check if we're dealing with a FlowRegion
917
932
  self.is_flow_region = hasattr(context, "constituent_regions")
918
-
933
+
919
934
  # If FlowRegion, we'll store guides per constituent region
920
935
  if self.is_flow_region:
921
936
  self._flow_guides: Dict["Region", Tuple[List[float], List[float]]] = {}
@@ -976,7 +991,7 @@ class Guides:
976
991
  if self.is_flow_region:
977
992
  # Invalidate cache when setting new values
978
993
  self._vertical_cache = None
979
-
994
+
980
995
  if value is None:
981
996
  self._vertical.data = []
982
997
  elif isinstance(value, Guides):
@@ -1018,7 +1033,7 @@ class Guides:
1018
1033
  if self.is_flow_region:
1019
1034
  # Invalidate cache when setting new values
1020
1035
  self._horizontal_cache = None
1021
-
1036
+
1022
1037
  if value is None:
1023
1038
  self._horizontal.data = []
1024
1039
  elif isinstance(value, Guides):
@@ -1132,7 +1147,7 @@ class Guides:
1132
1147
  max_lines_h: Optional[int] = None,
1133
1148
  max_lines_v: Optional[int] = None,
1134
1149
  outer: bool = False,
1135
- detection_method: str = "vector",
1150
+ detection_method: str = "pixels",
1136
1151
  resolution: int = 192,
1137
1152
  **detect_kwargs,
1138
1153
  ) -> "Guides":
@@ -1163,7 +1178,7 @@ class Guides:
1163
1178
  # Handle FlowRegion
1164
1179
  if hasattr(obj, "constituent_regions"):
1165
1180
  guides = cls(context=obj)
1166
-
1181
+
1167
1182
  # Process each constituent region
1168
1183
  for region in obj.constituent_regions:
1169
1184
  # Create guides for this specific region
@@ -1177,27 +1192,27 @@ class Guides:
1177
1192
  outer=outer,
1178
1193
  detection_method=detection_method,
1179
1194
  resolution=resolution,
1180
- **detect_kwargs
1195
+ **detect_kwargs,
1181
1196
  )
1182
-
1197
+
1183
1198
  # Store in flow guides
1184
1199
  guides._flow_guides[region] = (
1185
1200
  list(region_guides.vertical),
1186
- list(region_guides.horizontal)
1201
+ list(region_guides.horizontal),
1187
1202
  )
1188
-
1203
+
1189
1204
  # Add to unified view
1190
1205
  for v in region_guides.vertical:
1191
1206
  guides._unified_vertical.append((v, region))
1192
1207
  for h in region_guides.horizontal:
1193
1208
  guides._unified_horizontal.append((h, region))
1194
-
1209
+
1195
1210
  # Invalidate caches to force rebuild on next access
1196
1211
  guides._vertical_cache = None
1197
1212
  guides._horizontal_cache = None
1198
-
1213
+
1199
1214
  return guides
1200
-
1215
+
1201
1216
  # Original single-region logic follows...
1202
1217
  # Get bounds for potential outer guides
1203
1218
  if hasattr(obj, "bbox"):
@@ -1228,12 +1243,17 @@ class Guides:
1228
1243
  }
1229
1244
 
1230
1245
  # Handle threshold parameter
1231
- if threshold == "auto":
1246
+ if threshold == "auto" and detection_method == "vector":
1232
1247
  # Auto mode: use very low thresholds with max_lines constraints
1233
1248
  detect_params["peak_threshold_h"] = 0.0
1234
1249
  detect_params["peak_threshold_v"] = 0.0
1235
1250
  detect_params["max_lines_h"] = max_lines_h
1236
1251
  detect_params["max_lines_v"] = max_lines_v
1252
+ if threshold == "auto" and detection_method == "pixels":
1253
+ detect_params["peak_threshold_h"] = 0.5
1254
+ detect_params["peak_threshold_v"] = 0.5
1255
+ detect_params["max_lines_h"] = max_lines_h
1256
+ detect_params["max_lines_v"] = max_lines_v
1237
1257
  else:
1238
1258
  # Fixed threshold mode
1239
1259
  detect_params["peak_threshold_h"] = (
@@ -1275,6 +1295,7 @@ class Guides:
1275
1295
  lines = []
1276
1296
 
1277
1297
  # Filter by the source we just used
1298
+
1278
1299
  lines = [
1279
1300
  l for l in lines if getattr(l, "source", None) == detect_params["source_label"]
1280
1301
  ]
@@ -1399,7 +1420,7 @@ class Guides:
1399
1420
  # Handle FlowRegion
1400
1421
  if hasattr(obj, "constituent_regions"):
1401
1422
  guides = cls(context=obj)
1402
-
1423
+
1403
1424
  # Process each constituent region
1404
1425
  for region in obj.constituent_regions:
1405
1426
  # Create guides for this specific region
@@ -1409,27 +1430,27 @@ class Guides:
1409
1430
  markers=markers,
1410
1431
  align=align,
1411
1432
  outer=outer,
1412
- tolerance=tolerance
1433
+ tolerance=tolerance,
1413
1434
  )
1414
-
1435
+
1415
1436
  # Store in flow guides
1416
1437
  guides._flow_guides[region] = (
1417
1438
  list(region_guides.vertical),
1418
- list(region_guides.horizontal)
1439
+ list(region_guides.horizontal),
1419
1440
  )
1420
-
1441
+
1421
1442
  # Add to unified view
1422
1443
  for v in region_guides.vertical:
1423
1444
  guides._unified_vertical.append((v, region))
1424
1445
  for h in region_guides.horizontal:
1425
1446
  guides._unified_horizontal.append((h, region))
1426
-
1447
+
1427
1448
  # Invalidate caches
1428
1449
  guides._vertical_cache = None
1429
1450
  guides._horizontal_cache = None
1430
-
1451
+
1431
1452
  return guides
1432
-
1453
+
1433
1454
  # Original single-region logic follows...
1434
1455
  guides_coords = []
1435
1456
  bounds = None
@@ -1594,27 +1615,38 @@ class Guides:
1594
1615
  if self.is_flow_region:
1595
1616
  all_text_elements = []
1596
1617
  region_bounds = {}
1597
-
1618
+
1598
1619
  for region in self.context.constituent_regions:
1599
1620
  # Get text elements from this region
1600
1621
  if hasattr(region, "find_all"):
1601
1622
  try:
1602
1623
  text_elements = region.find_all("text", apply_exclusions=False)
1603
- elements = text_elements.elements if hasattr(text_elements, "elements") else text_elements
1624
+ elements = (
1625
+ text_elements.elements
1626
+ if hasattr(text_elements, "elements")
1627
+ else text_elements
1628
+ )
1604
1629
  all_text_elements.extend(elements)
1605
-
1630
+
1606
1631
  # Store bounds for each region
1607
1632
  if hasattr(region, "bbox"):
1608
1633
  region_bounds[region] = region.bbox
1609
1634
  elif hasattr(region, "x0"):
1610
- region_bounds[region] = (region.x0, region.top, region.x1, region.bottom)
1635
+ region_bounds[region] = (
1636
+ region.x0,
1637
+ region.top,
1638
+ region.x1,
1639
+ region.bottom,
1640
+ )
1611
1641
  except Exception as e:
1612
1642
  logger.warning(f"Error getting text elements from region: {e}")
1613
-
1643
+
1614
1644
  if not all_text_elements:
1615
- logger.warning("No text elements found across flow regions for whitespace detection")
1645
+ logger.warning(
1646
+ "No text elements found across flow regions for whitespace detection"
1647
+ )
1616
1648
  return self
1617
-
1649
+
1618
1650
  # Find whitespace gaps across all regions
1619
1651
  if axis == "vertical":
1620
1652
  gaps = self._find_vertical_whitespace_gaps(all_text_elements, min_gap, threshold)
@@ -1624,14 +1656,14 @@ class Guides:
1624
1656
  for coord, region in self._unified_vertical:
1625
1657
  all_guides.append(coord)
1626
1658
  guide_to_region_map.setdefault(coord, []).append(region)
1627
-
1659
+
1628
1660
  if gaps and all_guides:
1629
1661
  # Keep a copy of original guides to maintain mapping
1630
1662
  original_guides = all_guides.copy()
1631
-
1663
+
1632
1664
  # Snap guides to gaps
1633
1665
  self._snap_guides_to_gaps(all_guides, gaps, axis)
1634
-
1666
+
1635
1667
  # Update the unified view with snapped positions
1636
1668
  self._unified_vertical = []
1637
1669
  for i, new_coord in enumerate(all_guides):
@@ -1641,7 +1673,7 @@ class Guides:
1641
1673
  regions = guide_to_region_map.get(original_coord, [])
1642
1674
  for region in regions:
1643
1675
  self._unified_vertical.append((new_coord, region))
1644
-
1676
+
1645
1677
  # Update individual region guides
1646
1678
  for region in self._flow_guides:
1647
1679
  region_verticals = []
@@ -1649,13 +1681,13 @@ class Guides:
1649
1681
  if r == region:
1650
1682
  region_verticals.append(coord)
1651
1683
  self._flow_guides[region] = (
1652
- sorted(list(set(region_verticals))), # Deduplicate here
1653
- self._flow_guides[region][1]
1684
+ sorted(list(set(region_verticals))), # Deduplicate here
1685
+ self._flow_guides[region][1],
1654
1686
  )
1655
-
1687
+
1656
1688
  # Invalidate cache
1657
1689
  self._vertical_cache = None
1658
-
1690
+
1659
1691
  elif axis == "horizontal":
1660
1692
  gaps = self._find_horizontal_whitespace_gaps(all_text_elements, min_gap, threshold)
1661
1693
  # Get all horizontal guides across regions
@@ -1664,14 +1696,14 @@ class Guides:
1664
1696
  for coord, region in self._unified_horizontal:
1665
1697
  all_guides.append(coord)
1666
1698
  guide_to_region_map.setdefault(coord, []).append(region)
1667
-
1699
+
1668
1700
  if gaps and all_guides:
1669
1701
  # Keep a copy of original guides to maintain mapping
1670
1702
  original_guides = all_guides.copy()
1671
-
1703
+
1672
1704
  # Snap guides to gaps
1673
1705
  self._snap_guides_to_gaps(all_guides, gaps, axis)
1674
-
1706
+
1675
1707
  # Update the unified view with snapped positions
1676
1708
  self._unified_horizontal = []
1677
1709
  for i, new_coord in enumerate(all_guides):
@@ -1680,7 +1712,7 @@ class Guides:
1680
1712
  regions = guide_to_region_map.get(original_coord, [])
1681
1713
  for region in regions:
1682
1714
  self._unified_horizontal.append((new_coord, region))
1683
-
1715
+
1684
1716
  # Update individual region guides
1685
1717
  for region in self._flow_guides:
1686
1718
  region_horizontals = []
@@ -1689,17 +1721,17 @@ class Guides:
1689
1721
  region_horizontals.append(coord)
1690
1722
  self._flow_guides[region] = (
1691
1723
  self._flow_guides[region][0],
1692
- sorted(list(set(region_horizontals))) # Deduplicate here
1724
+ sorted(list(set(region_horizontals))), # Deduplicate here
1693
1725
  )
1694
-
1726
+
1695
1727
  # Invalidate cache
1696
1728
  self._horizontal_cache = None
1697
-
1729
+
1698
1730
  else:
1699
1731
  raise ValueError("axis must be 'vertical' or 'horizontal'")
1700
-
1732
+
1701
1733
  return self
1702
-
1734
+
1703
1735
  # Original single-region logic
1704
1736
  # Get elements for trough detection
1705
1737
  text_elements = self._get_text_elements()
@@ -1794,10 +1826,10 @@ class Guides:
1794
1826
 
1795
1827
  # Handle FlowRegion context merging
1796
1828
  new_context = self.context or other.context
1797
-
1829
+
1798
1830
  # If both are flow regions, we might need a more complex merge,
1799
1831
  # but for now, just picking one context is sufficient.
1800
-
1832
+
1801
1833
  # Create the new Guides object
1802
1834
  new_guides = Guides(
1803
1835
  verticals=combined_verticals,
@@ -1815,7 +1847,7 @@ class Guides:
1815
1847
  new_guides._flow_guides.update(self._flow_guides)
1816
1848
  if hasattr(other, "_flow_guides"):
1817
1849
  new_guides._flow_guides.update(other._flow_guides)
1818
-
1850
+
1819
1851
  # Re-initialize unified views
1820
1852
  if hasattr(self, "_unified_vertical"):
1821
1853
  new_guides._unified_vertical.extend(self._unified_vertical)
@@ -1826,7 +1858,7 @@ class Guides:
1826
1858
  new_guides._unified_horizontal.extend(self._unified_horizontal)
1827
1859
  if hasattr(other, "_unified_horizontal"):
1828
1860
  new_guides._unified_horizontal.extend(other._unified_horizontal)
1829
-
1861
+
1830
1862
  # Invalidate caches to force rebuild
1831
1863
  new_guides._vertical_cache = None
1832
1864
  new_guides._horizontal_cache = None
@@ -1850,65 +1882,73 @@ class Guides:
1850
1882
  if self.is_flow_region and (on is None or on == self.context):
1851
1883
  if not self._flow_guides:
1852
1884
  raise ValueError("No guides to show for FlowRegion")
1853
-
1885
+
1854
1886
  # Get stacking parameters from kwargs or use defaults
1855
- stack_direction = kwargs.get('stack_direction', 'vertical')
1856
- stack_gap = kwargs.get('stack_gap', 5)
1857
- stack_background_color = kwargs.get('stack_background_color', (255, 255, 255))
1858
-
1887
+ stack_direction = kwargs.get("stack_direction", "vertical")
1888
+ stack_gap = kwargs.get("stack_gap", 5)
1889
+ stack_background_color = kwargs.get("stack_background_color", (255, 255, 255))
1890
+
1859
1891
  # First, render all constituent regions without guides to get base images
1860
1892
  base_images = []
1861
1893
  region_infos = [] # Store region info for guide coordinate mapping
1862
-
1894
+
1863
1895
  for region in self.context.constituent_regions:
1864
1896
  try:
1865
- # Render region without guides
1866
- img = region.to_image(**kwargs)
1897
+ # Render region without guides using new system
1898
+ if hasattr(region, "render"):
1899
+ img = region.render(
1900
+ resolution=kwargs.get("resolution", 150),
1901
+ width=kwargs.get("width", None),
1902
+ crop=True, # Always crop regions to their bounds
1903
+ )
1904
+ else:
1905
+ # Fallback to old method
1906
+ img = region.render(**kwargs)
1867
1907
  if img:
1868
1908
  base_images.append(img)
1869
-
1909
+
1870
1910
  # Calculate scaling factors for this region
1871
1911
  scale_x = img.width / region.width
1872
1912
  scale_y = img.height / region.height
1873
-
1874
- region_infos.append({
1875
- 'region': region,
1876
- 'img_width': img.width,
1877
- 'img_height': img.height,
1878
- 'scale_x': scale_x,
1879
- 'scale_y': scale_y,
1880
- 'pdf_x0': region.x0,
1881
- 'pdf_top': region.top,
1882
- 'pdf_x1': region.x1,
1883
- 'pdf_bottom': region.bottom
1884
- })
1913
+
1914
+ region_infos.append(
1915
+ {
1916
+ "region": region,
1917
+ "img_width": img.width,
1918
+ "img_height": img.height,
1919
+ "scale_x": scale_x,
1920
+ "scale_y": scale_y,
1921
+ "pdf_x0": region.x0,
1922
+ "pdf_top": region.top,
1923
+ "pdf_x1": region.x1,
1924
+ "pdf_bottom": region.bottom,
1925
+ }
1926
+ )
1885
1927
  except Exception as e:
1886
1928
  logger.warning(f"Failed to render region: {e}")
1887
-
1929
+
1888
1930
  if not base_images:
1889
1931
  raise ValueError("Failed to render any images for FlowRegion")
1890
-
1932
+
1891
1933
  # Calculate final canvas size based on stacking direction
1892
1934
  if stack_direction == "vertical":
1893
1935
  final_width = max(img.width for img in base_images)
1894
1936
  final_height = (
1895
- sum(img.height for img in base_images)
1896
- + (len(base_images) - 1) * stack_gap
1937
+ sum(img.height for img in base_images) + (len(base_images) - 1) * stack_gap
1897
1938
  )
1898
1939
  else: # horizontal
1899
1940
  final_width = (
1900
- sum(img.width for img in base_images)
1901
- + (len(base_images) - 1) * stack_gap
1941
+ sum(img.width for img in base_images) + (len(base_images) - 1) * stack_gap
1902
1942
  )
1903
1943
  final_height = max(img.height for img in base_images)
1904
-
1944
+
1905
1945
  # Create unified canvas
1906
1946
  canvas = Image.new("RGB", (final_width, final_height), stack_background_color)
1907
1947
  draw = ImageDraw.Draw(canvas)
1908
-
1948
+
1909
1949
  # Paste base images and track positions
1910
1950
  region_positions = [] # (region_info, paste_x, paste_y)
1911
-
1951
+
1912
1952
  if stack_direction == "vertical":
1913
1953
  current_y = 0
1914
1954
  for i, (img, info) in enumerate(zip(base_images, region_infos)):
@@ -1923,44 +1963,50 @@ class Guides:
1923
1963
  canvas.paste(img, (current_x, paste_y))
1924
1964
  region_positions.append((info, current_x, paste_y))
1925
1965
  current_x += img.width + stack_gap
1926
-
1966
+
1927
1967
  # Now draw guides on the unified canvas
1928
1968
  # Draw vertical guides (blue) - these extend through the full canvas height
1929
1969
  for v_coord in self.vertical:
1930
1970
  # Find which region(s) this guide intersects
1931
1971
  for info, paste_x, paste_y in region_positions:
1932
- if info['pdf_x0'] <= v_coord <= info['pdf_x1']:
1972
+ if info["pdf_x0"] <= v_coord <= info["pdf_x1"]:
1933
1973
  # This guide is within this region's x-bounds
1934
1974
  # Convert PDF coordinate to pixel coordinate relative to the region
1935
- adjusted_x = v_coord - info['pdf_x0']
1936
- pixel_x = adjusted_x * info['scale_x'] + paste_x
1937
-
1975
+ adjusted_x = v_coord - info["pdf_x0"]
1976
+ pixel_x = adjusted_x * info["scale_x"] + paste_x
1977
+
1938
1978
  # Draw full-height line on canvas (not clipped to region)
1939
1979
  if 0 <= pixel_x <= final_width:
1940
1980
  x_pixel = int(pixel_x)
1941
- draw.line([(x_pixel, 0), (x_pixel, final_height - 1)],
1942
- fill=(0, 0, 255, 200), width=2)
1981
+ draw.line(
1982
+ [(x_pixel, 0), (x_pixel, final_height - 1)],
1983
+ fill=(0, 0, 255, 200),
1984
+ width=2,
1985
+ )
1943
1986
  break # Only draw once per guide
1944
-
1987
+
1945
1988
  # Draw horizontal guides (red) - these extend through the full canvas width
1946
1989
  for h_coord in self.horizontal:
1947
1990
  # Find which region(s) this guide intersects
1948
1991
  for info, paste_x, paste_y in region_positions:
1949
- if info['pdf_top'] <= h_coord <= info['pdf_bottom']:
1992
+ if info["pdf_top"] <= h_coord <= info["pdf_bottom"]:
1950
1993
  # This guide is within this region's y-bounds
1951
1994
  # Convert PDF coordinate to pixel coordinate relative to the region
1952
- adjusted_y = h_coord - info['pdf_top']
1953
- pixel_y = adjusted_y * info['scale_y'] + paste_y
1954
-
1995
+ adjusted_y = h_coord - info["pdf_top"]
1996
+ pixel_y = adjusted_y * info["scale_y"] + paste_y
1997
+
1955
1998
  # Draw full-width line on canvas (not clipped to region)
1956
1999
  if 0 <= pixel_y <= final_height:
1957
2000
  y_pixel = int(pixel_y)
1958
- draw.line([(0, y_pixel), (final_width - 1, y_pixel)],
1959
- fill=(255, 0, 0, 200), width=2)
2001
+ draw.line(
2002
+ [(0, y_pixel), (final_width - 1, y_pixel)],
2003
+ fill=(255, 0, 0, 200),
2004
+ width=2,
2005
+ )
1960
2006
  break # Only draw once per guide
1961
-
2007
+
1962
2008
  return canvas
1963
-
2009
+
1964
2010
  # Original single-region logic follows...
1965
2011
  # Determine what to display guides on
1966
2012
  target = on if on is not None else self.context
@@ -1981,10 +2027,15 @@ class Guides:
1981
2027
  raise ValueError("No target specified and no context available for guides display")
1982
2028
 
1983
2029
  # Prepare kwargs for image generation
1984
- image_kwargs = kwargs.copy()
2030
+ image_kwargs = {}
1985
2031
 
1986
- # Always turn off highlights to avoid visual clutter
1987
- image_kwargs["include_highlights"] = False
2032
+ # Extract only the parameters that the new render() method accepts
2033
+ if "resolution" in kwargs:
2034
+ image_kwargs["resolution"] = kwargs["resolution"]
2035
+ if "width" in kwargs:
2036
+ image_kwargs["width"] = kwargs["width"]
2037
+ if "crop" in kwargs:
2038
+ image_kwargs["crop"] = kwargs["crop"]
1988
2039
 
1989
2040
  # If target is a region-like object, crop to just that region
1990
2041
  if hasattr(target, "bbox") and hasattr(target, "page"):
@@ -1992,13 +2043,17 @@ class Guides:
1992
2043
  image_kwargs["crop"] = True
1993
2044
 
1994
2045
  # Get base image
1995
- if hasattr(target, "to_image"):
1996
- img = target.to_image(**image_kwargs)
2046
+ if hasattr(target, "render"):
2047
+ # Use the new unified rendering system
2048
+ img = target.render(**image_kwargs)
2049
+ elif hasattr(target, "render"):
2050
+ # Fallback to old method if available
2051
+ img = target.render(**image_kwargs)
1997
2052
  elif hasattr(target, "mode") and hasattr(target, "size"):
1998
2053
  # It's already a PIL Image
1999
2054
  img = target
2000
2055
  else:
2001
- raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
2056
+ raise ValueError(f"Object {target} does not support render() and is not a PIL Image")
2002
2057
 
2003
2058
  if img is None:
2004
2059
  raise ValueError("Failed to generate base image")
@@ -2599,64 +2654,70 @@ class Guides:
2599
2654
  source: Source label for created regions (for identification)
2600
2655
  cell_padding: Internal padding for cell regions in points
2601
2656
  include_outer_boundaries: Whether to add boundaries at edges if missing
2602
- multi_page: Controls multi-page table creation for FlowRegions.
2603
- - "auto": (default) Creates a multi-page grid if guides span pages.
2604
- - True: Forces creation of a multi-page grid.
2605
- - False: Creates separate grids for each page.
2657
+ multi_page: Controls multi-region table creation for FlowRegions.
2658
+ - "auto": (default) Creates a unified grid if there are multiple regions or guides span pages.
2659
+ - True: Forces creation of a unified multi-region grid.
2660
+ - False: Creates separate grids for each region.
2606
2661
 
2607
2662
  Returns:
2608
2663
  Dictionary with 'counts' and 'regions' created.
2609
2664
  """
2610
2665
  # Dispatch to appropriate implementation based on context and flags
2611
2666
  if self.is_flow_region:
2667
+ # Check if we should create a unified multi-region grid
2668
+ has_multiple_regions = len(self.context.constituent_regions) > 1
2612
2669
  spans_pages = self._spans_pages()
2613
- if multi_page is True or (multi_page == "auto" and spans_pages):
2670
+
2671
+ # Create unified grid if:
2672
+ # - multi_page is explicitly True, OR
2673
+ # - multi_page is "auto" AND (spans pages OR has multiple regions)
2674
+ if multi_page is True or (
2675
+ multi_page == "auto" and (spans_pages or has_multiple_regions)
2676
+ ):
2614
2677
  return self._build_grid_multi_page(
2615
2678
  source=source,
2616
2679
  cell_padding=cell_padding,
2617
2680
  include_outer_boundaries=include_outer_boundaries,
2618
2681
  )
2619
2682
  else:
2620
- # FlowRegion context, but creating separate tables per page
2683
+ # Single region FlowRegion or multi_page=False: create separate tables per region
2621
2684
  total_counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
2622
2685
  all_regions = {"table": [], "rows": [], "columns": [], "cells": []}
2623
-
2686
+
2624
2687
  for region in self.context.constituent_regions:
2625
2688
  if region in self._flow_guides:
2626
2689
  verticals, horizontals = self._flow_guides[region]
2627
-
2690
+
2628
2691
  region_guides = Guides(
2629
- verticals=verticals,
2630
- horizontals=horizontals,
2631
- context=region
2692
+ verticals=verticals, horizontals=horizontals, context=region
2632
2693
  )
2633
-
2694
+
2634
2695
  try:
2635
2696
  result = region_guides._build_grid_single_page(
2636
2697
  target=region,
2637
2698
  source=source,
2638
2699
  cell_padding=cell_padding,
2639
- include_outer_boundaries=include_outer_boundaries
2700
+ include_outer_boundaries=include_outer_boundaries,
2640
2701
  )
2641
-
2702
+
2642
2703
  for key in total_counts:
2643
2704
  total_counts[key] += result["counts"][key]
2644
-
2705
+
2645
2706
  if result["regions"]["table"]:
2646
2707
  all_regions["table"].append(result["regions"]["table"])
2647
2708
  all_regions["rows"].extend(result["regions"]["rows"])
2648
2709
  all_regions["columns"].extend(result["regions"]["columns"])
2649
2710
  all_regions["cells"].extend(result["regions"]["cells"])
2650
-
2711
+
2651
2712
  except Exception as e:
2652
2713
  logger.warning(f"Failed to build grid on region: {e}")
2653
-
2714
+
2654
2715
  logger.info(
2655
2716
  f"Created {total_counts['table']} tables, {total_counts['rows']} rows, "
2656
2717
  f"{total_counts['columns']} columns, and {total_counts['cells']} cells "
2657
2718
  f"from guides across {len(self._flow_guides)} regions"
2658
2719
  )
2659
-
2720
+
2660
2721
  return {"counts": total_counts, "regions": all_regions}
2661
2722
 
2662
2723
  # Fallback for single page/region
@@ -2673,7 +2734,16 @@ class Guides:
2673
2734
  cell_padding: float,
2674
2735
  include_outer_boundaries: bool,
2675
2736
  ) -> Dict[str, Any]:
2676
- """Builds a single, coherent grid across multiple pages of a FlowRegion."""
2737
+ """
2738
+ Builds a single, coherent grid across multiple regions of a FlowRegion.
2739
+
2740
+ Creates physical Region objects for each constituent region with _fragment
2741
+ region types (e.g., table_column_fragment), then stitches them into logical
2742
+ FlowRegion objects. Both are registered with pages, but the fragment types
2743
+ allow easy differentiation:
2744
+ - find_all('table_column') returns only logical columns
2745
+ - find_all('table_column_fragment') returns only physical fragments
2746
+ """
2677
2747
  from natural_pdf.flows.region import FlowRegion
2678
2748
 
2679
2749
  if not self.is_flow_region or not hasattr(self.context, "flow") or not self.context.flow:
@@ -2699,9 +2769,9 @@ class Guides:
2699
2769
  # Ensure the region's own boundaries are included to close off cells at page breaks
2700
2770
  clipped_verticals = sorted(list(set([bounds[0], bounds[2]] + clipped_verticals)))
2701
2771
  clipped_horizontals = sorted(list(set([bounds[1], bounds[3]] + clipped_horizontals)))
2702
-
2772
+
2703
2773
  if len(clipped_verticals) < 2 or len(clipped_horizontals) < 2:
2704
- continue # Not enough guides to form a cell
2774
+ continue # Not enough guides to form a cell
2705
2775
 
2706
2776
  region_guides = Guides(
2707
2777
  verticals=clipped_verticals,
@@ -2713,10 +2783,30 @@ class Guides:
2713
2783
  target=region,
2714
2784
  source=source,
2715
2785
  cell_padding=cell_padding,
2716
- include_outer_boundaries=False, # Boundaries are already handled
2786
+ include_outer_boundaries=False, # Boundaries are already handled
2717
2787
  )
2718
2788
 
2719
2789
  if grid_parts["counts"]["table"] > 0:
2790
+ # Mark physical regions as fragments by updating their region_type
2791
+ # This happens before stitching into logical FlowRegions
2792
+ if len(self.context.constituent_regions) > 1:
2793
+ # Update region types to indicate these are fragments
2794
+ if grid_parts["regions"]["table"]:
2795
+ grid_parts["regions"]["table"].region_type = "table_fragment"
2796
+ grid_parts["regions"]["table"].metadata["is_fragment"] = True
2797
+
2798
+ for row in grid_parts["regions"]["rows"]:
2799
+ row.region_type = "table_row_fragment"
2800
+ row.metadata["is_fragment"] = True
2801
+
2802
+ for col in grid_parts["regions"]["columns"]:
2803
+ col.region_type = "table_column_fragment"
2804
+ col.metadata["is_fragment"] = True
2805
+
2806
+ for cell in grid_parts["regions"]["cells"]:
2807
+ cell.region_type = "table_cell_fragment"
2808
+ cell.metadata["is_fragment"] = True
2809
+
2720
2810
  results_by_region.append(grid_parts)
2721
2811
 
2722
2812
  if not results_by_region:
@@ -2738,7 +2828,7 @@ class Guides:
2738
2828
  multi_page_table.metadata.update(
2739
2829
  {"is_multi_page": True, "num_rows": self.n_rows, "num_cols": self.n_cols}
2740
2830
  )
2741
-
2831
+
2742
2832
  # Initialize final region collections
2743
2833
  final_rows = []
2744
2834
  final_cols = []
@@ -2756,32 +2846,53 @@ class Guides:
2756
2846
  # Iterate through page breaks to merge split rows/cells
2757
2847
  for i in range(len(results_by_region) - 1):
2758
2848
  region_A = self.context.constituent_regions[i]
2759
-
2849
+
2760
2850
  # Check if a guide exists at the boundary
2761
- is_break_bounded = any(abs(h - region_A.bottom) < 0.1 for h in self.horizontal.data)
2851
+ is_break_bounded = any(
2852
+ abs(h - region_A.bottom) < 0.1 for h in self.horizontal.data
2853
+ )
2762
2854
 
2763
- if not is_break_bounded and page_rows[i] and page_rows[i+1]:
2855
+ if not is_break_bounded and page_rows[i] and page_rows[i + 1]:
2764
2856
  # No guide at break -> merge last row of A with first row of B
2765
2857
  last_row_A = page_rows[i].pop(-1)
2766
- first_row_B = page_rows[i+1].pop(0)
2767
-
2768
- merged_row = FlowRegion(flow, [last_row_A, first_row_B], source_flow_element=None)
2858
+ first_row_B = page_rows[i + 1].pop(0)
2859
+
2860
+ merged_row = FlowRegion(
2861
+ flow, [last_row_A, first_row_B], source_flow_element=None
2862
+ )
2769
2863
  merged_row.source = source
2770
2864
  merged_row.region_type = "table_row"
2771
- merged_row.metadata.update({"row_index": last_row_A.metadata.get("row_index"), "is_multi_page": True})
2772
- page_rows[i].append(merged_row) # Add merged row back in place of A's last
2865
+ merged_row.metadata.update(
2866
+ {
2867
+ "row_index": last_row_A.metadata.get("row_index"),
2868
+ "is_multi_page": True,
2869
+ }
2870
+ )
2871
+ page_rows[i].append(merged_row) # Add merged row back in place of A's last
2773
2872
 
2774
2873
  # Merge the corresponding cells using explicit row/col indices
2775
2874
  last_row_idx = last_row_A.metadata.get("row_index")
2776
2875
  first_row_idx = first_row_B.metadata.get("row_index")
2777
2876
 
2778
2877
  # Cells belonging to those rows
2779
- last_cells_A = [c for c in page_cells[i] if c.metadata.get("row_index") == last_row_idx]
2780
- first_cells_B = [c for c in page_cells[i+1] if c.metadata.get("row_index") == first_row_idx]
2878
+ last_cells_A = [
2879
+ c for c in page_cells[i] if c.metadata.get("row_index") == last_row_idx
2880
+ ]
2881
+ first_cells_B = [
2882
+ c
2883
+ for c in page_cells[i + 1]
2884
+ if c.metadata.get("row_index") == first_row_idx
2885
+ ]
2781
2886
 
2782
2887
  # Remove them from their page lists
2783
- page_cells[i] = [c for c in page_cells[i] if c.metadata.get("row_index") != last_row_idx]
2784
- page_cells[i+1] = [c for c in page_cells[i+1] if c.metadata.get("row_index") != first_row_idx]
2888
+ page_cells[i] = [
2889
+ c for c in page_cells[i] if c.metadata.get("row_index") != last_row_idx
2890
+ ]
2891
+ page_cells[i + 1] = [
2892
+ c
2893
+ for c in page_cells[i + 1]
2894
+ if c.metadata.get("row_index") != first_row_idx
2895
+ ]
2785
2896
 
2786
2897
  # Sort both lists by column index to keep alignment stable
2787
2898
  last_cells_A.sort(key=lambda c: c.metadata.get("col_index", 0))
@@ -2789,14 +2900,18 @@ class Guides:
2789
2900
 
2790
2901
  # Pair-wise merge
2791
2902
  for cell_A, cell_B in zip(last_cells_A, first_cells_B):
2792
- merged_cell = FlowRegion(flow, [cell_A, cell_B], source_flow_element=None)
2903
+ merged_cell = FlowRegion(
2904
+ flow, [cell_A, cell_B], source_flow_element=None
2905
+ )
2793
2906
  merged_cell.source = source
2794
2907
  merged_cell.region_type = "table_cell"
2795
- merged_cell.metadata.update({
2796
- "row_index": cell_A.metadata.get("row_index"),
2797
- "col_index": cell_A.metadata.get("col_index"),
2798
- "is_multi_page": True
2799
- })
2908
+ merged_cell.metadata.update(
2909
+ {
2910
+ "row_index": cell_A.metadata.get("row_index"),
2911
+ "col_index": cell_A.metadata.get("col_index"),
2912
+ "is_multi_page": True,
2913
+ }
2914
+ )
2800
2915
  page_cells[i].append(merged_cell)
2801
2916
 
2802
2917
  # Flatten the potentially modified lists of rows and cells
@@ -2804,23 +2919,27 @@ class Guides:
2804
2919
  final_cells = [cell for cells_list in page_cells for cell in cells_list]
2805
2920
 
2806
2921
  # Stitch columns, which always span vertically
2807
- physical_cols_by_index = zip(*(res["regions"]["columns"] for res in results_by_region))
2922
+ physical_cols_by_index = zip(
2923
+ *(res["regions"]["columns"] for res in results_by_region)
2924
+ )
2808
2925
  for j, physical_cols in enumerate(physical_cols_by_index):
2809
- col_fr = FlowRegion(flow=flow, constituent_regions=list(physical_cols), source_flow_element=None)
2926
+ col_fr = FlowRegion(
2927
+ flow=flow, constituent_regions=list(physical_cols), source_flow_element=None
2928
+ )
2810
2929
  col_fr.source = source
2811
2930
  col_fr.region_type = "table_column"
2812
2931
  col_fr.metadata.update({"col_index": j, "is_multi_page": True})
2813
2932
  final_cols.append(col_fr)
2814
2933
 
2815
2934
  elif orientation == "horizontal":
2816
- # Symmetric logic for horizontal flow (not fully implemented here for brevity)
2817
- # This would merge last column of A with first column of B if no vertical guide exists
2935
+ # Symmetric logic for horizontal flow (not fully implemented here for brevity)
2936
+ # This would merge last column of A with first column of B if no vertical guide exists
2818
2937
  logger.warning("Horizontal table stitching not fully implemented.")
2819
2938
  final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
2820
2939
  final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
2821
2940
  final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
2822
-
2823
- else: # Unknown orientation, just flatten everything
2941
+
2942
+ else: # Unknown orientation, just flatten everything
2824
2943
  final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
2825
2944
  final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
2826
2945
  final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
@@ -2829,65 +2948,40 @@ class Guides:
2829
2948
  # This ensures that page.find('table') finds the logical multi-page table, not fragments
2830
2949
  constituent_pages = set()
2831
2950
  for region in self.context.constituent_regions:
2832
- if hasattr(region, 'page') and hasattr(region.page, '_element_mgr'):
2951
+ if hasattr(region, "page") and hasattr(region.page, "_element_mgr"):
2833
2952
  constituent_pages.add(region.page)
2834
-
2835
- # First, remove ONLY the specific individual Region tables that were created during this build
2836
- # (i.e., the physical_tables), not ALL tables with the same source
2837
- physical_tables_to_remove = set(physical_tables) # Convert to set for fast lookup
2838
-
2953
+
2954
+ # Register the logical multi-page table with all constituent pages
2955
+ # Note: Physical table fragments are already registered with region_type="table_fragment"
2839
2956
  for page in constituent_pages:
2840
2957
  try:
2841
- # Find and remove only the specific physical tables that are part of this multi-page table
2842
- existing_tables = page.find_all('table')
2843
- tables_to_remove = [
2844
- table for table in existing_tables
2845
- if (table in physical_tables_to_remove and
2846
- not isinstance(table, FlowRegion)) # Only remove the specific Region tables we created
2847
- ]
2848
-
2849
- for table in tables_to_remove:
2850
- page._element_mgr.remove_element(table, element_type="regions")
2851
- logger.debug(f"Removed physical table fragment from page {page.page_number}")
2852
-
2853
- # Now register the multi-page table
2854
2958
  page._element_mgr.add_element(multi_page_table, element_type="regions")
2855
2959
  logger.debug(f"Registered multi-page table with page {page.page_number}")
2856
-
2960
+
2857
2961
  except Exception as e:
2858
- logger.warning(f"Failed to register multi-page table with page {page.page_number}: {e}")
2859
-
2860
- # SMART PAGE-LEVEL REGISTRY: Also register rows, columns, and cells with their respective pages
2861
- # This ensures that page.find('table_cell') etc. also work across the multi-page structure
2862
- for row in final_rows:
2863
- if hasattr(row, 'constituent_regions'):
2864
- # This is a FlowRegion row spanning multiple pages
2865
- for constituent_region in row.constituent_regions:
2866
- if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2867
- try:
2868
- constituent_region.page._element_mgr.add_element(row, element_type="regions")
2869
- except Exception as e:
2870
- logger.warning(f"Failed to register multi-page row: {e}")
2871
-
2872
- for col in final_cols:
2873
- if hasattr(col, 'constituent_regions'):
2874
- # This is a FlowRegion column spanning multiple pages
2875
- for constituent_region in col.constituent_regions:
2876
- if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2877
- try:
2878
- constituent_region.page._element_mgr.add_element(col, element_type="regions")
2879
- except Exception as e:
2880
- logger.warning(f"Failed to register multi-page column: {e}")
2881
-
2882
- for cell in final_cells:
2883
- if hasattr(cell, 'constituent_regions'):
2884
- # This is a FlowRegion cell spanning multiple pages
2885
- for constituent_region in cell.constituent_regions:
2886
- if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
2887
- try:
2888
- constituent_region.page._element_mgr.add_element(cell, element_type="regions")
2889
- except Exception as e:
2890
- logger.warning(f"Failed to register multi-page cell: {e}")
2962
+ logger.warning(
2963
+ f"Failed to register multi-page table with page {page.page_number}: {e}"
2964
+ )
2965
+
2966
+ # SMART PAGE-LEVEL REGISTRY: Register logical FlowRegion elements.
2967
+ # Physical fragments are already registered with their pages with _fragment region types,
2968
+ # so users can differentiate between logical regions and physical fragments.
2969
+ for page in constituent_pages:
2970
+ try:
2971
+ # Register all logical rows with this page
2972
+ for row in final_rows:
2973
+ page._element_mgr.add_element(row, element_type="regions")
2974
+
2975
+ # Register all logical columns with this page
2976
+ for col in final_cols:
2977
+ page._element_mgr.add_element(col, element_type="regions")
2978
+
2979
+ # Register all logical cells with this page
2980
+ for cell in final_cells:
2981
+ page._element_mgr.add_element(cell, element_type="regions")
2982
+
2983
+ except Exception as e:
2984
+ logger.warning(f"Failed to register multi-region table elements with page: {e}")
2891
2985
 
2892
2986
  final_counts = {
2893
2987
  "table": 1,
@@ -3130,7 +3224,9 @@ class Guides:
3130
3224
  try:
3131
3225
  text_elements = region.find_all("text", apply_exclusions=False)
3132
3226
  elements = (
3133
- text_elements.elements if hasattr(text_elements, "elements") else text_elements
3227
+ text_elements.elements
3228
+ if hasattr(text_elements, "elements")
3229
+ else text_elements
3134
3230
  )
3135
3231
  all_text_elements.extend(elements)
3136
3232
  except Exception as e:
@@ -3161,7 +3257,7 @@ class Guides:
3161
3257
  v_guide_pages = {}
3162
3258
  for coord, region in self._unified_vertical:
3163
3259
  v_guide_pages.setdefault(coord, set()).add(region.page.page_number)
3164
-
3260
+
3165
3261
  for pages in v_guide_pages.values():
3166
3262
  if len(pages) > 1:
3167
3263
  return True
@@ -3331,7 +3427,7 @@ class Guides:
3331
3427
  return "unknown"
3332
3428
 
3333
3429
  r1 = self.context.constituent_regions[0]
3334
- r2 = self.context.constituent_regions[1] # Compare first two regions
3430
+ r2 = self.context.constituent_regions[1] # Compare first two regions
3335
3431
 
3336
3432
  if not r1.bbox or not r2.bbox:
3337
3433
  return "unknown"