natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/flows/flow.py CHANGED
@@ -1366,546 +1366,241 @@ class Flow(Visualizable):
1366
1366
  end_elements_unwrapped = _unwrap(end_elements)
1367
1367
 
1368
1368
  # ------------------------------------------------------------------
1369
- # PRIMARY IMPLEMENTATION operate on each Flow **segment region**
1370
- # independently so that sectioning happens *per-region*, not per page.
1371
- # ------------------------------------------------------------------
1372
- from natural_pdf.elements.element_collection import ElementCollection
1373
-
1374
- aggregated_sections = []
1375
-
1376
- # Helper to decide if an element lies inside a segment (Region)
1377
- def _element_in_segment(elem, segment_region):
1378
- try:
1379
- return segment_region.intersects(elem) # Region method – robust
1380
- except Exception:
1381
- # Fallback to bounding-box containment checks
1382
- if not hasattr(elem, "bbox"):
1383
- return False
1384
- ex0, etop, ex1, ebottom = elem.bbox
1385
- sx0, stop, sx1, sbottom = segment_region.bbox
1386
- return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1387
-
1388
- for seg in self.segments:
1389
- # Each *seg* is guaranteed to be a Region (see _normalize_segments)
1390
-
1391
- # Resolve segment-specific boundary arguments
1392
- seg_start_elems = None
1393
- seg_end_elems = None
1394
-
1395
- # --- Handle selector strings ---
1396
- if isinstance(start_elements_unwrapped, str):
1397
- seg_start_elems = seg.find_all(start_elements_unwrapped).elements
1398
- elif start_elements_unwrapped is not None:
1399
- seg_start_elems = [
1400
- e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1401
- ]
1402
-
1403
- if isinstance(end_elements_unwrapped, str):
1404
- seg_end_elems = seg.find_all(end_elements_unwrapped).elements
1405
- elif end_elements_unwrapped is not None:
1406
- seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1407
-
1408
- # Call Region.get_sections – this returns ElementCollection[Region]
1409
- seg_sections = seg.get_sections(
1410
- start_elements=seg_start_elems,
1411
- end_elements=seg_end_elems,
1412
- include_boundaries=include_boundaries,
1413
- orientation=orientation,
1414
- )
1415
-
1416
- if seg_sections:
1417
- aggregated_sections.extend(seg_sections.elements)
1418
-
1419
- # Optionally, handle new_section_on_page_break – interpreted here as
1420
- # *new_section_on_segment_break*: if True and there were *no* explicit
1421
- # boundaries, treat the entire segment as a single section.
1422
- if (
1423
- new_section_on_page_break
1424
- and not seg_sections
1425
- and start_elements_unwrapped is None
1426
- and end_elements_unwrapped is None
1427
- ):
1428
- aggregated_sections.append(seg)
1429
-
1430
- # ------------------------------------------------------------------
1431
- # CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
1432
- # span multiple segments and create FlowRegions for those cases.
1433
- # ------------------------------------------------------------------
1434
-
1435
- # If we have explicit start/end elements, check for cross-segment sections
1436
- if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
1437
- # Find all start and end elements across all segments
1438
- all_start_elements = []
1439
- all_end_elements = []
1440
-
1441
- # Map elements to their segments for tracking
1442
- element_to_segment = {}
1443
-
1444
- for seg_idx, seg in enumerate(self.segments):
1445
- if isinstance(start_elements_unwrapped, str):
1446
- seg_starts = seg.find_all(start_elements_unwrapped).elements
1447
- else:
1448
- seg_starts = [
1449
- e for e in start_elements_unwrapped if _element_in_segment(e, seg)
1450
- ]
1451
-
1452
- if isinstance(end_elements_unwrapped, str):
1453
- seg_ends = seg.find_all(end_elements_unwrapped).elements
1454
- else:
1455
- seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1456
-
1457
- for elem in seg_starts:
1458
- all_start_elements.append((elem, seg_idx))
1459
- element_to_segment[id(elem)] = seg_idx
1460
-
1461
- for elem in seg_ends:
1462
- all_end_elements.append((elem, seg_idx))
1463
- element_to_segment[id(elem)] = seg_idx
1464
-
1465
- # Sort by segment index, then by position within segment
1466
- all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1467
- all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1468
-
1469
- # Look for cross-segment pairs (start in one segment, end in another)
1470
- cross_segment_sections = []
1471
- used_starts = set()
1472
- used_ends = set()
1473
-
1474
- for start_elem, start_seg_idx in all_start_elements:
1475
- if id(start_elem) in used_starts:
1476
- continue
1477
-
1478
- # Find the next end element that comes after this start
1479
- matching_end = None
1480
- for end_elem, end_seg_idx in all_end_elements:
1481
- if id(end_elem) in used_ends:
1482
- continue
1483
-
1484
- # Check if this end comes after the start (by segment order or position)
1485
- if end_seg_idx > start_seg_idx or (
1486
- end_seg_idx == start_seg_idx
1487
- and (
1488
- end_elem.top > start_elem.top
1489
- or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1490
- )
1491
- ):
1492
- matching_end = (end_elem, end_seg_idx)
1493
- break
1494
-
1495
- if matching_end is not None:
1496
- end_elem, end_seg_idx = matching_end
1497
-
1498
- # If start and end are in different segments, create FlowRegion
1499
- if start_seg_idx != end_seg_idx:
1500
- cross_segment_sections.append(
1501
- (start_elem, start_seg_idx, end_elem, end_seg_idx)
1502
- )
1503
- used_starts.add(id(start_elem))
1504
- used_ends.add(id(end_elem))
1505
-
1506
- # Create FlowRegions for cross-segment sections
1507
- from natural_pdf.elements.region import Region
1508
- from natural_pdf.flows.element import FlowElement
1509
- from natural_pdf.flows.region import FlowRegion
1510
-
1511
- for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
1512
- # Build constituent regions spanning from start segment to end segment
1513
- constituent_regions = []
1514
-
1515
- # First segment: from start element to bottom
1516
- start_seg = self.segments[start_seg_idx]
1517
- first_region = Region(
1518
- start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
1519
- )
1520
- constituent_regions.append(first_region)
1521
-
1522
- # Middle segments: full segments
1523
- for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1524
- constituent_regions.append(self.segments[seg_idx])
1525
-
1526
- # Last segment: from top to end element
1527
- if end_seg_idx != start_seg_idx:
1528
- end_seg = self.segments[end_seg_idx]
1529
- last_region = Region(
1530
- end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
1531
- )
1532
- constituent_regions.append(last_region)
1533
-
1534
- # Create FlowRegion
1535
- flow_element = FlowElement(physical_object=start_elem, flow=self)
1536
- flow_region = FlowRegion(
1537
- flow=self,
1538
- constituent_regions=constituent_regions,
1539
- source_flow_element=flow_element,
1540
- boundary_element_found=end_elem,
1541
- )
1542
-
1543
- # Remove any single-segment sections that are now covered by this FlowRegion
1544
- # This prevents duplication of content
1545
- aggregated_sections = [
1546
- s
1547
- for s in aggregated_sections
1548
- if not any(
1549
- cr.intersects(s)
1550
- for cr in constituent_regions
1551
- if hasattr(cr, "intersects") and hasattr(s, "intersects")
1552
- )
1553
- ]
1554
-
1555
- aggregated_sections.append(flow_region)
1556
-
1557
- # ------------------------------------------------------------------
1558
- # NEW APPROACH: First collect ALL boundary elements across all segments,
1559
- # then pair them up to create sections (either single-segment Regions
1560
- # or multi-segment FlowRegions).
1369
+ # For Flow, we need to handle sections that may span segments
1370
+ # We'll process all segments together, not independently
1561
1371
  # ------------------------------------------------------------------
1562
1372
  from natural_pdf.elements.element_collection import ElementCollection
1563
1373
  from natural_pdf.elements.region import Region
1564
1374
  from natural_pdf.flows.element import FlowElement
1565
1375
  from natural_pdf.flows.region import FlowRegion
1566
1376
 
1567
- # Helper to decide if an element lies inside a segment (Region)
1568
- def _element_in_segment(elem, segment_region):
1569
- try:
1570
- return segment_region.intersects(elem) # Region method – robust
1571
- except Exception:
1572
- # Fallback to bounding-box containment checks
1573
- if not hasattr(elem, "bbox"):
1574
- return False
1575
- ex0, etop, ex1, ebottom = elem.bbox
1576
- sx0, stop, sx1, sbottom = segment_region.bbox
1577
- return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
1578
-
1579
- # Collect ALL boundary elements across all segments with their segment indices
1580
- all_start_elements = []
1581
- all_end_elements = []
1582
-
1583
- for seg_idx, seg in enumerate(self.segments):
1584
- # Find start elements in this segment
1377
+ # Helper to check if element is in segment
1378
+ def _element_in_segment(elem, segment):
1379
+ # Simple bbox check
1380
+ return (
1381
+ elem.page == segment.page
1382
+ and elem.top >= segment.top
1383
+ and elem.bottom <= segment.bottom
1384
+ and elem.x0 >= segment.x0
1385
+ and elem.x1 <= segment.x1
1386
+ )
1387
+
1388
+ # Collect all boundary elements with their segment info
1389
+ all_starts = []
1390
+ all_ends = []
1391
+
1392
+ for seg_idx, segment in enumerate(self.segments):
1393
+ # Find starts in this segment
1585
1394
  if isinstance(start_elements_unwrapped, str):
1586
- seg_starts = seg.find_all(start_elements_unwrapped).elements
1587
- elif start_elements_unwrapped is not None:
1588
- seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
1395
+ seg_starts = segment.find_all(start_elements_unwrapped).elements
1396
+ elif start_elements_unwrapped:
1397
+ seg_starts = [
1398
+ e for e in start_elements_unwrapped if _element_in_segment(e, segment)
1399
+ ]
1589
1400
  else:
1590
1401
  seg_starts = []
1591
1402
 
1592
- logger.debug(f"\n=== Processing segment {seg_idx} ===")
1593
- logger.debug(f"Segment bbox: {seg.bbox}")
1594
- logger.debug(
1595
- f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
1596
- )
1597
-
1598
- logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
1599
- for i, elem in enumerate(seg_starts):
1600
- logger.debug(
1601
- f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1602
- )
1403
+ for elem in seg_starts:
1404
+ all_starts.append((elem, seg_idx, segment))
1603
1405
 
1604
- # Find end elements in this segment
1406
+ # Find ends in this segment
1605
1407
  if isinstance(end_elements_unwrapped, str):
1606
- seg_ends = seg.find_all(end_elements_unwrapped).elements
1607
- elif end_elements_unwrapped is not None:
1608
- seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
1408
+ seg_ends = segment.find_all(end_elements_unwrapped).elements
1409
+ elif end_elements_unwrapped:
1410
+ seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, segment)]
1609
1411
  else:
1610
1412
  seg_ends = []
1611
1413
 
1612
- logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
1613
- for i, elem in enumerate(seg_ends):
1614
- logger.debug(
1615
- f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
1616
- )
1617
-
1618
- # Add to global lists with segment index
1619
- for elem in seg_starts:
1620
- all_start_elements.append((elem, seg_idx))
1621
1414
  for elem in seg_ends:
1622
- all_end_elements.append((elem, seg_idx))
1415
+ all_ends.append((elem, seg_idx, segment))
1623
1416
 
1624
- # Sort by flow order: segment index first, then position within segment
1625
- all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1626
- all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1417
+ # Sort by segment index, then position
1418
+ all_starts.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1419
+ all_ends.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
1627
1420
 
1628
- logger.debug(f"\n=== Total boundary elements found ===")
1629
- logger.debug(f"Total start elements: {len(all_start_elements)}")
1630
- logger.debug(f"Total end elements: {len(all_end_elements)}")
1421
+ # If no boundary elements found, return empty collection
1422
+ if not all_starts and not all_ends:
1423
+ return ElementCollection([])
1631
1424
 
1632
- # Pair up start and end elements to create sections
1633
1425
  sections = []
1634
- used_starts = set()
1635
- used_ends = set()
1636
-
1637
- for start_elem, start_seg_idx in all_start_elements:
1638
- if id(start_elem) in used_starts:
1639
- continue
1640
-
1641
- logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
1642
- logger.debug(
1643
- f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
1644
- )
1645
-
1646
- # Find the next unused end element that comes after this start
1647
- matching_end = None
1648
- for end_elem, end_seg_idx in all_end_elements:
1649
- if id(end_elem) in used_ends:
1650
- continue
1651
1426
 
1652
- # Check if this end comes after the start in flow order
1653
- if end_seg_idx > start_seg_idx or (
1654
- end_seg_idx == start_seg_idx
1655
- and (
1656
- end_elem.top > start_elem.top
1657
- or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
1658
- )
1659
- ):
1660
- matching_end = (end_elem, end_seg_idx)
1661
- break
1662
-
1663
- if matching_end is not None:
1664
- end_elem, end_seg_idx = matching_end
1665
- used_starts.add(id(start_elem))
1666
- used_ends.add(id(end_elem))
1667
-
1668
- logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
1669
-
1670
- # Create section based on whether it spans segments
1671
- if start_seg_idx == end_seg_idx:
1672
- # Single segment section - use Region.get_section_between
1673
- seg = self.segments[start_seg_idx]
1674
- section = seg.get_section_between(start_elem, end_elem, include_boundaries)
1675
- sections.append(section)
1676
- logger.debug(f" Created single-segment Region")
1677
- else:
1678
- # Multi-segment section - create FlowRegion
1679
- logger.debug(
1680
- f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
1681
- )
1682
- constituent_regions = []
1683
-
1684
- # First segment: from start element to bottom
1685
- start_seg = self.segments[start_seg_idx]
1686
- if include_boundaries in ["start", "both"]:
1687
- first_top = start_elem.top
1688
- else:
1689
- first_top = start_elem.bottom
1690
- first_region = Region(
1691
- start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
1692
- )
1693
- constituent_regions.append(first_region)
1427
+ # Case 1: Only start elements provided
1428
+ if all_starts and not all_ends:
1429
+ for i in range(len(all_starts)):
1430
+ start_elem, start_seg_idx, start_seg = all_starts[i]
1694
1431
 
1695
- # Middle segments: full segments
1696
- for seg_idx in range(start_seg_idx + 1, end_seg_idx):
1697
- constituent_regions.append(self.segments[seg_idx])
1432
+ # Find end (next start or end of flow)
1433
+ if i + 1 < len(all_starts):
1434
+ # Section ends at next start
1435
+ end_elem, end_seg_idx, end_seg = all_starts[i + 1]
1698
1436
 
1699
- # Last segment: from top to end element
1700
- end_seg = self.segments[end_seg_idx]
1701
- if include_boundaries in ["end", "both"]:
1702
- last_bottom = end_elem.bottom
1437
+ if start_seg_idx == end_seg_idx:
1438
+ # Same segment - create regular Region
1439
+ section = start_seg.get_section_between(
1440
+ start_elem, end_elem, include_boundaries, orientation
1441
+ )
1442
+ if section:
1443
+ sections.append(section)
1703
1444
  else:
1704
- last_bottom = end_elem.top
1705
- last_region = Region(
1706
- end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
1707
- )
1708
- constituent_regions.append(last_region)
1709
-
1710
- # Create FlowRegion
1711
- flow_element = FlowElement(physical_object=start_elem, flow=self)
1712
- flow_region = FlowRegion(
1713
- flow=self,
1714
- constituent_regions=constituent_regions,
1715
- source_flow_element=flow_element,
1716
- boundary_element_found=end_elem,
1717
- )
1718
- sections.append(flow_region)
1445
+ # Cross-segment - create FlowRegion
1446
+ regions = []
1719
1447
 
1720
- # Handle special cases when only start or only end elements are provided
1721
- if start_elements_unwrapped is not None and end_elements_unwrapped is None:
1722
- logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
1723
- for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
1724
- if id(start_elem) in used_starts:
1725
- continue
1726
-
1727
- # Find next start element
1728
- next_start = None
1729
- if i + 1 < len(all_start_elements):
1730
- next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
1731
- # Create section from this start to just before next start
1732
- if start_seg_idx == next_start_seg_idx:
1733
- # Same segment
1734
- seg = self.segments[start_seg_idx]
1735
- # Find element just before next start
1736
- all_elems = seg.get_elements()
1737
- all_elems.sort(key=lambda e: (e.top, e.x0))
1738
- try:
1739
- next_idx = all_elems.index(next_start_elem)
1740
- if next_idx > 0:
1741
- end_elem = all_elems[next_idx - 1]
1742
- section = seg.get_section_between(
1743
- start_elem, end_elem, include_boundaries
1744
- )
1745
- sections.append(section)
1746
- except ValueError:
1747
- pass
1748
- elif next_start_seg_idx == start_seg_idx + 1:
1749
- # Next start is in the immediately following segment in the flow
1750
- # Create a FlowRegion that spans from current start to just before next start
1751
- logger.debug(f" Next start is in next flow segment - creating FlowRegion")
1752
-
1753
- constituent_regions = []
1754
-
1755
- # First segment: from start element to bottom
1756
- start_seg = self.segments[start_seg_idx]
1757
- if include_boundaries in ["start", "both"]:
1758
- first_top = start_elem.top
1448
+ # First segment: from start to bottom
1449
+ if include_boundaries in ["both", "start"]:
1450
+ top = start_elem.top
1759
1451
  else:
1760
- first_top = start_elem.bottom
1761
- first_region = Region(
1762
- start_seg.page,
1763
- (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1452
+ top = start_elem.bottom
1453
+ regions.append(
1454
+ Region(
1455
+ start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
1456
+ )
1764
1457
  )
1765
- constituent_regions.append(first_region)
1766
-
1767
- # Next segment: from top to just before next start
1768
- next_seg = self.segments[next_start_seg_idx]
1769
- # Find element just before next start in the next segment
1770
- next_seg_elems = next_seg.get_elements()
1771
- next_seg_elems.sort(key=lambda e: (e.top, e.x0))
1772
1458
 
1773
- last_bottom = next_start_elem.top # Default to just before the next start
1774
- try:
1775
- next_idx = next_seg_elems.index(next_start_elem)
1776
- if next_idx > 0:
1777
- # Use the bottom of the element before next start
1778
- prev_elem = next_seg_elems[next_idx - 1]
1779
- last_bottom = prev_elem.bottom
1780
- except ValueError:
1781
- pass
1459
+ # Middle segments (full)
1460
+ for idx in range(start_seg_idx + 1, end_seg_idx):
1461
+ regions.append(self.segments[idx])
1782
1462
 
1783
- last_region = Region(
1784
- next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
1463
+ # Last segment: from top to end element
1464
+ if include_boundaries in ["both", "end"]:
1465
+ bottom = end_elem.bottom
1466
+ else:
1467
+ bottom = end_elem.top
1468
+ regions.append(
1469
+ Region(end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, bottom))
1785
1470
  )
1786
- constituent_regions.append(last_region)
1787
1471
 
1788
1472
  # Create FlowRegion
1789
1473
  flow_element = FlowElement(physical_object=start_elem, flow=self)
1790
1474
  flow_region = FlowRegion(
1791
1475
  flow=self,
1792
- constituent_regions=constituent_regions,
1476
+ constituent_regions=regions,
1793
1477
  source_flow_element=flow_element,
1794
- boundary_element_found=None,
1478
+ boundary_element_found=end_elem,
1795
1479
  )
1480
+ flow_region.start_element = start_elem
1481
+ flow_region.end_element = end_elem
1482
+ flow_region._boundary_exclusions = include_boundaries
1796
1483
  sections.append(flow_region)
1797
- logger.debug(
1798
- f" Created FlowRegion with {len(constituent_regions)} constituent regions"
1484
+ else:
1485
+ # Last section - goes to end of flow
1486
+ if start_seg_idx == len(self.segments) - 1:
1487
+ # Within last segment
1488
+ section = start_seg.get_section_between(
1489
+ start_elem, None, include_boundaries, orientation
1799
1490
  )
1491
+ if section:
1492
+ sections.append(section)
1800
1493
  else:
1801
- # Next start is more than one segment away - just end at current segment
1802
- start_seg = self.segments[start_seg_idx]
1803
- if include_boundaries in ["start", "both"]:
1804
- region_top = start_elem.top
1494
+ # Spans to end
1495
+ regions = []
1496
+
1497
+ # First segment: from start to bottom
1498
+ if include_boundaries in ["both", "start"]:
1499
+ top = start_elem.top
1805
1500
  else:
1806
- region_top = start_elem.bottom
1807
- section = Region(
1808
- start_seg.page,
1809
- (start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
1501
+ top = start_elem.bottom
1502
+ regions.append(
1503
+ Region(
1504
+ start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
1505
+ )
1810
1506
  )
1811
- sections.append(section)
1812
- logger.debug(
1813
- f" Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
1507
+
1508
+ # Remaining segments (full)
1509
+ for idx in range(start_seg_idx + 1, len(self.segments)):
1510
+ regions.append(self.segments[idx])
1511
+
1512
+ # Create FlowRegion
1513
+ flow_element = FlowElement(physical_object=start_elem, flow=self)
1514
+ flow_region = FlowRegion(
1515
+ flow=self,
1516
+ constituent_regions=regions,
1517
+ source_flow_element=flow_element,
1518
+ boundary_element_found=None,
1814
1519
  )
1815
- else:
1816
- # Last start element: section goes to end of flow
1817
- # This could span multiple segments
1818
- if start_seg_idx == len(self.segments) - 1:
1819
- # Only in last segment
1820
- seg = self.segments[start_seg_idx]
1821
- if include_boundaries in ["start", "both"]:
1822
- region_top = start_elem.top
1823
- else:
1824
- region_top = start_elem.bottom
1825
- section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
1826
- sections.append(section)
1520
+ flow_region.start_element = start_elem
1521
+ flow_region._boundary_exclusions = include_boundaries
1522
+ sections.append(flow_region)
1523
+
1524
+ # Case 2: Both start and end elements
1525
+ elif all_starts and all_ends:
1526
+ # Match starts with ends
1527
+ used_ends = set()
1528
+
1529
+ for start_elem, start_seg_idx, start_seg in all_starts:
1530
+ # Find matching end
1531
+ best_end = None
1532
+
1533
+ for end_elem, end_seg_idx, end_seg in all_ends:
1534
+ if id(end_elem) in used_ends:
1535
+ continue
1536
+
1537
+ # End must come after start
1538
+ if end_seg_idx > start_seg_idx or (
1539
+ end_seg_idx == start_seg_idx and end_elem.top >= start_elem.bottom
1540
+ ):
1541
+ best_end = (end_elem, end_seg_idx, end_seg)
1542
+ break
1543
+
1544
+ if best_end:
1545
+ end_elem, end_seg_idx, end_seg = best_end
1546
+ used_ends.add(id(end_elem))
1547
+
1548
+ if start_seg_idx == end_seg_idx:
1549
+ # Same segment
1550
+ section = start_seg.get_section_between(
1551
+ start_elem, end_elem, include_boundaries, orientation
1552
+ )
1553
+ if section:
1554
+ sections.append(section)
1827
1555
  else:
1828
- # Spans to end of flow - create FlowRegion
1829
- constituent_regions = []
1556
+ # Cross-segment FlowRegion
1557
+ regions = []
1830
1558
 
1831
1559
  # First segment
1832
- start_seg = self.segments[start_seg_idx]
1833
- if include_boundaries in ["start", "both"]:
1834
- first_top = start_elem.top
1560
+ if include_boundaries in ["both", "start"]:
1561
+ top = start_elem.top
1835
1562
  else:
1836
- first_top = start_elem.bottom
1837
- first_region = Region(
1838
- start_seg.page,
1839
- (start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
1563
+ top = start_elem.bottom
1564
+ regions.append(
1565
+ Region(
1566
+ start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
1567
+ )
1840
1568
  )
1841
- constituent_regions.append(first_region)
1842
1569
 
1843
- # Remaining segments
1844
- for seg_idx in range(start_seg_idx + 1, len(self.segments)):
1845
- constituent_regions.append(self.segments[seg_idx])
1570
+ # Middle segments
1571
+ for idx in range(start_seg_idx + 1, end_seg_idx):
1572
+ regions.append(self.segments[idx])
1846
1573
 
1574
+ # Last segment
1575
+ if include_boundaries in ["both", "end"]:
1576
+ bottom = end_elem.bottom
1577
+ else:
1578
+ bottom = end_elem.top
1579
+ regions.append(
1580
+ Region(end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, bottom))
1581
+ )
1582
+
1583
+ # Create FlowRegion
1847
1584
  flow_element = FlowElement(physical_object=start_elem, flow=self)
1848
1585
  flow_region = FlowRegion(
1849
1586
  flow=self,
1850
- constituent_regions=constituent_regions,
1587
+ constituent_regions=regions,
1851
1588
  source_flow_element=flow_element,
1852
- boundary_element_found=None,
1589
+ boundary_element_found=end_elem,
1853
1590
  )
1591
+ flow_region.start_element = start_elem
1592
+ flow_region.end_element = end_elem
1593
+ flow_region._boundary_exclusions = include_boundaries
1854
1594
  sections.append(flow_region)
1855
1595
 
1856
- # Handle new_section_on_page_break when no explicit boundaries
1857
- if (
1858
- new_section_on_page_break
1859
- and start_elements_unwrapped is None
1860
- and end_elements_unwrapped is None
1861
- ):
1862
- # Each segment becomes its own section
1863
- sections = list(self.segments)
1864
-
1865
- # Sort sections by their position in the flow
1866
- def _section_sort_key(section):
1867
- if hasattr(section, "constituent_regions"):
1868
- # FlowRegion - use first constituent region
1869
- first_region = (
1870
- section.constituent_regions[0] if section.constituent_regions else None
1871
- )
1872
- if first_region:
1873
- # Find which segment this region belongs to
1874
- for idx, seg in enumerate(self.segments):
1875
- try:
1876
- if seg.intersects(first_region):
1877
- return (
1878
- idx,
1879
- getattr(first_region, "top", 0),
1880
- getattr(first_region, "x0", 0),
1881
- )
1882
- except:
1883
- pass
1884
- else:
1885
- # Regular Region
1886
- for idx, seg in enumerate(self.segments):
1887
- try:
1888
- if seg.intersects(section):
1889
- return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
1890
- except:
1891
- pass
1892
- return (float("inf"), 0, 0)
1893
-
1894
- sections.sort(key=_section_sort_key)
1895
-
1896
- logger.debug(f"\n=== Section creation complete ===")
1897
- logger.debug(f"Total sections created: {len(sections)}")
1898
- for i, section in enumerate(sections):
1899
- if hasattr(section, "constituent_regions"):
1900
- logger.debug(
1901
- f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
1902
- )
1903
- else:
1904
- logger.debug(f"Section {i}: Region with bbox={section.bbox}")
1596
+ # Case 3: Only end elements (sections from beginning to each end)
1597
+ elif not all_starts and all_ends:
1598
+ # TODO: Handle this case if needed
1599
+ pass
1905
1600
 
1906
1601
  return ElementCollection(sections)
1907
1602
 
1908
- def highlights(self, show: bool = False) -> "HighlightContext":
1603
+ def highlights(self, show: bool = False):
1909
1604
  """
1910
1605
  Create a highlight context for accumulating highlights.
1911
1606