natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py
CHANGED
@@ -1366,546 +1366,241 @@ class Flow(Visualizable):
|
|
1366
1366
|
end_elements_unwrapped = _unwrap(end_elements)
|
1367
1367
|
|
1368
1368
|
# ------------------------------------------------------------------
|
1369
|
-
#
|
1370
|
-
#
|
1371
|
-
# ------------------------------------------------------------------
|
1372
|
-
from natural_pdf.elements.element_collection import ElementCollection
|
1373
|
-
|
1374
|
-
aggregated_sections = []
|
1375
|
-
|
1376
|
-
# Helper to decide if an element lies inside a segment (Region)
|
1377
|
-
def _element_in_segment(elem, segment_region):
|
1378
|
-
try:
|
1379
|
-
return segment_region.intersects(elem) # Region method – robust
|
1380
|
-
except Exception:
|
1381
|
-
# Fallback to bounding-box containment checks
|
1382
|
-
if not hasattr(elem, "bbox"):
|
1383
|
-
return False
|
1384
|
-
ex0, etop, ex1, ebottom = elem.bbox
|
1385
|
-
sx0, stop, sx1, sbottom = segment_region.bbox
|
1386
|
-
return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
|
1387
|
-
|
1388
|
-
for seg in self.segments:
|
1389
|
-
# Each *seg* is guaranteed to be a Region (see _normalize_segments)
|
1390
|
-
|
1391
|
-
# Resolve segment-specific boundary arguments
|
1392
|
-
seg_start_elems = None
|
1393
|
-
seg_end_elems = None
|
1394
|
-
|
1395
|
-
# --- Handle selector strings ---
|
1396
|
-
if isinstance(start_elements_unwrapped, str):
|
1397
|
-
seg_start_elems = seg.find_all(start_elements_unwrapped).elements
|
1398
|
-
elif start_elements_unwrapped is not None:
|
1399
|
-
seg_start_elems = [
|
1400
|
-
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1401
|
-
]
|
1402
|
-
|
1403
|
-
if isinstance(end_elements_unwrapped, str):
|
1404
|
-
seg_end_elems = seg.find_all(end_elements_unwrapped).elements
|
1405
|
-
elif end_elements_unwrapped is not None:
|
1406
|
-
seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1407
|
-
|
1408
|
-
# Call Region.get_sections – this returns ElementCollection[Region]
|
1409
|
-
seg_sections = seg.get_sections(
|
1410
|
-
start_elements=seg_start_elems,
|
1411
|
-
end_elements=seg_end_elems,
|
1412
|
-
include_boundaries=include_boundaries,
|
1413
|
-
orientation=orientation,
|
1414
|
-
)
|
1415
|
-
|
1416
|
-
if seg_sections:
|
1417
|
-
aggregated_sections.extend(seg_sections.elements)
|
1418
|
-
|
1419
|
-
# Optionally, handle new_section_on_page_break – interpreted here as
|
1420
|
-
# *new_section_on_segment_break*: if True and there were *no* explicit
|
1421
|
-
# boundaries, treat the entire segment as a single section.
|
1422
|
-
if (
|
1423
|
-
new_section_on_page_break
|
1424
|
-
and not seg_sections
|
1425
|
-
and start_elements_unwrapped is None
|
1426
|
-
and end_elements_unwrapped is None
|
1427
|
-
):
|
1428
|
-
aggregated_sections.append(seg)
|
1429
|
-
|
1430
|
-
# ------------------------------------------------------------------
|
1431
|
-
# CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
|
1432
|
-
# span multiple segments and create FlowRegions for those cases.
|
1433
|
-
# ------------------------------------------------------------------
|
1434
|
-
|
1435
|
-
# If we have explicit start/end elements, check for cross-segment sections
|
1436
|
-
if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
|
1437
|
-
# Find all start and end elements across all segments
|
1438
|
-
all_start_elements = []
|
1439
|
-
all_end_elements = []
|
1440
|
-
|
1441
|
-
# Map elements to their segments for tracking
|
1442
|
-
element_to_segment = {}
|
1443
|
-
|
1444
|
-
for seg_idx, seg in enumerate(self.segments):
|
1445
|
-
if isinstance(start_elements_unwrapped, str):
|
1446
|
-
seg_starts = seg.find_all(start_elements_unwrapped).elements
|
1447
|
-
else:
|
1448
|
-
seg_starts = [
|
1449
|
-
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1450
|
-
]
|
1451
|
-
|
1452
|
-
if isinstance(end_elements_unwrapped, str):
|
1453
|
-
seg_ends = seg.find_all(end_elements_unwrapped).elements
|
1454
|
-
else:
|
1455
|
-
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1456
|
-
|
1457
|
-
for elem in seg_starts:
|
1458
|
-
all_start_elements.append((elem, seg_idx))
|
1459
|
-
element_to_segment[id(elem)] = seg_idx
|
1460
|
-
|
1461
|
-
for elem in seg_ends:
|
1462
|
-
all_end_elements.append((elem, seg_idx))
|
1463
|
-
element_to_segment[id(elem)] = seg_idx
|
1464
|
-
|
1465
|
-
# Sort by segment index, then by position within segment
|
1466
|
-
all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1467
|
-
all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1468
|
-
|
1469
|
-
# Look for cross-segment pairs (start in one segment, end in another)
|
1470
|
-
cross_segment_sections = []
|
1471
|
-
used_starts = set()
|
1472
|
-
used_ends = set()
|
1473
|
-
|
1474
|
-
for start_elem, start_seg_idx in all_start_elements:
|
1475
|
-
if id(start_elem) in used_starts:
|
1476
|
-
continue
|
1477
|
-
|
1478
|
-
# Find the next end element that comes after this start
|
1479
|
-
matching_end = None
|
1480
|
-
for end_elem, end_seg_idx in all_end_elements:
|
1481
|
-
if id(end_elem) in used_ends:
|
1482
|
-
continue
|
1483
|
-
|
1484
|
-
# Check if this end comes after the start (by segment order or position)
|
1485
|
-
if end_seg_idx > start_seg_idx or (
|
1486
|
-
end_seg_idx == start_seg_idx
|
1487
|
-
and (
|
1488
|
-
end_elem.top > start_elem.top
|
1489
|
-
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1490
|
-
)
|
1491
|
-
):
|
1492
|
-
matching_end = (end_elem, end_seg_idx)
|
1493
|
-
break
|
1494
|
-
|
1495
|
-
if matching_end is not None:
|
1496
|
-
end_elem, end_seg_idx = matching_end
|
1497
|
-
|
1498
|
-
# If start and end are in different segments, create FlowRegion
|
1499
|
-
if start_seg_idx != end_seg_idx:
|
1500
|
-
cross_segment_sections.append(
|
1501
|
-
(start_elem, start_seg_idx, end_elem, end_seg_idx)
|
1502
|
-
)
|
1503
|
-
used_starts.add(id(start_elem))
|
1504
|
-
used_ends.add(id(end_elem))
|
1505
|
-
|
1506
|
-
# Create FlowRegions for cross-segment sections
|
1507
|
-
from natural_pdf.elements.region import Region
|
1508
|
-
from natural_pdf.flows.element import FlowElement
|
1509
|
-
from natural_pdf.flows.region import FlowRegion
|
1510
|
-
|
1511
|
-
for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
|
1512
|
-
# Build constituent regions spanning from start segment to end segment
|
1513
|
-
constituent_regions = []
|
1514
|
-
|
1515
|
-
# First segment: from start element to bottom
|
1516
|
-
start_seg = self.segments[start_seg_idx]
|
1517
|
-
first_region = Region(
|
1518
|
-
start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
|
1519
|
-
)
|
1520
|
-
constituent_regions.append(first_region)
|
1521
|
-
|
1522
|
-
# Middle segments: full segments
|
1523
|
-
for seg_idx in range(start_seg_idx + 1, end_seg_idx):
|
1524
|
-
constituent_regions.append(self.segments[seg_idx])
|
1525
|
-
|
1526
|
-
# Last segment: from top to end element
|
1527
|
-
if end_seg_idx != start_seg_idx:
|
1528
|
-
end_seg = self.segments[end_seg_idx]
|
1529
|
-
last_region = Region(
|
1530
|
-
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
|
1531
|
-
)
|
1532
|
-
constituent_regions.append(last_region)
|
1533
|
-
|
1534
|
-
# Create FlowRegion
|
1535
|
-
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1536
|
-
flow_region = FlowRegion(
|
1537
|
-
flow=self,
|
1538
|
-
constituent_regions=constituent_regions,
|
1539
|
-
source_flow_element=flow_element,
|
1540
|
-
boundary_element_found=end_elem,
|
1541
|
-
)
|
1542
|
-
|
1543
|
-
# Remove any single-segment sections that are now covered by this FlowRegion
|
1544
|
-
# This prevents duplication of content
|
1545
|
-
aggregated_sections = [
|
1546
|
-
s
|
1547
|
-
for s in aggregated_sections
|
1548
|
-
if not any(
|
1549
|
-
cr.intersects(s)
|
1550
|
-
for cr in constituent_regions
|
1551
|
-
if hasattr(cr, "intersects") and hasattr(s, "intersects")
|
1552
|
-
)
|
1553
|
-
]
|
1554
|
-
|
1555
|
-
aggregated_sections.append(flow_region)
|
1556
|
-
|
1557
|
-
# ------------------------------------------------------------------
|
1558
|
-
# NEW APPROACH: First collect ALL boundary elements across all segments,
|
1559
|
-
# then pair them up to create sections (either single-segment Regions
|
1560
|
-
# or multi-segment FlowRegions).
|
1369
|
+
# For Flow, we need to handle sections that may span segments
|
1370
|
+
# We'll process all segments together, not independently
|
1561
1371
|
# ------------------------------------------------------------------
|
1562
1372
|
from natural_pdf.elements.element_collection import ElementCollection
|
1563
1373
|
from natural_pdf.elements.region import Region
|
1564
1374
|
from natural_pdf.flows.element import FlowElement
|
1565
1375
|
from natural_pdf.flows.region import FlowRegion
|
1566
1376
|
|
1567
|
-
# Helper to
|
1568
|
-
def _element_in_segment(elem,
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
# Find start elements in this segment
|
1377
|
+
# Helper to check if element is in segment
|
1378
|
+
def _element_in_segment(elem, segment):
|
1379
|
+
# Simple bbox check
|
1380
|
+
return (
|
1381
|
+
elem.page == segment.page
|
1382
|
+
and elem.top >= segment.top
|
1383
|
+
and elem.bottom <= segment.bottom
|
1384
|
+
and elem.x0 >= segment.x0
|
1385
|
+
and elem.x1 <= segment.x1
|
1386
|
+
)
|
1387
|
+
|
1388
|
+
# Collect all boundary elements with their segment info
|
1389
|
+
all_starts = []
|
1390
|
+
all_ends = []
|
1391
|
+
|
1392
|
+
for seg_idx, segment in enumerate(self.segments):
|
1393
|
+
# Find starts in this segment
|
1585
1394
|
if isinstance(start_elements_unwrapped, str):
|
1586
|
-
seg_starts =
|
1587
|
-
elif start_elements_unwrapped
|
1588
|
-
seg_starts = [
|
1395
|
+
seg_starts = segment.find_all(start_elements_unwrapped).elements
|
1396
|
+
elif start_elements_unwrapped:
|
1397
|
+
seg_starts = [
|
1398
|
+
e for e in start_elements_unwrapped if _element_in_segment(e, segment)
|
1399
|
+
]
|
1589
1400
|
else:
|
1590
1401
|
seg_starts = []
|
1591
1402
|
|
1592
|
-
|
1593
|
-
|
1594
|
-
logger.debug(
|
1595
|
-
f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
|
1596
|
-
)
|
1597
|
-
|
1598
|
-
logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
|
1599
|
-
for i, elem in enumerate(seg_starts):
|
1600
|
-
logger.debug(
|
1601
|
-
f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1602
|
-
)
|
1403
|
+
for elem in seg_starts:
|
1404
|
+
all_starts.append((elem, seg_idx, segment))
|
1603
1405
|
|
1604
|
-
# Find
|
1406
|
+
# Find ends in this segment
|
1605
1407
|
if isinstance(end_elements_unwrapped, str):
|
1606
|
-
seg_ends =
|
1607
|
-
elif end_elements_unwrapped
|
1608
|
-
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e,
|
1408
|
+
seg_ends = segment.find_all(end_elements_unwrapped).elements
|
1409
|
+
elif end_elements_unwrapped:
|
1410
|
+
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, segment)]
|
1609
1411
|
else:
|
1610
1412
|
seg_ends = []
|
1611
1413
|
|
1612
|
-
logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
|
1613
|
-
for i, elem in enumerate(seg_ends):
|
1614
|
-
logger.debug(
|
1615
|
-
f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1616
|
-
)
|
1617
|
-
|
1618
|
-
# Add to global lists with segment index
|
1619
|
-
for elem in seg_starts:
|
1620
|
-
all_start_elements.append((elem, seg_idx))
|
1621
1414
|
for elem in seg_ends:
|
1622
|
-
|
1415
|
+
all_ends.append((elem, seg_idx, segment))
|
1623
1416
|
|
1624
|
-
# Sort by
|
1625
|
-
|
1626
|
-
|
1417
|
+
# Sort by segment index, then position
|
1418
|
+
all_starts.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1419
|
+
all_ends.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1627
1420
|
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1421
|
+
# If no boundary elements found, return empty collection
|
1422
|
+
if not all_starts and not all_ends:
|
1423
|
+
return ElementCollection([])
|
1631
1424
|
|
1632
|
-
# Pair up start and end elements to create sections
|
1633
1425
|
sections = []
|
1634
|
-
used_starts = set()
|
1635
|
-
used_ends = set()
|
1636
|
-
|
1637
|
-
for start_elem, start_seg_idx in all_start_elements:
|
1638
|
-
if id(start_elem) in used_starts:
|
1639
|
-
continue
|
1640
|
-
|
1641
|
-
logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
|
1642
|
-
logger.debug(
|
1643
|
-
f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
|
1644
|
-
)
|
1645
|
-
|
1646
|
-
# Find the next unused end element that comes after this start
|
1647
|
-
matching_end = None
|
1648
|
-
for end_elem, end_seg_idx in all_end_elements:
|
1649
|
-
if id(end_elem) in used_ends:
|
1650
|
-
continue
|
1651
1426
|
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
end_elem.top > start_elem.top
|
1657
|
-
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1658
|
-
)
|
1659
|
-
):
|
1660
|
-
matching_end = (end_elem, end_seg_idx)
|
1661
|
-
break
|
1662
|
-
|
1663
|
-
if matching_end is not None:
|
1664
|
-
end_elem, end_seg_idx = matching_end
|
1665
|
-
used_starts.add(id(start_elem))
|
1666
|
-
used_ends.add(id(end_elem))
|
1667
|
-
|
1668
|
-
logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
|
1669
|
-
|
1670
|
-
# Create section based on whether it spans segments
|
1671
|
-
if start_seg_idx == end_seg_idx:
|
1672
|
-
# Single segment section - use Region.get_section_between
|
1673
|
-
seg = self.segments[start_seg_idx]
|
1674
|
-
section = seg.get_section_between(start_elem, end_elem, include_boundaries)
|
1675
|
-
sections.append(section)
|
1676
|
-
logger.debug(f" Created single-segment Region")
|
1677
|
-
else:
|
1678
|
-
# Multi-segment section - create FlowRegion
|
1679
|
-
logger.debug(
|
1680
|
-
f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
|
1681
|
-
)
|
1682
|
-
constituent_regions = []
|
1683
|
-
|
1684
|
-
# First segment: from start element to bottom
|
1685
|
-
start_seg = self.segments[start_seg_idx]
|
1686
|
-
if include_boundaries in ["start", "both"]:
|
1687
|
-
first_top = start_elem.top
|
1688
|
-
else:
|
1689
|
-
first_top = start_elem.bottom
|
1690
|
-
first_region = Region(
|
1691
|
-
start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
|
1692
|
-
)
|
1693
|
-
constituent_regions.append(first_region)
|
1427
|
+
# Case 1: Only start elements provided
|
1428
|
+
if all_starts and not all_ends:
|
1429
|
+
for i in range(len(all_starts)):
|
1430
|
+
start_elem, start_seg_idx, start_seg = all_starts[i]
|
1694
1431
|
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1432
|
+
# Find end (next start or end of flow)
|
1433
|
+
if i + 1 < len(all_starts):
|
1434
|
+
# Section ends at next start
|
1435
|
+
end_elem, end_seg_idx, end_seg = all_starts[i + 1]
|
1698
1436
|
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1437
|
+
if start_seg_idx == end_seg_idx:
|
1438
|
+
# Same segment - create regular Region
|
1439
|
+
section = start_seg.get_section_between(
|
1440
|
+
start_elem, end_elem, include_boundaries, orientation
|
1441
|
+
)
|
1442
|
+
if section:
|
1443
|
+
sections.append(section)
|
1703
1444
|
else:
|
1704
|
-
|
1705
|
-
|
1706
|
-
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
|
1707
|
-
)
|
1708
|
-
constituent_regions.append(last_region)
|
1709
|
-
|
1710
|
-
# Create FlowRegion
|
1711
|
-
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1712
|
-
flow_region = FlowRegion(
|
1713
|
-
flow=self,
|
1714
|
-
constituent_regions=constituent_regions,
|
1715
|
-
source_flow_element=flow_element,
|
1716
|
-
boundary_element_found=end_elem,
|
1717
|
-
)
|
1718
|
-
sections.append(flow_region)
|
1445
|
+
# Cross-segment - create FlowRegion
|
1446
|
+
regions = []
|
1719
1447
|
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
|
1724
|
-
if id(start_elem) in used_starts:
|
1725
|
-
continue
|
1726
|
-
|
1727
|
-
# Find next start element
|
1728
|
-
next_start = None
|
1729
|
-
if i + 1 < len(all_start_elements):
|
1730
|
-
next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
|
1731
|
-
# Create section from this start to just before next start
|
1732
|
-
if start_seg_idx == next_start_seg_idx:
|
1733
|
-
# Same segment
|
1734
|
-
seg = self.segments[start_seg_idx]
|
1735
|
-
# Find element just before next start
|
1736
|
-
all_elems = seg.get_elements()
|
1737
|
-
all_elems.sort(key=lambda e: (e.top, e.x0))
|
1738
|
-
try:
|
1739
|
-
next_idx = all_elems.index(next_start_elem)
|
1740
|
-
if next_idx > 0:
|
1741
|
-
end_elem = all_elems[next_idx - 1]
|
1742
|
-
section = seg.get_section_between(
|
1743
|
-
start_elem, end_elem, include_boundaries
|
1744
|
-
)
|
1745
|
-
sections.append(section)
|
1746
|
-
except ValueError:
|
1747
|
-
pass
|
1748
|
-
elif next_start_seg_idx == start_seg_idx + 1:
|
1749
|
-
# Next start is in the immediately following segment in the flow
|
1750
|
-
# Create a FlowRegion that spans from current start to just before next start
|
1751
|
-
logger.debug(f" Next start is in next flow segment - creating FlowRegion")
|
1752
|
-
|
1753
|
-
constituent_regions = []
|
1754
|
-
|
1755
|
-
# First segment: from start element to bottom
|
1756
|
-
start_seg = self.segments[start_seg_idx]
|
1757
|
-
if include_boundaries in ["start", "both"]:
|
1758
|
-
first_top = start_elem.top
|
1448
|
+
# First segment: from start to bottom
|
1449
|
+
if include_boundaries in ["both", "start"]:
|
1450
|
+
top = start_elem.top
|
1759
1451
|
else:
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1452
|
+
top = start_elem.bottom
|
1453
|
+
regions.append(
|
1454
|
+
Region(
|
1455
|
+
start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
|
1456
|
+
)
|
1764
1457
|
)
|
1765
|
-
constituent_regions.append(first_region)
|
1766
|
-
|
1767
|
-
# Next segment: from top to just before next start
|
1768
|
-
next_seg = self.segments[next_start_seg_idx]
|
1769
|
-
# Find element just before next start in the next segment
|
1770
|
-
next_seg_elems = next_seg.get_elements()
|
1771
|
-
next_seg_elems.sort(key=lambda e: (e.top, e.x0))
|
1772
1458
|
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
if next_idx > 0:
|
1777
|
-
# Use the bottom of the element before next start
|
1778
|
-
prev_elem = next_seg_elems[next_idx - 1]
|
1779
|
-
last_bottom = prev_elem.bottom
|
1780
|
-
except ValueError:
|
1781
|
-
pass
|
1459
|
+
# Middle segments (full)
|
1460
|
+
for idx in range(start_seg_idx + 1, end_seg_idx):
|
1461
|
+
regions.append(self.segments[idx])
|
1782
1462
|
|
1783
|
-
|
1784
|
-
|
1463
|
+
# Last segment: from top to end element
|
1464
|
+
if include_boundaries in ["both", "end"]:
|
1465
|
+
bottom = end_elem.bottom
|
1466
|
+
else:
|
1467
|
+
bottom = end_elem.top
|
1468
|
+
regions.append(
|
1469
|
+
Region(end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, bottom))
|
1785
1470
|
)
|
1786
|
-
constituent_regions.append(last_region)
|
1787
1471
|
|
1788
1472
|
# Create FlowRegion
|
1789
1473
|
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1790
1474
|
flow_region = FlowRegion(
|
1791
1475
|
flow=self,
|
1792
|
-
constituent_regions=
|
1476
|
+
constituent_regions=regions,
|
1793
1477
|
source_flow_element=flow_element,
|
1794
|
-
boundary_element_found=
|
1478
|
+
boundary_element_found=end_elem,
|
1795
1479
|
)
|
1480
|
+
flow_region.start_element = start_elem
|
1481
|
+
flow_region.end_element = end_elem
|
1482
|
+
flow_region._boundary_exclusions = include_boundaries
|
1796
1483
|
sections.append(flow_region)
|
1797
|
-
|
1798
|
-
|
1484
|
+
else:
|
1485
|
+
# Last section - goes to end of flow
|
1486
|
+
if start_seg_idx == len(self.segments) - 1:
|
1487
|
+
# Within last segment
|
1488
|
+
section = start_seg.get_section_between(
|
1489
|
+
start_elem, None, include_boundaries, orientation
|
1799
1490
|
)
|
1491
|
+
if section:
|
1492
|
+
sections.append(section)
|
1800
1493
|
else:
|
1801
|
-
#
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1494
|
+
# Spans to end
|
1495
|
+
regions = []
|
1496
|
+
|
1497
|
+
# First segment: from start to bottom
|
1498
|
+
if include_boundaries in ["both", "start"]:
|
1499
|
+
top = start_elem.top
|
1805
1500
|
else:
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
1501
|
+
top = start_elem.bottom
|
1502
|
+
regions.append(
|
1503
|
+
Region(
|
1504
|
+
start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
|
1505
|
+
)
|
1810
1506
|
)
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1507
|
+
|
1508
|
+
# Remaining segments (full)
|
1509
|
+
for idx in range(start_seg_idx + 1, len(self.segments)):
|
1510
|
+
regions.append(self.segments[idx])
|
1511
|
+
|
1512
|
+
# Create FlowRegion
|
1513
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1514
|
+
flow_region = FlowRegion(
|
1515
|
+
flow=self,
|
1516
|
+
constituent_regions=regions,
|
1517
|
+
source_flow_element=flow_element,
|
1518
|
+
boundary_element_found=None,
|
1814
1519
|
)
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
|
1824
|
-
|
1825
|
-
|
1826
|
-
|
1520
|
+
flow_region.start_element = start_elem
|
1521
|
+
flow_region._boundary_exclusions = include_boundaries
|
1522
|
+
sections.append(flow_region)
|
1523
|
+
|
1524
|
+
# Case 2: Both start and end elements
|
1525
|
+
elif all_starts and all_ends:
|
1526
|
+
# Match starts with ends
|
1527
|
+
used_ends = set()
|
1528
|
+
|
1529
|
+
for start_elem, start_seg_idx, start_seg in all_starts:
|
1530
|
+
# Find matching end
|
1531
|
+
best_end = None
|
1532
|
+
|
1533
|
+
for end_elem, end_seg_idx, end_seg in all_ends:
|
1534
|
+
if id(end_elem) in used_ends:
|
1535
|
+
continue
|
1536
|
+
|
1537
|
+
# End must come after start
|
1538
|
+
if end_seg_idx > start_seg_idx or (
|
1539
|
+
end_seg_idx == start_seg_idx and end_elem.top >= start_elem.bottom
|
1540
|
+
):
|
1541
|
+
best_end = (end_elem, end_seg_idx, end_seg)
|
1542
|
+
break
|
1543
|
+
|
1544
|
+
if best_end:
|
1545
|
+
end_elem, end_seg_idx, end_seg = best_end
|
1546
|
+
used_ends.add(id(end_elem))
|
1547
|
+
|
1548
|
+
if start_seg_idx == end_seg_idx:
|
1549
|
+
# Same segment
|
1550
|
+
section = start_seg.get_section_between(
|
1551
|
+
start_elem, end_elem, include_boundaries, orientation
|
1552
|
+
)
|
1553
|
+
if section:
|
1554
|
+
sections.append(section)
|
1827
1555
|
else:
|
1828
|
-
#
|
1829
|
-
|
1556
|
+
# Cross-segment FlowRegion
|
1557
|
+
regions = []
|
1830
1558
|
|
1831
1559
|
# First segment
|
1832
|
-
|
1833
|
-
|
1834
|
-
first_top = start_elem.top
|
1560
|
+
if include_boundaries in ["both", "start"]:
|
1561
|
+
top = start_elem.top
|
1835
1562
|
else:
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1563
|
+
top = start_elem.bottom
|
1564
|
+
regions.append(
|
1565
|
+
Region(
|
1566
|
+
start_seg.page, (start_seg.x0, top, start_seg.x1, start_seg.bottom)
|
1567
|
+
)
|
1840
1568
|
)
|
1841
|
-
constituent_regions.append(first_region)
|
1842
1569
|
|
1843
|
-
#
|
1844
|
-
for
|
1845
|
-
|
1570
|
+
# Middle segments
|
1571
|
+
for idx in range(start_seg_idx + 1, end_seg_idx):
|
1572
|
+
regions.append(self.segments[idx])
|
1846
1573
|
|
1574
|
+
# Last segment
|
1575
|
+
if include_boundaries in ["both", "end"]:
|
1576
|
+
bottom = end_elem.bottom
|
1577
|
+
else:
|
1578
|
+
bottom = end_elem.top
|
1579
|
+
regions.append(
|
1580
|
+
Region(end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, bottom))
|
1581
|
+
)
|
1582
|
+
|
1583
|
+
# Create FlowRegion
|
1847
1584
|
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1848
1585
|
flow_region = FlowRegion(
|
1849
1586
|
flow=self,
|
1850
|
-
constituent_regions=
|
1587
|
+
constituent_regions=regions,
|
1851
1588
|
source_flow_element=flow_element,
|
1852
|
-
boundary_element_found=
|
1589
|
+
boundary_element_found=end_elem,
|
1853
1590
|
)
|
1591
|
+
flow_region.start_element = start_elem
|
1592
|
+
flow_region.end_element = end_elem
|
1593
|
+
flow_region._boundary_exclusions = include_boundaries
|
1854
1594
|
sections.append(flow_region)
|
1855
1595
|
|
1856
|
-
#
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
and end_elements_unwrapped is None
|
1861
|
-
):
|
1862
|
-
# Each segment becomes its own section
|
1863
|
-
sections = list(self.segments)
|
1864
|
-
|
1865
|
-
# Sort sections by their position in the flow
|
1866
|
-
def _section_sort_key(section):
|
1867
|
-
if hasattr(section, "constituent_regions"):
|
1868
|
-
# FlowRegion - use first constituent region
|
1869
|
-
first_region = (
|
1870
|
-
section.constituent_regions[0] if section.constituent_regions else None
|
1871
|
-
)
|
1872
|
-
if first_region:
|
1873
|
-
# Find which segment this region belongs to
|
1874
|
-
for idx, seg in enumerate(self.segments):
|
1875
|
-
try:
|
1876
|
-
if seg.intersects(first_region):
|
1877
|
-
return (
|
1878
|
-
idx,
|
1879
|
-
getattr(first_region, "top", 0),
|
1880
|
-
getattr(first_region, "x0", 0),
|
1881
|
-
)
|
1882
|
-
except:
|
1883
|
-
pass
|
1884
|
-
else:
|
1885
|
-
# Regular Region
|
1886
|
-
for idx, seg in enumerate(self.segments):
|
1887
|
-
try:
|
1888
|
-
if seg.intersects(section):
|
1889
|
-
return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
|
1890
|
-
except:
|
1891
|
-
pass
|
1892
|
-
return (float("inf"), 0, 0)
|
1893
|
-
|
1894
|
-
sections.sort(key=_section_sort_key)
|
1895
|
-
|
1896
|
-
logger.debug(f"\n=== Section creation complete ===")
|
1897
|
-
logger.debug(f"Total sections created: {len(sections)}")
|
1898
|
-
for i, section in enumerate(sections):
|
1899
|
-
if hasattr(section, "constituent_regions"):
|
1900
|
-
logger.debug(
|
1901
|
-
f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
|
1902
|
-
)
|
1903
|
-
else:
|
1904
|
-
logger.debug(f"Section {i}: Region with bbox={section.bbox}")
|
1596
|
+
# Case 3: Only end elements (sections from beginning to each end)
|
1597
|
+
elif not all_starts and all_ends:
|
1598
|
+
# TODO: Handle this case if needed
|
1599
|
+
pass
|
1905
1600
|
|
1906
1601
|
return ElementCollection(sections)
|
1907
1602
|
|
1908
|
-
def highlights(self, show: bool = False)
|
1603
|
+
def highlights(self, show: bool = False):
|
1909
1604
|
"""
|
1910
1605
|
Create a highlight context for accumulating highlights.
|
1911
1606
|
|