natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +130 -31
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +172 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -7,6 +7,7 @@ from typing import (
|
|
7
7
|
List,
|
8
8
|
Literal,
|
9
9
|
Optional,
|
10
|
+
Set,
|
10
11
|
Tuple,
|
11
12
|
Union,
|
12
13
|
overload,
|
@@ -346,6 +347,7 @@ class Region(
|
|
346
347
|
include_source: bool = False,
|
347
348
|
until: Optional[str] = None,
|
348
349
|
include_endpoint: bool = True,
|
350
|
+
offset: Optional[float] = None,
|
349
351
|
**kwargs,
|
350
352
|
) -> "Region":
|
351
353
|
"""
|
@@ -357,11 +359,18 @@ class Region(
|
|
357
359
|
include_source: Whether to include this region in the result (default: False)
|
358
360
|
until: Optional selector string to specify an upper boundary element
|
359
361
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
362
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
360
363
|
**kwargs: Additional parameters
|
361
364
|
|
362
365
|
Returns:
|
363
366
|
Region object representing the area above
|
364
367
|
"""
|
368
|
+
# Use global default if offset not provided
|
369
|
+
if offset is None:
|
370
|
+
import natural_pdf
|
371
|
+
|
372
|
+
offset = natural_pdf.options.layout.directional_offset
|
373
|
+
|
365
374
|
return self._direction(
|
366
375
|
direction="above",
|
367
376
|
size=height,
|
@@ -369,6 +378,7 @@ class Region(
|
|
369
378
|
include_source=include_source,
|
370
379
|
until=until,
|
371
380
|
include_endpoint=include_endpoint,
|
381
|
+
offset=offset,
|
372
382
|
**kwargs,
|
373
383
|
)
|
374
384
|
|
@@ -379,6 +389,7 @@ class Region(
|
|
379
389
|
include_source: bool = False,
|
380
390
|
until: Optional[str] = None,
|
381
391
|
include_endpoint: bool = True,
|
392
|
+
offset: Optional[float] = None,
|
382
393
|
**kwargs,
|
383
394
|
) -> "Region":
|
384
395
|
"""
|
@@ -390,11 +401,18 @@ class Region(
|
|
390
401
|
include_source: Whether to include this region in the result (default: False)
|
391
402
|
until: Optional selector string to specify a lower boundary element
|
392
403
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
404
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
393
405
|
**kwargs: Additional parameters
|
394
406
|
|
395
407
|
Returns:
|
396
408
|
Region object representing the area below
|
397
409
|
"""
|
410
|
+
# Use global default if offset not provided
|
411
|
+
if offset is None:
|
412
|
+
import natural_pdf
|
413
|
+
|
414
|
+
offset = natural_pdf.options.layout.directional_offset
|
415
|
+
|
398
416
|
return self._direction(
|
399
417
|
direction="below",
|
400
418
|
size=height,
|
@@ -402,16 +420,18 @@ class Region(
|
|
402
420
|
include_source=include_source,
|
403
421
|
until=until,
|
404
422
|
include_endpoint=include_endpoint,
|
423
|
+
offset=offset,
|
405
424
|
**kwargs,
|
406
425
|
)
|
407
426
|
|
408
427
|
def left(
|
409
428
|
self,
|
410
429
|
width: Optional[float] = None,
|
411
|
-
height: str = "
|
430
|
+
height: str = "element",
|
412
431
|
include_source: bool = False,
|
413
432
|
until: Optional[str] = None,
|
414
433
|
include_endpoint: bool = True,
|
434
|
+
offset: Optional[float] = None,
|
415
435
|
**kwargs,
|
416
436
|
) -> "Region":
|
417
437
|
"""
|
@@ -423,11 +443,18 @@ class Region(
|
|
423
443
|
include_source: Whether to include this region in the result (default: False)
|
424
444
|
until: Optional selector string to specify a left boundary element
|
425
445
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
446
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
426
447
|
**kwargs: Additional parameters
|
427
448
|
|
428
449
|
Returns:
|
429
450
|
Region object representing the area to the left
|
430
451
|
"""
|
452
|
+
# Use global default if offset not provided
|
453
|
+
if offset is None:
|
454
|
+
import natural_pdf
|
455
|
+
|
456
|
+
offset = natural_pdf.options.layout.directional_offset
|
457
|
+
|
431
458
|
return self._direction(
|
432
459
|
direction="left",
|
433
460
|
size=width,
|
@@ -435,16 +462,18 @@ class Region(
|
|
435
462
|
include_source=include_source,
|
436
463
|
until=until,
|
437
464
|
include_endpoint=include_endpoint,
|
465
|
+
offset=offset,
|
438
466
|
**kwargs,
|
439
467
|
)
|
440
468
|
|
441
469
|
def right(
|
442
470
|
self,
|
443
471
|
width: Optional[float] = None,
|
444
|
-
height: str = "
|
472
|
+
height: str = "element",
|
445
473
|
include_source: bool = False,
|
446
474
|
until: Optional[str] = None,
|
447
475
|
include_endpoint: bool = True,
|
476
|
+
offset: Optional[float] = None,
|
448
477
|
**kwargs,
|
449
478
|
) -> "Region":
|
450
479
|
"""
|
@@ -456,11 +485,18 @@ class Region(
|
|
456
485
|
include_source: Whether to include this region in the result (default: False)
|
457
486
|
until: Optional selector string to specify a right boundary element
|
458
487
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
488
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
459
489
|
**kwargs: Additional parameters
|
460
490
|
|
461
491
|
Returns:
|
462
492
|
Region object representing the area to the right
|
463
493
|
"""
|
494
|
+
# Use global default if offset not provided
|
495
|
+
if offset is None:
|
496
|
+
import natural_pdf
|
497
|
+
|
498
|
+
offset = natural_pdf.options.layout.directional_offset
|
499
|
+
|
464
500
|
return self._direction(
|
465
501
|
direction="right",
|
466
502
|
size=width,
|
@@ -468,6 +504,7 @@ class Region(
|
|
468
504
|
include_source=include_source,
|
469
505
|
until=until,
|
470
506
|
include_endpoint=include_endpoint,
|
507
|
+
offset=offset,
|
471
508
|
**kwargs,
|
472
509
|
)
|
473
510
|
|
@@ -638,12 +675,10 @@ class Region(
|
|
638
675
|
Returns:
|
639
676
|
True if the element is in the region, False otherwise
|
640
677
|
"""
|
641
|
-
#
|
642
|
-
|
643
|
-
return False
|
678
|
+
# Use centralized spatial utility for consistency
|
679
|
+
from natural_pdf.utils.spatial import is_element_in_region
|
644
680
|
|
645
|
-
return
|
646
|
-
# return self.intersects(element)
|
681
|
+
return is_element_in_region(element, self, strategy="center", check_page=True)
|
647
682
|
|
648
683
|
def contains(self, element: "Element") -> bool:
|
649
684
|
"""
|
@@ -739,7 +774,12 @@ class Region(
|
|
739
774
|
)
|
740
775
|
|
741
776
|
def exclude(self):
|
742
|
-
|
777
|
+
"""
|
778
|
+
Exclude this region from text extraction and other operations.
|
779
|
+
|
780
|
+
This excludes everything within the region's bounds.
|
781
|
+
"""
|
782
|
+
self.page.add_exclusion(self, method="region")
|
743
783
|
|
744
784
|
def highlight(
|
745
785
|
self,
|
@@ -1224,12 +1264,36 @@ class Region(
|
|
1224
1264
|
selector, apply_exclusions=apply_exclusions, **kwargs
|
1225
1265
|
)
|
1226
1266
|
# Filter those elements to only include ones within this region
|
1227
|
-
|
1267
|
+
elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1228
1268
|
else:
|
1229
1269
|
# Get all elements from the page
|
1230
1270
|
page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
|
1231
1271
|
# Filter to elements in this region
|
1232
|
-
|
1272
|
+
elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1273
|
+
|
1274
|
+
# Apply boundary exclusions if this is a section with boundary settings
|
1275
|
+
if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
|
1276
|
+
excluded_ids = set()
|
1277
|
+
|
1278
|
+
if self._boundary_exclusions == "none":
|
1279
|
+
# Exclude both start and end elements
|
1280
|
+
if hasattr(self, "start_element") and self.start_element:
|
1281
|
+
excluded_ids.add(id(self.start_element))
|
1282
|
+
if hasattr(self, "end_element") and self.end_element:
|
1283
|
+
excluded_ids.add(id(self.end_element))
|
1284
|
+
elif self._boundary_exclusions == "start":
|
1285
|
+
# Exclude only end element
|
1286
|
+
if hasattr(self, "end_element") and self.end_element:
|
1287
|
+
excluded_ids.add(id(self.end_element))
|
1288
|
+
elif self._boundary_exclusions == "end":
|
1289
|
+
# Exclude only start element
|
1290
|
+
if hasattr(self, "start_element") and self.start_element:
|
1291
|
+
excluded_ids.add(id(self.start_element))
|
1292
|
+
|
1293
|
+
if excluded_ids:
|
1294
|
+
elements = [e for e in elements if id(e) not in excluded_ids]
|
1295
|
+
|
1296
|
+
return elements
|
1233
1297
|
|
1234
1298
|
def extract_text(
|
1235
1299
|
self,
|
@@ -1300,6 +1364,34 @@ class Region(
|
|
1300
1364
|
elif debug:
|
1301
1365
|
logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
|
1302
1366
|
|
1367
|
+
# Add boundary element exclusions if this is a section with boundary settings
|
1368
|
+
if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
|
1369
|
+
boundary_exclusions = []
|
1370
|
+
|
1371
|
+
if self._boundary_exclusions == "none":
|
1372
|
+
# Exclude both start and end elements
|
1373
|
+
if hasattr(self, "start_element") and self.start_element:
|
1374
|
+
boundary_exclusions.append(self.start_element)
|
1375
|
+
if hasattr(self, "end_element") and self.end_element:
|
1376
|
+
boundary_exclusions.append(self.end_element)
|
1377
|
+
elif self._boundary_exclusions == "start":
|
1378
|
+
# Exclude only end element
|
1379
|
+
if hasattr(self, "end_element") and self.end_element:
|
1380
|
+
boundary_exclusions.append(self.end_element)
|
1381
|
+
elif self._boundary_exclusions == "end":
|
1382
|
+
# Exclude only start element
|
1383
|
+
if hasattr(self, "start_element") and self.start_element:
|
1384
|
+
boundary_exclusions.append(self.start_element)
|
1385
|
+
|
1386
|
+
# Add boundary elements as exclusion regions
|
1387
|
+
for elem in boundary_exclusions:
|
1388
|
+
if hasattr(elem, "bbox"):
|
1389
|
+
exclusion_regions.append(elem)
|
1390
|
+
if debug:
|
1391
|
+
logger.debug(
|
1392
|
+
f"Adding boundary exclusion: {elem.extract_text().strip()} at {elem.bbox}"
|
1393
|
+
)
|
1394
|
+
|
1303
1395
|
# 4. Spatially Filter Characters using Utility
|
1304
1396
|
# Pass self as the target_region for precise polygon checks etc.
|
1305
1397
|
filtered_chars = filter_chars_spatially(
|
@@ -1510,6 +1602,49 @@ class Region(
|
|
1510
1602
|
|
1511
1603
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1512
1604
|
|
1605
|
+
# For stream method with text-based edge detection and explicit vertical lines,
|
1606
|
+
# adjust guides to ensure they fall within text bounds for proper intersection
|
1607
|
+
if (
|
1608
|
+
effective_method == "pdfplumber"
|
1609
|
+
and table_settings.get("horizontal_strategy") == "text"
|
1610
|
+
and table_settings.get("vertical_strategy") == "explicit"
|
1611
|
+
and "explicit_vertical_lines" in table_settings
|
1612
|
+
):
|
1613
|
+
|
1614
|
+
text_elements = self.find_all("text", apply_exclusions=apply_exclusions)
|
1615
|
+
if text_elements:
|
1616
|
+
text_bounds = text_elements.merge().bbox
|
1617
|
+
text_left = text_bounds[0]
|
1618
|
+
text_right = text_bounds[2]
|
1619
|
+
|
1620
|
+
# Adjust vertical guides to fall within text bounds
|
1621
|
+
original_verticals = table_settings["explicit_vertical_lines"]
|
1622
|
+
adjusted_verticals = []
|
1623
|
+
|
1624
|
+
for v in original_verticals:
|
1625
|
+
if v < text_left:
|
1626
|
+
# Guide is left of text bounds, clip to text start
|
1627
|
+
adjusted_verticals.append(text_left)
|
1628
|
+
logger.debug(
|
1629
|
+
f"Region {self.bbox}: Adjusted left guide from {v:.1f} to {text_left:.1f}"
|
1630
|
+
)
|
1631
|
+
elif v > text_right:
|
1632
|
+
# Guide is right of text bounds, clip to text end
|
1633
|
+
adjusted_verticals.append(text_right)
|
1634
|
+
logger.debug(
|
1635
|
+
f"Region {self.bbox}: Adjusted right guide from {v:.1f} to {text_right:.1f}"
|
1636
|
+
)
|
1637
|
+
else:
|
1638
|
+
# Guide is within text bounds, keep as is
|
1639
|
+
adjusted_verticals.append(v)
|
1640
|
+
|
1641
|
+
# Update table settings with adjusted guides
|
1642
|
+
table_settings["explicit_vertical_lines"] = adjusted_verticals
|
1643
|
+
logger.debug(
|
1644
|
+
f"Region {self.bbox}: Adjusted {len(original_verticals)} guides for stream extraction. "
|
1645
|
+
f"Text bounds: {text_left:.1f}-{text_right:.1f}"
|
1646
|
+
)
|
1647
|
+
|
1513
1648
|
# Use the selected method
|
1514
1649
|
if effective_method == "tatr":
|
1515
1650
|
table_rows = self._extract_table_tatr(
|
@@ -2765,69 +2900,31 @@ class Region(
|
|
2765
2900
|
if orientation not in ["vertical", "horizontal"]:
|
2766
2901
|
raise ValueError(f"orientation must be 'vertical' or 'horizontal', got '{orientation}'")
|
2767
2902
|
|
2768
|
-
#
|
2769
|
-
|
2770
|
-
|
2771
|
-
|
2772
|
-
|
2773
|
-
|
2774
|
-
|
2775
|
-
|
2776
|
-
|
2777
|
-
|
2778
|
-
|
2779
|
-
|
2780
|
-
|
2781
|
-
|
2782
|
-
|
2783
|
-
|
2784
|
-
|
2785
|
-
top = start_element.bottom # Start at the bottom of start element
|
2786
|
-
bottom = end_element.bottom
|
2787
|
-
else: # "none"
|
2788
|
-
# Exclude both boundary elements
|
2789
|
-
top = start_element.bottom # Start at the bottom of start element
|
2790
|
-
bottom = end_element.top # Stop at the top of end element
|
2791
|
-
|
2792
|
-
# Ensure valid boundaries
|
2793
|
-
if top >= bottom:
|
2794
|
-
logger.debug(f"Invalid section boundaries: top={top} >= bottom={bottom}")
|
2795
|
-
# Return an empty region
|
2796
|
-
return Region(self.page, (x0, top, x0, top))
|
2797
|
-
else: # horizontal
|
2798
|
-
# Use full height of the parent region for horizontal sections
|
2799
|
-
top = self.top # Use parent region's top boundary
|
2800
|
-
bottom = self.bottom # Use parent region's bottom boundary
|
2801
|
-
|
2802
|
-
# Determine horizontal boundaries based on include_boundaries
|
2803
|
-
if include_boundaries == "both":
|
2804
|
-
# Include both boundary elements
|
2805
|
-
x0 = start_element.x0
|
2806
|
-
x1 = end_element.x1
|
2807
|
-
elif include_boundaries == "start":
|
2808
|
-
# Include start element, exclude end element
|
2809
|
-
x0 = start_element.x0
|
2810
|
-
x1 = end_element.x0 # Stop at the left of end element
|
2811
|
-
elif include_boundaries == "end":
|
2812
|
-
# Exclude start element, include end element
|
2813
|
-
x0 = start_element.x1 # Start at the right of start element
|
2814
|
-
x1 = end_element.x1
|
2815
|
-
else: # "none"
|
2816
|
-
# Exclude both boundary elements
|
2817
|
-
x0 = start_element.x1 # Start at the right of start element
|
2818
|
-
x1 = end_element.x0 # Stop at the left of end element
|
2819
|
-
|
2820
|
-
# Ensure valid boundaries
|
2821
|
-
if x0 >= x1:
|
2822
|
-
logger.debug(f"Invalid section boundaries: x0={x0} >= x1={x1}")
|
2823
|
-
# Return an empty region
|
2824
|
-
return Region(self.page, (x0, top, x0, top))
|
2903
|
+
# Use centralized section utilities
|
2904
|
+
from natural_pdf.utils.sections import calculate_section_bounds, validate_section_bounds
|
2905
|
+
|
2906
|
+
# Calculate section boundaries
|
2907
|
+
bounds = calculate_section_bounds(
|
2908
|
+
start_element=start_element,
|
2909
|
+
end_element=end_element,
|
2910
|
+
include_boundaries=include_boundaries,
|
2911
|
+
orientation=orientation,
|
2912
|
+
parent_bounds=self.bbox,
|
2913
|
+
)
|
2914
|
+
|
2915
|
+
# Validate boundaries
|
2916
|
+
if not validate_section_bounds(bounds, orientation):
|
2917
|
+
# Return an empty region at the start position
|
2918
|
+
x0, top, _, _ = bounds
|
2919
|
+
return Region(self.page, (x0, top, x0, top))
|
2825
2920
|
|
2826
2921
|
# Create new region
|
2827
|
-
section = Region(self.page,
|
2828
|
-
|
2922
|
+
section = Region(self.page, bounds)
|
2923
|
+
|
2924
|
+
# Store the original boundary elements and exclusion info
|
2829
2925
|
section.start_element = start_element
|
2830
2926
|
section.end_element = end_element
|
2927
|
+
section._boundary_exclusions = include_boundaries
|
2831
2928
|
|
2832
2929
|
return section
|
2833
2930
|
|
@@ -2851,121 +2948,63 @@ class Region(
|
|
2851
2948
|
List of Region objects representing the extracted sections
|
2852
2949
|
"""
|
2853
2950
|
from natural_pdf.elements.element_collection import ElementCollection
|
2951
|
+
from natural_pdf.utils.sections import extract_sections_from_region
|
2952
|
+
|
2953
|
+
# Use centralized section extraction logic
|
2954
|
+
sections = extract_sections_from_region(
|
2955
|
+
region=self,
|
2956
|
+
start_elements=start_elements,
|
2957
|
+
end_elements=end_elements,
|
2958
|
+
include_boundaries=include_boundaries,
|
2959
|
+
orientation=orientation,
|
2960
|
+
)
|
2854
2961
|
|
2855
|
-
|
2856
|
-
if isinstance(start_elements, str):
|
2857
|
-
start_elements = self.find_all(start_elements) # Use region's find_all
|
2858
|
-
if hasattr(start_elements, "elements"):
|
2859
|
-
start_elements = start_elements.elements
|
2962
|
+
return ElementCollection(sections)
|
2860
2963
|
|
2861
|
-
|
2862
|
-
|
2863
|
-
|
2864
|
-
end_elements = end_elements.elements
|
2964
|
+
def split(self, divider, **kwargs) -> "ElementCollection[Region]":
|
2965
|
+
"""
|
2966
|
+
Divide this region into sections based on the provided divider elements.
|
2865
2967
|
|
2866
|
-
|
2867
|
-
|
2868
|
-
|
2869
|
-
|
2870
|
-
|
2871
|
-
return []
|
2872
|
-
# Ensure end_elements is a list if provided
|
2873
|
-
if end_elements is not None and not hasattr(end_elements, "__iter__"):
|
2874
|
-
logger.warning("end_elements must be iterable if provided. Ignoring.")
|
2875
|
-
end_elements = []
|
2876
|
-
elif end_elements is None:
|
2877
|
-
end_elements = []
|
2878
|
-
|
2879
|
-
# If no start elements found within the region, return empty list
|
2880
|
-
if not start_elements:
|
2881
|
-
return []
|
2968
|
+
Args:
|
2969
|
+
divider: Elements or selector string that mark section boundaries
|
2970
|
+
**kwargs: Additional parameters passed to get_sections()
|
2971
|
+
- include_boundaries: How to include boundary elements (default: 'start')
|
2972
|
+
- orientation: 'vertical' or 'horizontal' (default: 'vertical')
|
2882
2973
|
|
2883
|
-
|
2884
|
-
|
2885
|
-
|
2886
|
-
|
2887
|
-
|
2888
|
-
|
2889
|
-
|
2890
|
-
|
2891
|
-
|
2892
|
-
|
2893
|
-
#
|
2894
|
-
|
2895
|
-
|
2896
|
-
|
2897
|
-
|
2898
|
-
|
2899
|
-
# Add
|
2900
|
-
|
2901
|
-
|
2902
|
-
if
|
2903
|
-
|
2904
|
-
|
2905
|
-
|
2906
|
-
|
2907
|
-
|
2908
|
-
|
2909
|
-
|
2910
|
-
|
2911
|
-
|
2912
|
-
# Sort boundaries by index (document order within the region)
|
2913
|
-
section_boundaries.sort(key=lambda x: x["index"])
|
2914
|
-
|
2915
|
-
# Generate sections
|
2916
|
-
sections = []
|
2917
|
-
current_start_boundary = None
|
2918
|
-
|
2919
|
-
for i, boundary in enumerate(section_boundaries):
|
2920
|
-
# If it's a start boundary and we don't have a current start
|
2921
|
-
if boundary["type"] == "start" and current_start_boundary is None:
|
2922
|
-
current_start_boundary = boundary
|
2923
|
-
|
2924
|
-
# If it's an end boundary and we have a current start
|
2925
|
-
elif boundary["type"] == "end" and current_start_boundary is not None:
|
2926
|
-
# Create a section from current_start to this boundary
|
2927
|
-
start_element = current_start_boundary["element"]
|
2928
|
-
end_element = boundary["element"]
|
2929
|
-
# Use the helper, ensuring elements are from within the region
|
2930
|
-
section = self.get_section_between(
|
2931
|
-
start_element, end_element, include_boundaries, orientation
|
2932
|
-
)
|
2933
|
-
sections.append(section)
|
2934
|
-
current_start_boundary = None # Reset
|
2935
|
-
|
2936
|
-
# If it's another start boundary and we have a current start (split by starts only)
|
2937
|
-
elif (
|
2938
|
-
boundary["type"] == "start"
|
2939
|
-
and current_start_boundary is not None
|
2940
|
-
and not end_elements
|
2941
|
-
):
|
2942
|
-
# End the previous section just before this start boundary
|
2943
|
-
start_element = current_start_boundary["element"]
|
2944
|
-
# Find the element immediately preceding this start in the sorted list
|
2945
|
-
end_idx = boundary["index"] - 1
|
2946
|
-
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
2947
|
-
end_element = all_elements_in_region[end_idx]
|
2948
|
-
section = self.get_section_between(
|
2949
|
-
start_element, end_element, include_boundaries, orientation
|
2974
|
+
Returns:
|
2975
|
+
ElementCollection of Region objects representing the sections
|
2976
|
+
|
2977
|
+
Example:
|
2978
|
+
# Split a region by bold text
|
2979
|
+
sections = region.split("text:bold")
|
2980
|
+
|
2981
|
+
# Split horizontally by vertical lines
|
2982
|
+
sections = region.split("line[orientation=vertical]", orientation="horizontal")
|
2983
|
+
"""
|
2984
|
+
# Default to 'start' boundaries for split (include divider at start of each section)
|
2985
|
+
if "include_boundaries" not in kwargs:
|
2986
|
+
kwargs["include_boundaries"] = "start"
|
2987
|
+
|
2988
|
+
sections = self.get_sections(start_elements=divider, **kwargs)
|
2989
|
+
|
2990
|
+
# Add section before first divider if there's content
|
2991
|
+
if sections and hasattr(sections[0], "start_element"):
|
2992
|
+
first_divider = sections[0].start_element
|
2993
|
+
if first_divider:
|
2994
|
+
# Get all elements before the first divider
|
2995
|
+
all_elements = self.get_elements()
|
2996
|
+
if all_elements and all_elements[0] != first_divider:
|
2997
|
+
# Create section from start to just before first divider
|
2998
|
+
initial_section = self.get_section_between(
|
2999
|
+
start_element=None,
|
3000
|
+
end_element=first_divider,
|
3001
|
+
include_boundaries="none",
|
3002
|
+
orientation=kwargs.get("orientation", "vertical"),
|
2950
3003
|
)
|
2951
|
-
|
2952
|
-
|
2953
|
-
# For now, just reset and start new section
|
2954
|
-
|
2955
|
-
# Start the new section
|
2956
|
-
current_start_boundary = boundary
|
2957
|
-
|
2958
|
-
# Handle the last section if we have a current start
|
2959
|
-
if current_start_boundary is not None:
|
2960
|
-
start_element = current_start_boundary["element"]
|
2961
|
-
# End at the last element within the region
|
2962
|
-
end_element = all_elements_in_region[-1]
|
2963
|
-
section = self.get_section_between(
|
2964
|
-
start_element, end_element, include_boundaries, orientation
|
2965
|
-
)
|
2966
|
-
sections.append(section)
|
3004
|
+
if initial_section and initial_section.get_elements():
|
3005
|
+
sections.insert(0, initial_section)
|
2967
3006
|
|
2968
|
-
return
|
3007
|
+
return sections
|
2969
3008
|
|
2970
3009
|
def create_cells(self):
|
2971
3010
|
"""
|
natural_pdf/elements/text.py
CHANGED
@@ -459,9 +459,11 @@ class TextElement(Element):
|
|
459
459
|
@property
|
460
460
|
def highlight_color(self):
|
461
461
|
"""Return RGB(A) tuple of highlight colour if stored."""
|
462
|
-
|
463
|
-
|
464
|
-
|
462
|
+
# Check _obj first, being careful with falsy values like 0.0
|
463
|
+
if "highlight_color" in self._obj:
|
464
|
+
return self._obj["highlight_color"]
|
465
|
+
# Fall back to metadata
|
466
|
+
return self.metadata.get("decoration", {}).get("highlight_color")
|
465
467
|
|
466
468
|
def __repr__(self) -> str:
|
467
469
|
"""String representation of the text element."""
|
natural_pdf/flows/element.py
CHANGED
@@ -249,6 +249,7 @@ class FlowElement:
|
|
249
249
|
"cross_size": cross_size_for_op,
|
250
250
|
"cross_alignment": cross_alignment, # Pass alignment
|
251
251
|
"include_source": include_source,
|
252
|
+
"_from_flow": True, # Prevent multipage recursion
|
252
253
|
# Pass other relevant kwargs if Region._direction uses them (e.g. strict_type)
|
253
254
|
**{k: v for k, v in kwargs.items() if k in ["strict_type", "first_match_only"]},
|
254
255
|
}
|