natural-pdf 0.2.5__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +94 -42
- natural_pdf/core/page.py +224 -62
- natural_pdf/core/page_collection.py +261 -50
- natural_pdf/core/page_groupby.py +20 -2
- natural_pdf/core/pdf.py +17 -14
- natural_pdf/core/render_spec.py +20 -5
- natural_pdf/describe/base.py +1 -1
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +84 -8
- natural_pdf/elements/element_collection.py +757 -20
- natural_pdf/elements/region.py +181 -48
- natural_pdf/flows/flow.py +3 -0
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/RECORD +20 -19
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -717,14 +717,23 @@ class Page(
|
|
717
717
|
|
718
718
|
# Add PDF-level exclusions if we have a parent PDF
|
719
719
|
if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
|
720
|
+
# Get existing labels to check for duplicates
|
721
|
+
existing_labels = set()
|
722
|
+
for exc in all_exclusions:
|
723
|
+
if len(exc) >= 2 and exc[1]: # Has a label
|
724
|
+
existing_labels.add(exc[1])
|
725
|
+
|
720
726
|
for pdf_exclusion in self._parent._exclusions:
|
721
|
-
# Check if this exclusion is already in our list (avoid duplicates)
|
722
|
-
if pdf_exclusion
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
727
|
+
# Check if this exclusion label is already in our list (avoid duplicates)
|
728
|
+
label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
|
729
|
+
if label and label in existing_labels:
|
730
|
+
continue # Skip this exclusion as it's already been applied
|
731
|
+
|
732
|
+
# Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
|
733
|
+
if len(pdf_exclusion) == 2:
|
734
|
+
# Convert to 3-tuple format with default method
|
735
|
+
pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
|
736
|
+
all_exclusions.append(pdf_exclusion)
|
728
737
|
|
729
738
|
if debug:
|
730
739
|
print(
|
@@ -829,6 +838,36 @@ class Page(
|
|
829
838
|
regions.append(exclusion_item) # Label is already on the Region object
|
830
839
|
if debug:
|
831
840
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
841
|
+
|
842
|
+
# Process string selectors (from PDF-level exclusions)
|
843
|
+
elif isinstance(exclusion_item, str):
|
844
|
+
selector_str = exclusion_item
|
845
|
+
matching_elements = self.find_all(selector_str, apply_exclusions=False)
|
846
|
+
|
847
|
+
if debug:
|
848
|
+
print(
|
849
|
+
f" - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
|
850
|
+
)
|
851
|
+
|
852
|
+
if method == "region":
|
853
|
+
# Convert each matching element to a region
|
854
|
+
for el in matching_elements:
|
855
|
+
try:
|
856
|
+
bbox_coords = (
|
857
|
+
float(el.x0),
|
858
|
+
float(el.top),
|
859
|
+
float(el.x1),
|
860
|
+
float(el.bottom),
|
861
|
+
)
|
862
|
+
region = Region(self, bbox_coords, label=label)
|
863
|
+
regions.append(region)
|
864
|
+
if debug:
|
865
|
+
print(f" ✓ Added region from selector match: {bbox_coords}")
|
866
|
+
except Exception as e:
|
867
|
+
if debug:
|
868
|
+
print(f" ✗ Failed to create region from element: {e}")
|
869
|
+
# If method is "element", it will be handled in _filter_elements_by_exclusions
|
870
|
+
|
832
871
|
# Element-based exclusions are not converted to regions here
|
833
872
|
# They will be handled separately in _filter_elements_by_exclusions
|
834
873
|
|
@@ -852,7 +891,16 @@ class Page(
|
|
852
891
|
Returns:
|
853
892
|
A new list containing only the elements not excluded.
|
854
893
|
"""
|
855
|
-
|
894
|
+
# Check both page-level and PDF-level exclusions
|
895
|
+
has_page_exclusions = bool(self._exclusions)
|
896
|
+
has_pdf_exclusions = (
|
897
|
+
hasattr(self, "_parent")
|
898
|
+
and self._parent
|
899
|
+
and hasattr(self._parent, "_exclusions")
|
900
|
+
and bool(self._parent._exclusions)
|
901
|
+
)
|
902
|
+
|
903
|
+
if not has_page_exclusions and not has_pdf_exclusions:
|
856
904
|
if debug_exclusions:
|
857
905
|
print(
|
858
906
|
f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
|
@@ -865,9 +913,15 @@ class Page(
|
|
865
913
|
)
|
866
914
|
|
867
915
|
# Collect element-based exclusions
|
868
|
-
|
916
|
+
# Store element bboxes for comparison instead of object ids
|
917
|
+
excluded_element_bboxes = set() # Use set for O(1) lookup
|
918
|
+
|
919
|
+
# Process both page-level and PDF-level exclusions
|
920
|
+
all_exclusions = list(self._exclusions) if has_page_exclusions else []
|
921
|
+
if has_pdf_exclusions:
|
922
|
+
all_exclusions.extend(self._parent._exclusions)
|
869
923
|
|
870
|
-
for exclusion_data in
|
924
|
+
for exclusion_data in all_exclusions:
|
871
925
|
# Handle both old format (2-tuple) and new format (3-tuple)
|
872
926
|
if len(exclusion_data) == 2:
|
873
927
|
exclusion_item, label = exclusion_data
|
@@ -883,16 +937,31 @@ class Page(
|
|
883
937
|
if isinstance(exclusion_item, Region):
|
884
938
|
continue
|
885
939
|
|
940
|
+
# Handle string selectors for element-based exclusions
|
941
|
+
if isinstance(exclusion_item, str) and method == "element":
|
942
|
+
selector_str = exclusion_item
|
943
|
+
matching_elements = self.find_all(selector_str, apply_exclusions=False)
|
944
|
+
for el in matching_elements:
|
945
|
+
if hasattr(el, "bbox"):
|
946
|
+
bbox = tuple(el.bbox)
|
947
|
+
excluded_element_bboxes.add(bbox)
|
948
|
+
if debug_exclusions:
|
949
|
+
print(
|
950
|
+
f" - Added element exclusion from selector '{selector_str}': {bbox}"
|
951
|
+
)
|
952
|
+
|
886
953
|
# Handle element-based exclusions
|
887
|
-
|
888
|
-
|
954
|
+
elif method == "element" and hasattr(exclusion_item, "bbox"):
|
955
|
+
# Store bbox tuple for comparison
|
956
|
+
bbox = tuple(exclusion_item.bbox)
|
957
|
+
excluded_element_bboxes.add(bbox)
|
889
958
|
if debug_exclusions:
|
890
|
-
print(f" - Added element exclusion: {exclusion_item}")
|
959
|
+
print(f" - Added element exclusion with bbox {bbox}: {exclusion_item}")
|
891
960
|
|
892
961
|
if debug_exclusions:
|
893
962
|
print(
|
894
963
|
f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
|
895
|
-
f"and {len(
|
964
|
+
f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
|
896
965
|
)
|
897
966
|
|
898
967
|
filtered_elements = []
|
@@ -903,7 +972,7 @@ class Page(
|
|
903
972
|
exclude = False
|
904
973
|
|
905
974
|
# Check element-based exclusions first (faster)
|
906
|
-
if
|
975
|
+
if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
|
907
976
|
exclude = True
|
908
977
|
element_excluded_count += 1
|
909
978
|
if debug_exclusions:
|
@@ -2487,10 +2556,23 @@ class Page(
|
|
2487
2556
|
return self
|
2488
2557
|
|
2489
2558
|
def get_section_between(
|
2490
|
-
self,
|
2559
|
+
self,
|
2560
|
+
start_element=None,
|
2561
|
+
end_element=None,
|
2562
|
+
include_boundaries="both",
|
2563
|
+
orientation="vertical",
|
2491
2564
|
) -> Optional["Region"]: # Return Optional
|
2492
2565
|
"""
|
2493
2566
|
Get a section between two elements on this page.
|
2567
|
+
|
2568
|
+
Args:
|
2569
|
+
start_element: Element marking the start of the section
|
2570
|
+
end_element: Element marking the end of the section
|
2571
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2572
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2573
|
+
|
2574
|
+
Returns:
|
2575
|
+
Region representing the section
|
2494
2576
|
"""
|
2495
2577
|
# Create a full-page region to operate within
|
2496
2578
|
page_region = self.create_region(0, 0, self.width, self.height)
|
@@ -2501,6 +2583,7 @@ class Page(
|
|
2501
2583
|
start_element=start_element,
|
2502
2584
|
end_element=end_element,
|
2503
2585
|
include_boundaries=include_boundaries,
|
2586
|
+
orientation=orientation,
|
2504
2587
|
)
|
2505
2588
|
except Exception as e:
|
2506
2589
|
logger.error(
|
@@ -2525,11 +2608,20 @@ class Page(
|
|
2525
2608
|
include_boundaries="start",
|
2526
2609
|
y_threshold=5.0,
|
2527
2610
|
bounding_box=None,
|
2611
|
+
orientation="vertical",
|
2528
2612
|
) -> "ElementCollection[Region]":
|
2529
2613
|
"""
|
2530
2614
|
Get sections of a page defined by start/end elements.
|
2531
2615
|
Uses the page-level implementation.
|
2532
2616
|
|
2617
|
+
Args:
|
2618
|
+
start_elements: Elements or selector string that mark the start of sections
|
2619
|
+
end_elements: Elements or selector string that mark the end of sections
|
2620
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2621
|
+
y_threshold: Threshold for vertical alignment (only used for vertical orientation)
|
2622
|
+
bounding_box: Optional bounding box to constrain sections
|
2623
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2624
|
+
|
2533
2625
|
Returns:
|
2534
2626
|
An ElementCollection containing the found Region objects.
|
2535
2627
|
"""
|
@@ -2566,10 +2658,23 @@ class Page(
|
|
2566
2658
|
if include_boundaries not in valid_inclusions:
|
2567
2659
|
raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
|
2568
2660
|
|
2569
|
-
if not start_elements:
|
2570
|
-
# Return an empty ElementCollection if no
|
2661
|
+
if not start_elements and not end_elements:
|
2662
|
+
# Return an empty ElementCollection if no boundary elements at all
|
2571
2663
|
return ElementCollection([])
|
2572
2664
|
|
2665
|
+
# If we only have end elements, create implicit start elements
|
2666
|
+
if not start_elements and end_elements:
|
2667
|
+
# Delegate to PageCollection implementation for consistency
|
2668
|
+
from natural_pdf.core.page_collection import PageCollection
|
2669
|
+
|
2670
|
+
pages = PageCollection([self])
|
2671
|
+
return pages.get_sections(
|
2672
|
+
start_elements=start_elements,
|
2673
|
+
end_elements=end_elements,
|
2674
|
+
include_boundaries=include_boundaries,
|
2675
|
+
orientation=orientation,
|
2676
|
+
)
|
2677
|
+
|
2573
2678
|
# Combine start and end elements with their type
|
2574
2679
|
all_boundaries = []
|
2575
2680
|
for el in start_elements:
|
@@ -2577,11 +2682,14 @@ class Page(
|
|
2577
2682
|
for el in end_elements:
|
2578
2683
|
all_boundaries.append((el, "end"))
|
2579
2684
|
|
2580
|
-
# Sort all boundary elements
|
2685
|
+
# Sort all boundary elements based on orientation
|
2581
2686
|
try:
|
2582
|
-
|
2687
|
+
if orientation == "vertical":
|
2688
|
+
all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
|
2689
|
+
else: # horizontal
|
2690
|
+
all_boundaries.sort(key=lambda x: (x[0].x0, x[0].top))
|
2583
2691
|
except AttributeError as e:
|
2584
|
-
logger.error(f"Error sorting boundaries: Element missing
|
2692
|
+
logger.error(f"Error sorting boundaries: Element missing position attribute? {e}")
|
2585
2693
|
return ElementCollection([]) # Cannot proceed if elements lack position
|
2586
2694
|
|
2587
2695
|
# Process sorted boundaries to find sections
|
@@ -2593,72 +2701,126 @@ class Page(
|
|
2593
2701
|
# If we have an active section, this start implicitly ends it
|
2594
2702
|
if active_section_started:
|
2595
2703
|
end_boundary_el = element # Use this start as the end boundary
|
2596
|
-
# Determine region boundaries
|
2704
|
+
# Determine region boundaries based on orientation
|
2705
|
+
if orientation == "vertical":
|
2706
|
+
sec_top = (
|
2707
|
+
current_start_element.top
|
2708
|
+
if include_boundaries in ["start", "both"]
|
2709
|
+
else current_start_element.bottom
|
2710
|
+
)
|
2711
|
+
sec_bottom = (
|
2712
|
+
end_boundary_el.top
|
2713
|
+
if include_boundaries not in ["end", "both"]
|
2714
|
+
else end_boundary_el.bottom
|
2715
|
+
)
|
2716
|
+
|
2717
|
+
if sec_top < sec_bottom: # Ensure valid region
|
2718
|
+
x0, _, x1, _ = get_bounds()
|
2719
|
+
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2720
|
+
region.start_element = current_start_element
|
2721
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
2722
|
+
region.is_end_next_start = True # Mark how it ended
|
2723
|
+
regions.append(region)
|
2724
|
+
else: # horizontal
|
2725
|
+
sec_left = (
|
2726
|
+
current_start_element.x0
|
2727
|
+
if include_boundaries in ["start", "both"]
|
2728
|
+
else current_start_element.x1
|
2729
|
+
)
|
2730
|
+
sec_right = (
|
2731
|
+
end_boundary_el.x0
|
2732
|
+
if include_boundaries not in ["end", "both"]
|
2733
|
+
else end_boundary_el.x1
|
2734
|
+
)
|
2735
|
+
|
2736
|
+
if sec_left < sec_right: # Ensure valid region
|
2737
|
+
_, y0, _, y1 = get_bounds()
|
2738
|
+
region = self.create_region(sec_left, y0, sec_right, y1)
|
2739
|
+
region.start_element = current_start_element
|
2740
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
2741
|
+
region.is_end_next_start = True # Mark how it ended
|
2742
|
+
regions.append(region)
|
2743
|
+
active_section_started = False # Reset for the new start
|
2744
|
+
|
2745
|
+
# Set this as the potential start of the next section
|
2746
|
+
current_start_element = element
|
2747
|
+
active_section_started = True
|
2748
|
+
|
2749
|
+
elif element_type == "end" and active_section_started:
|
2750
|
+
# We found an explicit end for the current section
|
2751
|
+
end_boundary_el = element
|
2752
|
+
if orientation == "vertical":
|
2597
2753
|
sec_top = (
|
2598
2754
|
current_start_element.top
|
2599
2755
|
if include_boundaries in ["start", "both"]
|
2600
2756
|
else current_start_element.bottom
|
2601
2757
|
)
|
2602
2758
|
sec_bottom = (
|
2603
|
-
end_boundary_el.
|
2604
|
-
if include_boundaries
|
2605
|
-
else end_boundary_el.
|
2759
|
+
end_boundary_el.bottom
|
2760
|
+
if include_boundaries in ["end", "both"]
|
2761
|
+
else end_boundary_el.top
|
2606
2762
|
)
|
2607
2763
|
|
2608
2764
|
if sec_top < sec_bottom: # Ensure valid region
|
2609
2765
|
x0, _, x1, _ = get_bounds()
|
2610
2766
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2611
2767
|
region.start_element = current_start_element
|
2612
|
-
region.end_element = end_boundary_el
|
2613
|
-
region.is_end_next_start =
|
2768
|
+
region.end_element = end_boundary_el
|
2769
|
+
region.is_end_next_start = False
|
2614
2770
|
regions.append(region)
|
2615
|
-
|
2771
|
+
else: # horizontal
|
2772
|
+
sec_left = (
|
2773
|
+
current_start_element.x0
|
2774
|
+
if include_boundaries in ["start", "both"]
|
2775
|
+
else current_start_element.x1
|
2776
|
+
)
|
2777
|
+
sec_right = (
|
2778
|
+
end_boundary_el.x1
|
2779
|
+
if include_boundaries in ["end", "both"]
|
2780
|
+
else end_boundary_el.x0
|
2781
|
+
)
|
2616
2782
|
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2783
|
+
if sec_left < sec_right: # Ensure valid region
|
2784
|
+
_, y0, _, y1 = get_bounds()
|
2785
|
+
region = self.create_region(sec_left, y0, sec_right, y1)
|
2786
|
+
region.start_element = current_start_element
|
2787
|
+
region.end_element = end_boundary_el
|
2788
|
+
region.is_end_next_start = False
|
2789
|
+
regions.append(region)
|
2620
2790
|
|
2621
|
-
|
2622
|
-
|
2623
|
-
|
2791
|
+
# Reset: section ended explicitly
|
2792
|
+
current_start_element = None
|
2793
|
+
active_section_started = False
|
2794
|
+
|
2795
|
+
# Handle the last section if it was started but never explicitly ended
|
2796
|
+
if active_section_started:
|
2797
|
+
if orientation == "vertical":
|
2624
2798
|
sec_top = (
|
2625
2799
|
current_start_element.top
|
2626
2800
|
if include_boundaries in ["start", "both"]
|
2627
2801
|
else current_start_element.bottom
|
2628
2802
|
)
|
2629
|
-
|
2630
|
-
|
2631
|
-
|
2632
|
-
|
2803
|
+
x0, _, x1, page_bottom = get_bounds()
|
2804
|
+
if sec_top < page_bottom:
|
2805
|
+
region = self.create_region(x0, sec_top, x1, page_bottom)
|
2806
|
+
region.start_element = current_start_element
|
2807
|
+
region.end_element = None # Ended by page end
|
2808
|
+
region.is_end_next_start = False
|
2809
|
+
regions.append(region)
|
2810
|
+
else: # horizontal
|
2811
|
+
sec_left = (
|
2812
|
+
current_start_element.x0
|
2813
|
+
if include_boundaries in ["start", "both"]
|
2814
|
+
else current_start_element.x1
|
2633
2815
|
)
|
2634
|
-
|
2635
|
-
if
|
2636
|
-
|
2637
|
-
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2816
|
+
page_left, y0, page_right, y1 = get_bounds()
|
2817
|
+
if sec_left < page_right:
|
2818
|
+
region = self.create_region(sec_left, y0, page_right, y1)
|
2638
2819
|
region.start_element = current_start_element
|
2639
|
-
region.end_element =
|
2820
|
+
region.end_element = None # Ended by page end
|
2640
2821
|
region.is_end_next_start = False
|
2641
2822
|
regions.append(region)
|
2642
2823
|
|
2643
|
-
# Reset: section ended explicitly
|
2644
|
-
current_start_element = None
|
2645
|
-
active_section_started = False
|
2646
|
-
|
2647
|
-
# Handle the last section if it was started but never explicitly ended
|
2648
|
-
if active_section_started:
|
2649
|
-
sec_top = (
|
2650
|
-
current_start_element.top
|
2651
|
-
if include_boundaries in ["start", "both"]
|
2652
|
-
else current_start_element.bottom
|
2653
|
-
)
|
2654
|
-
x0, _, x1, page_bottom = get_bounds()
|
2655
|
-
if sec_top < page_bottom:
|
2656
|
-
region = self.create_region(x0, sec_top, x1, page_bottom)
|
2657
|
-
region.start_element = current_start_element
|
2658
|
-
region.end_element = None # Ended by page end
|
2659
|
-
region.is_end_next_start = False
|
2660
|
-
regions.append(region)
|
2661
|
-
|
2662
2824
|
return ElementCollection(regions)
|
2663
2825
|
|
2664
2826
|
def __repr__(self) -> str:
|