natural-pdf 0.2.5__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -717,14 +717,23 @@ class Page(
717
717
 
718
718
  # Add PDF-level exclusions if we have a parent PDF
719
719
  if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
720
+ # Get existing labels to check for duplicates
721
+ existing_labels = set()
722
+ for exc in all_exclusions:
723
+ if len(exc) >= 2 and exc[1]: # Has a label
724
+ existing_labels.add(exc[1])
725
+
720
726
  for pdf_exclusion in self._parent._exclusions:
721
- # Check if this exclusion is already in our list (avoid duplicates)
722
- if pdf_exclusion not in all_exclusions:
723
- # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
724
- if len(pdf_exclusion) == 2:
725
- # Convert to 3-tuple format with default method
726
- pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
727
- all_exclusions.append(pdf_exclusion)
727
+ # Check if this exclusion label is already in our list (avoid duplicates)
728
+ label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
729
+ if label and label in existing_labels:
730
+ continue # Skip this exclusion as it's already been applied
731
+
732
+ # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
733
+ if len(pdf_exclusion) == 2:
734
+ # Convert to 3-tuple format with default method
735
+ pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
736
+ all_exclusions.append(pdf_exclusion)
728
737
 
729
738
  if debug:
730
739
  print(
@@ -829,6 +838,36 @@ class Page(
829
838
  regions.append(exclusion_item) # Label is already on the Region object
830
839
  if debug:
831
840
  print(f" - Added direct region '{label}': {exclusion_item}")
841
+
842
+ # Process string selectors (from PDF-level exclusions)
843
+ elif isinstance(exclusion_item, str):
844
+ selector_str = exclusion_item
845
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
846
+
847
+ if debug:
848
+ print(
849
+ f" - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
850
+ )
851
+
852
+ if method == "region":
853
+ # Convert each matching element to a region
854
+ for el in matching_elements:
855
+ try:
856
+ bbox_coords = (
857
+ float(el.x0),
858
+ float(el.top),
859
+ float(el.x1),
860
+ float(el.bottom),
861
+ )
862
+ region = Region(self, bbox_coords, label=label)
863
+ regions.append(region)
864
+ if debug:
865
+ print(f" ✓ Added region from selector match: {bbox_coords}")
866
+ except Exception as e:
867
+ if debug:
868
+ print(f" ✗ Failed to create region from element: {e}")
869
+ # If method is "element", it will be handled in _filter_elements_by_exclusions
870
+
832
871
  # Element-based exclusions are not converted to regions here
833
872
  # They will be handled separately in _filter_elements_by_exclusions
834
873
 
@@ -852,7 +891,16 @@ class Page(
852
891
  Returns:
853
892
  A new list containing only the elements not excluded.
854
893
  """
855
- if not self._exclusions:
894
+ # Check both page-level and PDF-level exclusions
895
+ has_page_exclusions = bool(self._exclusions)
896
+ has_pdf_exclusions = (
897
+ hasattr(self, "_parent")
898
+ and self._parent
899
+ and hasattr(self._parent, "_exclusions")
900
+ and bool(self._parent._exclusions)
901
+ )
902
+
903
+ if not has_page_exclusions and not has_pdf_exclusions:
856
904
  if debug_exclusions:
857
905
  print(
858
906
  f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
865
913
  )
866
914
 
867
915
  # Collect element-based exclusions
868
- excluded_elements = set() # Use set for O(1) lookup
916
+ # Store element bboxes for comparison instead of object ids
917
+ excluded_element_bboxes = set() # Use set for O(1) lookup
918
+
919
+ # Process both page-level and PDF-level exclusions
920
+ all_exclusions = list(self._exclusions) if has_page_exclusions else []
921
+ if has_pdf_exclusions:
922
+ all_exclusions.extend(self._parent._exclusions)
869
923
 
870
- for exclusion_data in self._exclusions:
924
+ for exclusion_data in all_exclusions:
871
925
  # Handle both old format (2-tuple) and new format (3-tuple)
872
926
  if len(exclusion_data) == 2:
873
927
  exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
883
937
  if isinstance(exclusion_item, Region):
884
938
  continue
885
939
 
940
+ # Handle string selectors for element-based exclusions
941
+ if isinstance(exclusion_item, str) and method == "element":
942
+ selector_str = exclusion_item
943
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
944
+ for el in matching_elements:
945
+ if hasattr(el, "bbox"):
946
+ bbox = tuple(el.bbox)
947
+ excluded_element_bboxes.add(bbox)
948
+ if debug_exclusions:
949
+ print(
950
+ f" - Added element exclusion from selector '{selector_str}': {bbox}"
951
+ )
952
+
886
953
  # Handle element-based exclusions
887
- if method == "element" and hasattr(exclusion_item, "bbox"):
888
- excluded_elements.add(id(exclusion_item))
954
+ elif method == "element" and hasattr(exclusion_item, "bbox"):
955
+ # Store bbox tuple for comparison
956
+ bbox = tuple(exclusion_item.bbox)
957
+ excluded_element_bboxes.add(bbox)
889
958
  if debug_exclusions:
890
- print(f" - Added element exclusion: {exclusion_item}")
959
+ print(f" - Added element exclusion with bbox {bbox}: {exclusion_item}")
891
960
 
892
961
  if debug_exclusions:
893
962
  print(
894
963
  f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
895
- f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
964
+ f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
896
965
  )
897
966
 
898
967
  filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
903
972
  exclude = False
904
973
 
905
974
  # Check element-based exclusions first (faster)
906
- if id(element) in excluded_elements:
975
+ if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
907
976
  exclude = True
908
977
  element_excluded_count += 1
909
978
  if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
2487
2556
  return self
2488
2557
 
2489
2558
  def get_section_between(
2490
- self, start_element=None, end_element=None, include_boundaries="both"
2559
+ self,
2560
+ start_element=None,
2561
+ end_element=None,
2562
+ include_boundaries="both",
2563
+ orientation="vertical",
2491
2564
  ) -> Optional["Region"]: # Return Optional
2492
2565
  """
2493
2566
  Get a section between two elements on this page.
2567
+
2568
+ Args:
2569
+ start_element: Element marking the start of the section
2570
+ end_element: Element marking the end of the section
2571
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2572
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
2573
+
2574
+ Returns:
2575
+ Region representing the section
2494
2576
  """
2495
2577
  # Create a full-page region to operate within
2496
2578
  page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
2501
2583
  start_element=start_element,
2502
2584
  end_element=end_element,
2503
2585
  include_boundaries=include_boundaries,
2586
+ orientation=orientation,
2504
2587
  )
2505
2588
  except Exception as e:
2506
2589
  logger.error(
@@ -2525,11 +2608,20 @@ class Page(
2525
2608
  include_boundaries="start",
2526
2609
  y_threshold=5.0,
2527
2610
  bounding_box=None,
2611
+ orientation="vertical",
2528
2612
  ) -> "ElementCollection[Region]":
2529
2613
  """
2530
2614
  Get sections of a page defined by start/end elements.
2531
2615
  Uses the page-level implementation.
2532
2616
 
2617
+ Args:
2618
+ start_elements: Elements or selector string that mark the start of sections
2619
+ end_elements: Elements or selector string that mark the end of sections
2620
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2621
+ y_threshold: Threshold for vertical alignment (only used for vertical orientation)
2622
+ bounding_box: Optional bounding box to constrain sections
2623
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
2624
+
2533
2625
  Returns:
2534
2626
  An ElementCollection containing the found Region objects.
2535
2627
  """
@@ -2566,10 +2658,23 @@ class Page(
2566
2658
  if include_boundaries not in valid_inclusions:
2567
2659
  raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
2568
2660
 
2569
- if not start_elements:
2570
- # Return an empty ElementCollection if no start elements
2661
+ if not start_elements and not end_elements:
2662
+ # Return an empty ElementCollection if no boundary elements at all
2571
2663
  return ElementCollection([])
2572
2664
 
2665
+ # If we only have end elements, create implicit start elements
2666
+ if not start_elements and end_elements:
2667
+ # Delegate to PageCollection implementation for consistency
2668
+ from natural_pdf.core.page_collection import PageCollection
2669
+
2670
+ pages = PageCollection([self])
2671
+ return pages.get_sections(
2672
+ start_elements=start_elements,
2673
+ end_elements=end_elements,
2674
+ include_boundaries=include_boundaries,
2675
+ orientation=orientation,
2676
+ )
2677
+
2573
2678
  # Combine start and end elements with their type
2574
2679
  all_boundaries = []
2575
2680
  for el in start_elements:
@@ -2577,11 +2682,14 @@ class Page(
2577
2682
  for el in end_elements:
2578
2683
  all_boundaries.append((el, "end"))
2579
2684
 
2580
- # Sort all boundary elements primarily by top, then x0
2685
+ # Sort all boundary elements based on orientation
2581
2686
  try:
2582
- all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
2687
+ if orientation == "vertical":
2688
+ all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
2689
+ else: # horizontal
2690
+ all_boundaries.sort(key=lambda x: (x[0].x0, x[0].top))
2583
2691
  except AttributeError as e:
2584
- logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
2692
+ logger.error(f"Error sorting boundaries: Element missing position attribute? {e}")
2585
2693
  return ElementCollection([]) # Cannot proceed if elements lack position
2586
2694
 
2587
2695
  # Process sorted boundaries to find sections
@@ -2593,72 +2701,126 @@ class Page(
2593
2701
  # If we have an active section, this start implicitly ends it
2594
2702
  if active_section_started:
2595
2703
  end_boundary_el = element # Use this start as the end boundary
2596
- # Determine region boundaries
2704
+ # Determine region boundaries based on orientation
2705
+ if orientation == "vertical":
2706
+ sec_top = (
2707
+ current_start_element.top
2708
+ if include_boundaries in ["start", "both"]
2709
+ else current_start_element.bottom
2710
+ )
2711
+ sec_bottom = (
2712
+ end_boundary_el.top
2713
+ if include_boundaries not in ["end", "both"]
2714
+ else end_boundary_el.bottom
2715
+ )
2716
+
2717
+ if sec_top < sec_bottom: # Ensure valid region
2718
+ x0, _, x1, _ = get_bounds()
2719
+ region = self.create_region(x0, sec_top, x1, sec_bottom)
2720
+ region.start_element = current_start_element
2721
+ region.end_element = end_boundary_el # Mark the element that ended it
2722
+ region.is_end_next_start = True # Mark how it ended
2723
+ regions.append(region)
2724
+ else: # horizontal
2725
+ sec_left = (
2726
+ current_start_element.x0
2727
+ if include_boundaries in ["start", "both"]
2728
+ else current_start_element.x1
2729
+ )
2730
+ sec_right = (
2731
+ end_boundary_el.x0
2732
+ if include_boundaries not in ["end", "both"]
2733
+ else end_boundary_el.x1
2734
+ )
2735
+
2736
+ if sec_left < sec_right: # Ensure valid region
2737
+ _, y0, _, y1 = get_bounds()
2738
+ region = self.create_region(sec_left, y0, sec_right, y1)
2739
+ region.start_element = current_start_element
2740
+ region.end_element = end_boundary_el # Mark the element that ended it
2741
+ region.is_end_next_start = True # Mark how it ended
2742
+ regions.append(region)
2743
+ active_section_started = False # Reset for the new start
2744
+
2745
+ # Set this as the potential start of the next section
2746
+ current_start_element = element
2747
+ active_section_started = True
2748
+
2749
+ elif element_type == "end" and active_section_started:
2750
+ # We found an explicit end for the current section
2751
+ end_boundary_el = element
2752
+ if orientation == "vertical":
2597
2753
  sec_top = (
2598
2754
  current_start_element.top
2599
2755
  if include_boundaries in ["start", "both"]
2600
2756
  else current_start_element.bottom
2601
2757
  )
2602
2758
  sec_bottom = (
2603
- end_boundary_el.top
2604
- if include_boundaries not in ["end", "both"]
2605
- else end_boundary_el.bottom
2759
+ end_boundary_el.bottom
2760
+ if include_boundaries in ["end", "both"]
2761
+ else end_boundary_el.top
2606
2762
  )
2607
2763
 
2608
2764
  if sec_top < sec_bottom: # Ensure valid region
2609
2765
  x0, _, x1, _ = get_bounds()
2610
2766
  region = self.create_region(x0, sec_top, x1, sec_bottom)
2611
2767
  region.start_element = current_start_element
2612
- region.end_element = end_boundary_el # Mark the element that ended it
2613
- region.is_end_next_start = True # Mark how it ended
2768
+ region.end_element = end_boundary_el
2769
+ region.is_end_next_start = False
2614
2770
  regions.append(region)
2615
- active_section_started = False # Reset for the new start
2771
+ else: # horizontal
2772
+ sec_left = (
2773
+ current_start_element.x0
2774
+ if include_boundaries in ["start", "both"]
2775
+ else current_start_element.x1
2776
+ )
2777
+ sec_right = (
2778
+ end_boundary_el.x1
2779
+ if include_boundaries in ["end", "both"]
2780
+ else end_boundary_el.x0
2781
+ )
2616
2782
 
2617
- # Set this as the potential start of the next section
2618
- current_start_element = element
2619
- active_section_started = True
2783
+ if sec_left < sec_right: # Ensure valid region
2784
+ _, y0, _, y1 = get_bounds()
2785
+ region = self.create_region(sec_left, y0, sec_right, y1)
2786
+ region.start_element = current_start_element
2787
+ region.end_element = end_boundary_el
2788
+ region.is_end_next_start = False
2789
+ regions.append(region)
2620
2790
 
2621
- elif element_type == "end" and active_section_started:
2622
- # We found an explicit end for the current section
2623
- end_boundary_el = element
2791
+ # Reset: section ended explicitly
2792
+ current_start_element = None
2793
+ active_section_started = False
2794
+
2795
+ # Handle the last section if it was started but never explicitly ended
2796
+ if active_section_started:
2797
+ if orientation == "vertical":
2624
2798
  sec_top = (
2625
2799
  current_start_element.top
2626
2800
  if include_boundaries in ["start", "both"]
2627
2801
  else current_start_element.bottom
2628
2802
  )
2629
- sec_bottom = (
2630
- end_boundary_el.bottom
2631
- if include_boundaries in ["end", "both"]
2632
- else end_boundary_el.top
2803
+ x0, _, x1, page_bottom = get_bounds()
2804
+ if sec_top < page_bottom:
2805
+ region = self.create_region(x0, sec_top, x1, page_bottom)
2806
+ region.start_element = current_start_element
2807
+ region.end_element = None # Ended by page end
2808
+ region.is_end_next_start = False
2809
+ regions.append(region)
2810
+ else: # horizontal
2811
+ sec_left = (
2812
+ current_start_element.x0
2813
+ if include_boundaries in ["start", "both"]
2814
+ else current_start_element.x1
2633
2815
  )
2634
-
2635
- if sec_top < sec_bottom: # Ensure valid region
2636
- x0, _, x1, _ = get_bounds()
2637
- region = self.create_region(x0, sec_top, x1, sec_bottom)
2816
+ page_left, y0, page_right, y1 = get_bounds()
2817
+ if sec_left < page_right:
2818
+ region = self.create_region(sec_left, y0, page_right, y1)
2638
2819
  region.start_element = current_start_element
2639
- region.end_element = end_boundary_el
2820
+ region.end_element = None # Ended by page end
2640
2821
  region.is_end_next_start = False
2641
2822
  regions.append(region)
2642
2823
 
2643
- # Reset: section ended explicitly
2644
- current_start_element = None
2645
- active_section_started = False
2646
-
2647
- # Handle the last section if it was started but never explicitly ended
2648
- if active_section_started:
2649
- sec_top = (
2650
- current_start_element.top
2651
- if include_boundaries in ["start", "both"]
2652
- else current_start_element.bottom
2653
- )
2654
- x0, _, x1, page_bottom = get_bounds()
2655
- if sec_top < page_bottom:
2656
- region = self.create_region(x0, sec_top, x1, page_bottom)
2657
- region.start_element = current_start_element
2658
- region.end_element = None # Ended by page end
2659
- region.is_end_next_start = False
2660
- regions.append(region)
2661
-
2662
2824
  return ElementCollection(regions)
2663
2825
 
2664
2826
  def __repr__(self) -> str: