natural-pdf 0.2.5__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +94 -42
- natural_pdf/core/page.py +224 -62
- natural_pdf/core/page_collection.py +261 -50
- natural_pdf/core/page_groupby.py +20 -2
- natural_pdf/core/pdf.py +17 -14
- natural_pdf/core/render_spec.py +20 -5
- natural_pdf/describe/base.py +1 -1
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +84 -8
- natural_pdf/elements/element_collection.py +757 -20
- natural_pdf/elements/region.py +181 -48
- natural_pdf/flows/flow.py +3 -0
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/RECORD +20 -19
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.8.dist-info}/top_level.txt +0 -0
@@ -460,6 +460,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
460
460
|
end_elements=None,
|
461
461
|
new_section_on_page_break=False,
|
462
462
|
include_boundaries="both",
|
463
|
+
orientation="vertical",
|
463
464
|
) -> "ElementCollection[Region]":
|
464
465
|
"""
|
465
466
|
Extract sections from a page collection based on start/end elements.
|
@@ -469,6 +470,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
469
470
|
end_elements: Elements or selector string that mark the end of sections (optional)
|
470
471
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
471
472
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
473
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
472
474
|
|
473
475
|
Returns:
|
474
476
|
List of Region objects representing the extracted sections
|
@@ -511,6 +513,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
511
513
|
next_page = self.pages[i + 1]
|
512
514
|
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
513
515
|
top_region.is_page_boundary = True # Mark it as a special boundary
|
516
|
+
# If start_elements is None, initialize it as an empty list
|
517
|
+
if start_elements is None:
|
518
|
+
start_elements = []
|
514
519
|
start_elements.append(top_region)
|
515
520
|
|
516
521
|
# Get all elements from all pages and sort them in document order
|
@@ -532,16 +537,23 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
532
537
|
first_page = self.pages[0]
|
533
538
|
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
534
539
|
first_start.is_implicit_start = True
|
540
|
+
# Don't mark this as created from any end element, so it can pair with any end
|
535
541
|
start_elements.append(first_start)
|
536
542
|
|
537
543
|
# For each end element (except the last), add an implicit start after it
|
538
|
-
|
544
|
+
# Sort by page, then top, then bottom (for elements with same top), then x0
|
545
|
+
sorted_end_elements = sorted(
|
546
|
+
end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
|
547
|
+
)
|
539
548
|
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
540
549
|
# Create implicit start element right after this end element
|
541
550
|
implicit_start = Region(
|
542
551
|
end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1)
|
543
552
|
)
|
544
553
|
implicit_start.is_implicit_start = True
|
554
|
+
# Track which end element this implicit start was created from
|
555
|
+
# to avoid pairing them together (which would create zero height)
|
556
|
+
implicit_start.created_from_end = end_elem
|
545
557
|
start_elements.append(implicit_start)
|
546
558
|
|
547
559
|
# Mark section boundaries
|
@@ -606,17 +618,20 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
606
618
|
|
607
619
|
# Sort boundaries by page index, then by actual document position
|
608
620
|
def _sort_key(boundary):
|
609
|
-
"""Sort boundaries by (page_idx,
|
621
|
+
"""Sort boundaries by (page_idx, position, priority)."""
|
610
622
|
page_idx = boundary["page_idx"]
|
611
623
|
element = boundary["element"]
|
612
624
|
|
613
|
-
#
|
614
|
-
|
625
|
+
# Position on the page based on orientation
|
626
|
+
if orientation == "vertical":
|
627
|
+
pos = getattr(element, "top", 0.0)
|
628
|
+
else: # horizontal
|
629
|
+
pos = getattr(element, "x0", 0.0)
|
615
630
|
|
616
631
|
# Ensure starts come before ends at the same coordinate
|
617
632
|
priority = 0 if boundary["type"] == "start" else 1
|
618
633
|
|
619
|
-
return (page_idx,
|
634
|
+
return (page_idx, pos, priority)
|
620
635
|
|
621
636
|
section_boundaries.sort(key=_sort_key)
|
622
637
|
|
@@ -624,10 +639,17 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
624
639
|
sections = []
|
625
640
|
|
626
641
|
# --- Helper: build a FlowRegion spanning multiple pages ---
|
627
|
-
def _build_flow_region(start_el, end_el):
|
628
|
-
"""Return a FlowRegion that covers from *start_el* to *end_el
|
629
|
-
If *end_el* is None, the region continues to the bottom of the last
|
630
|
-
page in this PageCollection.
|
642
|
+
def _build_flow_region(start_el, end_el, include_boundaries="both", orientation="vertical"):
|
643
|
+
"""Return a FlowRegion that covers from *start_el* to *end_el*.
|
644
|
+
If *end_el* is None, the region continues to the bottom/right of the last
|
645
|
+
page in this PageCollection.
|
646
|
+
|
647
|
+
Args:
|
648
|
+
start_el: Start element
|
649
|
+
end_el: End element
|
650
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
651
|
+
orientation: 'vertical' or 'horizontal' - determines section direction
|
652
|
+
"""
|
631
653
|
# Local imports to avoid top-level cycles
|
632
654
|
from natural_pdf.elements.region import Region
|
633
655
|
from natural_pdf.flows.element import FlowElement
|
@@ -639,12 +661,24 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
639
661
|
|
640
662
|
parts: list[Region] = []
|
641
663
|
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
664
|
+
if orientation == "vertical":
|
665
|
+
# Determine the start_top based on include_boundaries
|
666
|
+
start_top = start_el.top
|
667
|
+
if include_boundaries == "none" or include_boundaries == "end":
|
668
|
+
# Exclude start boundary
|
669
|
+
start_top = start_el.bottom if hasattr(start_el, "bottom") else start_el.top
|
670
|
+
|
671
|
+
# Slice of first page beginning at *start_top*
|
672
|
+
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
673
|
+
else: # horizontal
|
674
|
+
# Determine the start_left based on include_boundaries
|
675
|
+
start_left = start_el.x0
|
676
|
+
if include_boundaries == "none" or include_boundaries == "end":
|
677
|
+
# Exclude start boundary
|
678
|
+
start_left = start_el.x1 if hasattr(start_el, "x1") else start_el.x0
|
679
|
+
|
680
|
+
# Slice of first page beginning at *start_left*
|
681
|
+
parts.append(Region(start_pg, (start_left, 0, start_pg.width, start_pg.height)))
|
648
682
|
|
649
683
|
# Full middle pages
|
650
684
|
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
@@ -653,10 +687,32 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
653
687
|
|
654
688
|
# Slice of last page (if distinct)
|
655
689
|
if end_pg is not start_pg:
|
656
|
-
|
657
|
-
|
690
|
+
if orientation == "vertical":
|
691
|
+
# Determine the bottom based on include_boundaries
|
692
|
+
if end_el is not None:
|
693
|
+
if include_boundaries == "none" or include_boundaries == "start":
|
694
|
+
# Exclude end boundary
|
695
|
+
bottom = end_el.top if hasattr(end_el, "top") else end_el.bottom
|
696
|
+
else:
|
697
|
+
# Include end boundary
|
698
|
+
bottom = end_el.bottom
|
699
|
+
else:
|
700
|
+
bottom = end_pg.height
|
701
|
+
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
702
|
+
else: # horizontal
|
703
|
+
# Determine the right based on include_boundaries
|
704
|
+
if end_el is not None:
|
705
|
+
if include_boundaries == "none" or include_boundaries == "start":
|
706
|
+
# Exclude end boundary
|
707
|
+
right = end_el.x0 if hasattr(end_el, "x0") else end_el.x1
|
708
|
+
else:
|
709
|
+
# Include end boundary
|
710
|
+
right = end_el.x1
|
711
|
+
else:
|
712
|
+
right = end_pg.width
|
713
|
+
parts.append(Region(end_pg, (0, 0, right, end_pg.height)))
|
658
714
|
|
659
|
-
flow = Flow(segments=parts, arrangement=
|
715
|
+
flow = Flow(segments=parts, arrangement=orientation)
|
660
716
|
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
661
717
|
return FlowRegion(
|
662
718
|
flow=flow,
|
@@ -680,26 +736,103 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
680
736
|
start_element = current_start["element"]
|
681
737
|
end_element = boundary["element"]
|
682
738
|
|
739
|
+
# Check if this is an implicit start created from this same end element
|
740
|
+
# This would create a zero-height section, so skip this pairing
|
741
|
+
if (
|
742
|
+
hasattr(start_element, "is_implicit_start")
|
743
|
+
and hasattr(start_element, "created_from_end")
|
744
|
+
and start_element.created_from_end is end_element
|
745
|
+
):
|
746
|
+
# Skip this pairing - keep current_start for next end element
|
747
|
+
continue
|
748
|
+
|
683
749
|
# If both elements are on the same page, use the page's get_section_between
|
684
750
|
if start_element.page == end_element.page:
|
685
751
|
# For implicit start elements, create a region from the top of the page
|
686
752
|
if hasattr(start_element, "is_implicit_start"):
|
687
753
|
from natural_pdf.elements.region import Region
|
688
754
|
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
755
|
+
# Adjust boundaries based on include_boundaries parameter and orientation
|
756
|
+
if orientation == "vertical":
|
757
|
+
top = start_element.top
|
758
|
+
bottom = end_element.bottom
|
759
|
+
|
760
|
+
if include_boundaries == "none":
|
761
|
+
# Exclude both boundaries - move past them
|
762
|
+
top = (
|
763
|
+
start_element.bottom
|
764
|
+
if hasattr(start_element, "bottom")
|
765
|
+
else start_element.top
|
766
|
+
)
|
767
|
+
bottom = (
|
768
|
+
end_element.top
|
769
|
+
if hasattr(end_element, "top")
|
770
|
+
else end_element.bottom
|
771
|
+
)
|
772
|
+
elif include_boundaries == "start":
|
773
|
+
# Include start, exclude end
|
774
|
+
bottom = (
|
775
|
+
end_element.top
|
776
|
+
if hasattr(end_element, "top")
|
777
|
+
else end_element.bottom
|
778
|
+
)
|
779
|
+
elif include_boundaries == "end":
|
780
|
+
# Exclude start, include end
|
781
|
+
top = (
|
782
|
+
start_element.bottom
|
783
|
+
if hasattr(start_element, "bottom")
|
784
|
+
else start_element.top
|
785
|
+
)
|
786
|
+
# "both" is default - no adjustment needed
|
787
|
+
|
788
|
+
section = Region(
|
789
|
+
start_element.page,
|
790
|
+
(0, top, start_element.page.width, bottom),
|
791
|
+
)
|
792
|
+
else: # horizontal
|
793
|
+
left = start_element.x0
|
794
|
+
right = end_element.x1
|
795
|
+
|
796
|
+
if include_boundaries == "none":
|
797
|
+
# Exclude both boundaries - move past them
|
798
|
+
left = (
|
799
|
+
start_element.x1
|
800
|
+
if hasattr(start_element, "x1")
|
801
|
+
else start_element.x0
|
802
|
+
)
|
803
|
+
right = (
|
804
|
+
end_element.x0 if hasattr(end_element, "x0") else end_element.x1
|
805
|
+
)
|
806
|
+
elif include_boundaries == "start":
|
807
|
+
# Include start, exclude end
|
808
|
+
right = (
|
809
|
+
end_element.x0 if hasattr(end_element, "x0") else end_element.x1
|
810
|
+
)
|
811
|
+
elif include_boundaries == "end":
|
812
|
+
# Exclude start, include end
|
813
|
+
left = (
|
814
|
+
start_element.x1
|
815
|
+
if hasattr(start_element, "x1")
|
816
|
+
else start_element.x0
|
817
|
+
)
|
818
|
+
# "both" is default - no adjustment needed
|
819
|
+
|
820
|
+
section = Region(
|
821
|
+
start_element.page,
|
822
|
+
(left, 0, right, start_element.page.height),
|
823
|
+
)
|
693
824
|
section.start_element = start_element
|
694
825
|
section.boundary_element_found = end_element
|
695
826
|
else:
|
696
827
|
section = start_element.page.get_section_between(
|
697
|
-
start_element, end_element, include_boundaries
|
828
|
+
start_element, end_element, include_boundaries, orientation
|
698
829
|
)
|
699
830
|
sections.append(section)
|
700
831
|
else:
|
701
832
|
# Create FlowRegion spanning pages
|
702
|
-
flow_region = _build_flow_region(
|
833
|
+
flow_region = _build_flow_region(
|
834
|
+
start_element, end_element, include_boundaries, orientation
|
835
|
+
)
|
703
836
|
sections.append(flow_region)
|
704
837
|
|
705
838
|
current_start = None
|
@@ -709,35 +842,84 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
709
842
|
# Create a section from current_start to just before this boundary
|
710
843
|
start_element = current_start["element"]
|
711
844
|
|
712
|
-
#
|
845
|
+
# Create section from current start to just before this new start
|
713
846
|
if start_element.page == boundary["element"].page:
|
714
|
-
|
715
|
-
page_elements = [e for e in all_elements if e.page == start_element.page]
|
716
|
-
# Sort by position
|
717
|
-
page_elements.sort(key=lambda e: (e.top, e.x0))
|
718
|
-
|
719
|
-
# Find the last element before the boundary
|
720
|
-
end_idx = (
|
721
|
-
page_elements.index(boundary["element"]) - 1
|
722
|
-
if boundary["element"] in page_elements
|
723
|
-
else -1
|
724
|
-
)
|
725
|
-
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
847
|
+
from natural_pdf.elements.region import Region
|
726
848
|
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
849
|
+
next_start = boundary["element"]
|
850
|
+
|
851
|
+
# Create section based on orientation
|
852
|
+
if orientation == "vertical":
|
853
|
+
# Determine vertical bounds
|
854
|
+
if include_boundaries in ["start", "both"]:
|
855
|
+
top = start_element.top
|
856
|
+
else:
|
857
|
+
top = start_element.bottom
|
858
|
+
|
859
|
+
# The section ends just before the next start
|
860
|
+
bottom = next_start.top
|
861
|
+
|
862
|
+
# Create the section with full page width
|
863
|
+
if top < bottom:
|
864
|
+
section = Region(
|
865
|
+
start_element.page, (0, top, start_element.page.width, bottom)
|
866
|
+
)
|
867
|
+
section.start_element = start_element
|
868
|
+
sections.append(section)
|
869
|
+
else: # horizontal
|
870
|
+
# Determine horizontal bounds
|
871
|
+
if include_boundaries in ["start", "both"]:
|
872
|
+
left = start_element.x0
|
873
|
+
else:
|
874
|
+
left = start_element.x1
|
875
|
+
|
876
|
+
# The section ends just before the next start
|
877
|
+
right = next_start.x0
|
878
|
+
|
879
|
+
# Create the section with full page height
|
880
|
+
if left < right:
|
881
|
+
section = Region(
|
882
|
+
start_element.page, (left, 0, right, start_element.page.height)
|
883
|
+
)
|
884
|
+
section.start_element = start_element
|
885
|
+
sections.append(section)
|
732
886
|
else:
|
733
887
|
# Cross-page section - create from current_start to the end of its page
|
734
888
|
from natural_pdf.elements.region import Region
|
735
889
|
|
736
890
|
start_page = start_element.page
|
737
891
|
|
738
|
-
# Handle implicit start elements
|
739
|
-
|
740
|
-
|
892
|
+
# Handle implicit start elements and respect include_boundaries
|
893
|
+
if orientation == "vertical":
|
894
|
+
if include_boundaries in ["none", "end"]:
|
895
|
+
# Exclude start boundary
|
896
|
+
start_top = (
|
897
|
+
start_element.bottom
|
898
|
+
if hasattr(start_element, "bottom")
|
899
|
+
else start_element.top
|
900
|
+
)
|
901
|
+
else:
|
902
|
+
# Include start boundary
|
903
|
+
start_top = start_element.top
|
904
|
+
|
905
|
+
region = Region(
|
906
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
907
|
+
)
|
908
|
+
else: # horizontal
|
909
|
+
if include_boundaries in ["none", "end"]:
|
910
|
+
# Exclude start boundary
|
911
|
+
start_left = (
|
912
|
+
start_element.x1
|
913
|
+
if hasattr(start_element, "x1")
|
914
|
+
else start_element.x0
|
915
|
+
)
|
916
|
+
else:
|
917
|
+
# Include start boundary
|
918
|
+
start_left = start_element.x0
|
919
|
+
|
920
|
+
region = Region(
|
921
|
+
start_page, (start_left, 0, start_page.width, start_page.height)
|
922
|
+
)
|
741
923
|
region.start_element = start_element
|
742
924
|
sections.append(region)
|
743
925
|
|
@@ -753,19 +935,48 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
753
935
|
# on the last page of the collection
|
754
936
|
last_page = self.pages[-1]
|
755
937
|
last_page_elements = [e for e in all_elements if e.page == last_page]
|
756
|
-
|
938
|
+
if orientation == "vertical":
|
939
|
+
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
940
|
+
else: # horizontal
|
941
|
+
last_page_elements.sort(key=lambda e: (e.x0, e.top))
|
757
942
|
end_element = last_page_elements[-1] if last_page_elements else None
|
758
943
|
|
759
944
|
# Create FlowRegion spanning multiple pages using helper
|
760
|
-
flow_region = _build_flow_region(
|
945
|
+
flow_region = _build_flow_region(
|
946
|
+
start_element, end_element, include_boundaries, orientation
|
947
|
+
)
|
761
948
|
sections.append(flow_region)
|
762
949
|
else:
|
763
950
|
# With start_elements only, create a section to the end of the current page
|
764
951
|
from natural_pdf.elements.region import Region
|
765
952
|
|
766
|
-
# Handle implicit start elements
|
767
|
-
|
768
|
-
|
953
|
+
# Handle implicit start elements and respect include_boundaries
|
954
|
+
if orientation == "vertical":
|
955
|
+
if include_boundaries in ["none", "end"]:
|
956
|
+
# Exclude start boundary
|
957
|
+
start_top = (
|
958
|
+
start_element.bottom
|
959
|
+
if hasattr(start_element, "bottom")
|
960
|
+
else start_element.top
|
961
|
+
)
|
962
|
+
else:
|
963
|
+
# Include start boundary
|
964
|
+
start_top = start_element.top
|
965
|
+
|
966
|
+
region = Region(start_page, (0, start_top, start_page.width, start_page.height))
|
967
|
+
else: # horizontal
|
968
|
+
if include_boundaries in ["none", "end"]:
|
969
|
+
# Exclude start boundary
|
970
|
+
start_left = (
|
971
|
+
start_element.x1 if hasattr(start_element, "x1") else start_element.x0
|
972
|
+
)
|
973
|
+
else:
|
974
|
+
# Include start boundary
|
975
|
+
start_left = start_element.x0
|
976
|
+
|
977
|
+
region = Region(
|
978
|
+
start_page, (start_left, 0, start_page.width, start_page.height)
|
979
|
+
)
|
769
980
|
region.start_element = start_element
|
770
981
|
sections.append(region)
|
771
982
|
|
natural_pdf/core/page_groupby.py
CHANGED
@@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional,
|
|
7
7
|
|
8
8
|
from tqdm.auto import tqdm
|
9
9
|
|
10
|
+
from natural_pdf.utils.color_utils import format_color_value
|
11
|
+
|
10
12
|
if TYPE_CHECKING:
|
11
13
|
from natural_pdf.core.page import Page
|
12
14
|
from natural_pdf.core.page_collection import PageCollection
|
@@ -201,7 +203,15 @@ class PageGroupBy:
|
|
201
203
|
"""
|
202
204
|
groups = self._compute_groups()
|
203
205
|
for key, pages in groups.items():
|
204
|
-
|
206
|
+
# Format the key for display, converting colors to hex if needed
|
207
|
+
if isinstance(self.by, str):
|
208
|
+
# If grouped by a string selector, check if it's a color attribute
|
209
|
+
formatted_key = format_color_value(key, attr_name=self.by)
|
210
|
+
else:
|
211
|
+
# For callable grouping, try to format as color
|
212
|
+
formatted_key = format_color_value(key)
|
213
|
+
|
214
|
+
print(f"\n--- Group: {formatted_key} ({len(pages)} pages) ---")
|
205
215
|
pages.show(**kwargs)
|
206
216
|
|
207
217
|
def __len__(self) -> int:
|
@@ -220,7 +230,15 @@ class PageGroupBy:
|
|
220
230
|
print("-" * 40)
|
221
231
|
|
222
232
|
for i, (key, pages) in enumerate(groups.items()):
|
223
|
-
|
233
|
+
if key is None:
|
234
|
+
key_display = "None"
|
235
|
+
else:
|
236
|
+
# Format the key for display, converting colors to hex if needed
|
237
|
+
if isinstance(self.by, str):
|
238
|
+
formatted_key = format_color_value(key, attr_name=self.by)
|
239
|
+
else:
|
240
|
+
formatted_key = format_color_value(key)
|
241
|
+
key_display = f"'{formatted_key}'"
|
224
242
|
print(f"[{i}] {key_display}: {len(pages)} pages")
|
225
243
|
|
226
244
|
def __repr__(self) -> str:
|
natural_pdf/core/pdf.py
CHANGED
@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
|
|
252
252
|
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
253
253
|
|
254
254
|
self._cache[index] = cached
|
255
|
+
|
256
|
+
# Also cache in the parent PDF's main page list if this is a slice
|
257
|
+
if (
|
258
|
+
hasattr(self._parent_pdf, "_pages")
|
259
|
+
and hasattr(self._parent_pdf._pages, "_cache")
|
260
|
+
and actual_page_index < len(self._parent_pdf._pages._cache)
|
261
|
+
and self._parent_pdf._pages._cache[actual_page_index] is None
|
262
|
+
):
|
263
|
+
self._parent_pdf._pages._cache[actual_page_index] = cached
|
264
|
+
|
255
265
|
return cached
|
256
266
|
|
257
267
|
# Sequence protocol ---------------------------------------------------
|
@@ -720,26 +730,16 @@ class PDF(
|
|
720
730
|
# Store for bookkeeping and lazy application
|
721
731
|
self._exclusions.append((exclusion_func, label))
|
722
732
|
|
723
|
-
#
|
724
|
-
|
725
|
-
if self._pages._cache[i] is not None: # Only apply to existing pages
|
726
|
-
try:
|
727
|
-
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
728
|
-
except Exception as e:
|
729
|
-
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
733
|
+
# Don't modify already-cached pages - they will get PDF-level exclusions
|
734
|
+
# dynamically through _get_exclusion_regions()
|
730
735
|
return self
|
731
736
|
|
732
737
|
# Fallback to original callable / Region behaviour ------------------
|
733
738
|
exclusion_data = (exclusion_func, label)
|
734
739
|
self._exclusions.append(exclusion_data)
|
735
740
|
|
736
|
-
#
|
737
|
-
|
738
|
-
if self._pages._cache[i] is not None: # Only apply to existing pages
|
739
|
-
try:
|
740
|
-
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
741
|
-
except Exception as e:
|
742
|
-
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
741
|
+
# Don't modify already-cached pages - they will get PDF-level exclusions
|
742
|
+
# dynamically through _get_exclusion_regions()
|
743
743
|
|
744
744
|
return self
|
745
745
|
|
@@ -1280,6 +1280,7 @@ class PDF(
|
|
1280
1280
|
end_elements=None,
|
1281
1281
|
new_section_on_page_break=False,
|
1282
1282
|
include_boundaries="both",
|
1283
|
+
orientation="vertical",
|
1283
1284
|
) -> "ElementCollection":
|
1284
1285
|
"""
|
1285
1286
|
Extract sections from the entire PDF based on start/end elements.
|
@@ -1292,6 +1293,7 @@ class PDF(
|
|
1292
1293
|
end_elements: Elements or selector string that mark the end of sections (optional)
|
1293
1294
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1294
1295
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1296
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
1295
1297
|
|
1296
1298
|
Returns:
|
1297
1299
|
ElementCollection of Region objects representing the extracted sections
|
@@ -1328,6 +1330,7 @@ class PDF(
|
|
1328
1330
|
end_elements=end_elements,
|
1329
1331
|
new_section_on_page_break=new_section_on_page_break,
|
1330
1332
|
include_boundaries=include_boundaries,
|
1333
|
+
orientation=orientation,
|
1331
1334
|
)
|
1332
1335
|
|
1333
1336
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
natural_pdf/core/render_spec.py
CHANGED
@@ -196,7 +196,7 @@ class Visualizable:
|
|
196
196
|
columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
|
197
197
|
limit: Optional[int] = 30, # Max pages to show (default 30)
|
198
198
|
# Cropping options
|
199
|
-
crop: Union[bool, Literal["
|
199
|
+
crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
|
200
200
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
201
201
|
**kwargs,
|
202
202
|
) -> Optional["PIL_Image"]:
|
@@ -219,7 +219,12 @@ class Visualizable:
|
|
219
219
|
gap: Pixels between stacked images
|
220
220
|
columns: Number of columns for grid layout (defaults to 6)
|
221
221
|
limit: Maximum number of pages to display (default 30, None for all)
|
222
|
-
crop:
|
222
|
+
crop: Cropping mode:
|
223
|
+
- False: No cropping (default)
|
224
|
+
- True: Tight crop to element bounds
|
225
|
+
- int: Padding in pixels around element
|
226
|
+
- 'wide': Full page width, cropped vertically to element
|
227
|
+
- Region: Crop to the bounds of another region
|
223
228
|
crop_bbox: Explicit crop bounds
|
224
229
|
**kwargs: Additional parameters passed to rendering
|
225
230
|
|
@@ -230,6 +235,11 @@ class Visualizable:
|
|
230
235
|
if isinstance(annotate, str):
|
231
236
|
annotate = [annotate]
|
232
237
|
|
238
|
+
# Handle 'cols' as an alias for 'columns' for backward compatibility
|
239
|
+
if "cols" in kwargs and columns == 6: # Only use cols if columns wasn't explicitly set
|
240
|
+
columns = kwargs.pop("cols")
|
241
|
+
logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
|
242
|
+
|
233
243
|
# Pass limit as max_pages to _get_render_specs
|
234
244
|
if limit is not None:
|
235
245
|
kwargs["max_pages"] = limit
|
@@ -283,7 +293,7 @@ class Visualizable:
|
|
283
293
|
gap: int = 5,
|
284
294
|
columns: Optional[int] = None,
|
285
295
|
# Cropping options
|
286
|
-
crop: Union[bool, Literal["
|
296
|
+
crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
|
287
297
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
288
298
|
**kwargs,
|
289
299
|
) -> Optional["PIL_Image"]:
|
@@ -299,13 +309,18 @@ class Visualizable:
|
|
299
309
|
stack_direction: Direction for stack layout
|
300
310
|
gap: Pixels between stacked images
|
301
311
|
columns: Number of columns for grid layout
|
302
|
-
crop:
|
312
|
+
crop: Cropping mode (False, True, int for padding, 'wide', or Region)
|
303
313
|
crop_bbox: Explicit crop bounds
|
304
314
|
**kwargs: Additional parameters passed to rendering
|
305
315
|
|
306
316
|
Returns:
|
307
317
|
PIL Image object or None if nothing to render
|
308
318
|
"""
|
319
|
+
# Handle 'cols' as an alias for 'columns' for backward compatibility
|
320
|
+
if "cols" in kwargs and columns is None: # Only use cols if columns wasn't explicitly set
|
321
|
+
columns = kwargs.pop("cols")
|
322
|
+
logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
|
323
|
+
|
309
324
|
specs = self._get_render_specs(mode="render", crop=crop, crop_bbox=crop_bbox, **kwargs)
|
310
325
|
|
311
326
|
if not specs:
|
@@ -353,7 +368,7 @@ class Visualizable:
|
|
353
368
|
stack_direction: Direction for stack layout
|
354
369
|
gap: Pixels between stacked images
|
355
370
|
columns: Number of columns for grid layout
|
356
|
-
crop:
|
371
|
+
crop: Cropping mode (False, True, int for padding, 'wide', or Region)
|
357
372
|
crop_bbox: Explicit crop bounds
|
358
373
|
format: Image format (inferred from path if not specified)
|
359
374
|
**kwargs: Additional parameters passed to rendering
|
natural_pdf/describe/base.py
CHANGED
@@ -344,7 +344,7 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
344
344
|
|
345
345
|
elif column == "highlight":
|
346
346
|
# If element is highlighted, return its colour; otherwise blank
|
347
|
-
if getattr(element, "
|
347
|
+
if getattr(element, "is_highlighted", False):
|
348
348
|
col_val = getattr(element, "highlight_color", None)
|
349
349
|
if col_val is None:
|
350
350
|
return "True" # fallback if colour missing
|
natural_pdf/describe/elements.py
CHANGED
@@ -306,7 +306,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
306
306
|
styles["strikeout"] += 1
|
307
307
|
if getattr(element, "underline", False):
|
308
308
|
styles["underline"] += 1
|
309
|
-
if getattr(element, "
|
309
|
+
if getattr(element, "is_highlighted", False):
|
310
310
|
styles["highlight"] += 1
|
311
311
|
|
312
312
|
# Color - use TextElement's color property
|