natural-pdf 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +94 -42
- natural_pdf/core/page.py +110 -44
- natural_pdf/core/page_collection.py +223 -34
- natural_pdf/core/page_groupby.py +20 -2
- natural_pdf/core/pdf.py +3 -0
- natural_pdf/core/render_spec.py +20 -5
- natural_pdf/describe/base.py +1 -1
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +84 -8
- natural_pdf/elements/element_collection.py +730 -12
- natural_pdf/elements/region.py +181 -48
- natural_pdf/flows/flow.py +3 -0
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/RECORD +20 -19
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/top_level.txt +0 -0
@@ -460,6 +460,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
460
460
|
end_elements=None,
|
461
461
|
new_section_on_page_break=False,
|
462
462
|
include_boundaries="both",
|
463
|
+
orientation="vertical",
|
463
464
|
) -> "ElementCollection[Region]":
|
464
465
|
"""
|
465
466
|
Extract sections from a page collection based on start/end elements.
|
@@ -469,6 +470,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
469
470
|
end_elements: Elements or selector string that mark the end of sections (optional)
|
470
471
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
471
472
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
473
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
472
474
|
|
473
475
|
Returns:
|
474
476
|
List of Region objects representing the extracted sections
|
@@ -511,6 +513,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
511
513
|
next_page = self.pages[i + 1]
|
512
514
|
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
513
515
|
top_region.is_page_boundary = True # Mark it as a special boundary
|
516
|
+
# If start_elements is None, initialize it as an empty list
|
517
|
+
if start_elements is None:
|
518
|
+
start_elements = []
|
514
519
|
start_elements.append(top_region)
|
515
520
|
|
516
521
|
# Get all elements from all pages and sort them in document order
|
@@ -542,6 +547,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
542
547
|
end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1)
|
543
548
|
)
|
544
549
|
implicit_start.is_implicit_start = True
|
550
|
+
# Track which end element this implicit start was created from
|
551
|
+
# to avoid pairing them together (which would create zero height)
|
552
|
+
implicit_start.created_from_end = end_elem
|
545
553
|
start_elements.append(implicit_start)
|
546
554
|
|
547
555
|
# Mark section boundaries
|
@@ -606,17 +614,20 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
606
614
|
|
607
615
|
# Sort boundaries by page index, then by actual document position
|
608
616
|
def _sort_key(boundary):
|
609
|
-
"""Sort boundaries by (page_idx,
|
617
|
+
"""Sort boundaries by (page_idx, position, priority)."""
|
610
618
|
page_idx = boundary["page_idx"]
|
611
619
|
element = boundary["element"]
|
612
620
|
|
613
|
-
#
|
614
|
-
|
621
|
+
# Position on the page based on orientation
|
622
|
+
if orientation == "vertical":
|
623
|
+
pos = getattr(element, "top", 0.0)
|
624
|
+
else: # horizontal
|
625
|
+
pos = getattr(element, "x0", 0.0)
|
615
626
|
|
616
627
|
# Ensure starts come before ends at the same coordinate
|
617
628
|
priority = 0 if boundary["type"] == "start" else 1
|
618
629
|
|
619
|
-
return (page_idx,
|
630
|
+
return (page_idx, pos, priority)
|
620
631
|
|
621
632
|
section_boundaries.sort(key=_sort_key)
|
622
633
|
|
@@ -624,10 +635,17 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
624
635
|
sections = []
|
625
636
|
|
626
637
|
# --- Helper: build a FlowRegion spanning multiple pages ---
|
627
|
-
def _build_flow_region(start_el, end_el):
|
628
|
-
"""Return a FlowRegion that covers from *start_el* to *end_el
|
629
|
-
If *end_el* is None, the region continues to the bottom of the last
|
630
|
-
page in this PageCollection.
|
638
|
+
def _build_flow_region(start_el, end_el, include_boundaries="both", orientation="vertical"):
|
639
|
+
"""Return a FlowRegion that covers from *start_el* to *end_el*.
|
640
|
+
If *end_el* is None, the region continues to the bottom/right of the last
|
641
|
+
page in this PageCollection.
|
642
|
+
|
643
|
+
Args:
|
644
|
+
start_el: Start element
|
645
|
+
end_el: End element
|
646
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
647
|
+
orientation: 'vertical' or 'horizontal' - determines section direction
|
648
|
+
"""
|
631
649
|
# Local imports to avoid top-level cycles
|
632
650
|
from natural_pdf.elements.region import Region
|
633
651
|
from natural_pdf.flows.element import FlowElement
|
@@ -639,12 +657,24 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
639
657
|
|
640
658
|
parts: list[Region] = []
|
641
659
|
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
660
|
+
if orientation == "vertical":
|
661
|
+
# Determine the start_top based on include_boundaries
|
662
|
+
start_top = start_el.top
|
663
|
+
if include_boundaries == "none" or include_boundaries == "end":
|
664
|
+
# Exclude start boundary
|
665
|
+
start_top = start_el.bottom if hasattr(start_el, "bottom") else start_el.top
|
666
|
+
|
667
|
+
# Slice of first page beginning at *start_top*
|
668
|
+
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
669
|
+
else: # horizontal
|
670
|
+
# Determine the start_left based on include_boundaries
|
671
|
+
start_left = start_el.x0
|
672
|
+
if include_boundaries == "none" or include_boundaries == "end":
|
673
|
+
# Exclude start boundary
|
674
|
+
start_left = start_el.x1 if hasattr(start_el, "x1") else start_el.x0
|
675
|
+
|
676
|
+
# Slice of first page beginning at *start_left*
|
677
|
+
parts.append(Region(start_pg, (start_left, 0, start_pg.width, start_pg.height)))
|
648
678
|
|
649
679
|
# Full middle pages
|
650
680
|
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
@@ -653,10 +683,32 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
653
683
|
|
654
684
|
# Slice of last page (if distinct)
|
655
685
|
if end_pg is not start_pg:
|
656
|
-
|
657
|
-
|
686
|
+
if orientation == "vertical":
|
687
|
+
# Determine the bottom based on include_boundaries
|
688
|
+
if end_el is not None:
|
689
|
+
if include_boundaries == "none" or include_boundaries == "start":
|
690
|
+
# Exclude end boundary
|
691
|
+
bottom = end_el.top if hasattr(end_el, "top") else end_el.bottom
|
692
|
+
else:
|
693
|
+
# Include end boundary
|
694
|
+
bottom = end_el.bottom
|
695
|
+
else:
|
696
|
+
bottom = end_pg.height
|
697
|
+
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
698
|
+
else: # horizontal
|
699
|
+
# Determine the right based on include_boundaries
|
700
|
+
if end_el is not None:
|
701
|
+
if include_boundaries == "none" or include_boundaries == "start":
|
702
|
+
# Exclude end boundary
|
703
|
+
right = end_el.x0 if hasattr(end_el, "x0") else end_el.x1
|
704
|
+
else:
|
705
|
+
# Include end boundary
|
706
|
+
right = end_el.x1
|
707
|
+
else:
|
708
|
+
right = end_pg.width
|
709
|
+
parts.append(Region(end_pg, (0, 0, right, end_pg.height)))
|
658
710
|
|
659
|
-
flow = Flow(segments=parts, arrangement=
|
711
|
+
flow = Flow(segments=parts, arrangement=orientation)
|
660
712
|
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
661
713
|
return FlowRegion(
|
662
714
|
flow=flow,
|
@@ -680,26 +732,103 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
680
732
|
start_element = current_start["element"]
|
681
733
|
end_element = boundary["element"]
|
682
734
|
|
735
|
+
# Check if this is an implicit start created from this same end element
|
736
|
+
# This would create a zero-height section, so skip this pairing
|
737
|
+
if (
|
738
|
+
hasattr(start_element, "is_implicit_start")
|
739
|
+
and hasattr(start_element, "created_from_end")
|
740
|
+
and start_element.created_from_end is end_element
|
741
|
+
):
|
742
|
+
# Skip this pairing - keep current_start for next end element
|
743
|
+
continue
|
744
|
+
|
683
745
|
# If both elements are on the same page, use the page's get_section_between
|
684
746
|
if start_element.page == end_element.page:
|
685
747
|
# For implicit start elements, create a region from the top of the page
|
686
748
|
if hasattr(start_element, "is_implicit_start"):
|
687
749
|
from natural_pdf.elements.region import Region
|
688
750
|
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
751
|
+
# Adjust boundaries based on include_boundaries parameter and orientation
|
752
|
+
if orientation == "vertical":
|
753
|
+
top = start_element.top
|
754
|
+
bottom = end_element.bottom
|
755
|
+
|
756
|
+
if include_boundaries == "none":
|
757
|
+
# Exclude both boundaries - move past them
|
758
|
+
top = (
|
759
|
+
start_element.bottom
|
760
|
+
if hasattr(start_element, "bottom")
|
761
|
+
else start_element.top
|
762
|
+
)
|
763
|
+
bottom = (
|
764
|
+
end_element.top
|
765
|
+
if hasattr(end_element, "top")
|
766
|
+
else end_element.bottom
|
767
|
+
)
|
768
|
+
elif include_boundaries == "start":
|
769
|
+
# Include start, exclude end
|
770
|
+
bottom = (
|
771
|
+
end_element.top
|
772
|
+
if hasattr(end_element, "top")
|
773
|
+
else end_element.bottom
|
774
|
+
)
|
775
|
+
elif include_boundaries == "end":
|
776
|
+
# Exclude start, include end
|
777
|
+
top = (
|
778
|
+
start_element.bottom
|
779
|
+
if hasattr(start_element, "bottom")
|
780
|
+
else start_element.top
|
781
|
+
)
|
782
|
+
# "both" is default - no adjustment needed
|
783
|
+
|
784
|
+
section = Region(
|
785
|
+
start_element.page,
|
786
|
+
(0, top, start_element.page.width, bottom),
|
787
|
+
)
|
788
|
+
else: # horizontal
|
789
|
+
left = start_element.x0
|
790
|
+
right = end_element.x1
|
791
|
+
|
792
|
+
if include_boundaries == "none":
|
793
|
+
# Exclude both boundaries - move past them
|
794
|
+
left = (
|
795
|
+
start_element.x1
|
796
|
+
if hasattr(start_element, "x1")
|
797
|
+
else start_element.x0
|
798
|
+
)
|
799
|
+
right = (
|
800
|
+
end_element.x0 if hasattr(end_element, "x0") else end_element.x1
|
801
|
+
)
|
802
|
+
elif include_boundaries == "start":
|
803
|
+
# Include start, exclude end
|
804
|
+
right = (
|
805
|
+
end_element.x0 if hasattr(end_element, "x0") else end_element.x1
|
806
|
+
)
|
807
|
+
elif include_boundaries == "end":
|
808
|
+
# Exclude start, include end
|
809
|
+
left = (
|
810
|
+
start_element.x1
|
811
|
+
if hasattr(start_element, "x1")
|
812
|
+
else start_element.x0
|
813
|
+
)
|
814
|
+
# "both" is default - no adjustment needed
|
815
|
+
|
816
|
+
section = Region(
|
817
|
+
start_element.page,
|
818
|
+
(left, 0, right, start_element.page.height),
|
819
|
+
)
|
693
820
|
section.start_element = start_element
|
694
821
|
section.boundary_element_found = end_element
|
695
822
|
else:
|
696
823
|
section = start_element.page.get_section_between(
|
697
|
-
start_element, end_element, include_boundaries
|
824
|
+
start_element, end_element, include_boundaries, orientation
|
698
825
|
)
|
699
826
|
sections.append(section)
|
700
827
|
else:
|
701
828
|
# Create FlowRegion spanning pages
|
702
|
-
flow_region = _build_flow_region(
|
829
|
+
flow_region = _build_flow_region(
|
830
|
+
start_element, end_element, include_boundaries, orientation
|
831
|
+
)
|
703
832
|
sections.append(flow_region)
|
704
833
|
|
705
834
|
current_start = None
|
@@ -713,8 +842,11 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
713
842
|
if start_element.page == boundary["element"].page:
|
714
843
|
# Find elements on this page
|
715
844
|
page_elements = [e for e in all_elements if e.page == start_element.page]
|
716
|
-
# Sort by position
|
717
|
-
|
845
|
+
# Sort by position based on orientation
|
846
|
+
if orientation == "vertical":
|
847
|
+
page_elements.sort(key=lambda e: (e.top, e.x0))
|
848
|
+
else: # horizontal
|
849
|
+
page_elements.sort(key=lambda e: (e.x0, e.top))
|
718
850
|
|
719
851
|
# Find the last element before the boundary
|
720
852
|
end_idx = (
|
@@ -726,7 +858,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
726
858
|
|
727
859
|
# Create the section
|
728
860
|
section = start_element.page.get_section_between(
|
729
|
-
start_element, end_element, include_boundaries
|
861
|
+
start_element, end_element, include_boundaries, orientation
|
730
862
|
)
|
731
863
|
sections.append(section)
|
732
864
|
else:
|
@@ -735,9 +867,37 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
735
867
|
|
736
868
|
start_page = start_element.page
|
737
869
|
|
738
|
-
# Handle implicit start elements
|
739
|
-
|
740
|
-
|
870
|
+
# Handle implicit start elements and respect include_boundaries
|
871
|
+
if orientation == "vertical":
|
872
|
+
if include_boundaries in ["none", "end"]:
|
873
|
+
# Exclude start boundary
|
874
|
+
start_top = (
|
875
|
+
start_element.bottom
|
876
|
+
if hasattr(start_element, "bottom")
|
877
|
+
else start_element.top
|
878
|
+
)
|
879
|
+
else:
|
880
|
+
# Include start boundary
|
881
|
+
start_top = start_element.top
|
882
|
+
|
883
|
+
region = Region(
|
884
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
885
|
+
)
|
886
|
+
else: # horizontal
|
887
|
+
if include_boundaries in ["none", "end"]:
|
888
|
+
# Exclude start boundary
|
889
|
+
start_left = (
|
890
|
+
start_element.x1
|
891
|
+
if hasattr(start_element, "x1")
|
892
|
+
else start_element.x0
|
893
|
+
)
|
894
|
+
else:
|
895
|
+
# Include start boundary
|
896
|
+
start_left = start_element.x0
|
897
|
+
|
898
|
+
region = Region(
|
899
|
+
start_page, (start_left, 0, start_page.width, start_page.height)
|
900
|
+
)
|
741
901
|
region.start_element = start_element
|
742
902
|
sections.append(region)
|
743
903
|
|
@@ -753,19 +913,48 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
753
913
|
# on the last page of the collection
|
754
914
|
last_page = self.pages[-1]
|
755
915
|
last_page_elements = [e for e in all_elements if e.page == last_page]
|
756
|
-
|
916
|
+
if orientation == "vertical":
|
917
|
+
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
918
|
+
else: # horizontal
|
919
|
+
last_page_elements.sort(key=lambda e: (e.x0, e.top))
|
757
920
|
end_element = last_page_elements[-1] if last_page_elements else None
|
758
921
|
|
759
922
|
# Create FlowRegion spanning multiple pages using helper
|
760
|
-
flow_region = _build_flow_region(
|
923
|
+
flow_region = _build_flow_region(
|
924
|
+
start_element, end_element, include_boundaries, orientation
|
925
|
+
)
|
761
926
|
sections.append(flow_region)
|
762
927
|
else:
|
763
928
|
# With start_elements only, create a section to the end of the current page
|
764
929
|
from natural_pdf.elements.region import Region
|
765
930
|
|
766
|
-
# Handle implicit start elements
|
767
|
-
|
768
|
-
|
931
|
+
# Handle implicit start elements and respect include_boundaries
|
932
|
+
if orientation == "vertical":
|
933
|
+
if include_boundaries in ["none", "end"]:
|
934
|
+
# Exclude start boundary
|
935
|
+
start_top = (
|
936
|
+
start_element.bottom
|
937
|
+
if hasattr(start_element, "bottom")
|
938
|
+
else start_element.top
|
939
|
+
)
|
940
|
+
else:
|
941
|
+
# Include start boundary
|
942
|
+
start_top = start_element.top
|
943
|
+
|
944
|
+
region = Region(start_page, (0, start_top, start_page.width, start_page.height))
|
945
|
+
else: # horizontal
|
946
|
+
if include_boundaries in ["none", "end"]:
|
947
|
+
# Exclude start boundary
|
948
|
+
start_left = (
|
949
|
+
start_element.x1 if hasattr(start_element, "x1") else start_element.x0
|
950
|
+
)
|
951
|
+
else:
|
952
|
+
# Include start boundary
|
953
|
+
start_left = start_element.x0
|
954
|
+
|
955
|
+
region = Region(
|
956
|
+
start_page, (start_left, 0, start_page.width, start_page.height)
|
957
|
+
)
|
769
958
|
region.start_element = start_element
|
770
959
|
sections.append(region)
|
771
960
|
|
natural_pdf/core/page_groupby.py
CHANGED
@@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional,
|
|
7
7
|
|
8
8
|
from tqdm.auto import tqdm
|
9
9
|
|
10
|
+
from natural_pdf.utils.color_utils import format_color_value
|
11
|
+
|
10
12
|
if TYPE_CHECKING:
|
11
13
|
from natural_pdf.core.page import Page
|
12
14
|
from natural_pdf.core.page_collection import PageCollection
|
@@ -201,7 +203,15 @@ class PageGroupBy:
|
|
201
203
|
"""
|
202
204
|
groups = self._compute_groups()
|
203
205
|
for key, pages in groups.items():
|
204
|
-
|
206
|
+
# Format the key for display, converting colors to hex if needed
|
207
|
+
if isinstance(self.by, str):
|
208
|
+
# If grouped by a string selector, check if it's a color attribute
|
209
|
+
formatted_key = format_color_value(key, attr_name=self.by)
|
210
|
+
else:
|
211
|
+
# For callable grouping, try to format as color
|
212
|
+
formatted_key = format_color_value(key)
|
213
|
+
|
214
|
+
print(f"\n--- Group: {formatted_key} ({len(pages)} pages) ---")
|
205
215
|
pages.show(**kwargs)
|
206
216
|
|
207
217
|
def __len__(self) -> int:
|
@@ -220,7 +230,15 @@ class PageGroupBy:
|
|
220
230
|
print("-" * 40)
|
221
231
|
|
222
232
|
for i, (key, pages) in enumerate(groups.items()):
|
223
|
-
|
233
|
+
if key is None:
|
234
|
+
key_display = "None"
|
235
|
+
else:
|
236
|
+
# Format the key for display, converting colors to hex if needed
|
237
|
+
if isinstance(self.by, str):
|
238
|
+
formatted_key = format_color_value(key, attr_name=self.by)
|
239
|
+
else:
|
240
|
+
formatted_key = format_color_value(key)
|
241
|
+
key_display = f"'{formatted_key}'"
|
224
242
|
print(f"[{i}] {key_display}: {len(pages)} pages")
|
225
243
|
|
226
244
|
def __repr__(self) -> str:
|
natural_pdf/core/pdf.py
CHANGED
@@ -1280,6 +1280,7 @@ class PDF(
|
|
1280
1280
|
end_elements=None,
|
1281
1281
|
new_section_on_page_break=False,
|
1282
1282
|
include_boundaries="both",
|
1283
|
+
orientation="vertical",
|
1283
1284
|
) -> "ElementCollection":
|
1284
1285
|
"""
|
1285
1286
|
Extract sections from the entire PDF based on start/end elements.
|
@@ -1292,6 +1293,7 @@ class PDF(
|
|
1292
1293
|
end_elements: Elements or selector string that mark the end of sections (optional)
|
1293
1294
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1294
1295
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1296
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
1295
1297
|
|
1296
1298
|
Returns:
|
1297
1299
|
ElementCollection of Region objects representing the extracted sections
|
@@ -1328,6 +1330,7 @@ class PDF(
|
|
1328
1330
|
end_elements=end_elements,
|
1329
1331
|
new_section_on_page_break=new_section_on_page_break,
|
1330
1332
|
include_boundaries=include_boundaries,
|
1333
|
+
orientation=orientation,
|
1331
1334
|
)
|
1332
1335
|
|
1333
1336
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
natural_pdf/core/render_spec.py
CHANGED
@@ -196,7 +196,7 @@ class Visualizable:
|
|
196
196
|
columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
|
197
197
|
limit: Optional[int] = 30, # Max pages to show (default 30)
|
198
198
|
# Cropping options
|
199
|
-
crop: Union[bool, Literal["
|
199
|
+
crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
|
200
200
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
201
201
|
**kwargs,
|
202
202
|
) -> Optional["PIL_Image"]:
|
@@ -219,7 +219,12 @@ class Visualizable:
|
|
219
219
|
gap: Pixels between stacked images
|
220
220
|
columns: Number of columns for grid layout (defaults to 6)
|
221
221
|
limit: Maximum number of pages to display (default 30, None for all)
|
222
|
-
crop:
|
222
|
+
crop: Cropping mode:
|
223
|
+
- False: No cropping (default)
|
224
|
+
- True: Tight crop to element bounds
|
225
|
+
- int: Padding in pixels around element
|
226
|
+
- 'wide': Full page width, cropped vertically to element
|
227
|
+
- Region: Crop to the bounds of another region
|
223
228
|
crop_bbox: Explicit crop bounds
|
224
229
|
**kwargs: Additional parameters passed to rendering
|
225
230
|
|
@@ -230,6 +235,11 @@ class Visualizable:
|
|
230
235
|
if isinstance(annotate, str):
|
231
236
|
annotate = [annotate]
|
232
237
|
|
238
|
+
# Handle 'cols' as an alias for 'columns' for backward compatibility
|
239
|
+
if "cols" in kwargs and columns == 6: # Only use cols if columns wasn't explicitly set
|
240
|
+
columns = kwargs.pop("cols")
|
241
|
+
logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
|
242
|
+
|
233
243
|
# Pass limit as max_pages to _get_render_specs
|
234
244
|
if limit is not None:
|
235
245
|
kwargs["max_pages"] = limit
|
@@ -283,7 +293,7 @@ class Visualizable:
|
|
283
293
|
gap: int = 5,
|
284
294
|
columns: Optional[int] = None,
|
285
295
|
# Cropping options
|
286
|
-
crop: Union[bool, Literal["
|
296
|
+
crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
|
287
297
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
288
298
|
**kwargs,
|
289
299
|
) -> Optional["PIL_Image"]:
|
@@ -299,13 +309,18 @@ class Visualizable:
|
|
299
309
|
stack_direction: Direction for stack layout
|
300
310
|
gap: Pixels between stacked images
|
301
311
|
columns: Number of columns for grid layout
|
302
|
-
crop:
|
312
|
+
crop: Cropping mode (False, True, int for padding, 'wide', or Region)
|
303
313
|
crop_bbox: Explicit crop bounds
|
304
314
|
**kwargs: Additional parameters passed to rendering
|
305
315
|
|
306
316
|
Returns:
|
307
317
|
PIL Image object or None if nothing to render
|
308
318
|
"""
|
319
|
+
# Handle 'cols' as an alias for 'columns' for backward compatibility
|
320
|
+
if "cols" in kwargs and columns is None: # Only use cols if columns wasn't explicitly set
|
321
|
+
columns = kwargs.pop("cols")
|
322
|
+
logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
|
323
|
+
|
309
324
|
specs = self._get_render_specs(mode="render", crop=crop, crop_bbox=crop_bbox, **kwargs)
|
310
325
|
|
311
326
|
if not specs:
|
@@ -353,7 +368,7 @@ class Visualizable:
|
|
353
368
|
stack_direction: Direction for stack layout
|
354
369
|
gap: Pixels between stacked images
|
355
370
|
columns: Number of columns for grid layout
|
356
|
-
crop:
|
371
|
+
crop: Cropping mode (False, True, int for padding, 'wide', or Region)
|
357
372
|
crop_bbox: Explicit crop bounds
|
358
373
|
format: Image format (inferred from path if not specified)
|
359
374
|
**kwargs: Additional parameters passed to rendering
|
natural_pdf/describe/base.py
CHANGED
@@ -344,7 +344,7 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
344
344
|
|
345
345
|
elif column == "highlight":
|
346
346
|
# If element is highlighted, return its colour; otherwise blank
|
347
|
-
if getattr(element, "
|
347
|
+
if getattr(element, "is_highlighted", False):
|
348
348
|
col_val = getattr(element, "highlight_color", None)
|
349
349
|
if col_val is None:
|
350
350
|
return "True" # fallback if colour missing
|
natural_pdf/describe/elements.py
CHANGED
@@ -306,7 +306,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
306
306
|
styles["strikeout"] += 1
|
307
307
|
if getattr(element, "underline", False):
|
308
308
|
styles["underline"] += 1
|
309
|
-
if getattr(element, "
|
309
|
+
if getattr(element, "is_highlighted", False):
|
310
310
|
styles["highlight"] += 1
|
311
311
|
|
312
312
|
# Color - use TextElement's color property
|