natural-pdf 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -460,6 +460,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
460
460
  end_elements=None,
461
461
  new_section_on_page_break=False,
462
462
  include_boundaries="both",
463
+ orientation="vertical",
463
464
  ) -> "ElementCollection[Region]":
464
465
  """
465
466
  Extract sections from a page collection based on start/end elements.
@@ -469,6 +470,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
469
470
  end_elements: Elements or selector string that mark the end of sections (optional)
470
471
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
471
472
  include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
473
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
472
474
 
473
475
  Returns:
474
476
  List of Region objects representing the extracted sections
@@ -511,6 +513,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
511
513
  next_page = self.pages[i + 1]
512
514
  top_region = Region(next_page, (0, 0, next_page.width, 1))
513
515
  top_region.is_page_boundary = True # Mark it as a special boundary
516
+ # If start_elements is None, initialize it as an empty list
517
+ if start_elements is None:
518
+ start_elements = []
514
519
  start_elements.append(top_region)
515
520
 
516
521
  # Get all elements from all pages and sort them in document order
@@ -542,6 +547,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
542
547
  end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1)
543
548
  )
544
549
  implicit_start.is_implicit_start = True
550
+ # Track which end element this implicit start was created from
551
+ # to avoid pairing them together (which would create zero height)
552
+ implicit_start.created_from_end = end_elem
545
553
  start_elements.append(implicit_start)
546
554
 
547
555
  # Mark section boundaries
@@ -606,17 +614,20 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
606
614
 
607
615
  # Sort boundaries by page index, then by actual document position
608
616
  def _sort_key(boundary):
609
- """Sort boundaries by (page_idx, vertical_top, priority)."""
617
+ """Sort boundaries by (page_idx, position, priority)."""
610
618
  page_idx = boundary["page_idx"]
611
619
  element = boundary["element"]
612
620
 
613
- # Vertical position on the page
614
- y_pos = getattr(element, "top", 0.0)
621
+ # Position on the page based on orientation
622
+ if orientation == "vertical":
623
+ pos = getattr(element, "top", 0.0)
624
+ else: # horizontal
625
+ pos = getattr(element, "x0", 0.0)
615
626
 
616
627
  # Ensure starts come before ends at the same coordinate
617
628
  priority = 0 if boundary["type"] == "start" else 1
618
629
 
619
- return (page_idx, y_pos, priority)
630
+ return (page_idx, pos, priority)
620
631
 
621
632
  section_boundaries.sort(key=_sort_key)
622
633
 
@@ -624,10 +635,17 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
624
635
  sections = []
625
636
 
626
637
  # --- Helper: build a FlowRegion spanning multiple pages ---
627
- def _build_flow_region(start_el, end_el):
628
- """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
629
- If *end_el* is None, the region continues to the bottom of the last
630
- page in this PageCollection."""
638
+ def _build_flow_region(start_el, end_el, include_boundaries="both", orientation="vertical"):
639
+ """Return a FlowRegion that covers from *start_el* to *end_el*.
640
+ If *end_el* is None, the region continues to the bottom/right of the last
641
+ page in this PageCollection.
642
+
643
+ Args:
644
+ start_el: Start element
645
+ end_el: End element
646
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
647
+ orientation: 'vertical' or 'horizontal' - determines section direction
648
+ """
631
649
  # Local imports to avoid top-level cycles
632
650
  from natural_pdf.elements.region import Region
633
651
  from natural_pdf.flows.element import FlowElement
@@ -639,12 +657,24 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
639
657
 
640
658
  parts: list[Region] = []
641
659
 
642
- # Use the actual top of the start element (for implicit starts this is
643
- # the bottom of the previous end element) instead of forcing to 0.
644
- start_top = start_el.top
645
-
646
- # Slice of first page beginning at *start_top*
647
- parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
660
+ if orientation == "vertical":
661
+ # Determine the start_top based on include_boundaries
662
+ start_top = start_el.top
663
+ if include_boundaries == "none" or include_boundaries == "end":
664
+ # Exclude start boundary
665
+ start_top = start_el.bottom if hasattr(start_el, "bottom") else start_el.top
666
+
667
+ # Slice of first page beginning at *start_top*
668
+ parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
669
+ else: # horizontal
670
+ # Determine the start_left based on include_boundaries
671
+ start_left = start_el.x0
672
+ if include_boundaries == "none" or include_boundaries == "end":
673
+ # Exclude start boundary
674
+ start_left = start_el.x1 if hasattr(start_el, "x1") else start_el.x0
675
+
676
+ # Slice of first page beginning at *start_left*
677
+ parts.append(Region(start_pg, (start_left, 0, start_pg.width, start_pg.height)))
648
678
 
649
679
  # Full middle pages
650
680
  for pg_idx in range(start_pg.index + 1, end_pg.index):
@@ -653,10 +683,32 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
653
683
 
654
684
  # Slice of last page (if distinct)
655
685
  if end_pg is not start_pg:
656
- bottom = end_el.bottom if end_el is not None else end_pg.height
657
- parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
686
+ if orientation == "vertical":
687
+ # Determine the bottom based on include_boundaries
688
+ if end_el is not None:
689
+ if include_boundaries == "none" or include_boundaries == "start":
690
+ # Exclude end boundary
691
+ bottom = end_el.top if hasattr(end_el, "top") else end_el.bottom
692
+ else:
693
+ # Include end boundary
694
+ bottom = end_el.bottom
695
+ else:
696
+ bottom = end_pg.height
697
+ parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
698
+ else: # horizontal
699
+ # Determine the right based on include_boundaries
700
+ if end_el is not None:
701
+ if include_boundaries == "none" or include_boundaries == "start":
702
+ # Exclude end boundary
703
+ right = end_el.x0 if hasattr(end_el, "x0") else end_el.x1
704
+ else:
705
+ # Include end boundary
706
+ right = end_el.x1
707
+ else:
708
+ right = end_pg.width
709
+ parts.append(Region(end_pg, (0, 0, right, end_pg.height)))
658
710
 
659
- flow = Flow(segments=parts, arrangement="vertical")
711
+ flow = Flow(segments=parts, arrangement=orientation)
660
712
  src_fe = FlowElement(physical_object=start_el, flow=flow)
661
713
  return FlowRegion(
662
714
  flow=flow,
@@ -680,26 +732,103 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
680
732
  start_element = current_start["element"]
681
733
  end_element = boundary["element"]
682
734
 
735
+ # Check if this is an implicit start created from this same end element
736
+ # This would create a zero-height section, so skip this pairing
737
+ if (
738
+ hasattr(start_element, "is_implicit_start")
739
+ and hasattr(start_element, "created_from_end")
740
+ and start_element.created_from_end is end_element
741
+ ):
742
+ # Skip this pairing - keep current_start for next end element
743
+ continue
744
+
683
745
  # If both elements are on the same page, use the page's get_section_between
684
746
  if start_element.page == end_element.page:
685
747
  # For implicit start elements, create a region from the top of the page
686
748
  if hasattr(start_element, "is_implicit_start"):
687
749
  from natural_pdf.elements.region import Region
688
750
 
689
- section = Region(
690
- start_element.page,
691
- (0, start_element.top, start_element.page.width, end_element.bottom),
692
- )
751
+ # Adjust boundaries based on include_boundaries parameter and orientation
752
+ if orientation == "vertical":
753
+ top = start_element.top
754
+ bottom = end_element.bottom
755
+
756
+ if include_boundaries == "none":
757
+ # Exclude both boundaries - move past them
758
+ top = (
759
+ start_element.bottom
760
+ if hasattr(start_element, "bottom")
761
+ else start_element.top
762
+ )
763
+ bottom = (
764
+ end_element.top
765
+ if hasattr(end_element, "top")
766
+ else end_element.bottom
767
+ )
768
+ elif include_boundaries == "start":
769
+ # Include start, exclude end
770
+ bottom = (
771
+ end_element.top
772
+ if hasattr(end_element, "top")
773
+ else end_element.bottom
774
+ )
775
+ elif include_boundaries == "end":
776
+ # Exclude start, include end
777
+ top = (
778
+ start_element.bottom
779
+ if hasattr(start_element, "bottom")
780
+ else start_element.top
781
+ )
782
+ # "both" is default - no adjustment needed
783
+
784
+ section = Region(
785
+ start_element.page,
786
+ (0, top, start_element.page.width, bottom),
787
+ )
788
+ else: # horizontal
789
+ left = start_element.x0
790
+ right = end_element.x1
791
+
792
+ if include_boundaries == "none":
793
+ # Exclude both boundaries - move past them
794
+ left = (
795
+ start_element.x1
796
+ if hasattr(start_element, "x1")
797
+ else start_element.x0
798
+ )
799
+ right = (
800
+ end_element.x0 if hasattr(end_element, "x0") else end_element.x1
801
+ )
802
+ elif include_boundaries == "start":
803
+ # Include start, exclude end
804
+ right = (
805
+ end_element.x0 if hasattr(end_element, "x0") else end_element.x1
806
+ )
807
+ elif include_boundaries == "end":
808
+ # Exclude start, include end
809
+ left = (
810
+ start_element.x1
811
+ if hasattr(start_element, "x1")
812
+ else start_element.x0
813
+ )
814
+ # "both" is default - no adjustment needed
815
+
816
+ section = Region(
817
+ start_element.page,
818
+ (left, 0, right, start_element.page.height),
819
+ )
693
820
  section.start_element = start_element
694
821
  section.boundary_element_found = end_element
695
822
  else:
696
823
  section = start_element.page.get_section_between(
697
- start_element, end_element, include_boundaries
824
+ start_element, end_element, include_boundaries, orientation
698
825
  )
699
826
  sections.append(section)
700
827
  else:
701
828
  # Create FlowRegion spanning pages
702
- flow_region = _build_flow_region(start_element, end_element)
829
+ flow_region = _build_flow_region(
830
+ start_element, end_element, include_boundaries, orientation
831
+ )
703
832
  sections.append(flow_region)
704
833
 
705
834
  current_start = None
@@ -713,8 +842,11 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
713
842
  if start_element.page == boundary["element"].page:
714
843
  # Find elements on this page
715
844
  page_elements = [e for e in all_elements if e.page == start_element.page]
716
- # Sort by position
717
- page_elements.sort(key=lambda e: (e.top, e.x0))
845
+ # Sort by position based on orientation
846
+ if orientation == "vertical":
847
+ page_elements.sort(key=lambda e: (e.top, e.x0))
848
+ else: # horizontal
849
+ page_elements.sort(key=lambda e: (e.x0, e.top))
718
850
 
719
851
  # Find the last element before the boundary
720
852
  end_idx = (
@@ -726,7 +858,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
726
858
 
727
859
  # Create the section
728
860
  section = start_element.page.get_section_between(
729
- start_element, end_element, include_boundaries
861
+ start_element, end_element, include_boundaries, orientation
730
862
  )
731
863
  sections.append(section)
732
864
  else:
@@ -735,9 +867,37 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
735
867
 
736
868
  start_page = start_element.page
737
869
 
738
- # Handle implicit start elements
739
- start_top = start_element.top
740
- region = Region(start_page, (0, start_top, start_page.width, start_page.height))
870
+ # Handle implicit start elements and respect include_boundaries
871
+ if orientation == "vertical":
872
+ if include_boundaries in ["none", "end"]:
873
+ # Exclude start boundary
874
+ start_top = (
875
+ start_element.bottom
876
+ if hasattr(start_element, "bottom")
877
+ else start_element.top
878
+ )
879
+ else:
880
+ # Include start boundary
881
+ start_top = start_element.top
882
+
883
+ region = Region(
884
+ start_page, (0, start_top, start_page.width, start_page.height)
885
+ )
886
+ else: # horizontal
887
+ if include_boundaries in ["none", "end"]:
888
+ # Exclude start boundary
889
+ start_left = (
890
+ start_element.x1
891
+ if hasattr(start_element, "x1")
892
+ else start_element.x0
893
+ )
894
+ else:
895
+ # Include start boundary
896
+ start_left = start_element.x0
897
+
898
+ region = Region(
899
+ start_page, (start_left, 0, start_page.width, start_page.height)
900
+ )
741
901
  region.start_element = start_element
742
902
  sections.append(region)
743
903
 
@@ -753,19 +913,48 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
753
913
  # on the last page of the collection
754
914
  last_page = self.pages[-1]
755
915
  last_page_elements = [e for e in all_elements if e.page == last_page]
756
- last_page_elements.sort(key=lambda e: (e.top, e.x0))
916
+ if orientation == "vertical":
917
+ last_page_elements.sort(key=lambda e: (e.top, e.x0))
918
+ else: # horizontal
919
+ last_page_elements.sort(key=lambda e: (e.x0, e.top))
757
920
  end_element = last_page_elements[-1] if last_page_elements else None
758
921
 
759
922
  # Create FlowRegion spanning multiple pages using helper
760
- flow_region = _build_flow_region(start_element, end_element)
923
+ flow_region = _build_flow_region(
924
+ start_element, end_element, include_boundaries, orientation
925
+ )
761
926
  sections.append(flow_region)
762
927
  else:
763
928
  # With start_elements only, create a section to the end of the current page
764
929
  from natural_pdf.elements.region import Region
765
930
 
766
- # Handle implicit start elements
767
- start_top = start_element.top
768
- region = Region(start_page, (0, start_top, start_page.width, start_page.height))
931
+ # Handle implicit start elements and respect include_boundaries
932
+ if orientation == "vertical":
933
+ if include_boundaries in ["none", "end"]:
934
+ # Exclude start boundary
935
+ start_top = (
936
+ start_element.bottom
937
+ if hasattr(start_element, "bottom")
938
+ else start_element.top
939
+ )
940
+ else:
941
+ # Include start boundary
942
+ start_top = start_element.top
943
+
944
+ region = Region(start_page, (0, start_top, start_page.width, start_page.height))
945
+ else: # horizontal
946
+ if include_boundaries in ["none", "end"]:
947
+ # Exclude start boundary
948
+ start_left = (
949
+ start_element.x1 if hasattr(start_element, "x1") else start_element.x0
950
+ )
951
+ else:
952
+ # Include start boundary
953
+ start_left = start_element.x0
954
+
955
+ region = Region(
956
+ start_page, (start_left, 0, start_page.width, start_page.height)
957
+ )
769
958
  region.start_element = start_element
770
959
  sections.append(region)
771
960
 
@@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional,
7
7
 
8
8
  from tqdm.auto import tqdm
9
9
 
10
+ from natural_pdf.utils.color_utils import format_color_value
11
+
10
12
  if TYPE_CHECKING:
11
13
  from natural_pdf.core.page import Page
12
14
  from natural_pdf.core.page_collection import PageCollection
@@ -201,7 +203,15 @@ class PageGroupBy:
201
203
  """
202
204
  groups = self._compute_groups()
203
205
  for key, pages in groups.items():
204
- print(f"\n--- Group: {key} ({len(pages)} pages) ---")
206
+ # Format the key for display, converting colors to hex if needed
207
+ if isinstance(self.by, str):
208
+ # If grouped by a string selector, check if it's a color attribute
209
+ formatted_key = format_color_value(key, attr_name=self.by)
210
+ else:
211
+ # For callable grouping, try to format as color
212
+ formatted_key = format_color_value(key)
213
+
214
+ print(f"\n--- Group: {formatted_key} ({len(pages)} pages) ---")
205
215
  pages.show(**kwargs)
206
216
 
207
217
  def __len__(self) -> int:
@@ -220,7 +230,15 @@ class PageGroupBy:
220
230
  print("-" * 40)
221
231
 
222
232
  for i, (key, pages) in enumerate(groups.items()):
223
- key_display = f"'{key}'" if key is not None else "None"
233
+ if key is None:
234
+ key_display = "None"
235
+ else:
236
+ # Format the key for display, converting colors to hex if needed
237
+ if isinstance(self.by, str):
238
+ formatted_key = format_color_value(key, attr_name=self.by)
239
+ else:
240
+ formatted_key = format_color_value(key)
241
+ key_display = f"'{formatted_key}'"
224
242
  print(f"[{i}] {key_display}: {len(pages)} pages")
225
243
 
226
244
  def __repr__(self) -> str:
natural_pdf/core/pdf.py CHANGED
@@ -1280,6 +1280,7 @@ class PDF(
1280
1280
  end_elements=None,
1281
1281
  new_section_on_page_break=False,
1282
1282
  include_boundaries="both",
1283
+ orientation="vertical",
1283
1284
  ) -> "ElementCollection":
1284
1285
  """
1285
1286
  Extract sections from the entire PDF based on start/end elements.
@@ -1292,6 +1293,7 @@ class PDF(
1292
1293
  end_elements: Elements or selector string that mark the end of sections (optional)
1293
1294
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1294
1295
  include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1296
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
1295
1297
 
1296
1298
  Returns:
1297
1299
  ElementCollection of Region objects representing the extracted sections
@@ -1328,6 +1330,7 @@ class PDF(
1328
1330
  end_elements=end_elements,
1329
1331
  new_section_on_page_break=new_section_on_page_break,
1330
1332
  include_boundaries=include_boundaries,
1333
+ orientation=orientation,
1331
1334
  )
1332
1335
 
1333
1336
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -196,7 +196,7 @@ class Visualizable:
196
196
  columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
197
197
  limit: Optional[int] = 30, # Max pages to show (default 30)
198
198
  # Cropping options
199
- crop: Union[bool, Literal["content"]] = False,
199
+ crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
200
200
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
201
201
  **kwargs,
202
202
  ) -> Optional["PIL_Image"]:
@@ -219,7 +219,12 @@ class Visualizable:
219
219
  gap: Pixels between stacked images
220
220
  columns: Number of columns for grid layout (defaults to 6)
221
221
  limit: Maximum number of pages to display (default 30, None for all)
222
- crop: Whether to crop (True, False, or 'content' for bbox of elements)
222
+ crop: Cropping mode:
223
+ - False: No cropping (default)
224
+ - True: Tight crop to element bounds
225
+ - int: Padding in pixels around element
226
+ - 'wide': Full page width, cropped vertically to element
227
+ - Region: Crop to the bounds of another region
223
228
  crop_bbox: Explicit crop bounds
224
229
  **kwargs: Additional parameters passed to rendering
225
230
 
@@ -230,6 +235,11 @@ class Visualizable:
230
235
  if isinstance(annotate, str):
231
236
  annotate = [annotate]
232
237
 
238
+ # Handle 'cols' as an alias for 'columns' for backward compatibility
239
+ if "cols" in kwargs and columns == 6: # Only use cols if columns wasn't explicitly set
240
+ columns = kwargs.pop("cols")
241
+ logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
242
+
233
243
  # Pass limit as max_pages to _get_render_specs
234
244
  if limit is not None:
235
245
  kwargs["max_pages"] = limit
@@ -283,7 +293,7 @@ class Visualizable:
283
293
  gap: int = 5,
284
294
  columns: Optional[int] = None,
285
295
  # Cropping options
286
- crop: Union[bool, Literal["content"]] = False,
296
+ crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
287
297
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
288
298
  **kwargs,
289
299
  ) -> Optional["PIL_Image"]:
@@ -299,13 +309,18 @@ class Visualizable:
299
309
  stack_direction: Direction for stack layout
300
310
  gap: Pixels between stacked images
301
311
  columns: Number of columns for grid layout
302
- crop: Whether to crop
312
+ crop: Cropping mode (False, True, int for padding, 'wide', or Region)
303
313
  crop_bbox: Explicit crop bounds
304
314
  **kwargs: Additional parameters passed to rendering
305
315
 
306
316
  Returns:
307
317
  PIL Image object or None if nothing to render
308
318
  """
319
+ # Handle 'cols' as an alias for 'columns' for backward compatibility
320
+ if "cols" in kwargs and columns is None: # Only use cols if columns wasn't explicitly set
321
+ columns = kwargs.pop("cols")
322
+ logger.info(f"Using 'cols' parameter as alias for 'columns': {columns}")
323
+
309
324
  specs = self._get_render_specs(mode="render", crop=crop, crop_bbox=crop_bbox, **kwargs)
310
325
 
311
326
  if not specs:
@@ -353,7 +368,7 @@ class Visualizable:
353
368
  stack_direction: Direction for stack layout
354
369
  gap: Pixels between stacked images
355
370
  columns: Number of columns for grid layout
356
- crop: Whether to crop
371
+ crop: Cropping mode (False, True, int for padding, 'wide', or Region)
357
372
  crop_bbox: Explicit crop bounds
358
373
  format: Image format (inferred from path if not specified)
359
374
  **kwargs: Additional parameters passed to rendering
@@ -344,7 +344,7 @@ def _extract_element_value(element: "Element", column: str) -> Any:
344
344
 
345
345
  elif column == "highlight":
346
346
  # If element is highlighted, return its colour; otherwise blank
347
- if getattr(element, "highlight", False):
347
+ if getattr(element, "is_highlighted", False):
348
348
  col_val = getattr(element, "highlight_color", None)
349
349
  if col_val is None:
350
350
  return "True" # fallback if colour missing
@@ -306,7 +306,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
306
306
  styles["strikeout"] += 1
307
307
  if getattr(element, "underline", False):
308
308
  styles["underline"] += 1
309
- if getattr(element, "highlight", False):
309
+ if getattr(element, "is_highlighted", False):
310
310
  styles["highlight"] += 1
311
311
 
312
312
  # Color - use TextElement's color property