natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
789
789
  start_element.page,
790
790
  (0, top, start_element.page.width, bottom),
791
791
  )
792
+ section._boundary_exclusions = include_boundaries
792
793
  else: # horizontal
793
794
  left = start_element.x0
794
795
  right = end_element.x1
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
821
822
  start_element.page,
822
823
  (left, 0, right, start_element.page.height),
823
824
  )
825
+ section._boundary_exclusions = include_boundaries
824
826
  section.start_element = start_element
825
827
  section.boundary_element_found = end_element
826
828
  else:
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
865
867
  start_element.page, (0, top, start_element.page.width, bottom)
866
868
  )
867
869
  section.start_element = start_element
870
+ section.end_element = (
871
+ next_start # The next start is the end of this section
872
+ )
873
+ section._boundary_exclusions = include_boundaries
868
874
  sections.append(section)
869
875
  else: # horizontal
870
876
  # Determine horizontal bounds
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
882
888
  start_element.page, (left, 0, right, start_element.page.height)
883
889
  )
884
890
  section.start_element = start_element
891
+ section.end_element = (
892
+ next_start # The next start is the end of this section
893
+ )
894
+ section._boundary_exclusions = include_boundaries
885
895
  sections.append(section)
886
896
  else:
887
897
  # Cross-page section - create from current_start to the end of its page
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
982
992
 
983
993
  return ElementCollection(sections)
984
994
 
995
+ def split(self, divider, **kwargs) -> "ElementCollection[Region]":
996
+ """
997
+ Divide this page collection into sections based on the provided divider elements.
998
+
999
+ Args:
1000
+ divider: Elements or selector string that mark section boundaries
1001
+ **kwargs: Additional parameters passed to get_sections()
1002
+ - include_boundaries: How to include boundary elements (default: 'start')
1003
+ - orientation: 'vertical' or 'horizontal' (default: 'vertical')
1004
+ - new_section_on_page_break: Whether to split at page boundaries (default: False)
1005
+
1006
+ Returns:
1007
+ ElementCollection of Region objects representing the sections
1008
+
1009
+ Example:
1010
+ # Split a PDF by chapter titles
1011
+ chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
1012
+
1013
+ # Split by page breaks
1014
+ page_sections = pdf.pages.split(None, new_section_on_page_break=True)
1015
+
1016
+ # Split multi-page document by section headers
1017
+ sections = pdf.pages[10:20].split("text:bold:contains('Section')")
1018
+ """
1019
+ # Default to 'start' boundaries for split (include divider at start of each section)
1020
+ if "include_boundaries" not in kwargs:
1021
+ kwargs["include_boundaries"] = "start"
1022
+
1023
+ sections = self.get_sections(start_elements=divider, **kwargs)
1024
+
1025
+ # Add initial section if there's content before the first divider
1026
+ if sections and divider is not None:
1027
+ # Get all elements across all pages
1028
+ all_elements = []
1029
+ for page in self.pages:
1030
+ all_elements.extend(page.get_elements())
1031
+
1032
+ if all_elements:
1033
+ # Find first divider
1034
+ if isinstance(divider, str):
1035
+ # Search for first matching element
1036
+ first_divider = None
1037
+ for page in self.pages:
1038
+ match = page.find(divider)
1039
+ if match:
1040
+ first_divider = match
1041
+ break
1042
+ else:
1043
+ # divider is already elements
1044
+ first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
1045
+
1046
+ if first_divider and all_elements[0] != first_divider:
1047
+ # There's content before the first divider
1048
+ # Get section from start to first divider
1049
+ initial_sections = self.get_sections(
1050
+ start_elements=None,
1051
+ end_elements=[first_divider],
1052
+ include_boundaries="none",
1053
+ orientation=kwargs.get("orientation", "vertical"),
1054
+ )
1055
+ if initial_sections:
1056
+ sections = ElementCollection([initial_sections[0]] + list(sections))
1057
+
1058
+ return sections
1059
+
985
1060
  def _gather_analysis_data(
986
1061
  self,
987
1062
  analysis_keys: List[str],
natural_pdf/core/pdf.py CHANGED
@@ -1333,6 +1333,39 @@ class PDF(
1333
1333
  orientation=orientation,
1334
1334
  )
1335
1335
 
1336
+ def split(self, divider, **kwargs) -> "ElementCollection":
1337
+ """
1338
+ Divide the PDF into sections based on the provided divider elements.
1339
+
1340
+ Args:
1341
+ divider: Elements or selector string that mark section boundaries
1342
+ **kwargs: Additional parameters passed to get_sections()
1343
+ - include_boundaries: How to include boundary elements (default: 'start')
1344
+ - orientation: 'vertical' or 'horizontal' (default: 'vertical')
1345
+ - new_section_on_page_break: Whether to split at page boundaries (default: False)
1346
+
1347
+ Returns:
1348
+ ElementCollection of Region objects representing the sections
1349
+
1350
+ Example:
1351
+ # Split a PDF by chapter titles
1352
+ chapters = pdf.split("text[size>20]:contains('Chapter')")
1353
+
1354
+ # Export each chapter to a separate file
1355
+ for i, chapter in enumerate(chapters):
1356
+ chapter_text = chapter.extract_text()
1357
+ with open(f"chapter_{i+1}.txt", "w") as f:
1358
+ f.write(chapter_text)
1359
+
1360
+ # Split by horizontal rules/lines
1361
+ sections = pdf.split("line[orientation=horizontal]")
1362
+
1363
+ # Split only by page breaks (no divider elements)
1364
+ pages = pdf.split(None, new_section_on_page_break=True)
1365
+ """
1366
+ # Delegate to pages collection
1367
+ return self.pages.split(divider, **kwargs)
1368
+
1336
1369
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1337
1370
  """
1338
1371
  DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
272
272
  "font_family",
273
273
  "font_variant",
274
274
  "size",
275
- "bold",
276
- "italic",
277
- "strike",
278
- "underline",
279
- "highlight",
275
+ "styles",
280
276
  "source",
281
277
  "confidence",
278
+ "color",
282
279
  ]
283
280
  )
284
- # Add foreground text colour too
285
- columns.append("color")
286
281
  elif element_type == "rect":
287
282
  columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
288
283
  elif element_type == "line":
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
358
353
  return str(col_val)
359
354
  return ""
360
355
 
356
+ elif column == "styles":
357
+ # Collect all active text decorations
358
+ styles = []
359
+
360
+ if getattr(element, "bold", False):
361
+ styles.append("bold")
362
+ if getattr(element, "italic", False):
363
+ styles.append("italic")
364
+ if getattr(element, "strike", False):
365
+ styles.append("strike")
366
+ if getattr(element, "underline", False):
367
+ styles.append("underline")
368
+
369
+ # Handle highlight specially - include color if not default yellow
370
+ if getattr(element, "is_highlighted", False):
371
+ highlight_color = getattr(element, "highlight_color", None)
372
+ if highlight_color is not None:
373
+ # Convert color to hex if needed
374
+ if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
375
+ try:
376
+ r, g, b = [
377
+ int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
378
+ ]
379
+ hex_color = f"#{r:02x}{g:02x}{b:02x}"
380
+ styles.append(f"highlight({hex_color})")
381
+ except Exception:
382
+ styles.append("highlight")
383
+ elif isinstance(highlight_color, (int, float)):
384
+ # Grayscale value
385
+ try:
386
+ gray = (
387
+ int(highlight_color * 255)
388
+ if highlight_color <= 1
389
+ else int(highlight_color)
390
+ )
391
+ hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
392
+ styles.append(f"highlight({hex_color})")
393
+ except Exception:
394
+ styles.append("highlight")
395
+ else:
396
+ styles.append("highlight")
397
+ else:
398
+ styles.append("highlight")
399
+
400
+ return ", ".join(styles) if styles else ""
401
+
361
402
  elif column in ["stroke", "fill", "color"]:
362
403
  value = getattr(element, column, None)
363
404
  # If already a string (e.g. '#ff00aa' or 'red') return as is