natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -814,6 +814,7 @@ class Element(DirectionalMixin):
814
814
  legend_position: str = "right",
815
815
  color: Optional[Union[Tuple, str]] = "red", # Default color for single element
816
816
  label: Optional[str] = None,
817
+ width: Optional[int] = None, # Add width parameter
817
818
  ) -> Optional["Image.Image"]:
818
819
  """
819
820
  Show the page with only this element highlighted temporarily.
@@ -824,6 +825,7 @@ class Element(DirectionalMixin):
824
825
  legend_position: Position of the legend
825
826
  color: Color to highlight this element (default: red)
826
827
  label: Optional label for this element in the legend
828
+ width: Optional width for the output image in pixels
827
829
 
828
830
  Returns:
829
831
  PIL Image of the page with only this element highlighted, or None if error.
@@ -861,6 +863,7 @@ class Element(DirectionalMixin):
861
863
  page_index=self.page.index,
862
864
  temporary_highlights=[temp_highlight_data],
863
865
  scale=scale,
866
+ width=width, # Pass the width parameter
864
867
  labels=labels,
865
868
  legend_position=legend_position,
866
869
  )
@@ -898,6 +901,7 @@ class Element(DirectionalMixin):
898
901
  self,
899
902
  *,
900
903
  text: str,
904
+ contains: str = "all",
901
905
  apply_exclusions: bool = True,
902
906
  regex: bool = False,
903
907
  case: bool = True,
@@ -909,6 +913,7 @@ class Element(DirectionalMixin):
909
913
  self,
910
914
  selector: str,
911
915
  *,
916
+ contains: str = "all",
912
917
  apply_exclusions: bool = True,
913
918
  regex: bool = False,
914
919
  case: bool = True,
@@ -920,6 +925,7 @@ class Element(DirectionalMixin):
920
925
  selector: Optional[str] = None,
921
926
  *,
922
927
  text: Optional[str] = None,
928
+ contains: str = "all",
923
929
  apply_exclusions: bool = True,
924
930
  regex: bool = False,
925
931
  case: bool = True,
@@ -934,6 +940,9 @@ class Element(DirectionalMixin):
934
940
  Args:
935
941
  selector: CSS-like selector string.
936
942
  text: Text content to search for (equivalent to 'text:contains(...)').
943
+ contains: How to determine if elements are inside: 'all' (fully inside),
944
+ 'any' (any overlap), or 'center' (center point inside).
945
+ (default: "all")
937
946
  apply_exclusions: Whether to apply exclusion regions (default: True).
938
947
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
939
948
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -950,6 +959,7 @@ class Element(DirectionalMixin):
950
959
  return temp_region.find(
951
960
  selector=selector,
952
961
  text=text,
962
+ contains=contains,
953
963
  apply_exclusions=apply_exclusions,
954
964
  regex=regex,
955
965
  case=case,
@@ -961,6 +971,7 @@ class Element(DirectionalMixin):
961
971
  self,
962
972
  *,
963
973
  text: str,
974
+ contains: str = "all",
964
975
  apply_exclusions: bool = True,
965
976
  regex: bool = False,
966
977
  case: bool = True,
@@ -972,6 +983,7 @@ class Element(DirectionalMixin):
972
983
  self,
973
984
  selector: str,
974
985
  *,
986
+ contains: str = "all",
975
987
  apply_exclusions: bool = True,
976
988
  regex: bool = False,
977
989
  case: bool = True,
@@ -983,6 +995,7 @@ class Element(DirectionalMixin):
983
995
  selector: Optional[str] = None,
984
996
  *,
985
997
  text: Optional[str] = None,
998
+ contains: str = "all",
986
999
  apply_exclusions: bool = True,
987
1000
  regex: bool = False,
988
1001
  case: bool = True,
@@ -997,6 +1010,9 @@ class Element(DirectionalMixin):
997
1010
  Args:
998
1011
  selector: CSS-like selector string.
999
1012
  text: Text content to search for (equivalent to 'text:contains(...)').
1013
+ contains: How to determine if elements are inside: 'all' (fully inside),
1014
+ 'any' (any overlap), or 'center' (center point inside).
1015
+ (default: "all")
1000
1016
  apply_exclusions: Whether to apply exclusion regions (default: True).
1001
1017
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1002
1018
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1013,6 +1029,7 @@ class Element(DirectionalMixin):
1013
1029
  return temp_region.find_all(
1014
1030
  selector=selector,
1015
1031
  text=text,
1032
+ contains=contains,
1016
1033
  apply_exclusions=apply_exclusions,
1017
1034
  regex=regex,
1018
1035
  case=case,
@@ -20,10 +20,10 @@ from typing import (
20
20
  )
21
21
 
22
22
  from pdfplumber.utils.geometry import objects_to_bbox
23
- from PIL import Image, ImageDraw, ImageFont
24
23
 
25
24
  # New Imports
26
25
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
26
+ from PIL import Image, ImageDraw, ImageFont
27
27
  from tqdm.auto import tqdm
28
28
 
29
29
  from natural_pdf.classification.manager import ClassificationManager
@@ -38,10 +38,31 @@ from natural_pdf.ocr import OCROptions
38
38
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
39
39
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
40
40
 
41
+ # Potentially lazy imports for optional dependencies needed in save_pdf
42
+ try:
43
+ import pikepdf
44
+ except ImportError:
45
+ pikepdf = None
46
+
47
+ try:
48
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
49
+
50
+ pass
51
+ except ImportError:
52
+ create_searchable_pdf = None
53
+
54
+ # ---> ADDED Import for the new exporter
55
+ try:
56
+ from natural_pdf.exporters.original_pdf import create_original_pdf
57
+ except ImportError:
58
+ create_original_pdf = None
59
+ # <--- END ADDED
60
+
41
61
  logger = logging.getLogger(__name__)
42
62
 
43
63
  if TYPE_CHECKING:
44
64
  from natural_pdf.core.page import Page
65
+ from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
45
66
  from natural_pdf.elements.region import Region
46
67
 
47
68
  T = TypeVar("T")
@@ -820,6 +841,7 @@ class ElementCollection(
820
841
  labels: bool = True, # Use 'labels' consistent with service
821
842
  legend_position: str = "right",
822
843
  render_ocr: bool = False,
844
+ width: Optional[int] = None, # Add width parameter
823
845
  ) -> Optional["Image.Image"]:
824
846
  """
825
847
  Generates a temporary preview image highlighting elements in this collection
@@ -842,6 +864,7 @@ class ElementCollection(
842
864
  labels: Whether to include a legend for the temporary highlights.
843
865
  legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
844
866
  render_ocr: Whether to render OCR text.
867
+ width: Optional width for the output image in pixels.
845
868
 
846
869
  Returns:
847
870
  PIL Image object of the temporary preview, or None if rendering fails or
@@ -902,6 +925,7 @@ class ElementCollection(
902
925
  page_index=page.index,
903
926
  temporary_highlights=highlight_data_list,
904
927
  scale=scale,
928
+ width=width, # Pass the width parameter
905
929
  labels=labels, # Use 'labels'
906
930
  legend_position=legend_position,
907
931
  render_ocr=render_ocr,
@@ -1139,10 +1163,96 @@ class ElementCollection(
1139
1163
 
1140
1164
  Args:
1141
1165
  selector: CSS-like selector string
1166
+ contains: How to determine if elements are inside: 'all' (fully inside),
1167
+ 'any' (any overlap), or 'center' (center point inside).
1168
+ (default: "all")
1142
1169
  apply_exclusions: Whether to exclude elements in exclusion regions
1143
1170
  """
1144
1171
  return self.apply(lambda element: element.find(selector, **kwargs))
1145
1172
 
1173
+ @overload
1174
+ def find_all(
1175
+ self,
1176
+ *,
1177
+ text: str,
1178
+ contains: str = "all",
1179
+ apply_exclusions: bool = True,
1180
+ regex: bool = False,
1181
+ case: bool = True,
1182
+ **kwargs,
1183
+ ) -> "ElementCollection": ...
1184
+
1185
+ @overload
1186
+ def find_all(
1187
+ self,
1188
+ selector: str,
1189
+ *,
1190
+ contains: str = "all",
1191
+ apply_exclusions: bool = True,
1192
+ regex: bool = False,
1193
+ case: bool = True,
1194
+ **kwargs,
1195
+ ) -> "ElementCollection": ...
1196
+
1197
+ def find_all(
1198
+ self,
1199
+ selector: Optional[str] = None,
1200
+ *,
1201
+ text: Optional[str] = None,
1202
+ contains: str = "all",
1203
+ apply_exclusions: bool = True,
1204
+ regex: bool = False,
1205
+ case: bool = True,
1206
+ **kwargs,
1207
+ ) -> "ElementCollection":
1208
+ """
1209
+ Find all elements within each element of this collection matching the selector OR text,
1210
+ and return a flattened collection of all found sub-elements.
1211
+
1212
+ Provide EITHER `selector` OR `text`, but not both.
1213
+
1214
+ Args:
1215
+ selector: CSS-like selector string.
1216
+ text: Text content to search for (equivalent to 'text:contains(...)').
1217
+ contains: How to determine if elements are inside: 'all' (fully inside),
1218
+ 'any' (any overlap), or 'center' (center point inside).
1219
+ (default: "all")
1220
+ apply_exclusions: Whether to apply exclusion regions (default: True).
1221
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1222
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1223
+ **kwargs: Additional parameters for element filtering.
1224
+
1225
+ Returns:
1226
+ A new ElementCollection containing all matching sub-elements from all elements
1227
+ in this collection.
1228
+ """
1229
+ if selector is None and text is None:
1230
+ raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
1231
+ if selector is not None and text is not None:
1232
+ raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
1233
+
1234
+ all_found_elements: List[Element] = []
1235
+ for element in self._elements:
1236
+ if hasattr(element, "find_all") and callable(element.find_all):
1237
+ # Element.find_all returns an ElementCollection
1238
+ found_in_element: "ElementCollection" = element.find_all(
1239
+ selector=selector,
1240
+ text=text,
1241
+ contains=contains,
1242
+ apply_exclusions=apply_exclusions,
1243
+ regex=regex,
1244
+ case=case,
1245
+ **kwargs,
1246
+ )
1247
+ if found_in_element and found_in_element.elements:
1248
+ all_found_elements.extend(found_in_element.elements)
1249
+ # else:
1250
+ # Elements in the collection are expected to support find_all.
1251
+ # If an element type doesn't, an AttributeError will naturally occur,
1252
+ # or a more specific check/handling could be added here if needed.
1253
+
1254
+ return ElementCollection(all_found_elements)
1255
+
1146
1256
  def extract_each_text(self, **kwargs) -> List[str]:
1147
1257
  """
1148
1258
  Extract text from each element in this region.
@@ -1613,6 +1723,7 @@ class PageCollection(Generic[P], ApplyMixin):
1613
1723
  self,
1614
1724
  *,
1615
1725
  text: str,
1726
+ contains: str = "all",
1616
1727
  apply_exclusions: bool = True,
1617
1728
  regex: bool = False,
1618
1729
  case: bool = True,
@@ -1624,6 +1735,7 @@ class PageCollection(Generic[P], ApplyMixin):
1624
1735
  self,
1625
1736
  selector: str,
1626
1737
  *,
1738
+ contains: str = "all",
1627
1739
  apply_exclusions: bool = True,
1628
1740
  regex: bool = False,
1629
1741
  case: bool = True,
@@ -1635,6 +1747,7 @@ class PageCollection(Generic[P], ApplyMixin):
1635
1747
  selector: Optional[str] = None,
1636
1748
  *,
1637
1749
  text: Optional[str] = None,
1750
+ contains: str = "all",
1638
1751
  apply_exclusions: bool = True,
1639
1752
  regex: bool = False,
1640
1753
  case: bool = True,
@@ -1648,6 +1761,9 @@ class PageCollection(Generic[P], ApplyMixin):
1648
1761
  Args:
1649
1762
  selector: CSS-like selector string.
1650
1763
  text: Text content to search for (equivalent to 'text:contains(...)').
1764
+ contains: How to determine if elements are inside: 'all' (fully inside),
1765
+ 'any' (any overlap), or 'center' (center point inside).
1766
+ (default: "all")
1651
1767
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1652
1768
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1653
1769
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1661,6 +1777,7 @@ class PageCollection(Generic[P], ApplyMixin):
1661
1777
  element = page.find(
1662
1778
  selector=selector,
1663
1779
  text=text,
1780
+ contains=contains,
1664
1781
  apply_exclusions=apply_exclusions,
1665
1782
  regex=regex,
1666
1783
  case=case,
@@ -1675,6 +1792,7 @@ class PageCollection(Generic[P], ApplyMixin):
1675
1792
  self,
1676
1793
  *,
1677
1794
  text: str,
1795
+ contains: str = "all",
1678
1796
  apply_exclusions: bool = True,
1679
1797
  regex: bool = False,
1680
1798
  case: bool = True,
@@ -1686,6 +1804,7 @@ class PageCollection(Generic[P], ApplyMixin):
1686
1804
  self,
1687
1805
  selector: str,
1688
1806
  *,
1807
+ contains: str = "all",
1689
1808
  apply_exclusions: bool = True,
1690
1809
  regex: bool = False,
1691
1810
  case: bool = True,
@@ -1697,6 +1816,7 @@ class PageCollection(Generic[P], ApplyMixin):
1697
1816
  selector: Optional[str] = None,
1698
1817
  *,
1699
1818
  text: Optional[str] = None,
1819
+ contains: str = "all",
1700
1820
  apply_exclusions: bool = True,
1701
1821
  regex: bool = False,
1702
1822
  case: bool = True,
@@ -1710,6 +1830,9 @@ class PageCollection(Generic[P], ApplyMixin):
1710
1830
  Args:
1711
1831
  selector: CSS-like selector string.
1712
1832
  text: Text content to search for (equivalent to 'text:contains(...)').
1833
+ contains: How to determine if elements are inside: 'all' (fully inside),
1834
+ 'any' (any overlap), or 'center' (center point inside).
1835
+ (default: "all")
1713
1836
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1714
1837
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1715
1838
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1724,6 +1847,7 @@ class PageCollection(Generic[P], ApplyMixin):
1724
1847
  elements = page.find_all(
1725
1848
  selector=selector,
1726
1849
  text=text,
1850
+ contains=contains,
1727
1851
  apply_exclusions=apply_exclusions,
1728
1852
  regex=regex,
1729
1853
  case=case,
@@ -2290,6 +2414,15 @@ class PageCollection(Generic[P], ApplyMixin):
2290
2414
  Returns:
2291
2415
  PIL Image of the page grid or None if no pages
2292
2416
  """
2417
+ # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2418
+ try:
2419
+ from PIL import Image, ImageDraw, ImageFont
2420
+ except ImportError:
2421
+ logger.error(
2422
+ "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
2423
+ )
2424
+ return None
2425
+
2293
2426
  if not self.pages:
2294
2427
  logger.warning("Cannot generate image for empty PageCollection")
2295
2428
  return None
@@ -2298,64 +2431,144 @@ class PageCollection(Generic[P], ApplyMixin):
2298
2431
  pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2299
2432
 
2300
2433
  # Load font once outside the loop
2301
- font = ImageFont.load_default(16) if add_labels else None
2434
+ font = None
2435
+ if add_labels:
2436
+ try:
2437
+ # Try loading a commonly available font first
2438
+ font = ImageFont.truetype("DejaVuSans.ttf", 16)
2439
+ except IOError:
2440
+ try:
2441
+ font = ImageFont.load_default(16)
2442
+ except IOError:
2443
+ logger.warning("Default font not found. Labels cannot be added.")
2444
+ add_labels = False # Disable if no font
2302
2445
 
2303
2446
  # Render individual page images
2304
2447
  page_images = []
2305
2448
  for page in pages_to_render:
2306
- img = page.to_image(width=page_width)
2449
+ try:
2450
+ # Assume page.to_image returns a PIL Image or None
2451
+ img = page.to_image(
2452
+ width=page_width, include_highlights=True
2453
+ ) # Render with highlights for visual context
2454
+ if img is None:
2455
+ logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2456
+ continue
2457
+ except Exception as img_err:
2458
+ logger.error(
2459
+ f"Error generating image for page {page.number}: {img_err}", exc_info=True
2460
+ )
2461
+ continue
2307
2462
 
2308
2463
  # Add page number label
2309
- if add_labels and font: # Check if font was loaded
2464
+ if add_labels and font:
2310
2465
  draw = ImageDraw.Draw(img)
2311
- pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
2312
- label_text = f"p{page.number} - {pdf_name}"
2466
+ pdf_name = (
2467
+ Path(page.pdf.path).stem
2468
+ if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
2469
+ else ""
2470
+ )
2471
+ label_text = f"p{page.number}"
2472
+ if pdf_name:
2473
+ label_text += f" - {pdf_name}"
2313
2474
 
2314
2475
  # Add category if requested and available
2315
2476
  if show_category:
2316
- category = getattr(page, "category", None)
2317
- confidence = getattr(page, "category_confidence", None)
2318
- if category is not None and confidence is not None:
2319
- category_str = f"{category} {confidence:.3f}"
2320
- label_text += f"\n{category_str}"
2477
+ # Placeholder logic - adjust based on how classification results are stored
2478
+ category = None
2479
+ confidence = None
2480
+ if (
2481
+ hasattr(page, "analyses")
2482
+ and page.analyses
2483
+ and "classification" in page.analyses
2484
+ ):
2485
+ result = page.analyses["classification"]
2486
+ # Adapt based on actual structure of classification result
2487
+ category = (
2488
+ getattr(result, "label", None) or result.get("label", None)
2489
+ if isinstance(result, dict)
2490
+ else None
2491
+ )
2492
+ confidence = (
2493
+ getattr(result, "score", None) or result.get("score", None)
2494
+ if isinstance(result, dict)
2495
+ else None
2496
+ )
2321
2497
 
2322
- # Calculate bounding box for multi-line text
2323
- # Use (5, 5) as top-left anchor for textbbox calculation for padding
2324
- # Use multiline_textbbox for accurate bounds with newlines
2325
- bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
2326
- # Add padding to the calculated bbox for the white background
2327
- bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
2498
+ if category is not None and confidence is not None:
2499
+ try:
2500
+ category_str = f"{category} ({confidence:.2f})" # Format confidence
2501
+ label_text += f"\\n{category_str}"
2502
+ except (TypeError, ValueError):
2503
+ pass # Ignore formatting errors
2328
2504
 
2329
- # Draw white background rectangle
2330
- draw.rectangle(bg_rect, fill=(255, 255, 255))
2505
+ # Calculate bounding box for multi-line text and draw background/text
2506
+ try:
2507
+ # Using textbbox for potentially better accuracy with specific fonts
2508
+ # Note: textbbox needs Pillow 8+
2509
+ bbox = draw.textbbox(
2510
+ (5, 5), label_text, font=font, spacing=2
2511
+ ) # Use textbbox if available
2512
+ bg_rect = (
2513
+ max(0, bbox[0] - 2),
2514
+ max(0, bbox[1] - 2),
2515
+ min(img.width, bbox[2] + 2),
2516
+ min(img.height, bbox[3] + 2),
2517
+ )
2331
2518
 
2332
- # Draw the potentially multi-line text using multiline_text
2333
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
2519
+ # Draw semi-transparent background
2520
+ overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
2521
+ draw_overlay = ImageDraw.Draw(overlay)
2522
+ draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2523
+ img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
2524
+ draw = ImageDraw.Draw(img) # Recreate draw object
2525
+
2526
+ # Draw the potentially multi-line text
2527
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2528
+ except AttributeError: # Fallback for older Pillow without textbbox
2529
+ # Approximate size and draw
2530
+ # This might not be perfectly aligned
2531
+ draw.rectangle(
2532
+ (2, 2, 150, 40), fill=(255, 255, 255, 180)
2533
+ ) # Simple fixed background
2534
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2535
+ except Exception as draw_err:
2536
+ logger.error(
2537
+ f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
2538
+ )
2334
2539
 
2335
2540
  page_images.append(img)
2336
2541
 
2542
+ if not page_images:
2543
+ logger.warning("No page images were successfully rendered for the grid.")
2544
+ return None
2545
+
2337
2546
  # Calculate grid dimensions if not provided
2547
+ num_images = len(page_images)
2338
2548
  if not rows and not cols:
2339
- # Default to a square-ish grid
2340
- cols = min(4, int(len(page_images) ** 0.5) + 1)
2341
- rows = (len(page_images) + cols - 1) // cols
2549
+ cols = min(4, int(num_images**0.5) + 1)
2550
+ rows = (num_images + cols - 1) // cols
2342
2551
  elif rows and not cols:
2343
- cols = (len(page_images) + rows - 1) // rows
2552
+ cols = (num_images + rows - 1) // rows
2344
2553
  elif cols and not rows:
2345
- rows = (len(page_images) + cols - 1) // cols
2554
+ rows = (num_images + cols - 1) // cols
2555
+ cols = max(1, cols if cols else 1) # Ensure at least 1
2556
+ rows = max(1, rows if rows else 1)
2346
2557
 
2347
2558
  # Get maximum dimensions for consistent grid cells
2348
- max_width = max(img.width for img in page_images)
2349
- max_height = max(img.height for img in page_images)
2559
+ max_width = max(img.width for img in page_images) if page_images else 1
2560
+ max_height = max(img.height for img in page_images) if page_images else 1
2350
2561
 
2351
2562
  # Create grid image
2352
2563
  grid_width = cols * max_width + (cols + 1) * spacing
2353
2564
  grid_height = rows * max_height + (rows + 1) * spacing
2354
- grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
2565
+ grid_img = Image.new(
2566
+ "RGB", (grid_width, grid_height), (220, 220, 220)
2567
+ ) # Lighter gray background
2355
2568
 
2356
2569
  # Place images in grid
2357
2570
  for i, img in enumerate(page_images):
2358
- if i >= rows * cols:
2571
+ if i >= rows * cols: # Ensure we don't exceed grid capacity
2359
2572
  break
2360
2573
 
2361
2574
  row = i // cols
@@ -2367,3 +2580,134 @@ class PageCollection(Generic[P], ApplyMixin):
2367
2580
  grid_img.paste(img, (x, y))
2368
2581
 
2369
2582
  return grid_img
2583
+
2584
+ def save_pdf(
2585
+ self,
2586
+ output_path: Union[str, Path],
2587
+ ocr: bool = False,
2588
+ original: bool = False,
2589
+ dpi: int = 300,
2590
+ ):
2591
+ """
2592
+ Saves the pages in this collection to a new PDF file.
2593
+
2594
+ Choose one saving mode:
2595
+ - `ocr=True`: Creates a new, image-based PDF using OCR results. This
2596
+ makes the text generated during the natural-pdf session searchable,
2597
+ but loses original vector content. Requires 'ocr-export' extras.
2598
+ - `original=True`: Extracts the original pages from the source PDF,
2599
+ preserving all vector content, fonts, and annotations. OCR results
2600
+ from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
2601
+
2602
+ Args:
2603
+ output_path: Path to save the new PDF file.
2604
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
2605
+ original: If True, save the original, vector-based pages.
2606
+ dpi: Resolution (dots per inch) used only when ocr=True for
2607
+ rendering page images and aligning the text layer.
2608
+
2609
+ Raises:
2610
+ ValueError: If the collection is empty, if neither or both 'ocr'
2611
+ and 'original' are True, or if 'original=True' and
2612
+ pages originate from different PDFs.
2613
+ ImportError: If required libraries ('pikepdf', 'Pillow')
2614
+ are not installed for the chosen mode.
2615
+ RuntimeError: If an unexpected error occurs during saving.
2616
+ """
2617
+ if not self.pages:
2618
+ raise ValueError("Cannot save an empty PageCollection.")
2619
+
2620
+ if not (ocr ^ original): # XOR: exactly one must be true
2621
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2622
+
2623
+ output_path_obj = Path(output_path)
2624
+ output_path_str = str(output_path_obj)
2625
+
2626
+ if ocr:
2627
+ if create_searchable_pdf is None:
2628
+ raise ImportError(
2629
+ "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
2630
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2631
+ )
2632
+
2633
+ # Check for non-OCR vector elements (provide a warning)
2634
+ has_vector_elements = False
2635
+ for page in self.pages:
2636
+ # Simplified check for common vector types or non-OCR chars/words
2637
+ if (
2638
+ hasattr(page, "rects")
2639
+ and page.rects
2640
+ or hasattr(page, "lines")
2641
+ and page.lines
2642
+ or hasattr(page, "curves")
2643
+ and page.curves
2644
+ or (
2645
+ hasattr(page, "chars")
2646
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
2647
+ )
2648
+ or (
2649
+ hasattr(page, "words")
2650
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
2651
+ )
2652
+ ):
2653
+ has_vector_elements = True
2654
+ break
2655
+ if has_vector_elements:
2656
+ logger.warning(
2657
+ "Warning: Saving with ocr=True creates an image-based PDF. "
2658
+ "Original vector elements (rects, lines, non-OCR text/chars) "
2659
+ "on selected pages will not be preserved in the output file."
2660
+ )
2661
+
2662
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
2663
+ try:
2664
+ # Delegate to the searchable PDF exporter function
2665
+ # Pass `self` (the PageCollection instance) as the source
2666
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
2667
+ # Success log is now inside create_searchable_pdf if needed, or keep here
2668
+ # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
2669
+ except Exception as e:
2670
+ logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
2671
+ # Re-raise as RuntimeError for consistency, potentially handled in exporter too
2672
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
2673
+
2674
+ elif original:
2675
+ # ---> MODIFIED: Call the new exporter
2676
+ if create_original_pdf is None:
2677
+ raise ImportError(
2678
+ "Saving with original=True requires 'pikepdf'. "
2679
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2680
+ )
2681
+
2682
+ # Check for OCR elements (provide a warning) - keep this check here
2683
+ has_ocr_elements = False
2684
+ for page in self.pages:
2685
+ # Use find_all which returns a collection; check if it's non-empty
2686
+ if hasattr(page, "find_all"):
2687
+ ocr_text_elements = page.find_all("text[source=ocr]")
2688
+ if ocr_text_elements: # Check truthiness of collection
2689
+ has_ocr_elements = True
2690
+ break
2691
+ elif hasattr(page, "words"): # Fallback check if find_all isn't present?
2692
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
2693
+ has_ocr_elements = True
2694
+ break
2695
+
2696
+ if has_ocr_elements:
2697
+ logger.warning(
2698
+ "Warning: Saving with original=True preserves original page content. "
2699
+ "OCR text generated in this session will not be included in the saved file."
2700
+ )
2701
+
2702
+ logger.info(f"Saving original pages PDF to: {output_path_str}")
2703
+ try:
2704
+ # Delegate to the original PDF exporter function
2705
+ # Pass `self` (the PageCollection instance) as the source
2706
+ create_original_pdf(self, output_path_str)
2707
+ # Success log is now inside create_original_pdf
2708
+ # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
2709
+ except Exception as e:
2710
+ # Error logging is handled within create_original_pdf
2711
+ # Re-raise the exception caught from the exporter
2712
+ raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2713
+ # <--- END MODIFIED