natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +29 -40
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +20 -18
- natural_pdf/core/pdf.py +146 -13
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +374 -30
- natural_pdf/elements/region.py +45 -14
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +519 -0
- natural_pdf/exporters/hocr_font.py +136 -0
- natural_pdf/exporters/original_pdf.py +127 -0
- natural_pdf/exporters/searchable_pdf.py +2 -12
- natural_pdf/ocr/engine_surya.py +1 -1
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -814,6 +814,7 @@ class Element(DirectionalMixin):
|
|
814
814
|
legend_position: str = "right",
|
815
815
|
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
816
816
|
label: Optional[str] = None,
|
817
|
+
width: Optional[int] = None, # Add width parameter
|
817
818
|
) -> Optional["Image.Image"]:
|
818
819
|
"""
|
819
820
|
Show the page with only this element highlighted temporarily.
|
@@ -824,6 +825,7 @@ class Element(DirectionalMixin):
|
|
824
825
|
legend_position: Position of the legend
|
825
826
|
color: Color to highlight this element (default: red)
|
826
827
|
label: Optional label for this element in the legend
|
828
|
+
width: Optional width for the output image in pixels
|
827
829
|
|
828
830
|
Returns:
|
829
831
|
PIL Image of the page with only this element highlighted, or None if error.
|
@@ -861,6 +863,7 @@ class Element(DirectionalMixin):
|
|
861
863
|
page_index=self.page.index,
|
862
864
|
temporary_highlights=[temp_highlight_data],
|
863
865
|
scale=scale,
|
866
|
+
width=width, # Pass the width parameter
|
864
867
|
labels=labels,
|
865
868
|
legend_position=legend_position,
|
866
869
|
)
|
@@ -898,6 +901,7 @@ class Element(DirectionalMixin):
|
|
898
901
|
self,
|
899
902
|
*,
|
900
903
|
text: str,
|
904
|
+
contains: str = "all",
|
901
905
|
apply_exclusions: bool = True,
|
902
906
|
regex: bool = False,
|
903
907
|
case: bool = True,
|
@@ -909,6 +913,7 @@ class Element(DirectionalMixin):
|
|
909
913
|
self,
|
910
914
|
selector: str,
|
911
915
|
*,
|
916
|
+
contains: str = "all",
|
912
917
|
apply_exclusions: bool = True,
|
913
918
|
regex: bool = False,
|
914
919
|
case: bool = True,
|
@@ -920,6 +925,7 @@ class Element(DirectionalMixin):
|
|
920
925
|
selector: Optional[str] = None,
|
921
926
|
*,
|
922
927
|
text: Optional[str] = None,
|
928
|
+
contains: str = "all",
|
923
929
|
apply_exclusions: bool = True,
|
924
930
|
regex: bool = False,
|
925
931
|
case: bool = True,
|
@@ -934,6 +940,9 @@ class Element(DirectionalMixin):
|
|
934
940
|
Args:
|
935
941
|
selector: CSS-like selector string.
|
936
942
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
943
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
944
|
+
'any' (any overlap), or 'center' (center point inside).
|
945
|
+
(default: "all")
|
937
946
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
938
947
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
939
948
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -950,6 +959,7 @@ class Element(DirectionalMixin):
|
|
950
959
|
return temp_region.find(
|
951
960
|
selector=selector,
|
952
961
|
text=text,
|
962
|
+
contains=contains,
|
953
963
|
apply_exclusions=apply_exclusions,
|
954
964
|
regex=regex,
|
955
965
|
case=case,
|
@@ -961,6 +971,7 @@ class Element(DirectionalMixin):
|
|
961
971
|
self,
|
962
972
|
*,
|
963
973
|
text: str,
|
974
|
+
contains: str = "all",
|
964
975
|
apply_exclusions: bool = True,
|
965
976
|
regex: bool = False,
|
966
977
|
case: bool = True,
|
@@ -972,6 +983,7 @@ class Element(DirectionalMixin):
|
|
972
983
|
self,
|
973
984
|
selector: str,
|
974
985
|
*,
|
986
|
+
contains: str = "all",
|
975
987
|
apply_exclusions: bool = True,
|
976
988
|
regex: bool = False,
|
977
989
|
case: bool = True,
|
@@ -983,6 +995,7 @@ class Element(DirectionalMixin):
|
|
983
995
|
selector: Optional[str] = None,
|
984
996
|
*,
|
985
997
|
text: Optional[str] = None,
|
998
|
+
contains: str = "all",
|
986
999
|
apply_exclusions: bool = True,
|
987
1000
|
regex: bool = False,
|
988
1001
|
case: bool = True,
|
@@ -997,6 +1010,9 @@ class Element(DirectionalMixin):
|
|
997
1010
|
Args:
|
998
1011
|
selector: CSS-like selector string.
|
999
1012
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1013
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1014
|
+
'any' (any overlap), or 'center' (center point inside).
|
1015
|
+
(default: "all")
|
1000
1016
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1001
1017
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1002
1018
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1013,6 +1029,7 @@ class Element(DirectionalMixin):
|
|
1013
1029
|
return temp_region.find_all(
|
1014
1030
|
selector=selector,
|
1015
1031
|
text=text,
|
1032
|
+
contains=contains,
|
1016
1033
|
apply_exclusions=apply_exclusions,
|
1017
1034
|
regex=regex,
|
1018
1035
|
case=case,
|
@@ -20,10 +20,10 @@ from typing import (
|
|
20
20
|
)
|
21
21
|
|
22
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
23
|
-
from PIL import Image, ImageDraw, ImageFont
|
24
23
|
|
25
24
|
# New Imports
|
26
25
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
26
|
+
from PIL import Image, ImageDraw, ImageFont
|
27
27
|
from tqdm.auto import tqdm
|
28
28
|
|
29
29
|
from natural_pdf.classification.manager import ClassificationManager
|
@@ -38,10 +38,31 @@ from natural_pdf.ocr import OCROptions
|
|
38
38
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
39
39
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
40
40
|
|
41
|
+
# Potentially lazy imports for optional dependencies needed in save_pdf
|
42
|
+
try:
|
43
|
+
import pikepdf
|
44
|
+
except ImportError:
|
45
|
+
pikepdf = None
|
46
|
+
|
47
|
+
try:
|
48
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
49
|
+
|
50
|
+
pass
|
51
|
+
except ImportError:
|
52
|
+
create_searchable_pdf = None
|
53
|
+
|
54
|
+
# ---> ADDED Import for the new exporter
|
55
|
+
try:
|
56
|
+
from natural_pdf.exporters.original_pdf import create_original_pdf
|
57
|
+
except ImportError:
|
58
|
+
create_original_pdf = None
|
59
|
+
# <--- END ADDED
|
60
|
+
|
41
61
|
logger = logging.getLogger(__name__)
|
42
62
|
|
43
63
|
if TYPE_CHECKING:
|
44
64
|
from natural_pdf.core.page import Page
|
65
|
+
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
45
66
|
from natural_pdf.elements.region import Region
|
46
67
|
|
47
68
|
T = TypeVar("T")
|
@@ -820,6 +841,7 @@ class ElementCollection(
|
|
820
841
|
labels: bool = True, # Use 'labels' consistent with service
|
821
842
|
legend_position: str = "right",
|
822
843
|
render_ocr: bool = False,
|
844
|
+
width: Optional[int] = None, # Add width parameter
|
823
845
|
) -> Optional["Image.Image"]:
|
824
846
|
"""
|
825
847
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -842,6 +864,7 @@ class ElementCollection(
|
|
842
864
|
labels: Whether to include a legend for the temporary highlights.
|
843
865
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
844
866
|
render_ocr: Whether to render OCR text.
|
867
|
+
width: Optional width for the output image in pixels.
|
845
868
|
|
846
869
|
Returns:
|
847
870
|
PIL Image object of the temporary preview, or None if rendering fails or
|
@@ -902,6 +925,7 @@ class ElementCollection(
|
|
902
925
|
page_index=page.index,
|
903
926
|
temporary_highlights=highlight_data_list,
|
904
927
|
scale=scale,
|
928
|
+
width=width, # Pass the width parameter
|
905
929
|
labels=labels, # Use 'labels'
|
906
930
|
legend_position=legend_position,
|
907
931
|
render_ocr=render_ocr,
|
@@ -1139,10 +1163,96 @@ class ElementCollection(
|
|
1139
1163
|
|
1140
1164
|
Args:
|
1141
1165
|
selector: CSS-like selector string
|
1166
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1167
|
+
'any' (any overlap), or 'center' (center point inside).
|
1168
|
+
(default: "all")
|
1142
1169
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1143
1170
|
"""
|
1144
1171
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
1145
1172
|
|
1173
|
+
@overload
|
1174
|
+
def find_all(
|
1175
|
+
self,
|
1176
|
+
*,
|
1177
|
+
text: str,
|
1178
|
+
contains: str = "all",
|
1179
|
+
apply_exclusions: bool = True,
|
1180
|
+
regex: bool = False,
|
1181
|
+
case: bool = True,
|
1182
|
+
**kwargs,
|
1183
|
+
) -> "ElementCollection": ...
|
1184
|
+
|
1185
|
+
@overload
|
1186
|
+
def find_all(
|
1187
|
+
self,
|
1188
|
+
selector: str,
|
1189
|
+
*,
|
1190
|
+
contains: str = "all",
|
1191
|
+
apply_exclusions: bool = True,
|
1192
|
+
regex: bool = False,
|
1193
|
+
case: bool = True,
|
1194
|
+
**kwargs,
|
1195
|
+
) -> "ElementCollection": ...
|
1196
|
+
|
1197
|
+
def find_all(
|
1198
|
+
self,
|
1199
|
+
selector: Optional[str] = None,
|
1200
|
+
*,
|
1201
|
+
text: Optional[str] = None,
|
1202
|
+
contains: str = "all",
|
1203
|
+
apply_exclusions: bool = True,
|
1204
|
+
regex: bool = False,
|
1205
|
+
case: bool = True,
|
1206
|
+
**kwargs,
|
1207
|
+
) -> "ElementCollection":
|
1208
|
+
"""
|
1209
|
+
Find all elements within each element of this collection matching the selector OR text,
|
1210
|
+
and return a flattened collection of all found sub-elements.
|
1211
|
+
|
1212
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1213
|
+
|
1214
|
+
Args:
|
1215
|
+
selector: CSS-like selector string.
|
1216
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1217
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1218
|
+
'any' (any overlap), or 'center' (center point inside).
|
1219
|
+
(default: "all")
|
1220
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1221
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1222
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1223
|
+
**kwargs: Additional parameters for element filtering.
|
1224
|
+
|
1225
|
+
Returns:
|
1226
|
+
A new ElementCollection containing all matching sub-elements from all elements
|
1227
|
+
in this collection.
|
1228
|
+
"""
|
1229
|
+
if selector is None and text is None:
|
1230
|
+
raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
|
1231
|
+
if selector is not None and text is not None:
|
1232
|
+
raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
|
1233
|
+
|
1234
|
+
all_found_elements: List[Element] = []
|
1235
|
+
for element in self._elements:
|
1236
|
+
if hasattr(element, "find_all") and callable(element.find_all):
|
1237
|
+
# Element.find_all returns an ElementCollection
|
1238
|
+
found_in_element: "ElementCollection" = element.find_all(
|
1239
|
+
selector=selector,
|
1240
|
+
text=text,
|
1241
|
+
contains=contains,
|
1242
|
+
apply_exclusions=apply_exclusions,
|
1243
|
+
regex=regex,
|
1244
|
+
case=case,
|
1245
|
+
**kwargs,
|
1246
|
+
)
|
1247
|
+
if found_in_element and found_in_element.elements:
|
1248
|
+
all_found_elements.extend(found_in_element.elements)
|
1249
|
+
# else:
|
1250
|
+
# Elements in the collection are expected to support find_all.
|
1251
|
+
# If an element type doesn't, an AttributeError will naturally occur,
|
1252
|
+
# or a more specific check/handling could be added here if needed.
|
1253
|
+
|
1254
|
+
return ElementCollection(all_found_elements)
|
1255
|
+
|
1146
1256
|
def extract_each_text(self, **kwargs) -> List[str]:
|
1147
1257
|
"""
|
1148
1258
|
Extract text from each element in this region.
|
@@ -1613,6 +1723,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1613
1723
|
self,
|
1614
1724
|
*,
|
1615
1725
|
text: str,
|
1726
|
+
contains: str = "all",
|
1616
1727
|
apply_exclusions: bool = True,
|
1617
1728
|
regex: bool = False,
|
1618
1729
|
case: bool = True,
|
@@ -1624,6 +1735,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1624
1735
|
self,
|
1625
1736
|
selector: str,
|
1626
1737
|
*,
|
1738
|
+
contains: str = "all",
|
1627
1739
|
apply_exclusions: bool = True,
|
1628
1740
|
regex: bool = False,
|
1629
1741
|
case: bool = True,
|
@@ -1635,6 +1747,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1635
1747
|
selector: Optional[str] = None,
|
1636
1748
|
*,
|
1637
1749
|
text: Optional[str] = None,
|
1750
|
+
contains: str = "all",
|
1638
1751
|
apply_exclusions: bool = True,
|
1639
1752
|
regex: bool = False,
|
1640
1753
|
case: bool = True,
|
@@ -1648,6 +1761,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1648
1761
|
Args:
|
1649
1762
|
selector: CSS-like selector string.
|
1650
1763
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1764
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1765
|
+
'any' (any overlap), or 'center' (center point inside).
|
1766
|
+
(default: "all")
|
1651
1767
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1652
1768
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1653
1769
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1661,6 +1777,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1661
1777
|
element = page.find(
|
1662
1778
|
selector=selector,
|
1663
1779
|
text=text,
|
1780
|
+
contains=contains,
|
1664
1781
|
apply_exclusions=apply_exclusions,
|
1665
1782
|
regex=regex,
|
1666
1783
|
case=case,
|
@@ -1675,6 +1792,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1675
1792
|
self,
|
1676
1793
|
*,
|
1677
1794
|
text: str,
|
1795
|
+
contains: str = "all",
|
1678
1796
|
apply_exclusions: bool = True,
|
1679
1797
|
regex: bool = False,
|
1680
1798
|
case: bool = True,
|
@@ -1686,6 +1804,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1686
1804
|
self,
|
1687
1805
|
selector: str,
|
1688
1806
|
*,
|
1807
|
+
contains: str = "all",
|
1689
1808
|
apply_exclusions: bool = True,
|
1690
1809
|
regex: bool = False,
|
1691
1810
|
case: bool = True,
|
@@ -1697,6 +1816,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1697
1816
|
selector: Optional[str] = None,
|
1698
1817
|
*,
|
1699
1818
|
text: Optional[str] = None,
|
1819
|
+
contains: str = "all",
|
1700
1820
|
apply_exclusions: bool = True,
|
1701
1821
|
regex: bool = False,
|
1702
1822
|
case: bool = True,
|
@@ -1710,6 +1830,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1710
1830
|
Args:
|
1711
1831
|
selector: CSS-like selector string.
|
1712
1832
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1833
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1834
|
+
'any' (any overlap), or 'center' (center point inside).
|
1835
|
+
(default: "all")
|
1713
1836
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1714
1837
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1715
1838
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1724,6 +1847,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1724
1847
|
elements = page.find_all(
|
1725
1848
|
selector=selector,
|
1726
1849
|
text=text,
|
1850
|
+
contains=contains,
|
1727
1851
|
apply_exclusions=apply_exclusions,
|
1728
1852
|
regex=regex,
|
1729
1853
|
case=case,
|
@@ -2290,6 +2414,15 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2290
2414
|
Returns:
|
2291
2415
|
PIL Image of the page grid or None if no pages
|
2292
2416
|
"""
|
2417
|
+
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2418
|
+
try:
|
2419
|
+
from PIL import Image, ImageDraw, ImageFont
|
2420
|
+
except ImportError:
|
2421
|
+
logger.error(
|
2422
|
+
"Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
|
2423
|
+
)
|
2424
|
+
return None
|
2425
|
+
|
2293
2426
|
if not self.pages:
|
2294
2427
|
logger.warning("Cannot generate image for empty PageCollection")
|
2295
2428
|
return None
|
@@ -2298,64 +2431,144 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2298
2431
|
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2299
2432
|
|
2300
2433
|
# Load font once outside the loop
|
2301
|
-
font =
|
2434
|
+
font = None
|
2435
|
+
if add_labels:
|
2436
|
+
try:
|
2437
|
+
# Try loading a commonly available font first
|
2438
|
+
font = ImageFont.truetype("DejaVuSans.ttf", 16)
|
2439
|
+
except IOError:
|
2440
|
+
try:
|
2441
|
+
font = ImageFont.load_default(16)
|
2442
|
+
except IOError:
|
2443
|
+
logger.warning("Default font not found. Labels cannot be added.")
|
2444
|
+
add_labels = False # Disable if no font
|
2302
2445
|
|
2303
2446
|
# Render individual page images
|
2304
2447
|
page_images = []
|
2305
2448
|
for page in pages_to_render:
|
2306
|
-
|
2449
|
+
try:
|
2450
|
+
# Assume page.to_image returns a PIL Image or None
|
2451
|
+
img = page.to_image(
|
2452
|
+
width=page_width, include_highlights=True
|
2453
|
+
) # Render with highlights for visual context
|
2454
|
+
if img is None:
|
2455
|
+
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2456
|
+
continue
|
2457
|
+
except Exception as img_err:
|
2458
|
+
logger.error(
|
2459
|
+
f"Error generating image for page {page.number}: {img_err}", exc_info=True
|
2460
|
+
)
|
2461
|
+
continue
|
2307
2462
|
|
2308
2463
|
# Add page number label
|
2309
|
-
if add_labels and font:
|
2464
|
+
if add_labels and font:
|
2310
2465
|
draw = ImageDraw.Draw(img)
|
2311
|
-
pdf_name =
|
2312
|
-
|
2466
|
+
pdf_name = (
|
2467
|
+
Path(page.pdf.path).stem
|
2468
|
+
if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
|
2469
|
+
else ""
|
2470
|
+
)
|
2471
|
+
label_text = f"p{page.number}"
|
2472
|
+
if pdf_name:
|
2473
|
+
label_text += f" - {pdf_name}"
|
2313
2474
|
|
2314
2475
|
# Add category if requested and available
|
2315
2476
|
if show_category:
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
|
2477
|
+
# Placeholder logic - adjust based on how classification results are stored
|
2478
|
+
category = None
|
2479
|
+
confidence = None
|
2480
|
+
if (
|
2481
|
+
hasattr(page, "analyses")
|
2482
|
+
and page.analyses
|
2483
|
+
and "classification" in page.analyses
|
2484
|
+
):
|
2485
|
+
result = page.analyses["classification"]
|
2486
|
+
# Adapt based on actual structure of classification result
|
2487
|
+
category = (
|
2488
|
+
getattr(result, "label", None) or result.get("label", None)
|
2489
|
+
if isinstance(result, dict)
|
2490
|
+
else None
|
2491
|
+
)
|
2492
|
+
confidence = (
|
2493
|
+
getattr(result, "score", None) or result.get("score", None)
|
2494
|
+
if isinstance(result, dict)
|
2495
|
+
else None
|
2496
|
+
)
|
2321
2497
|
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2327
|
-
|
2498
|
+
if category is not None and confidence is not None:
|
2499
|
+
try:
|
2500
|
+
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
2501
|
+
label_text += f"\\n{category_str}"
|
2502
|
+
except (TypeError, ValueError):
|
2503
|
+
pass # Ignore formatting errors
|
2328
2504
|
|
2329
|
-
#
|
2330
|
-
|
2505
|
+
# Calculate bounding box for multi-line text and draw background/text
|
2506
|
+
try:
|
2507
|
+
# Using textbbox for potentially better accuracy with specific fonts
|
2508
|
+
# Note: textbbox needs Pillow 8+
|
2509
|
+
bbox = draw.textbbox(
|
2510
|
+
(5, 5), label_text, font=font, spacing=2
|
2511
|
+
) # Use textbbox if available
|
2512
|
+
bg_rect = (
|
2513
|
+
max(0, bbox[0] - 2),
|
2514
|
+
max(0, bbox[1] - 2),
|
2515
|
+
min(img.width, bbox[2] + 2),
|
2516
|
+
min(img.height, bbox[3] + 2),
|
2517
|
+
)
|
2331
2518
|
|
2332
|
-
|
2333
|
-
|
2519
|
+
# Draw semi-transparent background
|
2520
|
+
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
2521
|
+
draw_overlay = ImageDraw.Draw(overlay)
|
2522
|
+
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
2523
|
+
img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
|
2524
|
+
draw = ImageDraw.Draw(img) # Recreate draw object
|
2525
|
+
|
2526
|
+
# Draw the potentially multi-line text
|
2527
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2528
|
+
except AttributeError: # Fallback for older Pillow without textbbox
|
2529
|
+
# Approximate size and draw
|
2530
|
+
# This might not be perfectly aligned
|
2531
|
+
draw.rectangle(
|
2532
|
+
(2, 2, 150, 40), fill=(255, 255, 255, 180)
|
2533
|
+
) # Simple fixed background
|
2534
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2535
|
+
except Exception as draw_err:
|
2536
|
+
logger.error(
|
2537
|
+
f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
|
2538
|
+
)
|
2334
2539
|
|
2335
2540
|
page_images.append(img)
|
2336
2541
|
|
2542
|
+
if not page_images:
|
2543
|
+
logger.warning("No page images were successfully rendered for the grid.")
|
2544
|
+
return None
|
2545
|
+
|
2337
2546
|
# Calculate grid dimensions if not provided
|
2547
|
+
num_images = len(page_images)
|
2338
2548
|
if not rows and not cols:
|
2339
|
-
|
2340
|
-
|
2341
|
-
rows = (len(page_images) + cols - 1) // cols
|
2549
|
+
cols = min(4, int(num_images**0.5) + 1)
|
2550
|
+
rows = (num_images + cols - 1) // cols
|
2342
2551
|
elif rows and not cols:
|
2343
|
-
cols = (
|
2552
|
+
cols = (num_images + rows - 1) // rows
|
2344
2553
|
elif cols and not rows:
|
2345
|
-
rows = (
|
2554
|
+
rows = (num_images + cols - 1) // cols
|
2555
|
+
cols = max(1, cols if cols else 1) # Ensure at least 1
|
2556
|
+
rows = max(1, rows if rows else 1)
|
2346
2557
|
|
2347
2558
|
# Get maximum dimensions for consistent grid cells
|
2348
|
-
max_width = max(img.width for img in page_images)
|
2349
|
-
max_height = max(img.height for img in page_images)
|
2559
|
+
max_width = max(img.width for img in page_images) if page_images else 1
|
2560
|
+
max_height = max(img.height for img in page_images) if page_images else 1
|
2350
2561
|
|
2351
2562
|
# Create grid image
|
2352
2563
|
grid_width = cols * max_width + (cols + 1) * spacing
|
2353
2564
|
grid_height = rows * max_height + (rows + 1) * spacing
|
2354
|
-
grid_img = Image.new(
|
2565
|
+
grid_img = Image.new(
|
2566
|
+
"RGB", (grid_width, grid_height), (220, 220, 220)
|
2567
|
+
) # Lighter gray background
|
2355
2568
|
|
2356
2569
|
# Place images in grid
|
2357
2570
|
for i, img in enumerate(page_images):
|
2358
|
-
if i >= rows * cols:
|
2571
|
+
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
2359
2572
|
break
|
2360
2573
|
|
2361
2574
|
row = i // cols
|
@@ -2367,3 +2580,134 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2367
2580
|
grid_img.paste(img, (x, y))
|
2368
2581
|
|
2369
2582
|
return grid_img
|
2583
|
+
|
2584
|
+
def save_pdf(
|
2585
|
+
self,
|
2586
|
+
output_path: Union[str, Path],
|
2587
|
+
ocr: bool = False,
|
2588
|
+
original: bool = False,
|
2589
|
+
dpi: int = 300,
|
2590
|
+
):
|
2591
|
+
"""
|
2592
|
+
Saves the pages in this collection to a new PDF file.
|
2593
|
+
|
2594
|
+
Choose one saving mode:
|
2595
|
+
- `ocr=True`: Creates a new, image-based PDF using OCR results. This
|
2596
|
+
makes the text generated during the natural-pdf session searchable,
|
2597
|
+
but loses original vector content. Requires 'ocr-export' extras.
|
2598
|
+
- `original=True`: Extracts the original pages from the source PDF,
|
2599
|
+
preserving all vector content, fonts, and annotations. OCR results
|
2600
|
+
from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
|
2601
|
+
|
2602
|
+
Args:
|
2603
|
+
output_path: Path to save the new PDF file.
|
2604
|
+
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
2605
|
+
original: If True, save the original, vector-based pages.
|
2606
|
+
dpi: Resolution (dots per inch) used only when ocr=True for
|
2607
|
+
rendering page images and aligning the text layer.
|
2608
|
+
|
2609
|
+
Raises:
|
2610
|
+
ValueError: If the collection is empty, if neither or both 'ocr'
|
2611
|
+
and 'original' are True, or if 'original=True' and
|
2612
|
+
pages originate from different PDFs.
|
2613
|
+
ImportError: If required libraries ('pikepdf', 'Pillow')
|
2614
|
+
are not installed for the chosen mode.
|
2615
|
+
RuntimeError: If an unexpected error occurs during saving.
|
2616
|
+
"""
|
2617
|
+
if not self.pages:
|
2618
|
+
raise ValueError("Cannot save an empty PageCollection.")
|
2619
|
+
|
2620
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
2621
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
2622
|
+
|
2623
|
+
output_path_obj = Path(output_path)
|
2624
|
+
output_path_str = str(output_path_obj)
|
2625
|
+
|
2626
|
+
if ocr:
|
2627
|
+
if create_searchable_pdf is None:
|
2628
|
+
raise ImportError(
|
2629
|
+
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
2630
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2631
|
+
)
|
2632
|
+
|
2633
|
+
# Check for non-OCR vector elements (provide a warning)
|
2634
|
+
has_vector_elements = False
|
2635
|
+
for page in self.pages:
|
2636
|
+
# Simplified check for common vector types or non-OCR chars/words
|
2637
|
+
if (
|
2638
|
+
hasattr(page, "rects")
|
2639
|
+
and page.rects
|
2640
|
+
or hasattr(page, "lines")
|
2641
|
+
and page.lines
|
2642
|
+
or hasattr(page, "curves")
|
2643
|
+
and page.curves
|
2644
|
+
or (
|
2645
|
+
hasattr(page, "chars")
|
2646
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
2647
|
+
)
|
2648
|
+
or (
|
2649
|
+
hasattr(page, "words")
|
2650
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
2651
|
+
)
|
2652
|
+
):
|
2653
|
+
has_vector_elements = True
|
2654
|
+
break
|
2655
|
+
if has_vector_elements:
|
2656
|
+
logger.warning(
|
2657
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
2658
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
2659
|
+
"on selected pages will not be preserved in the output file."
|
2660
|
+
)
|
2661
|
+
|
2662
|
+
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
2663
|
+
try:
|
2664
|
+
# Delegate to the searchable PDF exporter function
|
2665
|
+
# Pass `self` (the PageCollection instance) as the source
|
2666
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
2667
|
+
# Success log is now inside create_searchable_pdf if needed, or keep here
|
2668
|
+
# logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
|
2669
|
+
except Exception as e:
|
2670
|
+
logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
|
2671
|
+
# Re-raise as RuntimeError for consistency, potentially handled in exporter too
|
2672
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
2673
|
+
|
2674
|
+
elif original:
|
2675
|
+
# ---> MODIFIED: Call the new exporter
|
2676
|
+
if create_original_pdf is None:
|
2677
|
+
raise ImportError(
|
2678
|
+
"Saving with original=True requires 'pikepdf'. "
|
2679
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2680
|
+
)
|
2681
|
+
|
2682
|
+
# Check for OCR elements (provide a warning) - keep this check here
|
2683
|
+
has_ocr_elements = False
|
2684
|
+
for page in self.pages:
|
2685
|
+
# Use find_all which returns a collection; check if it's non-empty
|
2686
|
+
if hasattr(page, "find_all"):
|
2687
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
2688
|
+
if ocr_text_elements: # Check truthiness of collection
|
2689
|
+
has_ocr_elements = True
|
2690
|
+
break
|
2691
|
+
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
2692
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
2693
|
+
has_ocr_elements = True
|
2694
|
+
break
|
2695
|
+
|
2696
|
+
if has_ocr_elements:
|
2697
|
+
logger.warning(
|
2698
|
+
"Warning: Saving with original=True preserves original page content. "
|
2699
|
+
"OCR text generated in this session will not be included in the saved file."
|
2700
|
+
)
|
2701
|
+
|
2702
|
+
logger.info(f"Saving original pages PDF to: {output_path_str}")
|
2703
|
+
try:
|
2704
|
+
# Delegate to the original PDF exporter function
|
2705
|
+
# Pass `self` (the PageCollection instance) as the source
|
2706
|
+
create_original_pdf(self, output_path_str)
|
2707
|
+
# Success log is now inside create_original_pdf
|
2708
|
+
# logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
|
2709
|
+
except Exception as e:
|
2710
|
+
# Error logging is handled within create_original_pdf
|
2711
|
+
# Re-raise the exception caught from the exporter
|
2712
|
+
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2713
|
+
# <--- END MODIFIED
|