natural-pdf 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +24 -40
- natural_pdf/core/page.py +17 -17
- natural_pdf/core/pdf.py +130 -12
- natural_pdf/elements/collections.py +229 -29
- natural_pdf/elements/region.py +2 -3
- natural_pdf/exporters/hocr.py +540 -0
- natural_pdf/exporters/hocr_font.py +142 -0
- natural_pdf/exporters/original_pdf.py +130 -0
- natural_pdf/exporters/searchable_pdf.py +3 -3
- natural_pdf/ocr/engine_surya.py +1 -1
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +15 -12
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
"""
|
2
|
+
Module for exporting original PDF pages without modification.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import TYPE_CHECKING, List, Set, Union
|
9
|
+
|
10
|
+
# Lazy import for optional dependency
|
11
|
+
try:
|
12
|
+
import pikepdf
|
13
|
+
except ImportError:
|
14
|
+
pikepdf = None
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from natural_pdf.core.page import Page
|
18
|
+
from natural_pdf.core.pdf import PDF
|
19
|
+
from natural_pdf.elements.collections import PageCollection
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def create_original_pdf(
|
25
|
+
source: Union["Page", "PageCollection", "PDF"], output_path: Union[str, Path]
|
26
|
+
):
|
27
|
+
"""
|
28
|
+
Creates a new PDF file containing only the original, unmodified pages
|
29
|
+
specified by the source object.
|
30
|
+
|
31
|
+
Requires 'pikepdf'. Install with: pip install "natural-pdf[ocr-export]"
|
32
|
+
|
33
|
+
Args:
|
34
|
+
source: The Page, PageCollection, or PDF object indicating which pages to include.
|
35
|
+
output_path: The path to save the resulting PDF file.
|
36
|
+
|
37
|
+
Raises:
|
38
|
+
ImportError: If 'pikepdf' is not installed.
|
39
|
+
ValueError: If the source object is empty, pages are from different PDFs,
|
40
|
+
or the source PDF path cannot be determined.
|
41
|
+
RuntimeError: If pikepdf fails to open the source or save the output.
|
42
|
+
pikepdf.PasswordError: If the source PDF is password-protected.
|
43
|
+
"""
|
44
|
+
if pikepdf is None:
|
45
|
+
raise ImportError(
|
46
|
+
"Saving original PDF pages requires 'pikepdf'. "
|
47
|
+
"Install with: pip install \"natural-pdf[ocr-export]\""
|
48
|
+
)
|
49
|
+
|
50
|
+
output_path_str = str(output_path)
|
51
|
+
pages_to_extract: List["Page"] = []
|
52
|
+
|
53
|
+
# Determine the list of pages and the source PDF path
|
54
|
+
if hasattr(source, "pages") and isinstance(source.pages, list): # PDF or PageCollection
|
55
|
+
if not source.pages:
|
56
|
+
raise ValueError("Cannot save an empty collection/PDF.")
|
57
|
+
pages_to_extract = source.pages
|
58
|
+
elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
|
59
|
+
# Check if it's a natural_pdf.core.page.Page or similar duck-typed object
|
60
|
+
if hasattr(source, 'pdf') and source.pdf and hasattr(source.pdf, 'path'):
|
61
|
+
pages_to_extract = [source]
|
62
|
+
else:
|
63
|
+
raise ValueError("Input Page object does not have a valid PDF reference with a path.")
|
64
|
+
else:
|
65
|
+
raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
|
66
|
+
|
67
|
+
|
68
|
+
if not pages_to_extract:
|
69
|
+
raise ValueError("No valid pages found in the source object.")
|
70
|
+
|
71
|
+
# Verify all pages come from the same PDF and get path
|
72
|
+
first_page_pdf_path = None
|
73
|
+
if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
|
74
|
+
first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
|
75
|
+
|
76
|
+
if not first_page_pdf_path:
|
77
|
+
raise ValueError(
|
78
|
+
"Cannot save original pages: Source PDF path not found for the first page."
|
79
|
+
)
|
80
|
+
|
81
|
+
page_indices_set: Set[int] = set()
|
82
|
+
for page in pages_to_extract:
|
83
|
+
page_pdf_path = getattr(getattr(page, "pdf", None), "path", None)
|
84
|
+
if not page_pdf_path or page_pdf_path != first_page_pdf_path:
|
85
|
+
raise ValueError(
|
86
|
+
"Cannot save original pages: All pages must belong to the same source PDF document."
|
87
|
+
)
|
88
|
+
page_indices_set.add(page.index) # 0-based index
|
89
|
+
|
90
|
+
sorted_indices = sorted(list(page_indices_set))
|
91
|
+
|
92
|
+
logger.info(
|
93
|
+
f"Extracting original pages {sorted_indices} from '{first_page_pdf_path}' to '{output_path_str}'"
|
94
|
+
)
|
95
|
+
|
96
|
+
try:
|
97
|
+
with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
|
98
|
+
target_pikepdf_doc = pikepdf.Pdf.new()
|
99
|
+
|
100
|
+
for page_index in sorted_indices:
|
101
|
+
if 0 <= page_index < len(source_pikepdf_doc.pages):
|
102
|
+
# This correctly appends the pikepdf.Page object
|
103
|
+
target_pikepdf_doc.pages.append(source_pikepdf_doc.pages[page_index])
|
104
|
+
else:
|
105
|
+
logger.warning(
|
106
|
+
f"Page index {page_index} out of bounds for source PDF '{first_page_pdf_path}'. Skipping."
|
107
|
+
)
|
108
|
+
|
109
|
+
if not target_pikepdf_doc.pages:
|
110
|
+
raise RuntimeError(f"No valid pages found to save from source PDF.")
|
111
|
+
|
112
|
+
target_pikepdf_doc.save(output_path_str)
|
113
|
+
logger.info(
|
114
|
+
f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
|
115
|
+
)
|
116
|
+
|
117
|
+
except pikepdf.PasswordError:
|
118
|
+
logger.error(
|
119
|
+
f"Failed to open password-protected source PDF: {first_page_pdf_path}"
|
120
|
+
)
|
121
|
+
raise RuntimeError(
|
122
|
+
f"Source PDF '{first_page_pdf_path}' is password-protected."
|
123
|
+
) from None # Raise specific error without chaining the generic Exception
|
124
|
+
except Exception as e:
|
125
|
+
logger.error(
|
126
|
+
f"Failed to save original pages PDF to '{output_path_str}': {e}",
|
127
|
+
exc_info=True,
|
128
|
+
)
|
129
|
+
# Re-raise as RuntimeError for consistent API error handling
|
130
|
+
raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
|
@@ -22,7 +22,7 @@ except ImportError:
|
|
22
22
|
pikepdf = None # type: ignore
|
23
23
|
|
24
24
|
try:
|
25
|
-
from
|
25
|
+
from natural_pdf.exporters.hocr import HocrTransform
|
26
26
|
except ImportError:
|
27
27
|
HocrTransform = None # type: ignore
|
28
28
|
|
@@ -310,7 +310,7 @@ def create_searchable_pdf(
|
|
310
310
|
"""
|
311
311
|
Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
|
312
312
|
|
313
|
-
Relies on
|
313
|
+
Relies on pikepdf for saving the PDF.
|
314
314
|
|
315
315
|
Args:
|
316
316
|
source: The natural_pdf.PDF, PageCollection, or Page object
|
@@ -323,7 +323,7 @@ def create_searchable_pdf(
|
|
323
323
|
# This should ideally not happen if dependencies are in main install,
|
324
324
|
# but serves as a safeguard during development or if install is broken.
|
325
325
|
raise ImportError(
|
326
|
-
"Required dependencies (Pillow, pikepdf
|
326
|
+
"Required dependencies (Pillow, pikepdf) are missing. "
|
327
327
|
"Please ensure natural-pdf is installed correctly with all dependencies."
|
328
328
|
)
|
329
329
|
# --- End Safeguard Check ---
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -59,7 +59,7 @@ class SuryaOCREngine(OCREngine):
|
|
59
59
|
|
60
60
|
# Store languages instance variable during initialization to use here
|
61
61
|
langs = (
|
62
|
-
[
|
62
|
+
[self._langs] # Send all languages together in one list per image
|
63
63
|
if hasattr(self, "_langs")
|
64
64
|
else [[self.DEFAULT_LANGUAGES[0]]]
|
65
65
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -91,7 +91,6 @@ Requires-Dist: torchvision; extra == "core-ml"
|
|
91
91
|
Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
|
92
92
|
Requires-Dist: huggingface_hub; extra == "core-ml"
|
93
93
|
Provides-Extra: ocr-export
|
94
|
-
Requires-Dist: ocrmypdf; extra == "ocr-export"
|
95
94
|
Requires-Dist: pikepdf; extra == "ocr-export"
|
96
95
|
Provides-Extra: export-extras
|
97
96
|
Requires-Dist: jupytext; extra == "export-extras"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=HIYdzHD7QBRssIseUX_oDJYvVJs646tNSYhKHqk0HeA,2495
|
2
2
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
3
3
|
natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
|
4
4
|
natural_pdf/analyzers/text_structure.py,sha256=Uhxc7aYB1jddkiwRTEPOg_Te2HfOua4z_OtgP1m3org,12794
|
@@ -23,20 +23,23 @@ natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg
|
|
23
23
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
24
24
|
natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
|
25
25
|
natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
|
26
|
-
natural_pdf/core/page.py,sha256=
|
27
|
-
natural_pdf/core/pdf.py,sha256=
|
26
|
+
natural_pdf/core/page.py,sha256=S7Uj3DVksX7o3Qg7hpNulYuxHmqzSJIJ0yXVytPhFqY,105158
|
27
|
+
natural_pdf/core/pdf.py,sha256=qpZx5LXZ5Oq1fZ4mzDXBDOIcsApRinMEH0CjVY6jNvM,69273
|
28
28
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
29
29
|
natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
|
30
|
-
natural_pdf/elements/collections.py,sha256=
|
30
|
+
natural_pdf/elements/collections.py,sha256=HsNt_4x-yqNI_bDGeNEiih3hotAfrbppmp_O7rq9HGs,107141
|
31
31
|
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
32
32
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
33
|
-
natural_pdf/elements/region.py,sha256=
|
33
|
+
natural_pdf/elements/region.py,sha256=XYWUym7hgkzMMfmXw0hEz_iGJ6Sdyf6DRz6XjgMVwN0,97250
|
34
34
|
natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
|
35
35
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
36
36
|
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
37
37
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
38
|
+
natural_pdf/exporters/hocr.py,sha256=wilmVyBgmBNp2ZEdbKijk9ag8E1AGMMl6rBtsAOzp-Y,20201
|
39
|
+
natural_pdf/exporters/hocr_font.py,sha256=e9QdxeCExxpY_dpzwGxFlT_3TcvNejw9qpkNc1NVa4Y,4612
|
40
|
+
natural_pdf/exporters/original_pdf.py,sha256=vZeqBsCZh3JRRWwtfHzM78fxvhKkAI4QK3LLkeXidUM,5082
|
38
41
|
natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
|
39
|
-
natural_pdf/exporters/searchable_pdf.py,sha256
|
42
|
+
natural_pdf/exporters/searchable_pdf.py,sha256=-sbjjM4oV2YCiJaVKcUIPXjAs94ouXSyOSlAzv_qM7I,16815
|
40
43
|
natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
|
41
44
|
natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
|
42
45
|
natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
|
@@ -45,7 +48,7 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
|
|
45
48
|
natural_pdf/ocr/engine_doctr.py,sha256=519WpvSHgwP6Hv24tci_YHFX7XPlaxOnlREN_YG-Yys,16331
|
46
49
|
natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
|
47
50
|
natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
|
48
|
-
natural_pdf/ocr/engine_surya.py,sha256=
|
51
|
+
natural_pdf/ocr/engine_surya.py,sha256=CQHpPecCYsJsr7pEvEyubAf5FJFs7vFHAm_0cGGr-A4,4839
|
49
52
|
natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
|
50
53
|
natural_pdf/ocr/ocr_manager.py,sha256=f0q68ynGYVPkF4D3WnufxmHWD5R1jW5Z_1czTEi9JVU,13931
|
51
54
|
natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqCE,6438
|
@@ -73,8 +76,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
|
|
73
76
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
74
77
|
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
75
78
|
natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
|
76
|
-
natural_pdf-0.1.
|
77
|
-
natural_pdf-0.1.
|
78
|
-
natural_pdf-0.1.
|
79
|
-
natural_pdf-0.1.
|
80
|
-
natural_pdf-0.1.
|
79
|
+
natural_pdf-0.1.11.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
80
|
+
natural_pdf-0.1.11.dist-info/METADATA,sha256=HBEH41sOW2opbRoN_yUq8iw3jB2fvdOXEDj0ZGfmw8g,7354
|
81
|
+
natural_pdf-0.1.11.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
82
|
+
natural_pdf-0.1.11.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
83
|
+
natural_pdf-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|