natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ """
2
+ Module for exporting original PDF pages without modification.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, List, Set, Union
9
+
10
+ # Lazy import for optional dependency
11
+ try:
12
+ import pikepdf
13
+ except ImportError:
14
+ pikepdf = None
15
+
16
+ if TYPE_CHECKING:
17
+ from natural_pdf.core.page import Page
18
+ from natural_pdf.core.pdf import PDF
19
+ from natural_pdf.elements.collections import PageCollection
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def create_original_pdf(
25
+ source: Union["Page", "PageCollection", "PDF"], output_path: Union[str, Path]
26
+ ):
27
+ """
28
+ Creates a new PDF file containing only the original, unmodified pages
29
+ specified by the source object.
30
+
31
+ Requires 'pikepdf'. Install with: pip install "natural-pdf[ocr-export]"
32
+
33
+ Args:
34
+ source: The Page, PageCollection, or PDF object indicating which pages to include.
35
+ output_path: The path to save the resulting PDF file.
36
+
37
+ Raises:
38
+ ImportError: If 'pikepdf' is not installed.
39
+ ValueError: If the source object is empty, pages are from different PDFs,
40
+ or the source PDF path cannot be determined.
41
+ RuntimeError: If pikepdf fails to open the source or save the output.
42
+ pikepdf.PasswordError: If the source PDF is password-protected.
43
+ """
44
+ if pikepdf is None:
45
+ raise ImportError(
46
+ "Saving original PDF pages requires 'pikepdf'. "
47
+ "Install with: pip install \"natural-pdf[ocr-export]\""
48
+ )
49
+
50
+ output_path_str = str(output_path)
51
+ pages_to_extract: List["Page"] = []
52
+
53
+ # Determine the list of pages and the source PDF path
54
+ if hasattr(source, "pages") and isinstance(source.pages, list): # PDF or PageCollection
55
+ if not source.pages:
56
+ raise ValueError("Cannot save an empty collection/PDF.")
57
+ pages_to_extract = source.pages
58
+ elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
59
+ # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
60
+ if hasattr(source, 'pdf') and source.pdf and hasattr(source.pdf, 'path'):
61
+ pages_to_extract = [source]
62
+ else:
63
+ raise ValueError("Input Page object does not have a valid PDF reference with a path.")
64
+ else:
65
+ raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
66
+
67
+
68
+ if not pages_to_extract:
69
+ raise ValueError("No valid pages found in the source object.")
70
+
71
+ # Verify all pages come from the same PDF and get path
72
+ first_page_pdf_path = None
73
+ if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
74
+ first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
75
+
76
+ if not first_page_pdf_path:
77
+ raise ValueError(
78
+ "Cannot save original pages: Source PDF path not found for the first page."
79
+ )
80
+
81
+ page_indices_set: Set[int] = set()
82
+ for page in pages_to_extract:
83
+ page_pdf_path = getattr(getattr(page, "pdf", None), "path", None)
84
+ if not page_pdf_path or page_pdf_path != first_page_pdf_path:
85
+ raise ValueError(
86
+ "Cannot save original pages: All pages must belong to the same source PDF document."
87
+ )
88
+ page_indices_set.add(page.index) # 0-based index
89
+
90
+ sorted_indices = sorted(list(page_indices_set))
91
+
92
+ logger.info(
93
+ f"Extracting original pages {sorted_indices} from '{first_page_pdf_path}' to '{output_path_str}'"
94
+ )
95
+
96
+ try:
97
+ with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
98
+ target_pikepdf_doc = pikepdf.Pdf.new()
99
+
100
+ for page_index in sorted_indices:
101
+ if 0 <= page_index < len(source_pikepdf_doc.pages):
102
+ # This correctly appends the pikepdf.Page object
103
+ target_pikepdf_doc.pages.append(source_pikepdf_doc.pages[page_index])
104
+ else:
105
+ logger.warning(
106
+ f"Page index {page_index} out of bounds for source PDF '{first_page_pdf_path}'. Skipping."
107
+ )
108
+
109
+ if not target_pikepdf_doc.pages:
110
+ raise RuntimeError(f"No valid pages found to save from source PDF.")
111
+
112
+ target_pikepdf_doc.save(output_path_str)
113
+ logger.info(
114
+ f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
115
+ )
116
+
117
+ except pikepdf.PasswordError:
118
+ logger.error(
119
+ f"Failed to open password-protected source PDF: {first_page_pdf_path}"
120
+ )
121
+ raise RuntimeError(
122
+ f"Source PDF '{first_page_pdf_path}' is password-protected."
123
+ ) from None # Raise specific error without chaining the generic Exception
124
+ except Exception as e:
125
+ logger.error(
126
+ f"Failed to save original pages PDF to '{output_path_str}': {e}",
127
+ exc_info=True,
128
+ )
129
+ # Re-raise as RuntimeError for consistent API error handling
130
+ raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
@@ -22,7 +22,7 @@ except ImportError:
22
22
  pikepdf = None # type: ignore
23
23
 
24
24
  try:
25
- from ocrmypdf.hocrtransform import HocrTransform
25
+ from natural_pdf.exporters.hocr import HocrTransform
26
26
  except ImportError:
27
27
  HocrTransform = None # type: ignore
28
28
 
@@ -310,7 +310,7 @@ def create_searchable_pdf(
310
310
  """
311
311
  Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
312
312
 
313
- Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
313
+ Relies on pikepdf for saving the PDF.
314
314
 
315
315
  Args:
316
316
  source: The natural_pdf.PDF, PageCollection, or Page object
@@ -323,7 +323,7 @@ def create_searchable_pdf(
323
323
  # This should ideally not happen if dependencies are in main install,
324
324
  # but serves as a safeguard during development or if install is broken.
325
325
  raise ImportError(
326
- "Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
326
+ "Required dependencies (Pillow, pikepdf) are missing. "
327
327
  "Please ensure natural-pdf is installed correctly with all dependencies."
328
328
  )
329
329
  # --- End Safeguard Check ---
@@ -59,7 +59,7 @@ class SuryaOCREngine(OCREngine):
59
59
 
60
60
  # Store languages instance variable during initialization to use here
61
61
  langs = (
62
- [[lang] for lang in self._langs]
62
+ [self._langs] # Send all languages together in one list per image
63
63
  if hasattr(self, "_langs")
64
64
  else [[self.DEFAULT_LANGUAGES[0]]]
65
65
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -91,7 +91,6 @@ Requires-Dist: torchvision; extra == "core-ml"
91
91
  Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
92
92
  Requires-Dist: huggingface_hub; extra == "core-ml"
93
93
  Provides-Extra: ocr-export
94
- Requires-Dist: ocrmypdf; extra == "ocr-export"
95
94
  Requires-Dist: pikepdf; extra == "ocr-export"
96
95
  Provides-Extra: export-extras
97
96
  Requires-Dist: jupytext; extra == "export-extras"
@@ -1,4 +1,4 @@
1
- natural_pdf/__init__.py,sha256=LBrQcFOGooaUsTSAk6zrPCQqu0IM-ClvJLasexEk64k,2728
1
+ natural_pdf/__init__.py,sha256=HIYdzHD7QBRssIseUX_oDJYvVJs646tNSYhKHqk0HeA,2495
2
2
  natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
3
3
  natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
4
4
  natural_pdf/analyzers/text_structure.py,sha256=Uhxc7aYB1jddkiwRTEPOg_Te2HfOua4z_OtgP1m3org,12794
@@ -15,28 +15,31 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
15
15
  natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
16
16
  natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
17
17
  natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
18
- natural_pdf/classification/manager.py,sha256=CvZd3-lN3fEhcaLXr8gYfrdBGoBgzkIeE14EqjrOAzU,17730
19
- natural_pdf/classification/mixin.py,sha256=llari9AIMNGy9sTaR7y1g5vtVNUwuCutbKnjbJRMYx4,6903
20
- natural_pdf/classification/results.py,sha256=Ia26BQxObL5sURpFmg66bfjFPCxjcO_jeP2G-S9wRgo,2289
21
- natural_pdf/collections/mixins.py,sha256=ufetdzHmd2_WLGBPW4eBQrzZTFpjXyVsVwBquIE47zw,4476
22
- natural_pdf/collections/pdf_collection.py,sha256=JnsJugE-vxYsW1ZJWmMlVv_jbyG37X-9rZK1RQyKWAY,30020
18
+ natural_pdf/classification/manager.py,sha256=RxJch8xVu8Me6_T2Kh7ZqUNaAKlXvfyCZD0hRc4Hk6w,17929
19
+ natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
20
+ natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
21
+ natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
22
+ natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
23
23
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
24
24
  natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
25
25
  natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
26
- natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
27
- natural_pdf/core/pdf.py,sha256=Vw-L5149wO6RSfvb9sAfPDLqd9M1TdYoPHNEePh65y8,61201
26
+ natural_pdf/core/page.py,sha256=S7Uj3DVksX7o3Qg7hpNulYuxHmqzSJIJ0yXVytPhFqY,105158
27
+ natural_pdf/core/pdf.py,sha256=qpZx5LXZ5Oq1fZ4mzDXBDOIcsApRinMEH0CjVY6jNvM,69273
28
28
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
29
29
  natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
30
- natural_pdf/elements/collections.py,sha256=YRaJxNbJrBjgwzwuSoOtEotOKh6RaTi7NRCqKiGl514,92955
30
+ natural_pdf/elements/collections.py,sha256=HsNt_4x-yqNI_bDGeNEiih3hotAfrbppmp_O7rq9HGs,107141
31
31
  natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
32
32
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
33
- natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
33
+ natural_pdf/elements/region.py,sha256=XYWUym7hgkzMMfmXw0hEz_iGJ6Sdyf6DRz6XjgMVwN0,97250
34
34
  natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
35
35
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
36
36
  natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
37
37
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
38
+ natural_pdf/exporters/hocr.py,sha256=wilmVyBgmBNp2ZEdbKijk9ag8E1AGMMl6rBtsAOzp-Y,20201
39
+ natural_pdf/exporters/hocr_font.py,sha256=e9QdxeCExxpY_dpzwGxFlT_3TcvNejw9qpkNc1NVa4Y,4612
40
+ natural_pdf/exporters/original_pdf.py,sha256=vZeqBsCZh3JRRWwtfHzM78fxvhKkAI4QK3LLkeXidUM,5082
38
41
  natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
39
- natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
42
+ natural_pdf/exporters/searchable_pdf.py,sha256=-sbjjM4oV2YCiJaVKcUIPXjAs94ouXSyOSlAzv_qM7I,16815
40
43
  natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
41
44
  natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
42
45
  natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
@@ -45,7 +48,7 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
45
48
  natural_pdf/ocr/engine_doctr.py,sha256=519WpvSHgwP6Hv24tci_YHFX7XPlaxOnlREN_YG-Yys,16331
46
49
  natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
47
50
  natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
48
- natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
51
+ natural_pdf/ocr/engine_surya.py,sha256=CQHpPecCYsJsr7pEvEyubAf5FJFs7vFHAm_0cGGr-A4,4839
49
52
  natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
50
53
  natural_pdf/ocr/ocr_manager.py,sha256=f0q68ynGYVPkF4D3WnufxmHWD5R1jW5Z_1czTEi9JVU,13931
51
54
  natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqCE,6438
@@ -73,8 +76,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
73
76
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
74
77
  natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
75
78
  natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
76
- natural_pdf-0.1.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
77
- natural_pdf-0.1.9.dist-info/METADATA,sha256=10GX2Qesem-n8sPem4lls2EEQen4KyJVdcmQf1mt9mI,7400
78
- natural_pdf-0.1.9.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
79
- natural_pdf-0.1.9.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
80
- natural_pdf-0.1.9.dist-info/RECORD,,
79
+ natural_pdf-0.1.11.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
80
+ natural_pdf-0.1.11.dist-info/METADATA,sha256=HBEH41sOW2opbRoN_yUq8iw3jB2fvdOXEDj0ZGfmw8g,7354
81
+ natural_pdf-0.1.11.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
82
+ natural_pdf-0.1.11.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
83
+ natural_pdf-0.1.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.1.0)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5