PyPI - natural-pdf - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

natural-pdf 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

natural_pdf/__init__.py +24 -40
natural_pdf/core/page.py +17 -17
natural_pdf/core/pdf.py +130 -12
natural_pdf/elements/collections.py +229 -29
natural_pdf/elements/region.py +2 -3
natural_pdf/exporters/hocr.py +540 -0
natural_pdf/exporters/hocr_font.py +142 -0
natural_pdf/exporters/original_pdf.py +130 -0
natural_pdf/exporters/searchable_pdf.py +3 -3
natural_pdf/ocr/engine_surya.py +1 -1
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +15 -12
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0

natural_pdf/exporters/original_pdf.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""
+Module for exporting original PDF pages without modification.
+"""
+import logging
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Set, Union
+# Lazy import for optional dependency
+try:
+    import pikepdf
+except ImportError:
+    pikepdf = None
+if TYPE_CHECKING:
+    from natural_pdf.core.page import Page
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.elements.collections import PageCollection
+logger = logging.getLogger(__name__)
+def create_original_pdf(
+    source: Union["Page", "PageCollection", "PDF"], output_path: Union[str, Path]
+):
+    """
+    Creates a new PDF file containing only the original, unmodified pages
+    specified by the source object.
+    Requires 'pikepdf'. Install with: pip install "natural-pdf[ocr-export]"
+    Args:
+        source: The Page, PageCollection, or PDF object indicating which pages to include.
+        output_path: The path to save the resulting PDF file.
+    Raises:
+        ImportError: If 'pikepdf' is not installed.
+        ValueError: If the source object is empty, pages are from different PDFs,
+                    or the source PDF path cannot be determined.
+        RuntimeError: If pikepdf fails to open the source or save the output.
+        pikepdf.PasswordError: If the source PDF is password-protected.
+    """
+    if pikepdf is None:
+        raise ImportError(
+            "Saving original PDF pages requires 'pikepdf'. "
+            "Install with: pip install \"natural-pdf[ocr-export]\""
+        )
+    output_path_str = str(output_path)
+    pages_to_extract: List["Page"] = []
+    # Determine the list of pages and the source PDF path
+    if hasattr(source, "pages") and isinstance(source.pages, list):  # PDF or PageCollection
+        if not source.pages:
+            raise ValueError("Cannot save an empty collection/PDF.")
+        pages_to_extract = source.pages
+    elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
+         # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
+         if hasattr(source, 'pdf') and source.pdf and hasattr(source.pdf, 'path'):
+             pages_to_extract = [source]
+         else:
+            raise ValueError("Input Page object does not have a valid PDF reference with a path.")
+    else:
+        raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
+    if not pages_to_extract:
+         raise ValueError("No valid pages found in the source object.")
+    # Verify all pages come from the same PDF and get path
+    first_page_pdf_path = None
+    if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
+        first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
+    if not first_page_pdf_path:
+        raise ValueError(
+            "Cannot save original pages: Source PDF path not found for the first page."
+        )
+    page_indices_set: Set[int] = set()
+    for page in pages_to_extract:
+        page_pdf_path = getattr(getattr(page, "pdf", None), "path", None)
+        if not page_pdf_path or page_pdf_path != first_page_pdf_path:
+            raise ValueError(
+                "Cannot save original pages: All pages must belong to the same source PDF document."
+            )
+        page_indices_set.add(page.index)  # 0-based index
+    sorted_indices = sorted(list(page_indices_set))
+    logger.info(
+        f"Extracting original pages {sorted_indices} from '{first_page_pdf_path}' to '{output_path_str}'"
+    )
+    try:
+        with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
+            target_pikepdf_doc = pikepdf.Pdf.new()
+            for page_index in sorted_indices:
+                if 0 <= page_index < len(source_pikepdf_doc.pages):
+                    # This correctly appends the pikepdf.Page object
+                    target_pikepdf_doc.pages.append(source_pikepdf_doc.pages[page_index])
+                else:
+                    logger.warning(
+                        f"Page index {page_index} out of bounds for source PDF '{first_page_pdf_path}'. Skipping."
+                    )
+            if not target_pikepdf_doc.pages:
+                raise RuntimeError(f"No valid pages found to save from source PDF.")
+            target_pikepdf_doc.save(output_path_str)
+            logger.info(
+                f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
+            )
+    except pikepdf.PasswordError:
+        logger.error(
+            f"Failed to open password-protected source PDF: {first_page_pdf_path}"
+        )
+        raise RuntimeError(
+            f"Source PDF '{first_page_pdf_path}' is password-protected."
+        ) from None # Raise specific error without chaining the generic Exception
+    except Exception as e:
+        logger.error(
+            f"Failed to save original pages PDF to '{output_path_str}': {e}",
+            exc_info=True,
+        )
+        # Re-raise as RuntimeError for consistent API error handling
+        raise RuntimeError(f"Failed to save original pages PDF: {e}") from e

natural_pdf/exporters/searchable_pdf.py CHANGED Viewed

@@ -22,7 +22,7 @@ except ImportError:
     pikepdf = None  # type: ignore
 try:
-    from ocrmypdf.hocrtransform import HocrTransform
+    from natural_pdf.exporters.hocr import HocrTransform
 except ImportError:
     HocrTransform = None  # type: ignore
@@ -310,7 +310,7 @@ def create_searchable_pdf(
     """
     Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
-    Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
+    Relies on pikepdf for saving the PDF.
     Args:
         source: The natural_pdf.PDF, PageCollection, or Page object
@@ -323,7 +323,7 @@ def create_searchable_pdf(
         # This should ideally not happen if dependencies are in main install,
         # but serves as a safeguard during development or if install is broken.
         raise ImportError(
-            "Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
+            "Required dependencies (Pillow, pikepdf) are missing. "
             "Please ensure natural-pdf is installed correctly with all dependencies."
         )
     # --- End Safeguard Check ---

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -59,7 +59,7 @@ class SuryaOCREngine(OCREngine):
         # Store languages instance variable during initialization to use here
         langs = (
-            [[lang] for lang in self._langs]
+            [self._langs]  # Send all languages together in one list per image
             if hasattr(self, "_langs")
             else [[self.DEFAULT_LANGUAGES[0]]]
         )

{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.10
+Version: 0.1.11
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -91,7 +91,6 @@ Requires-Dist: torchvision; extra == "core-ml"
 Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
 Requires-Dist: huggingface_hub; extra == "core-ml"
 Provides-Extra: ocr-export
-Requires-Dist: ocrmypdf; extra == "ocr-export"
 Requires-Dist: pikepdf; extra == "ocr-export"
 Provides-Extra: export-extras
 Requires-Dist: jupytext; extra == "export-extras"

{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-natural_pdf/__init__.py,sha256=LBrQcFOGooaUsTSAk6zrPCQqu0IM-ClvJLasexEk64k,2728
+natural_pdf/__init__.py,sha256=HIYdzHD7QBRssIseUX_oDJYvVJs646tNSYhKHqk0HeA,2495
 natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
 natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
 natural_pdf/analyzers/text_structure.py,sha256=Uhxc7aYB1jddkiwRTEPOg_Te2HfOua4z_OtgP1m3org,12794
@@ -23,20 +23,23 @@ natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
 natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
-natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
-natural_pdf/core/pdf.py,sha256=gOvLumJZaHXdDwpxbX9HcC_Rea4HaYMemBdYg5GX7gQ,63837
+natural_pdf/core/page.py,sha256=S7Uj3DVksX7o3Qg7hpNulYuxHmqzSJIJ0yXVytPhFqY,105158
+natural_pdf/core/pdf.py,sha256=qpZx5LXZ5Oq1fZ4mzDXBDOIcsApRinMEH0CjVY6jNvM,69273
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
 natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
-natural_pdf/elements/collections.py,sha256=AN0WrrQYfCmcRS0-PHP4RQHxxdpcWnDuH2cWnmqtDE0,97184
+natural_pdf/elements/collections.py,sha256=HsNt_4x-yqNI_bDGeNEiih3hotAfrbppmp_O7rq9HGs,107141
 natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
+natural_pdf/elements/region.py,sha256=XYWUym7hgkzMMfmXw0hEz_iGJ6Sdyf6DRz6XjgMVwN0,97250
 natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
 natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
+natural_pdf/exporters/hocr.py,sha256=wilmVyBgmBNp2ZEdbKijk9ag8E1AGMMl6rBtsAOzp-Y,20201
+natural_pdf/exporters/hocr_font.py,sha256=e9QdxeCExxpY_dpzwGxFlT_3TcvNejw9qpkNc1NVa4Y,4612
+natural_pdf/exporters/original_pdf.py,sha256=vZeqBsCZh3JRRWwtfHzM78fxvhKkAI4QK3LLkeXidUM,5082
 natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
-natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
+natural_pdf/exporters/searchable_pdf.py,sha256=-sbjjM4oV2YCiJaVKcUIPXjAs94ouXSyOSlAzv_qM7I,16815
 natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
 natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
 natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
@@ -45,7 +48,7 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
 natural_pdf/ocr/engine_doctr.py,sha256=519WpvSHgwP6Hv24tci_YHFX7XPlaxOnlREN_YG-Yys,16331
 natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
 natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
-natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
+natural_pdf/ocr/engine_surya.py,sha256=CQHpPecCYsJsr7pEvEyubAf5FJFs7vFHAm_0cGGr-A4,4839
 natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
 natural_pdf/ocr/ocr_manager.py,sha256=f0q68ynGYVPkF4D3WnufxmHWD5R1jW5Z_1czTEi9JVU,13931
 natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqCE,6438
@@ -73,8 +76,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
 natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
-natural_pdf-0.1.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.10.dist-info/METADATA,sha256=gjUsfmnbqrdiHcaH6L1qiw6VX4MBlWjVj5HqlDnhuQY,7401
-natural_pdf-0.1.10.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
-natural_pdf-0.1.10.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.10.dist-info/RECORD,,
+natural_pdf-0.1.11.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.11.dist-info/METADATA,sha256=HBEH41sOW2opbRoN_yUq8iw3jB2fvdOXEDj0ZGfmw8g,7354
+natural_pdf-0.1.11.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
+natural_pdf-0.1.11.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.11.dist-info/RECORD,,

{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.1.0)
+Generator: setuptools (80.3.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

natural-pdf 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl