PyPI - natural-pdf - Versions diffs - 25.3.16__py3-none-any.whl → 25.3.16.2__py3-none-any.whl - Mend

natural-pdf 25.3.16py3-none-any.whl → 25.3.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

examples/url_pdf_example.py ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating loading a PDF from a URL.
+"""
+import sys
+import os
+import argparse
+# Add the parent directory to the path so we can import the natural_pdf package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+def main():
+    parser = argparse.ArgumentParser(description="Example of loading a PDF from a URL")
+    parser.add_argument('url', nargs='?',
+                      default="https://arxiv.org/pdf/2103.14749.pdf",
+                      help="URL to a PDF document (default: an arXiv paper)")
+    args = parser.parse_args()
+    print(f"Loading PDF from URL: {args.url}")
+    # Open the PDF from URL
+    with PDF(args.url) as pdf:
+        # Display basic document info
+        print(f"Document loaded successfully: {len(pdf)} pages")
+        # Extract text from the first page
+        if len(pdf) > 0:
+            page = pdf.pages[0]
+            # Get the title (usually large text on the first page)
+            title = page.find_all('text[size>=12]')
+            if title:
+                print("\nTitle candidates:")
+                for i, t in enumerate(title[:3], 1):  # Show top 3 candidates
+                    print(f"{i}. {t.text}")
+            # Extract the first 200 characters of text
+            text = page.extract_text()
+            preview = text[:200] + "..." if len(text) > 200 else text
+            print(f"\nText preview:\n{preview}")
+if __name__ == "__main__":
+    main()

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import pdfplumber
 import logging
+import tempfile
+import os
+import re
+import urllib.request
 from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
 from natural_pdf.core.page import Page
@@ -28,7 +32,7 @@ class PDF:
     with improved selection, navigation, and extraction capabilities.
     """
-    def __init__(self, path: str, reading_order: bool = True,
+    def __init__(self, path_or_url: str, reading_order: bool = True,
                  ocr: Optional[Union[bool, str, List, Dict]] = None,
                  ocr_engine: Optional[Union[str, Any]] = None,
                  font_attrs: Optional[List[str]] = None,
@@ -37,7 +41,7 @@ class PDF:
         Initialize the enhanced PDF object.
         Args:
-            path: Path to the PDF file
+            path_or_url: Path to the PDF file or a URL to a PDF
             reading_order: Whether to use natural reading order
             ocr: OCR configuration:
                  - None or False: OCR disabled
@@ -58,6 +62,40 @@ class PDF:
                        True: Spaces are part of words, better for multi-word searching
                        False: Break text at spaces, each word is separate (legacy behavior)
         """
+        # Check if the input is a URL
+        is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
+        # Initialize path-related attributes
+        self._original_path = path_or_url
+        self._temp_file = None
+        if is_url:
+            logger.info(f"Downloading PDF from URL: {path_or_url}")
+            try:
+                # Create a temporary file to store the downloaded PDF
+                self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+                # Download the PDF
+                with urllib.request.urlopen(path_or_url) as response:
+                    self._temp_file.write(response.read())
+                    self._temp_file.flush()
+                    self._temp_file.close()
+                # Use the temporary file path
+                path = self._temp_file.name
+                logger.info(f"PDF downloaded to temporary file: {path}")
+            except Exception as e:
+                if self._temp_file and hasattr(self._temp_file, 'name'):
+                    try:
+                        os.unlink(self._temp_file.name)
+                    except:
+                        pass
+                logger.error(f"Failed to download PDF from URL: {e}")
+                raise ValueError(f"Failed to download PDF from URL: {e}")
+        else:
+            # Use the provided path directly
+            path = path_or_url
         logger.info(f"Initializing PDF from {path}")
         logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
@@ -558,10 +596,21 @@ class PDF:
         return self.pages[key]
     def close(self):
-        """Close the underlying PDF file."""
+        """Close the underlying PDF file and clean up any temporary files."""
         if hasattr(self, '_pdf') and self._pdf is not None:
             self._pdf.close()
             self._pdf = None
+        # Clean up temporary file if it exists
+        if hasattr(self, '_temp_file') and self._temp_file is not None:
+            try:
+                if os.path.exists(self._temp_file.name):
+                    os.unlink(self._temp_file.name)
+                    logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up temporary PDF file: {e}")
+            finally:
+                self._temp_file = None
     def __enter__(self):
         """Context manager entry."""

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: natural-pdf
-Version: 25.3.16
+Version: 25.3.16.2
 Summary: A more intuitive interface for working with PDFs
 Home-page: https://github.com/jsoma/natural-pdf
 Author: Jonathan Soma
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
 Requires-Dist: Pillow>=8.0.0
 Requires-Dist: colour>=0.1.5
 Requires-Dist: numpy>=1.20.0
+Requires-Dist: urllib3>=1.26.0
 Requires-Dist: doclayout_yolo>=0.0.3
 Requires-Dist: torch>=2.0.0
 Requires-Dist: torchvision>=0.15.0
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
-[Complete documentation here](https://jsoma.github.io/natural-pdf)
+- [Complete documentation here](https://jsoma.github.io/natural-pdf)
+- [Live demo here](https://colab.research.google.com/github/jsoma/)
 ## Features
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
 ```python
 from natural_pdf import PDF
-# Open a PDF
+# Open a local PDF
 pdf = PDF('document.pdf')
+# Or open a PDF from a URL
+pdf = PDF('https://example.com/document.pdf')
 # Get the first page
 page = pdf.pages[0]

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/RECORD RENAMED Viewed

@@ -70,6 +70,7 @@ examples/text_style_example.py,sha256=pgLb9rbBrERj18icRsrSIgs0ndKuKcBWycwVPAAynr
 examples/tiny-text.py,sha256=X_SDThngugL8q-XvcKlRe8iNUqouxT3rYCDFqK2MRyo,1693
 examples/until_boundaries_example.py,sha256=rN8YrwDM7rBb1PuxYkpszYVWT-tNMSD-aKF-2zLBoHo,6461
 examples/until_example.py,sha256=4b3GyQxaayGfODiXaCgxcq_92gUgpqaJ0_lMWQnYD_0,4559
+examples/url_pdf_example.py,sha256=WjZMlKyIlcXhJNdSe9uMmWdTHixnRfeacnp64gQkNAo,1545
 examples/very_basics.py,sha256=cNLnr1z701ri0LgE1cVM4gfMMND0C9UnvvWybnwum6g,418
 natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
 natural_pdf/analyzers/__init__.py,sha256=XhxlbwiqbGpeIlS88c4P2t7-MLP98U3CcIr-3nGp488,188
@@ -77,7 +78,7 @@ natural_pdf/analyzers/document_layout.py,sha256=VHZTiiEZByhYUBHqOqmNIgj59zhc3ZNK
 natural_pdf/analyzers/text_structure.py,sha256=ZmUsBMNBENjEYcABHqwziDXIHyCVYdUaEyAW0Ohagzc,5208
 natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
 natural_pdf/core/page.py,sha256=1DqF6mvpFKqsmRnFFrByV6C-MqGJ2MWyOQIwWiCs8PA,106040
-natural_pdf/core/pdf.py,sha256=zeLiOJ8XZ8jPaCR9mhurCiQ6kzesN6lcTXlBjTowi_4,23363
+natural_pdf/core/pdf.py,sha256=sGhn0OaadU74Ozoa9QA_HsAZikUKOncyCSME0mImdLo,25475
 natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
 natural_pdf/elements/base.py,sha256=QJmhk6sYDKErLGrQ5VYhloytuntufxiP6wTzGfZ__9w,22754
 natural_pdf/elements/collections.py,sha256=vFFeMS0XiBL3p9PyNmwXndKMlPhlwp8os3xKLveN_8k,31558
@@ -102,8 +103,8 @@ natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6T
 natural_pdf/utils/visualization.py,sha256=Dujxp5xKbEap6UpoVEpArpkHChJLa_Je7FGz2S3Iwvw,5403
 tests/__init__.py,sha256=34RJiJqy8uDxasGCbzXIaJlHQklHprscPcA4xp2s97g,30
 tests/test_pdf.py,sha256=Ud-DI-GHAvnSJGMJewM_EwHtI_UgWTi7Gn9uIwQcpfE,1001
-natural_pdf-25.3.16.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-25.3.16.dist-info/METADATA,sha256=NzJ4A929jjoBvej1FByUxC9NhI5qGqlbIFos4gElEqQ,8365
-natural_pdf-25.3.16.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-natural_pdf-25.3.16.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
-natural_pdf-25.3.16.dist-info/RECORD,,
+natural_pdf-25.3.16.2.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-25.3.16.2.dist-info/METADATA,sha256=3pd3i8n5ZtwJ2oGLSkJJJ61lXKaK93ALGuY9HvENhLg,8548
+natural_pdf-25.3.16.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+natural_pdf-25.3.16.2.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
+natural_pdf-25.3.16.2.dist-info/RECORD,,

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 25.3.16__py3-none-any.whl → 25.3.16.2__py3-none-any.whl

natural-pdf 25.3.16py3-none-any.whl → 25.3.16.2py3-none-any.whl