natural-pdf 25.3.16__py3-none-any.whl → 25.3.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example demonstrating loading a PDF from a URL.
4
+ """
5
+ import sys
6
+ import os
7
+ import argparse
8
+
9
+ # Add the parent directory to the path so we can import the natural_pdf package
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Example of loading a PDF from a URL")
16
+ parser.add_argument('url', nargs='?',
17
+ default="https://arxiv.org/pdf/2103.14749.pdf",
18
+ help="URL to a PDF document (default: an arXiv paper)")
19
+ args = parser.parse_args()
20
+
21
+ print(f"Loading PDF from URL: {args.url}")
22
+
23
+ # Open the PDF from URL
24
+ with PDF(args.url) as pdf:
25
+ # Display basic document info
26
+ print(f"Document loaded successfully: {len(pdf)} pages")
27
+
28
+ # Extract text from the first page
29
+ if len(pdf) > 0:
30
+ page = pdf.pages[0]
31
+
32
+ # Get the title (usually large text on the first page)
33
+ title = page.find_all('text[size>=12]')
34
+ if title:
35
+ print("\nTitle candidates:")
36
+ for i, t in enumerate(title[:3], 1): # Show top 3 candidates
37
+ print(f"{i}. {t.text}")
38
+
39
+ # Extract the first 200 characters of text
40
+ text = page.extract_text()
41
+ preview = text[:200] + "..." if len(text) > 200 else text
42
+ print(f"\nText preview:\n{preview}")
43
+
44
+ if __name__ == "__main__":
45
+ main()
natural_pdf/core/pdf.py CHANGED
@@ -1,5 +1,9 @@
1
1
  import pdfplumber
2
2
  import logging
3
+ import tempfile
4
+ import os
5
+ import re
6
+ import urllib.request
3
7
  from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
4
8
 
5
9
  from natural_pdf.core.page import Page
@@ -28,7 +32,7 @@ class PDF:
28
32
  with improved selection, navigation, and extraction capabilities.
29
33
  """
30
34
 
31
- def __init__(self, path: str, reading_order: bool = True,
35
+ def __init__(self, path_or_url: str, reading_order: bool = True,
32
36
  ocr: Optional[Union[bool, str, List, Dict]] = None,
33
37
  ocr_engine: Optional[Union[str, Any]] = None,
34
38
  font_attrs: Optional[List[str]] = None,
@@ -37,7 +41,7 @@ class PDF:
37
41
  Initialize the enhanced PDF object.
38
42
 
39
43
  Args:
40
- path: Path to the PDF file
44
+ path_or_url: Path to the PDF file or a URL to a PDF
41
45
  reading_order: Whether to use natural reading order
42
46
  ocr: OCR configuration:
43
47
  - None or False: OCR disabled
@@ -58,6 +62,40 @@ class PDF:
58
62
  True: Spaces are part of words, better for multi-word searching
59
63
  False: Break text at spaces, each word is separate (legacy behavior)
60
64
  """
65
+ # Check if the input is a URL
66
+ is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
67
+
68
+ # Initialize path-related attributes
69
+ self._original_path = path_or_url
70
+ self._temp_file = None
71
+
72
+ if is_url:
73
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
74
+ try:
75
+ # Create a temporary file to store the downloaded PDF
76
+ self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
77
+
78
+ # Download the PDF
79
+ with urllib.request.urlopen(path_or_url) as response:
80
+ self._temp_file.write(response.read())
81
+ self._temp_file.flush()
82
+ self._temp_file.close()
83
+
84
+ # Use the temporary file path
85
+ path = self._temp_file.name
86
+ logger.info(f"PDF downloaded to temporary file: {path}")
87
+ except Exception as e:
88
+ if self._temp_file and hasattr(self._temp_file, 'name'):
89
+ try:
90
+ os.unlink(self._temp_file.name)
91
+ except:
92
+ pass
93
+ logger.error(f"Failed to download PDF from URL: {e}")
94
+ raise ValueError(f"Failed to download PDF from URL: {e}")
95
+ else:
96
+ # Use the provided path directly
97
+ path = path_or_url
98
+
61
99
  logger.info(f"Initializing PDF from {path}")
62
100
  logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
63
101
 
@@ -558,10 +596,21 @@ class PDF:
558
596
  return self.pages[key]
559
597
 
560
598
  def close(self):
561
- """Close the underlying PDF file."""
599
+ """Close the underlying PDF file and clean up any temporary files."""
562
600
  if hasattr(self, '_pdf') and self._pdf is not None:
563
601
  self._pdf.close()
564
602
  self._pdf = None
603
+
604
+ # Clean up temporary file if it exists
605
+ if hasattr(self, '_temp_file') and self._temp_file is not None:
606
+ try:
607
+ if os.path.exists(self._temp_file.name):
608
+ os.unlink(self._temp_file.name)
609
+ logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
610
+ except Exception as e:
611
+ logger.warning(f"Failed to clean up temporary PDF file: {e}")
612
+ finally:
613
+ self._temp_file = None
565
614
 
566
615
  def __enter__(self):
567
616
  """Context manager entry."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: natural-pdf
3
- Version: 25.3.16
3
+ Version: 25.3.16.2
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Home-page: https://github.com/jsoma/natural-pdf
6
6
  Author: Jonathan Soma
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
15
15
  Requires-Dist: Pillow>=8.0.0
16
16
  Requires-Dist: colour>=0.1.5
17
17
  Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
18
19
  Requires-Dist: doclayout_yolo>=0.0.3
19
20
  Requires-Dist: torch>=2.0.0
20
21
  Requires-Dist: torchvision>=0.15.0
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
58
59
 
59
60
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
60
61
 
61
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
62
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
63
+ - [Live demo here](https://colab.research.google.com/github/jsoma/)
62
64
 
63
65
  ## Features
64
66
 
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
96
98
  ```python
97
99
  from natural_pdf import PDF
98
100
 
99
- # Open a PDF
101
+ # Open a local PDF
100
102
  pdf = PDF('document.pdf')
101
103
 
104
+ # Or open a PDF from a URL
105
+ pdf = PDF('https://example.com/document.pdf')
106
+
102
107
  # Get the first page
103
108
  page = pdf.pages[0]
104
109
 
@@ -70,6 +70,7 @@ examples/text_style_example.py,sha256=pgLb9rbBrERj18icRsrSIgs0ndKuKcBWycwVPAAynr
70
70
  examples/tiny-text.py,sha256=X_SDThngugL8q-XvcKlRe8iNUqouxT3rYCDFqK2MRyo,1693
71
71
  examples/until_boundaries_example.py,sha256=rN8YrwDM7rBb1PuxYkpszYVWT-tNMSD-aKF-2zLBoHo,6461
72
72
  examples/until_example.py,sha256=4b3GyQxaayGfODiXaCgxcq_92gUgpqaJ0_lMWQnYD_0,4559
73
+ examples/url_pdf_example.py,sha256=WjZMlKyIlcXhJNdSe9uMmWdTHixnRfeacnp64gQkNAo,1545
73
74
  examples/very_basics.py,sha256=cNLnr1z701ri0LgE1cVM4gfMMND0C9UnvvWybnwum6g,418
74
75
  natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
75
76
  natural_pdf/analyzers/__init__.py,sha256=XhxlbwiqbGpeIlS88c4P2t7-MLP98U3CcIr-3nGp488,188
@@ -77,7 +78,7 @@ natural_pdf/analyzers/document_layout.py,sha256=VHZTiiEZByhYUBHqOqmNIgj59zhc3ZNK
77
78
  natural_pdf/analyzers/text_structure.py,sha256=ZmUsBMNBENjEYcABHqwziDXIHyCVYdUaEyAW0Ohagzc,5208
78
79
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
79
80
  natural_pdf/core/page.py,sha256=1DqF6mvpFKqsmRnFFrByV6C-MqGJ2MWyOQIwWiCs8PA,106040
80
- natural_pdf/core/pdf.py,sha256=zeLiOJ8XZ8jPaCR9mhurCiQ6kzesN6lcTXlBjTowi_4,23363
81
+ natural_pdf/core/pdf.py,sha256=sGhn0OaadU74Ozoa9QA_HsAZikUKOncyCSME0mImdLo,25475
81
82
  natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
82
83
  natural_pdf/elements/base.py,sha256=QJmhk6sYDKErLGrQ5VYhloytuntufxiP6wTzGfZ__9w,22754
83
84
  natural_pdf/elements/collections.py,sha256=vFFeMS0XiBL3p9PyNmwXndKMlPhlwp8os3xKLveN_8k,31558
@@ -102,8 +103,8 @@ natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6T
102
103
  natural_pdf/utils/visualization.py,sha256=Dujxp5xKbEap6UpoVEpArpkHChJLa_Je7FGz2S3Iwvw,5403
103
104
  tests/__init__.py,sha256=34RJiJqy8uDxasGCbzXIaJlHQklHprscPcA4xp2s97g,30
104
105
  tests/test_pdf.py,sha256=Ud-DI-GHAvnSJGMJewM_EwHtI_UgWTi7Gn9uIwQcpfE,1001
105
- natural_pdf-25.3.16.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
106
- natural_pdf-25.3.16.dist-info/METADATA,sha256=NzJ4A929jjoBvej1FByUxC9NhI5qGqlbIFos4gElEqQ,8365
107
- natural_pdf-25.3.16.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
108
- natural_pdf-25.3.16.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
109
- natural_pdf-25.3.16.dist-info/RECORD,,
106
+ natural_pdf-25.3.16.2.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
107
+ natural_pdf-25.3.16.2.dist-info/METADATA,sha256=3pd3i8n5ZtwJ2oGLSkJJJ61lXKaK93ALGuY9HvENhLg,8548
108
+ natural_pdf-25.3.16.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
109
+ natural_pdf-25.3.16.2.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
110
+ natural_pdf-25.3.16.2.dist-info/RECORD,,