natural-pdf 25.3.16__py3-none-any.whl → 25.3.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/url_pdf_example.py +45 -0
- natural_pdf/core/pdf.py +52 -3
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/METADATA +8 -3
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/RECORD +7 -6
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/LICENSE +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/WHEEL +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.16.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Example demonstrating loading a PDF from a URL.
|
4
|
+
"""
|
5
|
+
import sys
|
6
|
+
import os
|
7
|
+
import argparse
|
8
|
+
|
9
|
+
# Add the parent directory to the path so we can import the natural_pdf package
|
10
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
def main():
|
15
|
+
parser = argparse.ArgumentParser(description="Example of loading a PDF from a URL")
|
16
|
+
parser.add_argument('url', nargs='?',
|
17
|
+
default="https://arxiv.org/pdf/2103.14749.pdf",
|
18
|
+
help="URL to a PDF document (default: an arXiv paper)")
|
19
|
+
args = parser.parse_args()
|
20
|
+
|
21
|
+
print(f"Loading PDF from URL: {args.url}")
|
22
|
+
|
23
|
+
# Open the PDF from URL
|
24
|
+
with PDF(args.url) as pdf:
|
25
|
+
# Display basic document info
|
26
|
+
print(f"Document loaded successfully: {len(pdf)} pages")
|
27
|
+
|
28
|
+
# Extract text from the first page
|
29
|
+
if len(pdf) > 0:
|
30
|
+
page = pdf.pages[0]
|
31
|
+
|
32
|
+
# Get the title (usually large text on the first page)
|
33
|
+
title = page.find_all('text[size>=12]')
|
34
|
+
if title:
|
35
|
+
print("\nTitle candidates:")
|
36
|
+
for i, t in enumerate(title[:3], 1): # Show top 3 candidates
|
37
|
+
print(f"{i}. {t.text}")
|
38
|
+
|
39
|
+
# Extract the first 200 characters of text
|
40
|
+
text = page.extract_text()
|
41
|
+
preview = text[:200] + "..." if len(text) > 200 else text
|
42
|
+
print(f"\nText preview:\n{preview}")
|
43
|
+
|
44
|
+
if __name__ == "__main__":
|
45
|
+
main()
|
natural_pdf/core/pdf.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
import pdfplumber
|
2
2
|
import logging
|
3
|
+
import tempfile
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import urllib.request
|
3
7
|
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
|
4
8
|
|
5
9
|
from natural_pdf.core.page import Page
|
@@ -28,7 +32,7 @@ class PDF:
|
|
28
32
|
with improved selection, navigation, and extraction capabilities.
|
29
33
|
"""
|
30
34
|
|
31
|
-
def __init__(self,
|
35
|
+
def __init__(self, path_or_url: str, reading_order: bool = True,
|
32
36
|
ocr: Optional[Union[bool, str, List, Dict]] = None,
|
33
37
|
ocr_engine: Optional[Union[str, Any]] = None,
|
34
38
|
font_attrs: Optional[List[str]] = None,
|
@@ -37,7 +41,7 @@ class PDF:
|
|
37
41
|
Initialize the enhanced PDF object.
|
38
42
|
|
39
43
|
Args:
|
40
|
-
|
44
|
+
path_or_url: Path to the PDF file or a URL to a PDF
|
41
45
|
reading_order: Whether to use natural reading order
|
42
46
|
ocr: OCR configuration:
|
43
47
|
- None or False: OCR disabled
|
@@ -58,6 +62,40 @@ class PDF:
|
|
58
62
|
True: Spaces are part of words, better for multi-word searching
|
59
63
|
False: Break text at spaces, each word is separate (legacy behavior)
|
60
64
|
"""
|
65
|
+
# Check if the input is a URL
|
66
|
+
is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
|
67
|
+
|
68
|
+
# Initialize path-related attributes
|
69
|
+
self._original_path = path_or_url
|
70
|
+
self._temp_file = None
|
71
|
+
|
72
|
+
if is_url:
|
73
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
74
|
+
try:
|
75
|
+
# Create a temporary file to store the downloaded PDF
|
76
|
+
self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
77
|
+
|
78
|
+
# Download the PDF
|
79
|
+
with urllib.request.urlopen(path_or_url) as response:
|
80
|
+
self._temp_file.write(response.read())
|
81
|
+
self._temp_file.flush()
|
82
|
+
self._temp_file.close()
|
83
|
+
|
84
|
+
# Use the temporary file path
|
85
|
+
path = self._temp_file.name
|
86
|
+
logger.info(f"PDF downloaded to temporary file: {path}")
|
87
|
+
except Exception as e:
|
88
|
+
if self._temp_file and hasattr(self._temp_file, 'name'):
|
89
|
+
try:
|
90
|
+
os.unlink(self._temp_file.name)
|
91
|
+
except:
|
92
|
+
pass
|
93
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
94
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
95
|
+
else:
|
96
|
+
# Use the provided path directly
|
97
|
+
path = path_or_url
|
98
|
+
|
61
99
|
logger.info(f"Initializing PDF from {path}")
|
62
100
|
logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
|
63
101
|
|
@@ -558,10 +596,21 @@ class PDF:
|
|
558
596
|
return self.pages[key]
|
559
597
|
|
560
598
|
def close(self):
|
561
|
-
"""Close the underlying PDF file."""
|
599
|
+
"""Close the underlying PDF file and clean up any temporary files."""
|
562
600
|
if hasattr(self, '_pdf') and self._pdf is not None:
|
563
601
|
self._pdf.close()
|
564
602
|
self._pdf = None
|
603
|
+
|
604
|
+
# Clean up temporary file if it exists
|
605
|
+
if hasattr(self, '_temp_file') and self._temp_file is not None:
|
606
|
+
try:
|
607
|
+
if os.path.exists(self._temp_file.name):
|
608
|
+
os.unlink(self._temp_file.name)
|
609
|
+
logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
|
610
|
+
except Exception as e:
|
611
|
+
logger.warning(f"Failed to clean up temporary PDF file: {e}")
|
612
|
+
finally:
|
613
|
+
self._temp_file = None
|
565
614
|
|
566
615
|
def __enter__(self):
|
567
616
|
"""Context manager entry."""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 25.3.16
|
3
|
+
Version: 25.3.16.2
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Home-page: https://github.com/jsoma/natural-pdf
|
6
6
|
Author: Jonathan Soma
|
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
|
|
15
15
|
Requires-Dist: Pillow>=8.0.0
|
16
16
|
Requires-Dist: colour>=0.1.5
|
17
17
|
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
18
19
|
Requires-Dist: doclayout_yolo>=0.0.3
|
19
20
|
Requires-Dist: torch>=2.0.0
|
20
21
|
Requires-Dist: torchvision>=0.15.0
|
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
58
59
|
|
59
60
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
60
61
|
|
61
|
-
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
62
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
63
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/)
|
62
64
|
|
63
65
|
## Features
|
64
66
|
|
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
|
|
96
98
|
```python
|
97
99
|
from natural_pdf import PDF
|
98
100
|
|
99
|
-
# Open a PDF
|
101
|
+
# Open a local PDF
|
100
102
|
pdf = PDF('document.pdf')
|
101
103
|
|
104
|
+
# Or open a PDF from a URL
|
105
|
+
pdf = PDF('https://example.com/document.pdf')
|
106
|
+
|
102
107
|
# Get the first page
|
103
108
|
page = pdf.pages[0]
|
104
109
|
|
@@ -70,6 +70,7 @@ examples/text_style_example.py,sha256=pgLb9rbBrERj18icRsrSIgs0ndKuKcBWycwVPAAynr
|
|
70
70
|
examples/tiny-text.py,sha256=X_SDThngugL8q-XvcKlRe8iNUqouxT3rYCDFqK2MRyo,1693
|
71
71
|
examples/until_boundaries_example.py,sha256=rN8YrwDM7rBb1PuxYkpszYVWT-tNMSD-aKF-2zLBoHo,6461
|
72
72
|
examples/until_example.py,sha256=4b3GyQxaayGfODiXaCgxcq_92gUgpqaJ0_lMWQnYD_0,4559
|
73
|
+
examples/url_pdf_example.py,sha256=WjZMlKyIlcXhJNdSe9uMmWdTHixnRfeacnp64gQkNAo,1545
|
73
74
|
examples/very_basics.py,sha256=cNLnr1z701ri0LgE1cVM4gfMMND0C9UnvvWybnwum6g,418
|
74
75
|
natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
|
75
76
|
natural_pdf/analyzers/__init__.py,sha256=XhxlbwiqbGpeIlS88c4P2t7-MLP98U3CcIr-3nGp488,188
|
@@ -77,7 +78,7 @@ natural_pdf/analyzers/document_layout.py,sha256=VHZTiiEZByhYUBHqOqmNIgj59zhc3ZNK
|
|
77
78
|
natural_pdf/analyzers/text_structure.py,sha256=ZmUsBMNBENjEYcABHqwziDXIHyCVYdUaEyAW0Ohagzc,5208
|
78
79
|
natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
|
79
80
|
natural_pdf/core/page.py,sha256=1DqF6mvpFKqsmRnFFrByV6C-MqGJ2MWyOQIwWiCs8PA,106040
|
80
|
-
natural_pdf/core/pdf.py,sha256=
|
81
|
+
natural_pdf/core/pdf.py,sha256=sGhn0OaadU74Ozoa9QA_HsAZikUKOncyCSME0mImdLo,25475
|
81
82
|
natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
|
82
83
|
natural_pdf/elements/base.py,sha256=QJmhk6sYDKErLGrQ5VYhloytuntufxiP6wTzGfZ__9w,22754
|
83
84
|
natural_pdf/elements/collections.py,sha256=vFFeMS0XiBL3p9PyNmwXndKMlPhlwp8os3xKLveN_8k,31558
|
@@ -102,8 +103,8 @@ natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6T
|
|
102
103
|
natural_pdf/utils/visualization.py,sha256=Dujxp5xKbEap6UpoVEpArpkHChJLa_Je7FGz2S3Iwvw,5403
|
103
104
|
tests/__init__.py,sha256=34RJiJqy8uDxasGCbzXIaJlHQklHprscPcA4xp2s97g,30
|
104
105
|
tests/test_pdf.py,sha256=Ud-DI-GHAvnSJGMJewM_EwHtI_UgWTi7Gn9uIwQcpfE,1001
|
105
|
-
natural_pdf-25.3.16.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
106
|
-
natural_pdf-25.3.16.dist-info/METADATA,sha256=
|
107
|
-
natural_pdf-25.3.16.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
108
|
-
natural_pdf-25.3.16.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
|
109
|
-
natural_pdf-25.3.16.dist-info/RECORD,,
|
106
|
+
natural_pdf-25.3.16.2.dist-info/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
107
|
+
natural_pdf-25.3.16.2.dist-info/METADATA,sha256=3pd3i8n5ZtwJ2oGLSkJJJ61lXKaK93ALGuY9HvENhLg,8548
|
108
|
+
natural_pdf-25.3.16.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
109
|
+
natural_pdf-25.3.16.2.dist-info/top_level.txt,sha256=2AueS3xkctrmlcDA_te2-_WG0A0wGhS0UQNwnr_cbFQ,27
|
110
|
+
natural_pdf-25.3.16.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|