pypaperretriever 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypaperretriever-1.0.1/LICENSE +9 -0
- pypaperretriever-1.0.1/PKG-INFO +190 -0
- pypaperretriever-1.0.1/README.md +150 -0
- pypaperretriever-1.0.1/pypaperretriever/__init__.py +20 -0
- pypaperretriever-1.0.1/pypaperretriever/http_client.py +556 -0
- pypaperretriever-1.0.1/pypaperretriever/image_extractor.py +289 -0
- pypaperretriever-1.0.1/pypaperretriever/paper_retriever.py +569 -0
- pypaperretriever-1.0.1/pypaperretriever/paper_tracker.py +268 -0
- pypaperretriever-1.0.1/pypaperretriever/pubmed_searcher.py +584 -0
- pypaperretriever-1.0.1/pypaperretriever/reference_retriever.py +503 -0
- pypaperretriever-1.0.1/pypaperretriever/utils.py +139 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/PKG-INFO +190 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/SOURCES.txt +22 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/dependency_links.txt +1 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/entry_points.txt +2 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/requires.txt +13 -0
- pypaperretriever-1.0.1/pypaperretriever.egg-info/top_level.txt +1 -0
- pypaperretriever-1.0.1/pyproject.toml +58 -0
- pypaperretriever-1.0.1/setup.cfg +4 -0
- pypaperretriever-1.0.1/tests/test_http_client.py +1105 -0
- pypaperretriever-1.0.1/tests/test_image_extractor.py +54 -0
- pypaperretriever-1.0.1/tests/test_paper_retriever.py +166 -0
- pypaperretriever-1.0.1/tests/test_pubmed_searcher.py +52 -0
- pypaperretriever-1.0.1/tests/test_utils.py +471 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Joseph Turner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pypaperretriever
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: A tool to retrieve and process academic papers
|
|
5
|
+
Author-email: Joseph Turner <josephisaacturner@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/josephisaacturner/pypaperretriever
|
|
8
|
+
Project-URL: Documentation, https://josephiturner.com/pypaperretriever/
|
|
9
|
+
Project-URL: Repository, https://github.com/josephisaacturner/pypaperretriever
|
|
10
|
+
Keywords: pubmed,doi,pmid,academic-papers,downloader,image-extraction,citation-network
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: pandas<3.0,>=2.2.3
|
|
28
|
+
Requires-Dist: numpy<3.0,>=1.26
|
|
29
|
+
Requires-Dist: tqdm<5.0,>=4.64.1
|
|
30
|
+
Requires-Dist: biopython<2.0,>=1.83
|
|
31
|
+
Requires-Dist: lxml>=5.2.0
|
|
32
|
+
Requires-Dist: requests<3.0,>=2.31.0
|
|
33
|
+
Requires-Dist: PyMuPDF<2.0,>=1.24.1
|
|
34
|
+
Requires-Dist: pdf2image<2.0,>=1.17.0
|
|
35
|
+
Requires-Dist: beautifulsoup4<5.0,>=4.12.3
|
|
36
|
+
Requires-Dist: opencv-python<5.0,>=4.11.0.86
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest<9.0,>=8.3.4; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# PyPaperRetriever
|
|
42
|
+
|
|
43
|
+
<img src="logo.png" width="200">
|
|
44
|
+
|
|
45
|
+
[](https://josephiturner.com/pypaperretriever/)
|
|
46
|
+
[](https://josephiturner.com/pypaperretriever/quickstart/)
|
|
47
|
+
|
|
48
|
+
👉 **[Full Documentation](https://josephiturner.com/pypaperretriever/)**
|
|
49
|
+
👉 **[Start with the Quickstart](https://josephiturner.com/pypaperretriever/quickstart/)**
|
|
50
|
+
|
|
51
|
+
**A python package for retrieving scientific papers from the web.** Inspired by PyPaperBot (https://github.com/ferru97/PyPaperBot) but with improved flexibility and extensibility. Prefers open-access sources but users can opt to use Sci-Hub as a fallback depending on their ethical considerations and local laws.
|
|
52
|
+
|
|
53
|
+
### Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install pypaperretriever
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Or install the latest development version directly from GitHub:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install git+https://github.com/josephisaacturner/pypaperretriever.git
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Features
|
|
66
|
+
|
|
67
|
+
- Download papers using DOI or PubMed ID (PMID)
|
|
68
|
+
- Search PubMed programmatically with advanced query options
|
|
69
|
+
- Track citation networks (both upstream and downstream) for papers of interest
|
|
70
|
+
- Extract images from downloaded PDFs
|
|
71
|
+
- Find all available sources from Unpaywall and optional Sci-Hub integration
|
|
72
|
+
- Keep track of sources used via JSON sidecar files for each download
|
|
73
|
+
- Avoid duplicate downloads with intelligent checking
|
|
74
|
+
- BIDS-compatible file naming convention
|
|
75
|
+
- Both command-line and Python API interfaces
|
|
76
|
+
- Advanced search capabilities with customizable filters
|
|
77
|
+
- Citation network analysis tools
|
|
78
|
+
|
|
79
|
+
### Ethical and legal note on Sci-Hub
|
|
80
|
+
Use of Sci-Hub is disabled by default and clearly labeled. Institutions and researchers differ in policy and legal context; PyPaperRetriever exposes an opt-in flag so users can comply with local rules while retaining a complete pipeline for contexts where such access is permitted. The authors of PyPaperRetriever do not endorse or encourage the use of Sci-Hub in violation of local laws or institutional policies. Users are responsible for ensuring compliance with all applicable laws and ethical guidelines when using this tool.
|
|
81
|
+
|
|
82
|
+
### Usage Examples
|
|
83
|
+
|
|
84
|
+
For complete examples, see [examples.ipynb](examples.ipynb) in the repository.
|
|
85
|
+
|
|
86
|
+
#### 1. Download Using DOI
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from pypaperretriever import PaperRetriever
|
|
90
|
+
|
|
91
|
+
retriever = PaperRetriever(
|
|
92
|
+
email="your.email@gmail.com",
|
|
93
|
+
doi="10.7759/cureus.76081",
|
|
94
|
+
download_directory='PDFs'
|
|
95
|
+
)
|
|
96
|
+
retriever.download()
|
|
97
|
+
|
|
98
|
+
# Command-line alternative
|
|
99
|
+
pypaperretriever --doi 10.7759/cureus.76081 --email your.email@gmail.com --dwn-dir PDFs
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
#### 2. Download Using PubMed ID
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from pypaperretriever import PaperRetriever
|
|
106
|
+
|
|
107
|
+
retriever = PaperRetriever(
|
|
108
|
+
email="your.email@gmail.com",
|
|
109
|
+
pmid="33813262",
|
|
110
|
+
download_directory='PDFs'
|
|
111
|
+
)
|
|
112
|
+
retriever.download()
|
|
113
|
+
|
|
114
|
+
# Command-line alternative
|
|
115
|
+
pypaperretriever --pmid 33813262 --email your.email@gmail.com --dwn-dir PDFs
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
#### 3. Control Sci-Hub Access
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
retriever = PaperRetriever(
|
|
122
|
+
email="your.email@gmail.com",
|
|
123
|
+
doi="10.1016/j.revmed.2011.10.009",
|
|
124
|
+
download_directory='PDFs',
|
|
125
|
+
allow_scihub=False # Set to True to enable Sci-Hub
|
|
126
|
+
)
|
|
127
|
+
retriever.download()
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
#### 4. Extract Images from PDFs
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from pypaperretriever import ImageExtractor
|
|
134
|
+
|
|
135
|
+
extractor = ImageExtractor('path/to/your/paper.pdf')
|
|
136
|
+
extractor.extract_images()
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
#### 5. Search PubMed Programmatically
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from pypaperretriever import PubMedSearcher
|
|
143
|
+
|
|
144
|
+
search_query = """("brain lesions"[MeSH Terms] OR "brain lesion"[Title/Abstract] OR
|
|
145
|
+
"cerebral lesion"[Title/Abstract]) AND (case reports[Publication Type])"""
|
|
146
|
+
|
|
147
|
+
searcher = PubMedSearcher(search_string=search_query, email="your.email@gmail.com")
|
|
148
|
+
|
|
149
|
+
results = searcher.search(
|
|
150
|
+
count=10,
|
|
151
|
+
order_by='relevance', # or 'chronological'
|
|
152
|
+
only_open_access=False,
|
|
153
|
+
only_case_reports=False
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Download found articles
|
|
157
|
+
searcher.download_articles(download_directory='PDFs', allow_scihub=True)
|
|
158
|
+
|
|
159
|
+
# Extract images from downloaded articles
|
|
160
|
+
searcher.extract_images()
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
#### 6. Track Citation Networks
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from pypaperretriever import PaperTracker
|
|
167
|
+
|
|
168
|
+
tracker = PaperTracker(
|
|
169
|
+
email="your.email@gmail.com",
|
|
170
|
+
doi='10.1097/RLU.0000000000001894',
|
|
171
|
+
max_upstream_generations=1, # Papers referenced by your paper
|
|
172
|
+
max_downstream_generations=1 # Papers that cite your paper
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
results = tracker.track_paper()
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Contributing
|
|
179
|
+
|
|
180
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
181
|
+
|
|
182
|
+
### License
|
|
183
|
+
|
|
184
|
+
MIT License
|
|
185
|
+
|
|
186
|
+
### Citation
|
|
187
|
+
|
|
188
|
+
If you use PyPaperRetriever in your research, please cite:
|
|
189
|
+
|
|
190
|
+
Turner et al., (2025). PyPaperRetriever: A Python Tool for Finding and Downloading Scientific Literature. Journal of Open Source Software, 10(113), 8135, https://doi.org/10.21105/joss.08135
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# PyPaperRetriever
|
|
2
|
+
|
|
3
|
+
<img src="logo.png" width="200">
|
|
4
|
+
|
|
5
|
+
[](https://josephiturner.com/pypaperretriever/)
|
|
6
|
+
[](https://josephiturner.com/pypaperretriever/quickstart/)
|
|
7
|
+
|
|
8
|
+
👉 **[Full Documentation](https://josephiturner.com/pypaperretriever/)**
|
|
9
|
+
👉 **[Start with the Quickstart](https://josephiturner.com/pypaperretriever/quickstart/)**
|
|
10
|
+
|
|
11
|
+
**A python package for retrieving scientific papers from the web.** Inspired by PyPaperBot (https://github.com/ferru97/PyPaperBot) but with improved flexibility and extensibility. Prefers open-access sources but users can opt to use Sci-Hub as a fallback depending on their ethical considerations and local laws.
|
|
12
|
+
|
|
13
|
+
### Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install pypaperretriever
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or install the latest development version directly from GitHub:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install git+https://github.com/josephisaacturner/pypaperretriever.git
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Features
|
|
26
|
+
|
|
27
|
+
- Download papers using DOI or PubMed ID (PMID)
|
|
28
|
+
- Search PubMed programmatically with advanced query options
|
|
29
|
+
- Track citation networks (both upstream and downstream) for papers of interest
|
|
30
|
+
- Extract images from downloaded PDFs
|
|
31
|
+
- Find all available sources from Unpaywall and optional Sci-Hub integration
|
|
32
|
+
- Keep track of sources used via JSON sidecar files for each download
|
|
33
|
+
- Avoid duplicate downloads with intelligent checking
|
|
34
|
+
- BIDS-compatible file naming convention
|
|
35
|
+
- Both command-line and Python API interfaces
|
|
36
|
+
- Advanced search capabilities with customizable filters
|
|
37
|
+
- Citation network analysis tools
|
|
38
|
+
|
|
39
|
+
### Ethical and legal note on Sci-Hub
|
|
40
|
+
Use of Sci-Hub is disabled by default and clearly labeled. Institutions and researchers differ in policy and legal context; PyPaperRetriever exposes an opt-in flag so users can comply with local rules while retaining a complete pipeline for contexts where such access is permitted. The authors of PyPaperRetriever do not endorse or encourage the use of Sci-Hub in violation of local laws or institutional policies. Users are responsible for ensuring compliance with all applicable laws and ethical guidelines when using this tool.
|
|
41
|
+
|
|
42
|
+
### Usage Examples
|
|
43
|
+
|
|
44
|
+
For complete examples, see [examples.ipynb](examples.ipynb) in the repository.
|
|
45
|
+
|
|
46
|
+
#### 1. Download Using DOI
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from pypaperretriever import PaperRetriever
|
|
50
|
+
|
|
51
|
+
retriever = PaperRetriever(
|
|
52
|
+
email="your.email@gmail.com",
|
|
53
|
+
doi="10.7759/cureus.76081",
|
|
54
|
+
download_directory='PDFs'
|
|
55
|
+
)
|
|
56
|
+
retriever.download()
|
|
57
|
+
|
|
58
|
+
# Command-line alternative
|
|
59
|
+
pypaperretriever --doi 10.7759/cureus.76081 --email your.email@gmail.com --dwn-dir PDFs
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
#### 2. Download Using PubMed ID
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from pypaperretriever import PaperRetriever
|
|
66
|
+
|
|
67
|
+
retriever = PaperRetriever(
|
|
68
|
+
email="your.email@gmail.com",
|
|
69
|
+
pmid="33813262",
|
|
70
|
+
download_directory='PDFs'
|
|
71
|
+
)
|
|
72
|
+
retriever.download()
|
|
73
|
+
|
|
74
|
+
# Command-line alternative
|
|
75
|
+
pypaperretriever --pmid 33813262 --email your.email@gmail.com --dwn-dir PDFs
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
#### 3. Control Sci-Hub Access
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
retriever = PaperRetriever(
|
|
82
|
+
email="your.email@gmail.com",
|
|
83
|
+
doi="10.1016/j.revmed.2011.10.009",
|
|
84
|
+
download_directory='PDFs',
|
|
85
|
+
allow_scihub=False # Set to True to enable Sci-Hub
|
|
86
|
+
)
|
|
87
|
+
retriever.download()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### 4. Extract Images from PDFs
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from pypaperretriever import ImageExtractor
|
|
94
|
+
|
|
95
|
+
extractor = ImageExtractor('path/to/your/paper.pdf')
|
|
96
|
+
extractor.extract_images()
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
#### 5. Search PubMed Programmatically
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from pypaperretriever import PubMedSearcher
|
|
103
|
+
|
|
104
|
+
search_query = """("brain lesions"[MeSH Terms] OR "brain lesion"[Title/Abstract] OR
|
|
105
|
+
"cerebral lesion"[Title/Abstract]) AND (case reports[Publication Type])"""
|
|
106
|
+
|
|
107
|
+
searcher = PubMedSearcher(search_string=search_query, email="your.email@gmail.com")
|
|
108
|
+
|
|
109
|
+
results = searcher.search(
|
|
110
|
+
count=10,
|
|
111
|
+
order_by='relevance', # or 'chronological'
|
|
112
|
+
only_open_access=False,
|
|
113
|
+
only_case_reports=False
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Download found articles
|
|
117
|
+
searcher.download_articles(download_directory='PDFs', allow_scihub=True)
|
|
118
|
+
|
|
119
|
+
# Extract images from downloaded articles
|
|
120
|
+
searcher.extract_images()
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### 6. Track Citation Networks
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from pypaperretriever import PaperTracker
|
|
127
|
+
|
|
128
|
+
tracker = PaperTracker(
|
|
129
|
+
email="your.email@gmail.com",
|
|
130
|
+
doi='10.1097/RLU.0000000000001894',
|
|
131
|
+
max_upstream_generations=1, # Papers referenced by your paper
|
|
132
|
+
max_downstream_generations=1 # Papers that cite your paper
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
results = tracker.track_paper()
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Contributing
|
|
139
|
+
|
|
140
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
141
|
+
|
|
142
|
+
### License
|
|
143
|
+
|
|
144
|
+
MIT License
|
|
145
|
+
|
|
146
|
+
### Citation
|
|
147
|
+
|
|
148
|
+
If you use PyPaperRetriever in your research, please cite:
|
|
149
|
+
|
|
150
|
+
Turner et al., (2025). PyPaperRetriever: A Python Tool for Finding and Downloading Scientific Literature. Journal of Open Source Software, 10(113), 8135, https://doi.org/10.21105/joss.08135
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Convenience imports for the public pypaperretriever API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .http_client import HttpClient
|
|
6
|
+
from .paper_retriever import PaperRetriever
|
|
7
|
+
from .pubmed_searcher import PubMedSearcher
|
|
8
|
+
from .reference_retriever import ReferenceRetriever
|
|
9
|
+
from .image_extractor import ImageExtractor
|
|
10
|
+
from .paper_tracker import PaperTracker
|
|
11
|
+
from .utils import decode_doi, doi_to_pmid, encode_doi, pmid_to_doi
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"HttpClient",
|
|
15
|
+
"PaperRetriever",
|
|
16
|
+
"PubMedSearcher",
|
|
17
|
+
"ReferenceRetriever",
|
|
18
|
+
"ImageExtractor",
|
|
19
|
+
"PaperTracker",
|
|
20
|
+
]
|