paperscraper 0.2.16__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.2.16 → paperscraper-0.3.0}/PKG-INFO +97 -34
- {paperscraper-0.2.16 → paperscraper-0.3.0}/README.md +93 -32
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/__init__.py +1 -1
- paperscraper-0.3.0/paperscraper/citations/__init__.py +3 -0
- paperscraper-0.3.0/paperscraper/citations/citations.py +63 -0
- paperscraper-0.3.0/paperscraper/citations/tests/test_citations.py +19 -0
- paperscraper-0.3.0/paperscraper/pdf.py +527 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/scholar.py +1 -28
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/tests/test_scholar.py +2 -6
- paperscraper-0.3.0/paperscraper/tests/test_pdf.py +302 -0
- paperscraper-0.3.0/paperscraper/xrxiv/tests/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/PKG-INFO +97 -34
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/SOURCES.txt +3 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/requires.txt +1 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/setup.py +1 -0
- paperscraper-0.2.16/paperscraper/citations/__init__.py +0 -2
- paperscraper-0.2.16/paperscraper/pdf.py +0 -164
- paperscraper-0.2.16/paperscraper/tests/test_pdf.py +0 -161
- {paperscraper-0.2.16 → paperscraper-0.3.0}/LICENSE +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/arxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/arxiv/utils.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/core.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/core.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/paper.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/entity/researcher.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/self_citations.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/self_references.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/tests/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/tests/test_self_references.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/citations/utils.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/arxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/biorxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/chemrxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/medrxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/impact.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/load_dumps.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/plotting.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/postprocessing.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/pubmed.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/tests/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/scholar/__init__.py +0 -0
- /paperscraper-0.2.16/paperscraper/scholar/tests/__init__.py → /paperscraper-0.3.0/paperscraper/scholar/core.py +0 -0
- {paperscraper-0.2.16/paperscraper → paperscraper-0.3.0/paperscraper/scholar}/tests/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.2.16/paperscraper/xrxiv → paperscraper-0.3.0/paperscraper}/tests/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/tests/test_dump.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/tests/test_impactor.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/utils.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/xrxiv_api.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.2.16 → paperscraper-0.3.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/jannisborn/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
@@ -34,6 +34,7 @@ Requires-Dist: thefuzz
|
|
|
34
34
|
Requires-Dist: pytest
|
|
35
35
|
Requires-Dist: tldextract
|
|
36
36
|
Requires-Dist: semanticscholar
|
|
37
|
+
Requires-Dist: pydantic
|
|
37
38
|
Dynamic: author
|
|
38
39
|
Dynamic: author-email
|
|
39
40
|
Dynamic: classifier
|
|
@@ -42,6 +43,7 @@ Dynamic: description-content-type
|
|
|
42
43
|
Dynamic: home-page
|
|
43
44
|
Dynamic: keywords
|
|
44
45
|
Dynamic: license
|
|
46
|
+
Dynamic: license-file
|
|
45
47
|
Dynamic: requires-dist
|
|
46
48
|
Dynamic: summary
|
|
47
49
|
|
|
@@ -56,12 +58,27 @@ MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.or
|
|
|
56
58
|
[](https://codecov.io/github/jannisborn/paperscraper)
|
|
57
59
|
# paperscraper
|
|
58
60
|
|
|
59
|
-
`paperscraper` is a `python` package for scraping publication metadata or full
|
|
61
|
+
`paperscraper` is a `python` package for scraping publication metadata or full text files (PDF or XML) from
|
|
60
62
|
**PubMed** or preprint servers such as **arXiv**, **medRxiv**, **bioRxiv** and **chemRxiv**.
|
|
61
63
|
It provides a streamlined interface to scrape metadata, allows to retrieve citation counts
|
|
62
64
|
from Google Scholar, impact factors from journals and comes with simple postprocessing functions
|
|
63
65
|
and plotting routines for meta-analysis.
|
|
64
66
|
|
|
67
|
+
## Table of Contents
|
|
68
|
+
|
|
69
|
+
1. [Getting Started](#getting-started)
|
|
70
|
+
- [Download X-rxiv Dumps](#download-x-rxiv-dumps)
|
|
71
|
+
- [Arxiv Local Dump](#arxiv-local-dump)
|
|
72
|
+
2. [Examples](#examples)
|
|
73
|
+
- [Publication Keyword Search](#publication-keyword-search)
|
|
74
|
+
- [Full-Text Retrieval (PDFs & XMLs)](#full-text-retrieval-pdfs--xmls)
|
|
75
|
+
- [Citation Search](#citation-search)
|
|
76
|
+
- [Journal Impact Factor](#journal-impact-factor)
|
|
77
|
+
3. [Plotting](#plotting)
|
|
78
|
+
- [Barplots](#barplots)
|
|
79
|
+
- [Venn Diagrams](#venn-diagrams)
|
|
80
|
+
4. [Citation](#citation)
|
|
81
|
+
5. [Contributions](#contributions)
|
|
65
82
|
|
|
66
83
|
## Getting started
|
|
67
84
|
|
|
@@ -90,6 +107,21 @@ medrxiv(start_date="2023-04-01", end_date="2023-04-08")
|
|
|
90
107
|
```
|
|
91
108
|
But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
|
|
92
109
|
|
|
110
|
+
#### Arxiv local dump
|
|
111
|
+
If you prefer local search rather than using the arxiv API:
|
|
112
|
+
|
|
113
|
+
```py
|
|
114
|
+
from paperscraper.get_dumps import arxiv
|
|
115
|
+
arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
|
|
119
|
+
The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
|
|
120
|
+
backend directly in the `get_and_dump_arxiv_papers` function:
|
|
121
|
+
```py
|
|
122
|
+
from paperscraper.arxiv import get_and_dump_arxiv_papers
|
|
123
|
+
get_and_dump_arxiv_papers(..., backend='local')
|
|
124
|
+
```
|
|
93
125
|
|
|
94
126
|
## Examples
|
|
95
127
|
|
|
@@ -158,10 +190,15 @@ from paperscraper.scholar import get_and_dump_scholar_papers
|
|
|
158
190
|
topic = 'Machine Learning'
|
|
159
191
|
get_and_dump_scholar_papers(topic)
|
|
160
192
|
```
|
|
193
|
+
*NOTE*: The scholar endpoint does not require authentication but since it regularly prompts with captchas, it's difficult to apply large scale.
|
|
194
|
+
|
|
195
|
+
### Full-Text Retrieval (PDFs & XMLs)
|
|
161
196
|
|
|
162
|
-
|
|
197
|
+
`paperscraper` allows you to download full text of publications using DOIs. The basic functionality works reliably for preprint servers (arXiv, bioRxiv, medRxiv, chemRxiv), but retrieving papers from PubMed dumps is more challenging due to publisher restrictions and paywalls.
|
|
163
198
|
|
|
164
|
-
|
|
199
|
+
#### Standard Usage
|
|
200
|
+
|
|
201
|
+
The main download functions work for all paper types with automatic fallbacks:
|
|
165
202
|
|
|
166
203
|
```py
|
|
167
204
|
from paperscraper.pdf import save_pdf
|
|
@@ -169,31 +206,71 @@ paper_data = {'doi': "10.48550/arXiv.2207.03928"}
|
|
|
169
206
|
save_pdf(paper_data, filepath='gt4sd_paper.pdf')
|
|
170
207
|
```
|
|
171
208
|
|
|
172
|
-
|
|
173
|
-
Here we scrape the PDFs for the metadata obtained in the previous example.
|
|
209
|
+
To batch download full texts from your metadata search results:
|
|
174
210
|
|
|
175
211
|
```py
|
|
176
212
|
from paperscraper.pdf import save_pdf_from_dump
|
|
177
213
|
|
|
178
|
-
# Save PDFs in current folder and name the files by their DOI
|
|
214
|
+
# Save PDFs/XMLs in current folder and name the files by their DOI
|
|
179
215
|
save_pdf_from_dump('medrxiv_covid_ai_imaging.jsonl', pdf_path='.', key_to_save='doi')
|
|
180
216
|
```
|
|
181
|
-
|
|
182
|
-
|
|
217
|
+
|
|
218
|
+
#### Automatic Fallback Mechanisms
|
|
219
|
+
|
|
220
|
+
When the standard text retrieval fails, `paperscraper` automatically tries these fallbacks:
|
|
221
|
+
|
|
222
|
+
- **BioC-PMC**: For biomedical papers in [PubMed Central](https://pmc.ncbi.nlm.nih.gov/) (open-access repository), it retrieves open-access full-text XML from the [BioC-PMC API](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/).
|
|
223
|
+
- **eLife Papers**: For [eLife](https://elifesciences.org/) journal papers, it fetches XML files from eLife's open [GitHub repository](https://github.com/elifesciences/elife-article-xml).
|
|
224
|
+
|
|
225
|
+
These fallbacks are tried automatically without requiring any additional configuration.
|
|
226
|
+
|
|
227
|
+
#### Enhanced Retrieval with Publisher APIs
|
|
228
|
+
|
|
229
|
+
For more comprehensive access to papers from major publishers, you can provide API keys for:
|
|
230
|
+
|
|
231
|
+
- **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
|
|
232
|
+
- **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
|
|
233
|
+
|
|
234
|
+
To use publisher APIs:
|
|
235
|
+
|
|
236
|
+
1. Create a file with your API keys:
|
|
237
|
+
```
|
|
238
|
+
WILEY_TDM_API_TOKEN=your_wiley_token_here
|
|
239
|
+
ELSEVIER_TDM_API_KEY=your_elsevier_key_here
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
2. Pass the file path when calling retrieval functions:
|
|
243
|
+
|
|
244
|
+
```py
|
|
245
|
+
from paperscraper.pdf import save_pdf_from_dump
|
|
246
|
+
|
|
247
|
+
save_pdf_from_dump(
|
|
248
|
+
'pubmed_query_results.jsonl',
|
|
249
|
+
pdf_path='./papers',
|
|
250
|
+
key_to_save='doi',
|
|
251
|
+
api_keys='path/to/your/api_keys.txt'
|
|
252
|
+
)
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
For obtaining API keys:
|
|
256
|
+
- Wiley TDM API: Visit [Wiley Text and Data Mining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) (free for academic users with institutional subscription)
|
|
257
|
+
- Elsevier TDM API: Visit [Elsevier's Text and Data Mining](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) (free for academic users with institutional subscription)
|
|
258
|
+
|
|
259
|
+
*NOTE*: While these fallback mechanisms improve retrieval success rates, they cannot guarantee access to all papers due to various access restrictions.
|
|
183
260
|
|
|
184
261
|
|
|
185
262
|
### Citation search
|
|
186
263
|
|
|
187
|
-
|
|
264
|
+
You can fetch the number of citations of a paper from its title or DOI
|
|
188
265
|
|
|
189
266
|
```py
|
|
190
|
-
from paperscraper.
|
|
267
|
+
from paperscraper.citations import get_citations_from_title, get_citations_by_doi
|
|
191
268
|
title = 'Über formal unentscheidbare Sätze der Principia Mathematica und verwandter Systeme I.'
|
|
192
|
-
get_citations_from_title(title)
|
|
193
|
-
```
|
|
269
|
+
print(get_citations_from_title(title))
|
|
194
270
|
|
|
195
|
-
|
|
196
|
-
|
|
271
|
+
doi = '10.1021/acs.jcim.3c00132'
|
|
272
|
+
get_citations_by_doi(doi)
|
|
273
|
+
```
|
|
197
274
|
|
|
198
275
|
### Journal impact factor
|
|
199
276
|
|
|
@@ -231,28 +308,13 @@ i.search("quantum information", threshold=90, return_all=True)
|
|
|
231
308
|
# ]
|
|
232
309
|
```
|
|
233
310
|
|
|
234
|
-
## Arxiv local dump
|
|
235
|
-
If you prefer local search rather than using the arxiv API:
|
|
236
|
-
|
|
237
|
-
```py
|
|
238
|
-
from paperscraper.get_dumps import arxiv
|
|
239
|
-
arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
|
|
243
|
-
The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
|
|
244
|
-
backend directly in the `get_and_dump_arxiv_papers` function:
|
|
245
|
-
```py
|
|
246
|
-
from paperscraper.arxiv import get_and_dump_arxiv_papers
|
|
247
|
-
get_and_dump_arxiv_papers(..., backend='local')
|
|
248
|
-
```
|
|
249
311
|
|
|
250
|
-
|
|
312
|
+
## Plotting
|
|
251
313
|
|
|
252
314
|
When multiple query searches are performed, two types of plots can be generated
|
|
253
315
|
automatically: Venn diagrams and bar plots.
|
|
254
316
|
|
|
255
|
-
|
|
317
|
+
### Barplots
|
|
256
318
|
|
|
257
319
|
Compare the temporal evolution of different queries across different servers.
|
|
258
320
|
|
|
@@ -310,7 +372,7 @@ plot_comparison(
|
|
|
310
372
|

|
|
311
373
|
|
|
312
374
|
|
|
313
|
-
|
|
375
|
+
### Venn Diagrams
|
|
314
376
|
|
|
315
377
|
```py
|
|
316
378
|
from paperscraper.plotting import (
|
|
@@ -369,6 +431,7 @@ If you use `paperscraper`, please cite a paper that motivated our development of
|
|
|
369
431
|
|
|
370
432
|
## Contributions
|
|
371
433
|
Thanks to the following contributors:
|
|
434
|
+
- [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
|
|
372
435
|
- [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
|
|
373
436
|
- [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
|
|
374
437
|
- [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
|
|
@@ -9,12 +9,27 @@ MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.or
|
|
|
9
9
|
[](https://codecov.io/github/jannisborn/paperscraper)
|
|
10
10
|
# paperscraper
|
|
11
11
|
|
|
12
|
-
`paperscraper` is a `python` package for scraping publication metadata or full
|
|
12
|
+
`paperscraper` is a `python` package for scraping publication metadata or full text files (PDF or XML) from
|
|
13
13
|
**PubMed** or preprint servers such as **arXiv**, **medRxiv**, **bioRxiv** and **chemRxiv**.
|
|
14
14
|
It provides a streamlined interface to scrape metadata, allows to retrieve citation counts
|
|
15
15
|
from Google Scholar, impact factors from journals and comes with simple postprocessing functions
|
|
16
16
|
and plotting routines for meta-analysis.
|
|
17
17
|
|
|
18
|
+
## Table of Contents
|
|
19
|
+
|
|
20
|
+
1. [Getting Started](#getting-started)
|
|
21
|
+
- [Download X-rxiv Dumps](#download-x-rxiv-dumps)
|
|
22
|
+
- [Arxiv Local Dump](#arxiv-local-dump)
|
|
23
|
+
2. [Examples](#examples)
|
|
24
|
+
- [Publication Keyword Search](#publication-keyword-search)
|
|
25
|
+
- [Full-Text Retrieval (PDFs & XMLs)](#full-text-retrieval-pdfs--xmls)
|
|
26
|
+
- [Citation Search](#citation-search)
|
|
27
|
+
- [Journal Impact Factor](#journal-impact-factor)
|
|
28
|
+
3. [Plotting](#plotting)
|
|
29
|
+
- [Barplots](#barplots)
|
|
30
|
+
- [Venn Diagrams](#venn-diagrams)
|
|
31
|
+
4. [Citation](#citation)
|
|
32
|
+
5. [Contributions](#contributions)
|
|
18
33
|
|
|
19
34
|
## Getting started
|
|
20
35
|
|
|
@@ -43,6 +58,21 @@ medrxiv(start_date="2023-04-01", end_date="2023-04-08")
|
|
|
43
58
|
```
|
|
44
59
|
But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
|
|
45
60
|
|
|
61
|
+
#### Arxiv local dump
|
|
62
|
+
If you prefer local search rather than using the arxiv API:
|
|
63
|
+
|
|
64
|
+
```py
|
|
65
|
+
from paperscraper.get_dumps import arxiv
|
|
66
|
+
arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
|
|
70
|
+
The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
|
|
71
|
+
backend directly in the `get_and_dump_arxiv_papers` function:
|
|
72
|
+
```py
|
|
73
|
+
from paperscraper.arxiv import get_and_dump_arxiv_papers
|
|
74
|
+
get_and_dump_arxiv_papers(..., backend='local')
|
|
75
|
+
```
|
|
46
76
|
|
|
47
77
|
## Examples
|
|
48
78
|
|
|
@@ -111,10 +141,15 @@ from paperscraper.scholar import get_and_dump_scholar_papers
|
|
|
111
141
|
topic = 'Machine Learning'
|
|
112
142
|
get_and_dump_scholar_papers(topic)
|
|
113
143
|
```
|
|
144
|
+
*NOTE*: The scholar endpoint does not require authentication but since it regularly prompts with captchas, it's difficult to apply large scale.
|
|
145
|
+
|
|
146
|
+
### Full-Text Retrieval (PDFs & XMLs)
|
|
114
147
|
|
|
115
|
-
|
|
148
|
+
`paperscraper` allows you to download full text of publications using DOIs. The basic functionality works reliably for preprint servers (arXiv, bioRxiv, medRxiv, chemRxiv), but retrieving papers from PubMed dumps is more challenging due to publisher restrictions and paywalls.
|
|
116
149
|
|
|
117
|
-
|
|
150
|
+
#### Standard Usage
|
|
151
|
+
|
|
152
|
+
The main download functions work for all paper types with automatic fallbacks:
|
|
118
153
|
|
|
119
154
|
```py
|
|
120
155
|
from paperscraper.pdf import save_pdf
|
|
@@ -122,31 +157,71 @@ paper_data = {'doi': "10.48550/arXiv.2207.03928"}
|
|
|
122
157
|
save_pdf(paper_data, filepath='gt4sd_paper.pdf')
|
|
123
158
|
```
|
|
124
159
|
|
|
125
|
-
|
|
126
|
-
Here we scrape the PDFs for the metadata obtained in the previous example.
|
|
160
|
+
To batch download full texts from your metadata search results:
|
|
127
161
|
|
|
128
162
|
```py
|
|
129
163
|
from paperscraper.pdf import save_pdf_from_dump
|
|
130
164
|
|
|
131
|
-
# Save PDFs in current folder and name the files by their DOI
|
|
165
|
+
# Save PDFs/XMLs in current folder and name the files by their DOI
|
|
132
166
|
save_pdf_from_dump('medrxiv_covid_ai_imaging.jsonl', pdf_path='.', key_to_save='doi')
|
|
133
167
|
```
|
|
134
|
-
|
|
135
|
-
|
|
168
|
+
|
|
169
|
+
#### Automatic Fallback Mechanisms
|
|
170
|
+
|
|
171
|
+
When the standard text retrieval fails, `paperscraper` automatically tries these fallbacks:
|
|
172
|
+
|
|
173
|
+
- **BioC-PMC**: For biomedical papers in [PubMed Central](https://pmc.ncbi.nlm.nih.gov/) (open-access repository), it retrieves open-access full-text XML from the [BioC-PMC API](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/).
|
|
174
|
+
- **eLife Papers**: For [eLife](https://elifesciences.org/) journal papers, it fetches XML files from eLife's open [GitHub repository](https://github.com/elifesciences/elife-article-xml).
|
|
175
|
+
|
|
176
|
+
These fallbacks are tried automatically without requiring any additional configuration.
|
|
177
|
+
|
|
178
|
+
#### Enhanced Retrieval with Publisher APIs
|
|
179
|
+
|
|
180
|
+
For more comprehensive access to papers from major publishers, you can provide API keys for:
|
|
181
|
+
|
|
182
|
+
- **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
|
|
183
|
+
- **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
|
|
184
|
+
|
|
185
|
+
To use publisher APIs:
|
|
186
|
+
|
|
187
|
+
1. Create a file with your API keys:
|
|
188
|
+
```
|
|
189
|
+
WILEY_TDM_API_TOKEN=your_wiley_token_here
|
|
190
|
+
ELSEVIER_TDM_API_KEY=your_elsevier_key_here
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
2. Pass the file path when calling retrieval functions:
|
|
194
|
+
|
|
195
|
+
```py
|
|
196
|
+
from paperscraper.pdf import save_pdf_from_dump
|
|
197
|
+
|
|
198
|
+
save_pdf_from_dump(
|
|
199
|
+
'pubmed_query_results.jsonl',
|
|
200
|
+
pdf_path='./papers',
|
|
201
|
+
key_to_save='doi',
|
|
202
|
+
api_keys='path/to/your/api_keys.txt'
|
|
203
|
+
)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
For obtaining API keys:
|
|
207
|
+
- Wiley TDM API: Visit [Wiley Text and Data Mining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) (free for academic users with institutional subscription)
|
|
208
|
+
- Elsevier TDM API: Visit [Elsevier's Text and Data Mining](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) (free for academic users with institutional subscription)
|
|
209
|
+
|
|
210
|
+
*NOTE*: While these fallback mechanisms improve retrieval success rates, they cannot guarantee access to all papers due to various access restrictions.
|
|
136
211
|
|
|
137
212
|
|
|
138
213
|
### Citation search
|
|
139
214
|
|
|
140
|
-
|
|
215
|
+
You can fetch the number of citations of a paper from its title or DOI
|
|
141
216
|
|
|
142
217
|
```py
|
|
143
|
-
from paperscraper.
|
|
218
|
+
from paperscraper.citations import get_citations_from_title, get_citations_by_doi
|
|
144
219
|
title = 'Über formal unentscheidbare Sätze der Principia Mathematica und verwandter Systeme I.'
|
|
145
|
-
get_citations_from_title(title)
|
|
146
|
-
```
|
|
220
|
+
print(get_citations_from_title(title))
|
|
147
221
|
|
|
148
|
-
|
|
149
|
-
|
|
222
|
+
doi = '10.1021/acs.jcim.3c00132'
|
|
223
|
+
get_citations_by_doi(doi)
|
|
224
|
+
```
|
|
150
225
|
|
|
151
226
|
### Journal impact factor
|
|
152
227
|
|
|
@@ -184,28 +259,13 @@ i.search("quantum information", threshold=90, return_all=True)
|
|
|
184
259
|
# ]
|
|
185
260
|
```
|
|
186
261
|
|
|
187
|
-
## Arxiv local dump
|
|
188
|
-
If you prefer local search rather than using the arxiv API:
|
|
189
|
-
|
|
190
|
-
```py
|
|
191
|
-
from paperscraper.get_dumps import arxiv
|
|
192
|
-
arxiv(start_date='2024-01-01', end_date=None) # scrapes all metadata from 2024 until today.
|
|
193
|
-
```
|
|
194
|
-
|
|
195
|
-
Afterwards you can search the local arxiv dump just like the other x-rxiv dumps.
|
|
196
|
-
The direct endpoint is `paperscraper.arxiv.get_arxiv_papers_local`. You can also specify the
|
|
197
|
-
backend directly in the `get_and_dump_arxiv_papers` function:
|
|
198
|
-
```py
|
|
199
|
-
from paperscraper.arxiv import get_and_dump_arxiv_papers
|
|
200
|
-
get_and_dump_arxiv_papers(..., backend='local')
|
|
201
|
-
```
|
|
202
262
|
|
|
203
|
-
|
|
263
|
+
## Plotting
|
|
204
264
|
|
|
205
265
|
When multiple query searches are performed, two types of plots can be generated
|
|
206
266
|
automatically: Venn diagrams and bar plots.
|
|
207
267
|
|
|
208
|
-
|
|
268
|
+
### Barplots
|
|
209
269
|
|
|
210
270
|
Compare the temporal evolution of different queries across different servers.
|
|
211
271
|
|
|
@@ -263,7 +323,7 @@ plot_comparison(
|
|
|
263
323
|

|
|
264
324
|
|
|
265
325
|
|
|
266
|
-
|
|
326
|
+
### Venn Diagrams
|
|
267
327
|
|
|
268
328
|
```py
|
|
269
329
|
from paperscraper.plotting import (
|
|
@@ -322,6 +382,7 @@ If you use `paperscraper`, please cite a paper that motivated our development of
|
|
|
322
382
|
|
|
323
383
|
## Contributions
|
|
324
384
|
Thanks to the following contributors:
|
|
385
|
+
- [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
|
|
325
386
|
- [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
|
|
326
387
|
- [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
|
|
327
388
|
- [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from time import sleep
|
|
4
|
+
|
|
5
|
+
from scholarly import scholarly
|
|
6
|
+
from semanticscholar import SemanticScholar, SemanticScholarException
|
|
7
|
+
|
|
8
|
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
sch = SemanticScholar()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_citations_by_doi(doi: str) -> int:
|
|
14
|
+
"""
|
|
15
|
+
Get the number of citations of a paper according to semantic scholar.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
doi: the DOI of the paper.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The number of citations
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
paper = sch.get_paper(doi)
|
|
26
|
+
citations = len(paper["citations"])
|
|
27
|
+
except SemanticScholarException.ObjectNotFoundException:
|
|
28
|
+
logger.warning(f"Could not find paper {doi}, assuming 0 citation.")
|
|
29
|
+
citations = 0
|
|
30
|
+
except ConnectionRefusedError as e:
|
|
31
|
+
logger.warning(f"Waiting for 10 sec since {doi} gave: {e}")
|
|
32
|
+
sleep(10)
|
|
33
|
+
citations = len(sch.get_paper(doi)["citations"])
|
|
34
|
+
finally:
|
|
35
|
+
return citations
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_citations_from_title(title: str) -> int:
|
|
39
|
+
"""
|
|
40
|
+
Args:
|
|
41
|
+
title (str): Title of paper to be searched on Scholar.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
TypeError: If sth else than str is passed.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
int: Number of citations of paper.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
if not isinstance(title, str):
|
|
51
|
+
raise TypeError(f"Pass str not {type(title)}")
|
|
52
|
+
|
|
53
|
+
# Search for exact match
|
|
54
|
+
title = '"' + title.strip() + '"'
|
|
55
|
+
|
|
56
|
+
matches = scholarly.search_pubs(title)
|
|
57
|
+
counts = list(map(lambda p: int(p["num_citations"]), matches))
|
|
58
|
+
if len(counts) == 0:
|
|
59
|
+
logger.warning(f"Found no match for {title}.")
|
|
60
|
+
return 0
|
|
61
|
+
if len(counts) > 1:
|
|
62
|
+
logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
|
|
63
|
+
return counts[0]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from paperscraper.citations import get_citations_by_doi
|
|
4
|
+
|
|
5
|
+
logging.disable(logging.INFO)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestCitations:
|
|
9
|
+
def test_citations(self):
|
|
10
|
+
num = get_citations_by_doi("10.1038/s42256-023-00639-z")
|
|
11
|
+
assert isinstance(num, int) and num > 50
|
|
12
|
+
|
|
13
|
+
# Try invalid DOI
|
|
14
|
+
num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
|
|
15
|
+
assert isinstance(num, int) and num == 0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
|
|
19
|
+
assert isinstance(num, int) and num == 0
|