paperscraper 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.3.2 → paperscraper-0.3.4}/PKG-INFO +53 -34
- {paperscraper-0.3.2 → paperscraper-0.3.4}/README.md +34 -8
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/__init__.py +1 -1
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/arxiv.py +4 -5
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/utils.py +2 -2
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/async_utils.py +36 -9
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/citations.py +2 -1
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/core.py +6 -5
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/paper.py +17 -15
- paperscraper-0.3.4/paperscraper/citations/entity/researcher.py +221 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/self_citations.py +5 -2
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/self_references.py +68 -42
- paperscraper-0.3.4/paperscraper/citations/tests/test_citations.py +32 -0
- paperscraper-0.3.4/paperscraper/citations/tests/test_self_citations.py +147 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/test_self_references.py +49 -4
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/utils.py +99 -51
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/arxiv.py +2 -2
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/biorxiv.py +2 -2
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/chemrxiv.py +2 -3
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/medrxiv.py +2 -2
- paperscraper-0.3.4/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +253 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/chemrxiv/utils.py +33 -18
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/load_dumps.py +2 -3
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/fallbacks.py +138 -56
- paperscraper-0.3.4/paperscraper/pdf/pdf.py +442 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/utils.py +21 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/plotting.py +17 -23
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/postprocessing.py +2 -3
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/pubmed.py +22 -14
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/scholar.py +6 -10
- paperscraper-0.3.4/paperscraper/server_dumps/__init__.py +4 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_dump.py +8 -2
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_impactor.py +23 -4
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_pdf.py +0 -5
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/utils.py +6 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/PKG-INFO +53 -34
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/SOURCES.txt +1 -1
- paperscraper-0.3.4/paperscraper.egg-info/requires.txt +19 -0
- paperscraper-0.3.4/pyproject.toml +90 -0
- paperscraper-0.3.2/paperscraper/citations/entity/researcher.py +0 -90
- paperscraper-0.3.2/paperscraper/citations/tests/test_citations.py +0 -18
- paperscraper-0.3.2/paperscraper/citations/tests/test_self_citations.py +0 -72
- paperscraper-0.3.2/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -144
- paperscraper-0.3.2/paperscraper/pdf/pdf.py +0 -250
- paperscraper-0.3.2/paperscraper/server_dumps/__init__.py +0 -1
- paperscraper-0.3.2/paperscraper.egg-info/requires.txt +0 -19
- paperscraper-0.3.2/setup.py +0 -77
- {paperscraper-0.3.2 → paperscraper-0.3.4}/LICENSE +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/core.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/orcid.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/test_paper.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/impact.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/tests/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/core.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/tests/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/tests/test_scholar.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/tests/__init__.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/xrxiv_api.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.3.2 → paperscraper-0.3.4}/setup.cfg +0 -0
|
@@ -1,62 +1,55 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
|
-
|
|
6
|
-
Author: Jannis Born, Matteo Manica
|
|
7
|
-
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
5
|
+
Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
|
|
8
6
|
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jannisborn/paperscraper
|
|
8
|
+
Project-URL: Documentation, https://jannisborn.github.io/paperscraper/
|
|
9
|
+
Project-URL: Repository, https://github.com/jannisborn/paperscraper
|
|
9
10
|
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
|
|
10
11
|
Classifier: Development Status :: 3 - Alpha
|
|
11
12
|
Classifier: Intended Audience :: Developers
|
|
12
13
|
Classifier: Intended Audience :: Science/Research
|
|
13
14
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
15
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
20
23
|
Description-Content-Type: text/markdown
|
|
21
24
|
License-File: LICENSE
|
|
22
|
-
Requires-Dist: arxiv>=1.4.
|
|
25
|
+
Requires-Dist: arxiv>=1.4.7
|
|
23
26
|
Requires-Dist: pymed-paperscraper>=1.0.4
|
|
24
|
-
Requires-Dist: pandas
|
|
25
|
-
Requires-Dist: requests
|
|
26
|
-
Requires-Dist: tqdm
|
|
27
|
+
Requires-Dist: pandas>=1.0.4
|
|
28
|
+
Requires-Dist: requests==2.32.0
|
|
29
|
+
Requires-Dist: tqdm>=4.51.0
|
|
27
30
|
Requires-Dist: scholarly>=1.0.0
|
|
28
|
-
Requires-Dist: seaborn
|
|
29
|
-
Requires-Dist: matplotlib
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: bs4
|
|
32
|
-
Requires-Dist: impact-factor>=1.1.
|
|
33
|
-
Requires-Dist: thefuzz
|
|
31
|
+
Requires-Dist: seaborn>=0.11.0
|
|
32
|
+
Requires-Dist: matplotlib>=3.3.2
|
|
33
|
+
Requires-Dist: matplotlib-venn>=0.11.5
|
|
34
|
+
Requires-Dist: bs4>=0.0.1
|
|
35
|
+
Requires-Dist: impact-factor>=1.1.3
|
|
36
|
+
Requires-Dist: thefuzz>=0.20.0
|
|
34
37
|
Requires-Dist: pytest
|
|
35
38
|
Requires-Dist: tldextract
|
|
36
|
-
Requires-Dist: semanticscholar
|
|
39
|
+
Requires-Dist: semanticscholar>=0.8.4
|
|
37
40
|
Requires-Dist: pydantic
|
|
38
41
|
Requires-Dist: unidecode
|
|
39
42
|
Requires-Dist: dotenv
|
|
40
43
|
Requires-Dist: boto3
|
|
41
|
-
Dynamic: author
|
|
42
|
-
Dynamic: author-email
|
|
43
|
-
Dynamic: classifier
|
|
44
|
-
Dynamic: description
|
|
45
|
-
Dynamic: description-content-type
|
|
46
|
-
Dynamic: home-page
|
|
47
|
-
Dynamic: keywords
|
|
48
|
-
Dynamic: license
|
|
49
44
|
Dynamic: license-file
|
|
50
|
-
Dynamic: requires-dist
|
|
51
|
-
Dynamic: summary
|
|
52
45
|
|
|
53
46
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
|
|
54
47
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
|
|
48
|
+
[](https://jannisborn.github.io/paperscraper/)
|
|
55
49
|
[](https://opensource.org/licenses/MIT)
|
|
57
51
|
[](https://badge.fury.io/py/paperscraper)
|
|
58
52
|
[](https://pepy.tech/project/paperscraper)
|
|
59
|
-
[](https://github.com/psf/black)
|
|
60
53
|
[](https://codecov.io/github/jannisborn/paperscraper)
|
|
61
54
|
# paperscraper
|
|
62
55
|
|
|
@@ -66,6 +59,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
|
|
|
66
59
|
from Google Scholar, impact factors from journals and comes with simple postprocessing functions
|
|
67
60
|
and plotting routines for meta-analysis.
|
|
68
61
|
|
|
62
|
+
|
|
69
63
|
## Table of Contents
|
|
70
64
|
|
|
71
65
|
1. [Getting Started](#getting-started)
|
|
@@ -90,18 +84,30 @@ pip install paperscraper
|
|
|
90
84
|
|
|
91
85
|
This is enough to query PubMed, arXiv or Google Scholar.
|
|
92
86
|
|
|
87
|
+
### Local development
|
|
88
|
+
|
|
89
|
+
```console
|
|
90
|
+
uv sync
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
|
|
94
|
+
|
|
95
|
+
```console
|
|
96
|
+
uv run python -c "import paperscraper"
|
|
97
|
+
```
|
|
98
|
+
|
|
93
99
|
#### Download X-rxiv Dumps
|
|
94
100
|
|
|
95
|
-
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire
|
|
101
|
+
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
|
|
96
102
|
|
|
97
103
|
```py
|
|
98
104
|
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
105
|
+
chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
|
|
106
|
+
medrxiv() # Takes <1h -> +90K papers (~200 MB file)
|
|
107
|
+
biorxiv() # Up to 6h -> +400K papers (~800 MB file)
|
|
102
108
|
```
|
|
103
109
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
|
|
104
|
-
*NOTE*: If you experience API connection issues
|
|
110
|
+
*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
|
|
105
111
|
|
|
106
112
|
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
|
|
107
113
|
```py
|
|
@@ -279,6 +285,13 @@ doi = '10.1021/acs.jcim.3c00132'
|
|
|
279
285
|
get_citations_by_doi(doi)
|
|
280
286
|
```
|
|
281
287
|
|
|
288
|
+
NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
|
|
289
|
+
```sh
|
|
290
|
+
export SS_API_KEY=YOUR_API_KEY
|
|
291
|
+
```
|
|
292
|
+
This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
|
|
293
|
+
|
|
294
|
+
|
|
282
295
|
### Journal impact factor
|
|
283
296
|
|
|
284
297
|
You can also retrieve the impact factor for all journals:
|
|
@@ -423,7 +436,7 @@ plot_multiple_venn(
|
|
|
423
436
|
## Citation
|
|
424
437
|
If you use `paperscraper`, please cite a paper that motivated our development of this tool.
|
|
425
438
|
|
|
426
|
-
```
|
|
439
|
+
```bibtex
|
|
427
440
|
@article{born2021trends,
|
|
428
441
|
title={Trends in Deep Learning for Property-driven Drug Design},
|
|
429
442
|
author={Born, Jannis and Manica, Matteo},
|
|
@@ -439,9 +452,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
|
|
|
439
452
|
## Contributions
|
|
440
453
|
Thanks to the following contributors:
|
|
441
454
|
- [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
|
|
455
|
+
|
|
442
456
|
- [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
|
|
457
|
+
|
|
443
458
|
- [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
|
|
459
|
+
|
|
444
460
|
- [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
|
|
461
|
+
|
|
445
462
|
- [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
|
|
446
|
-
|
|
463
|
+
|
|
464
|
+
- [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
|
|
465
|
+
|
|
447
466
|
- [@juliusbierk](https://github.com/juliusbierk): Bugfixes
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
|
|
2
2
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
|
|
3
|
+
[](https://jannisborn.github.io/paperscraper/)
|
|
3
4
|
[](https://opensource.org/licenses/MIT)
|
|
5
6
|
[](https://badge.fury.io/py/paperscraper)
|
|
6
7
|
[](https://pepy.tech/project/paperscraper)
|
|
7
|
-
[](https://github.com/psf/black)
|
|
8
8
|
[](https://codecov.io/github/jannisborn/paperscraper)
|
|
9
9
|
# paperscraper
|
|
10
10
|
|
|
@@ -14,6 +14,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
|
|
|
14
14
|
from Google Scholar, impact factors from journals and comes with simple postprocessing functions
|
|
15
15
|
and plotting routines for meta-analysis.
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
## Table of Contents
|
|
18
19
|
|
|
19
20
|
1. [Getting Started](#getting-started)
|
|
@@ -38,18 +39,30 @@ pip install paperscraper
|
|
|
38
39
|
|
|
39
40
|
This is enough to query PubMed, arXiv or Google Scholar.
|
|
40
41
|
|
|
42
|
+
### Local development
|
|
43
|
+
|
|
44
|
+
```console
|
|
45
|
+
uv sync
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
|
|
49
|
+
|
|
50
|
+
```console
|
|
51
|
+
uv run python -c "import paperscraper"
|
|
52
|
+
```
|
|
53
|
+
|
|
41
54
|
#### Download X-rxiv Dumps
|
|
42
55
|
|
|
43
|
-
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire
|
|
56
|
+
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
|
|
44
57
|
|
|
45
58
|
```py
|
|
46
59
|
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
60
|
+
chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
|
|
61
|
+
medrxiv() # Takes <1h -> +90K papers (~200 MB file)
|
|
62
|
+
biorxiv() # Up to 6h -> +400K papers (~800 MB file)
|
|
50
63
|
```
|
|
51
64
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
|
|
52
|
-
*NOTE*: If you experience API connection issues
|
|
65
|
+
*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
|
|
53
66
|
|
|
54
67
|
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
|
|
55
68
|
```py
|
|
@@ -227,6 +240,13 @@ doi = '10.1021/acs.jcim.3c00132'
|
|
|
227
240
|
get_citations_by_doi(doi)
|
|
228
241
|
```
|
|
229
242
|
|
|
243
|
+
NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
|
|
244
|
+
```sh
|
|
245
|
+
export SS_API_KEY=YOUR_API_KEY
|
|
246
|
+
```
|
|
247
|
+
This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
|
|
248
|
+
|
|
249
|
+
|
|
230
250
|
### Journal impact factor
|
|
231
251
|
|
|
232
252
|
You can also retrieve the impact factor for all journals:
|
|
@@ -371,7 +391,7 @@ plot_multiple_venn(
|
|
|
371
391
|
## Citation
|
|
372
392
|
If you use `paperscraper`, please cite a paper that motivated our development of this tool.
|
|
373
393
|
|
|
374
|
-
```
|
|
394
|
+
```bibtex
|
|
375
395
|
@article{born2021trends,
|
|
376
396
|
title={Trends in Deep Learning for Property-driven Drug Design},
|
|
377
397
|
author={Born, Jannis and Manica, Matteo},
|
|
@@ -387,9 +407,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
|
|
|
387
407
|
## Contributions
|
|
388
408
|
Thanks to the following contributors:
|
|
389
409
|
- [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
|
|
410
|
+
|
|
390
411
|
- [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
|
|
412
|
+
|
|
391
413
|
- [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
|
|
414
|
+
|
|
392
415
|
- [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
|
|
416
|
+
|
|
393
417
|
- [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
|
|
394
|
-
|
|
418
|
+
|
|
419
|
+
- [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
|
|
420
|
+
|
|
395
421
|
- [@juliusbierk](https://github.com/juliusbierk): Bugfixes
|
|
@@ -6,17 +6,16 @@ from typing import Dict, List, Literal, Union
|
|
|
6
6
|
|
|
7
7
|
import arxiv
|
|
8
8
|
import pandas as pd
|
|
9
|
-
import pkg_resources
|
|
10
9
|
from tqdm import tqdm
|
|
11
10
|
|
|
12
|
-
from ..utils import dump_papers
|
|
11
|
+
from ..utils import dump_papers, get_server_dumps_dir
|
|
13
12
|
from ..xrxiv.xrxiv_query import XRXivQuery
|
|
14
13
|
from .utils import get_query_from_keywords, infer_backend
|
|
15
14
|
|
|
16
15
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
19
|
-
dump_root =
|
|
18
|
+
dump_root = get_server_dumps_dir()
|
|
20
19
|
|
|
21
20
|
global ARXIV_QUERIER
|
|
22
21
|
ARXIV_QUERIER = None
|
|
@@ -94,7 +93,7 @@ def get_arxiv_papers_api(
|
|
|
94
93
|
fields as desired.
|
|
95
94
|
|
|
96
95
|
Args:
|
|
97
|
-
query Query to arxiv API. Needs to match the arxiv API notation.
|
|
96
|
+
query: Query to arxiv API. Needs to match the arxiv API notation.
|
|
98
97
|
fields: List of strings with fields to keep in output.
|
|
99
98
|
max_results: Maximal number of results, defaults to 99999.
|
|
100
99
|
client_options: Optional arguments for `arxiv.Client`. E.g.:
|
|
@@ -144,7 +143,7 @@ def get_and_dump_arxiv_papers(
|
|
|
144
143
|
keywords: List of keywords for arxiv search.
|
|
145
144
|
The outer list level will be considered as AND separated keys, the
|
|
146
145
|
inner level as OR separated.
|
|
147
|
-
|
|
146
|
+
output_filepath: Path where the dump will be saved.
|
|
148
147
|
fields: List of strings with fields to keep in output.
|
|
149
148
|
Defaults to ['title', 'authors', 'date', 'abstract',
|
|
150
149
|
'journal', 'doi'].
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
from ..utils import get_server_dumps_dir
|
|
7
7
|
|
|
8
8
|
finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
|
|
9
9
|
finalize_conjunction = lambda x: x[:-5]
|
|
@@ -59,6 +59,6 @@ def get_query_from_keywords(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def infer_backend():
|
|
62
|
-
dump_root =
|
|
62
|
+
dump_root = get_server_dumps_dir()
|
|
63
63
|
dump_paths = glob.glob(os.path.join(dump_root, "arxiv" + "*"))
|
|
64
64
|
return "api" if not dump_paths else "local"
|
|
@@ -49,14 +49,20 @@ def optional_async(
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def retry_with_exponential_backoff(
|
|
52
|
-
*,
|
|
52
|
+
*,
|
|
53
|
+
max_retries: int = 5,
|
|
54
|
+
base_delay: float = 1.0,
|
|
55
|
+
factor: float = 1.3,
|
|
56
|
+
constant_delay: float = 0.2,
|
|
53
57
|
) -> Callable[[F], F]:
|
|
54
58
|
"""
|
|
55
59
|
Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
|
|
56
60
|
|
|
57
61
|
Args:
|
|
58
62
|
max_retries: how many times to retry before giving up.
|
|
59
|
-
base_delay: initial delay in seconds; next delays will be
|
|
63
|
+
base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
|
|
64
|
+
factor: multiplier for delay after each retry.
|
|
65
|
+
constant_delay: fixed delay before each attempt.
|
|
60
66
|
|
|
61
67
|
Usage:
|
|
62
68
|
|
|
@@ -70,18 +76,39 @@ def retry_with_exponential_backoff(
|
|
|
70
76
|
@wraps(func)
|
|
71
77
|
async def wrapper(*args, **kwargs) -> Any:
|
|
72
78
|
delay = base_delay
|
|
73
|
-
|
|
79
|
+
last_exception: BaseException | None = None
|
|
80
|
+
for attempt in range(1, max_retries + 1):
|
|
81
|
+
await asyncio.sleep(constant_delay)
|
|
74
82
|
try:
|
|
75
83
|
return await func(*args, **kwargs)
|
|
76
84
|
except httpx.HTTPStatusError as e:
|
|
77
|
-
# only retry on 429
|
|
78
85
|
status = e.response.status_code if e.response is not None else None
|
|
79
|
-
if status != 429
|
|
86
|
+
if status != 429:
|
|
80
87
|
raise
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
last_exception = e
|
|
89
|
+
sleep_for = delay
|
|
90
|
+
if e.response is not None:
|
|
91
|
+
ra = e.response.headers.get("Retry-After")
|
|
92
|
+
if ra is not None:
|
|
93
|
+
try:
|
|
94
|
+
sleep_for = float(ra)
|
|
95
|
+
except ValueError:
|
|
96
|
+
pass
|
|
97
|
+
delay *= factor
|
|
98
|
+
|
|
99
|
+
except httpx.ReadError as e:
|
|
100
|
+
last_exception = e
|
|
101
|
+
sleep_for = delay
|
|
102
|
+
delay *= factor
|
|
103
|
+
|
|
104
|
+
if attempt == max_retries:
|
|
105
|
+
msg = (
|
|
106
|
+
f"{func.__name__} failed after {attempt} attempts with "
|
|
107
|
+
f"last delay {sleep_for:.2f}s"
|
|
108
|
+
)
|
|
109
|
+
raise RuntimeError(msg) from last_exception
|
|
110
|
+
|
|
111
|
+
await asyncio.sleep(sleep_for)
|
|
85
112
|
|
|
86
113
|
return wrapper
|
|
87
114
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
import sys
|
|
3
4
|
from time import sleep
|
|
4
5
|
|
|
@@ -7,7 +8,7 @@ from semanticscholar import SemanticScholar, SemanticScholarException
|
|
|
7
8
|
|
|
8
9
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
9
10
|
logger = logging.getLogger(__name__)
|
|
10
|
-
sch = SemanticScholar()
|
|
11
|
+
sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def get_citations_by_doi(doi: str) -> int:
|
|
@@ -5,14 +5,15 @@ from pydantic import BaseModel
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class EntityResult(BaseModel):
|
|
8
|
-
num_citations: int
|
|
9
|
-
num_references: int
|
|
10
|
-
# keys are authors or papers and values are absolute self links
|
|
11
|
-
self_citations: Dict[str, int] = {}
|
|
12
|
-
self_references: Dict[str, int] = {}
|
|
13
8
|
# aggregated results
|
|
14
9
|
self_citation_ratio: float = 0
|
|
15
10
|
self_reference_ratio: float = 0
|
|
11
|
+
# total number of author citations/references
|
|
12
|
+
num_citations: int
|
|
13
|
+
num_references: int
|
|
14
|
+
# keys are papers and values are percentage of self citations/references
|
|
15
|
+
self_citations: Dict[str, float] = {}
|
|
16
|
+
self_references: Dict[str, float] = {}
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class Entity:
|
|
@@ -68,14 +68,14 @@ class Paper(Entity):
|
|
|
68
68
|
Extracts the self references of a paper, for each author.
|
|
69
69
|
"""
|
|
70
70
|
if isinstance(self.doi, str):
|
|
71
|
-
self.
|
|
71
|
+
self.self_ref: ReferenceResult = self_references_paper(self.doi)
|
|
72
72
|
|
|
73
73
|
def self_citations(self):
|
|
74
74
|
"""
|
|
75
75
|
Extracts the self citations of a paper, for each author.
|
|
76
76
|
"""
|
|
77
77
|
if isinstance(self.doi, str):
|
|
78
|
-
self.
|
|
78
|
+
self.self_cite: CitationResult = self_citations_paper(self.doi)
|
|
79
79
|
|
|
80
80
|
def get_result(self) -> Optional[PaperResult]:
|
|
81
81
|
"""
|
|
@@ -83,18 +83,20 @@ class Paper(Entity):
|
|
|
83
83
|
|
|
84
84
|
Returns: PaperResult if available.
|
|
85
85
|
"""
|
|
86
|
-
if not hasattr(self, "
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
)
|
|
90
|
-
return
|
|
91
|
-
elif not hasattr(self, "citation_result"):
|
|
92
|
-
logger.warning(
|
|
93
|
-
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
|
|
94
|
-
)
|
|
95
|
-
return
|
|
96
|
-
ref_result = self.ref_result.model_dump()
|
|
97
|
-
ref_result.pop("ssid", None)
|
|
86
|
+
if not hasattr(self, "self_ref"):
|
|
87
|
+
self.self_references()
|
|
88
|
+
if not hasattr(self, "self_cite"):
|
|
89
|
+
self.self_citations()
|
|
98
90
|
return PaperResult(
|
|
99
|
-
title=self.title,
|
|
91
|
+
title=self.title,
|
|
92
|
+
**{
|
|
93
|
+
k: v
|
|
94
|
+
for k, v in self.self_ref.model_dump().items()
|
|
95
|
+
if k not in ["ssid", "title"]
|
|
96
|
+
},
|
|
97
|
+
**{
|
|
98
|
+
k: v
|
|
99
|
+
for k, v in self.self_cite.model_dump().items()
|
|
100
|
+
if k not in ["title"]
|
|
101
|
+
},
|
|
100
102
|
)
|