paperscraper 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {paperscraper-0.3.2 → paperscraper-0.3.4}/PKG-INFO +53 -34
  2. {paperscraper-0.3.2 → paperscraper-0.3.4}/README.md +34 -8
  3. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/__init__.py +1 -1
  4. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/arxiv.py +4 -5
  5. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/utils.py +2 -2
  6. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/async_utils.py +36 -9
  7. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/citations.py +2 -1
  8. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/core.py +6 -5
  9. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/paper.py +17 -15
  10. paperscraper-0.3.4/paperscraper/citations/entity/researcher.py +221 -0
  11. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/self_citations.py +5 -2
  12. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/self_references.py +68 -42
  13. paperscraper-0.3.4/paperscraper/citations/tests/test_citations.py +32 -0
  14. paperscraper-0.3.4/paperscraper/citations/tests/test_self_citations.py +147 -0
  15. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/test_self_references.py +49 -4
  16. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/utils.py +99 -51
  17. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/arxiv.py +2 -2
  18. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/biorxiv.py +2 -2
  19. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/chemrxiv.py +2 -3
  20. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/medrxiv.py +2 -2
  21. paperscraper-0.3.4/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +253 -0
  22. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/chemrxiv/utils.py +33 -18
  23. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/load_dumps.py +2 -3
  24. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/fallbacks.py +138 -56
  25. paperscraper-0.3.4/paperscraper/pdf/pdf.py +442 -0
  26. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/utils.py +21 -0
  27. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/plotting.py +17 -23
  28. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/postprocessing.py +2 -3
  29. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/pubmed.py +22 -14
  30. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/scholar.py +6 -10
  31. paperscraper-0.3.4/paperscraper/server_dumps/__init__.py +4 -0
  32. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_dump.py +8 -2
  33. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_impactor.py +23 -4
  34. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/test_pdf.py +0 -5
  35. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/utils.py +6 -0
  36. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/PKG-INFO +53 -34
  37. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/SOURCES.txt +1 -1
  38. paperscraper-0.3.4/paperscraper.egg-info/requires.txt +19 -0
  39. paperscraper-0.3.4/pyproject.toml +90 -0
  40. paperscraper-0.3.2/paperscraper/citations/entity/researcher.py +0 -90
  41. paperscraper-0.3.2/paperscraper/citations/tests/test_citations.py +0 -18
  42. paperscraper-0.3.2/paperscraper/citations/tests/test_self_citations.py +0 -72
  43. paperscraper-0.3.2/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -144
  44. paperscraper-0.3.2/paperscraper/pdf/pdf.py +0 -250
  45. paperscraper-0.3.2/paperscraper/server_dumps/__init__.py +0 -1
  46. paperscraper-0.3.2/paperscraper.egg-info/requires.txt +0 -19
  47. paperscraper-0.3.2/setup.py +0 -77
  48. {paperscraper-0.3.2 → paperscraper-0.3.4}/LICENSE +0 -0
  49. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/arxiv/__init__.py +0 -0
  50. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/__init__.py +0 -0
  51. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/core.py +0 -0
  52. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/entity/__init__.py +0 -0
  53. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/orcid.py +0 -0
  54. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/__init__.py +0 -0
  55. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/citations/tests/test_paper.py +0 -0
  56. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/__init__.py +0 -0
  57. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/__init__.py +0 -0
  58. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  59. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/impact.py +0 -0
  60. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pdf/__init__.py +0 -0
  61. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/__init__.py +0 -0
  62. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/tests/__init__.py +0 -0
  63. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  64. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/pubmed/utils.py +0 -0
  65. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/__init__.py +0 -0
  66. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/core.py +0 -0
  67. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/tests/__init__.py +0 -0
  68. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/scholar/tests/test_scholar.py +0 -0
  69. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/tests/__init__.py +0 -0
  70. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/__init__.py +0 -0
  71. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/tests/__init__.py +0 -0
  72. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
  73. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  74. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  75. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/dependency_links.txt +0 -0
  76. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/not-zip-safe +0 -0
  77. {paperscraper-0.3.2 → paperscraper-0.3.4}/paperscraper.egg-info/top_level.txt +0 -0
  78. {paperscraper-0.3.2 → paperscraper-0.3.4}/setup.cfg +0 -0
@@ -1,62 +1,55 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: paperscraper: Package to scrape papers.
5
- Home-page: https://github.com/jannisborn/paperscraper
6
- Author: Jannis Born, Matteo Manica
7
- Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
5
+ Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
8
6
  License: MIT
7
+ Project-URL: Homepage, https://github.com/jannisborn/paperscraper
8
+ Project-URL: Documentation, https://jannisborn.github.io/paperscraper/
9
+ Project-URL: Repository, https://github.com/jannisborn/paperscraper
9
10
  Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
10
11
  Classifier: Development Status :: 3 - Alpha
11
12
  Classifier: Intended Audience :: Developers
12
13
  Classifier: Intended Audience :: Science/Research
13
14
  Classifier: License :: OSI Approved :: MIT License
14
15
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.8
16
16
  Classifier: Programming Language :: Python :: 3.9
17
17
  Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
19
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.9
20
23
  Description-Content-Type: text/markdown
21
24
  License-File: LICENSE
22
- Requires-Dist: arxiv>=1.4.2
25
+ Requires-Dist: arxiv>=1.4.7
23
26
  Requires-Dist: pymed-paperscraper>=1.0.4
24
- Requires-Dist: pandas
25
- Requires-Dist: requests
26
- Requires-Dist: tqdm
27
+ Requires-Dist: pandas>=1.0.4
28
+ Requires-Dist: requests==2.32.0
29
+ Requires-Dist: tqdm>=4.51.0
27
30
  Requires-Dist: scholarly>=1.0.0
28
- Requires-Dist: seaborn
29
- Requires-Dist: matplotlib
30
- Requires-Dist: matplotlib_venn
31
- Requires-Dist: bs4
32
- Requires-Dist: impact-factor>=1.1.1
33
- Requires-Dist: thefuzz
31
+ Requires-Dist: seaborn>=0.11.0
32
+ Requires-Dist: matplotlib>=3.3.2
33
+ Requires-Dist: matplotlib-venn>=0.11.5
34
+ Requires-Dist: bs4>=0.0.1
35
+ Requires-Dist: impact-factor>=1.1.3
36
+ Requires-Dist: thefuzz>=0.20.0
34
37
  Requires-Dist: pytest
35
38
  Requires-Dist: tldextract
36
- Requires-Dist: semanticscholar
39
+ Requires-Dist: semanticscholar>=0.8.4
37
40
  Requires-Dist: pydantic
38
41
  Requires-Dist: unidecode
39
42
  Requires-Dist: dotenv
40
43
  Requires-Dist: boto3
41
- Dynamic: author
42
- Dynamic: author-email
43
- Dynamic: classifier
44
- Dynamic: description
45
- Dynamic: description-content-type
46
- Dynamic: home-page
47
- Dynamic: keywords
48
- Dynamic: license
49
44
  Dynamic: license-file
50
- Dynamic: requires-dist
51
- Dynamic: summary
52
45
 
53
46
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
54
47
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
48
+ [![build](https://github.com/jannisborn/paperscraper/actions/workflows/docs.yml/badge.svg?branch=main)](https://jannisborn.github.io/paperscraper/)
55
49
  [![License:
56
50
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
51
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
58
52
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
59
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
60
53
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
61
54
  # paperscraper
62
55
 
@@ -66,6 +59,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
66
59
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
67
60
  and plotting routines for meta-analysis.
68
61
 
62
+
69
63
  ## Table of Contents
70
64
 
71
65
  1. [Getting Started](#getting-started)
@@ -90,18 +84,30 @@ pip install paperscraper
90
84
 
91
85
  This is enough to query PubMed, arXiv or Google Scholar.
92
86
 
87
+ ### Local development
88
+
89
+ ```console
90
+ uv sync
91
+ ```
92
+
93
+ This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
94
+
95
+ ```console
96
+ uv run python -c "import paperscraper"
97
+ ```
98
+
93
99
  #### Download X-rxiv Dumps
94
100
 
95
- However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
101
+ However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
96
102
 
97
103
  ```py
98
104
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
99
- medrxiv() # Takes ~30min and should result in ~35 MB file
100
- biorxiv() # Takes ~1h and should result in ~350 MB file
101
- chemrxiv() # Takes ~45min and should result in ~20 MB file
105
+ chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
106
+ medrxiv() # Takes <1h -> +90K papers (~200 MB file)
107
+ biorxiv() # Up to 6h -> +400K papers (~800 MB file)
102
108
  ```
103
109
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
104
- *NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
110
+ *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
105
111
 
106
112
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
107
113
  ```py
@@ -279,6 +285,13 @@ doi = '10.1021/acs.jcim.3c00132'
279
285
  get_citations_by_doi(doi)
280
286
  ```
281
287
 
288
+ NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
289
+ ```sh
290
+ export SS_API_KEY=YOUR_API_KEY
291
+ ```
292
+ This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
293
+
294
+
282
295
  ### Journal impact factor
283
296
 
284
297
  You can also retrieve the impact factor for all journals:
@@ -423,7 +436,7 @@ plot_multiple_venn(
423
436
  ## Citation
424
437
  If you use `paperscraper`, please cite a paper that motivated our development of this tool.
425
438
 
426
- ```bib
439
+ ```bibtex
427
440
  @article{born2021trends,
428
441
  title={Trends in Deep Learning for Property-driven Drug Design},
429
442
  author={Born, Jannis and Manica, Matteo},
@@ -439,9 +452,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
439
452
  ## Contributions
440
453
  Thanks to the following contributors:
441
454
  - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
455
+
442
456
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
457
+
443
458
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
459
+
444
460
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
461
+
445
462
  - [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
446
- - [@lukasschwab](https://github.com/lukasschwab): Bumped `arxiv` dependency to >`1.4.2` in paperscraper `v0.1.0`.
463
+
464
+ - [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
465
+
447
466
  - [@juliusbierk](https://github.com/juliusbierk): Bugfixes
@@ -1,10 +1,10 @@
1
1
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
2
2
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
3
+ [![build](https://github.com/jannisborn/paperscraper/actions/workflows/docs.yml/badge.svg?branch=main)](https://jannisborn.github.io/paperscraper/)
3
4
  [![License:
4
5
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
6
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
6
7
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
7
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
8
8
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
9
9
  # paperscraper
10
10
 
@@ -14,6 +14,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
14
14
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
15
15
  and plotting routines for meta-analysis.
16
16
 
17
+
17
18
  ## Table of Contents
18
19
 
19
20
  1. [Getting Started](#getting-started)
@@ -38,18 +39,30 @@ pip install paperscraper
38
39
 
39
40
  This is enough to query PubMed, arXiv or Google Scholar.
40
41
 
42
+ ### Local development
43
+
44
+ ```console
45
+ uv sync
46
+ ```
47
+
48
+ This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
49
+
50
+ ```console
51
+ uv run python -c "import paperscraper"
52
+ ```
53
+
41
54
  #### Download X-rxiv Dumps
42
55
 
43
- However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
56
+ However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
44
57
 
45
58
  ```py
46
59
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
47
- medrxiv() # Takes ~30min and should result in ~35 MB file
48
- biorxiv() # Takes ~1h and should result in ~350 MB file
49
- chemrxiv() # Takes ~45min and should result in ~20 MB file
60
+ chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
61
+ medrxiv() # Takes <1h -> +90K papers (~200 MB file)
62
+ biorxiv() # Up to 6h -> +400K papers (~800 MB file)
50
63
  ```
51
64
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
52
- *NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
65
+ *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
53
66
 
54
67
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
55
68
  ```py
@@ -227,6 +240,13 @@ doi = '10.1021/acs.jcim.3c00132'
227
240
  get_citations_by_doi(doi)
228
241
  ```
229
242
 
243
+ NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
244
+ ```sh
245
+ export SS_API_KEY=YOUR_API_KEY
246
+ ```
247
+ This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
248
+
249
+
230
250
  ### Journal impact factor
231
251
 
232
252
  You can also retrieve the impact factor for all journals:
@@ -371,7 +391,7 @@ plot_multiple_venn(
371
391
  ## Citation
372
392
  If you use `paperscraper`, please cite a paper that motivated our development of this tool.
373
393
 
374
- ```bib
394
+ ```bibtex
375
395
  @article{born2021trends,
376
396
  title={Trends in Deep Learning for Property-driven Drug Design},
377
397
  author={Born, Jannis and Manica, Matteo},
@@ -387,9 +407,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
387
407
  ## Contributions
388
408
  Thanks to the following contributors:
389
409
  - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
410
+
390
411
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
412
+
391
413
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
414
+
392
415
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
416
+
393
417
  - [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
394
- - [@lukasschwab](https://github.com/lukasschwab): Bumped `arxiv` dependency to >`1.4.2` in paperscraper `v0.1.0`.
418
+
419
+ - [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
420
+
395
421
  - [@juliusbierk](https://github.com/juliusbierk): Bugfixes
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.3.2"
4
+ __version__ = "0.3.4"
5
5
 
6
6
  import logging
7
7
  import os
@@ -6,17 +6,16 @@ from typing import Dict, List, Literal, Union
6
6
 
7
7
  import arxiv
8
8
  import pandas as pd
9
- import pkg_resources
10
9
  from tqdm import tqdm
11
10
 
12
- from ..utils import dump_papers
11
+ from ..utils import dump_papers, get_server_dumps_dir
13
12
  from ..xrxiv.xrxiv_query import XRXivQuery
14
13
  from .utils import get_query_from_keywords, infer_backend
15
14
 
16
15
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
17
16
  logger = logging.getLogger(__name__)
18
17
 
19
- dump_root = pkg_resources.resource_filename("paperscraper", "server_dumps")
18
+ dump_root = get_server_dumps_dir()
20
19
 
21
20
  global ARXIV_QUERIER
22
21
  ARXIV_QUERIER = None
@@ -94,7 +93,7 @@ def get_arxiv_papers_api(
94
93
  fields as desired.
95
94
 
96
95
  Args:
97
- query Query to arxiv API. Needs to match the arxiv API notation.
96
+ query: Query to arxiv API. Needs to match the arxiv API notation.
98
97
  fields: List of strings with fields to keep in output.
99
98
  max_results: Maximal number of results, defaults to 99999.
100
99
  client_options: Optional arguments for `arxiv.Client`. E.g.:
@@ -144,7 +143,7 @@ def get_and_dump_arxiv_papers(
144
143
  keywords: List of keywords for arxiv search.
145
144
  The outer list level will be considered as AND separated keys, the
146
145
  inner level as OR separated.
147
- filepath: Path where the dump will be saved.
146
+ output_filepath: Path where the dump will be saved.
148
147
  fields: List of strings with fields to keep in output.
149
148
  Defaults to ['title', 'authors', 'date', 'abstract',
150
149
  'journal', 'doi'].
@@ -3,7 +3,7 @@ import os
3
3
  from datetime import datetime
4
4
  from typing import List, Union
5
5
 
6
- import pkg_resources
6
+ from ..utils import get_server_dumps_dir
7
7
 
8
8
  finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
9
9
  finalize_conjunction = lambda x: x[:-5]
@@ -59,6 +59,6 @@ def get_query_from_keywords(
59
59
 
60
60
 
61
61
  def infer_backend():
62
- dump_root = pkg_resources.resource_filename("paperscraper", "server_dumps")
62
+ dump_root = get_server_dumps_dir()
63
63
  dump_paths = glob.glob(os.path.join(dump_root, "arxiv" + "*"))
64
64
  return "api" if not dump_paths else "local"
@@ -49,14 +49,20 @@ def optional_async(
49
49
 
50
50
 
51
51
  def retry_with_exponential_backoff(
52
- *, max_retries: int = 5, base_delay: float = 1.0
52
+ *,
53
+ max_retries: int = 5,
54
+ base_delay: float = 1.0,
55
+ factor: float = 1.3,
56
+ constant_delay: float = 0.2,
53
57
  ) -> Callable[[F], F]:
54
58
  """
55
59
  Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
56
60
 
57
61
  Args:
58
62
  max_retries: how many times to retry before giving up.
59
- base_delay: initial delay in seconds; next delays will be duplication of previous.
63
+ base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
64
+ factor: multiplier for delay after each retry.
65
+ constant_delay: fixed delay before each attempt.
60
66
 
61
67
  Usage:
62
68
 
@@ -70,18 +76,39 @@ def retry_with_exponential_backoff(
70
76
  @wraps(func)
71
77
  async def wrapper(*args, **kwargs) -> Any:
72
78
  delay = base_delay
73
- for attempt in range(max_retries):
79
+ last_exception: BaseException | None = None
80
+ for attempt in range(1, max_retries + 1):
81
+ await asyncio.sleep(constant_delay)
74
82
  try:
75
83
  return await func(*args, **kwargs)
76
84
  except httpx.HTTPStatusError as e:
77
- # only retry on 429
78
85
  status = e.response.status_code if e.response is not None else None
79
- if status != 429 or attempt == max_retries - 1:
86
+ if status != 429:
80
87
  raise
81
- # backoff
82
- await asyncio.sleep(delay)
83
- delay *= 2
84
- # in theory we never reach here
88
+ last_exception = e
89
+ sleep_for = delay
90
+ if e.response is not None:
91
+ ra = e.response.headers.get("Retry-After")
92
+ if ra is not None:
93
+ try:
94
+ sleep_for = float(ra)
95
+ except ValueError:
96
+ pass
97
+ delay *= factor
98
+
99
+ except httpx.ReadError as e:
100
+ last_exception = e
101
+ sleep_for = delay
102
+ delay *= factor
103
+
104
+ if attempt == max_retries:
105
+ msg = (
106
+ f"{func.__name__} failed after {attempt} attempts with "
107
+ f"last delay {sleep_for:.2f}s"
108
+ )
109
+ raise RuntimeError(msg) from last_exception
110
+
111
+ await asyncio.sleep(sleep_for)
85
112
 
86
113
  return wrapper
87
114
 
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  import sys
3
4
  from time import sleep
4
5
 
@@ -7,7 +8,7 @@ from semanticscholar import SemanticScholar, SemanticScholarException
7
8
 
8
9
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
9
10
  logger = logging.getLogger(__name__)
10
- sch = SemanticScholar()
11
+ sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
11
12
 
12
13
 
13
14
  def get_citations_by_doi(doi: str) -> int:
@@ -5,14 +5,15 @@ from pydantic import BaseModel
5
5
 
6
6
 
7
7
  class EntityResult(BaseModel):
8
- num_citations: int
9
- num_references: int
10
- # keys are authors or papers and values are absolute self links
11
- self_citations: Dict[str, int] = {}
12
- self_references: Dict[str, int] = {}
13
8
  # aggregated results
14
9
  self_citation_ratio: float = 0
15
10
  self_reference_ratio: float = 0
11
+ # total number of author citations/references
12
+ num_citations: int
13
+ num_references: int
14
+ # keys are papers and values are percentage of self citations/references
15
+ self_citations: Dict[str, float] = {}
16
+ self_references: Dict[str, float] = {}
16
17
 
17
18
 
18
19
  class Entity:
@@ -68,14 +68,14 @@ class Paper(Entity):
68
68
  Extracts the self references of a paper, for each author.
69
69
  """
70
70
  if isinstance(self.doi, str):
71
- self.ref_result: ReferenceResult = self_references_paper(self.doi)
71
+ self.self_ref: ReferenceResult = self_references_paper(self.doi)
72
72
 
73
73
  def self_citations(self):
74
74
  """
75
75
  Extracts the self citations of a paper, for each author.
76
76
  """
77
77
  if isinstance(self.doi, str):
78
- self.citation_result: CitationResult = self_citations_paper(self.doi)
78
+ self.self_cite: CitationResult = self_citations_paper(self.doi)
79
79
 
80
80
  def get_result(self) -> Optional[PaperResult]:
81
81
  """
@@ -83,18 +83,20 @@ class Paper(Entity):
83
83
 
84
84
  Returns: PaperResult if available.
85
85
  """
86
- if not hasattr(self, "ref_result"):
87
- logger.warning(
88
- f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
89
- )
90
- return
91
- elif not hasattr(self, "citation_result"):
92
- logger.warning(
93
- f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
94
- )
95
- return
96
- ref_result = self.ref_result.model_dump()
97
- ref_result.pop("ssid", None)
86
+ if not hasattr(self, "self_ref"):
87
+ self.self_references()
88
+ if not hasattr(self, "self_cite"):
89
+ self.self_citations()
98
90
  return PaperResult(
99
- title=self.title, **ref_result, **self.citation_result.model_dump()
91
+ title=self.title,
92
+ **{
93
+ k: v
94
+ for k, v in self.self_ref.model_dump().items()
95
+ if k not in ["ssid", "title"]
96
+ },
97
+ **{
98
+ k: v
99
+ for k, v in self.self_cite.model_dump().items()
100
+ if k not in ["title"]
101
+ },
100
102
  )