paperscraper 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {paperscraper-0.3.2 → paperscraper-0.3.3}/PKG-INFO +16 -9
  2. {paperscraper-0.3.2 → paperscraper-0.3.3}/README.md +15 -8
  3. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/__init__.py +1 -1
  4. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/arxiv/arxiv.py +2 -2
  5. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/tests/test_self_citations.py +1 -2
  6. paperscraper-0.3.3/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +216 -0
  7. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/utils/chemrxiv/utils.py +21 -14
  8. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pdf/fallbacks.py +6 -3
  9. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/plotting.py +17 -23
  10. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/postprocessing.py +2 -3
  11. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pubmed/pubmed.py +12 -12
  12. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/scholar/scholar.py +6 -10
  13. paperscraper-0.3.3/paperscraper/server_dumps/__init__.py +4 -0
  14. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/PKG-INFO +16 -9
  15. paperscraper-0.3.2/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -144
  16. paperscraper-0.3.2/paperscraper/server_dumps/__init__.py +0 -1
  17. {paperscraper-0.3.2 → paperscraper-0.3.3}/LICENSE +0 -0
  18. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/arxiv/__init__.py +0 -0
  19. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/arxiv/utils.py +0 -0
  20. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/async_utils.py +0 -0
  21. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/__init__.py +0 -0
  22. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/citations.py +0 -0
  23. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/core.py +0 -0
  24. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/entity/__init__.py +0 -0
  25. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/entity/core.py +0 -0
  26. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/entity/paper.py +0 -0
  27. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/entity/researcher.py +0 -0
  28. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/orcid.py +0 -0
  29. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/self_citations.py +0 -0
  30. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/self_references.py +0 -0
  31. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/tests/__init__.py +0 -0
  32. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/tests/test_citations.py +0 -0
  33. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/tests/test_paper.py +0 -0
  34. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/tests/test_self_references.py +0 -0
  35. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/citations/utils.py +0 -0
  36. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/__init__.py +0 -0
  37. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/arxiv.py +0 -0
  38. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/biorxiv.py +0 -0
  39. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/chemrxiv.py +0 -0
  40. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/medrxiv.py +0 -0
  41. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/utils/__init__.py +0 -0
  42. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  43. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/impact.py +0 -0
  44. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/load_dumps.py +0 -0
  45. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pdf/__init__.py +0 -0
  46. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pdf/pdf.py +0 -0
  47. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pdf/utils.py +0 -0
  48. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pubmed/__init__.py +0 -0
  49. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pubmed/tests/__init__.py +0 -0
  50. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  51. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/pubmed/utils.py +0 -0
  52. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/scholar/__init__.py +0 -0
  53. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/scholar/core.py +0 -0
  54. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/scholar/tests/__init__.py +0 -0
  55. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/scholar/tests/test_scholar.py +0 -0
  56. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/tests/__init__.py +0 -0
  57. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/tests/test_dump.py +0 -0
  58. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/tests/test_impactor.py +0 -0
  59. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/tests/test_pdf.py +0 -0
  60. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/utils.py +0 -0
  61. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/xrxiv/__init__.py +0 -0
  62. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/xrxiv/tests/__init__.py +0 -0
  63. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
  64. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  65. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  66. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/SOURCES.txt +0 -0
  67. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/dependency_links.txt +0 -0
  68. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/not-zip-safe +0 -0
  69. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/requires.txt +0 -0
  70. {paperscraper-0.3.2 → paperscraper-0.3.3}/paperscraper.egg-info/top_level.txt +0 -0
  71. {paperscraper-0.3.2 → paperscraper-0.3.3}/setup.cfg +0 -0
  72. {paperscraper-0.3.2 → paperscraper-0.3.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/jannisborn/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
@@ -52,11 +52,11 @@ Dynamic: summary
52
52
 
53
53
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
54
54
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
55
+ [![build](https://github.com/jannisborn/paperscraper/actions/workflows/docs.yml/badge.svg?branch=main)](https://jannisborn.github.io/paperscraper/)
55
56
  [![License:
56
57
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
58
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
58
59
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
59
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
60
60
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
61
61
  # paperscraper
62
62
 
@@ -66,6 +66,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
66
66
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
67
67
  and plotting routines for meta-analysis.
68
68
 
69
+
69
70
  ## Table of Contents
70
71
 
71
72
  1. [Getting Started](#getting-started)
@@ -92,16 +93,16 @@ This is enough to query PubMed, arXiv or Google Scholar.
92
93
 
93
94
  #### Download X-rxiv Dumps
94
95
 
95
- However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
96
+ However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
96
97
 
97
98
  ```py
98
99
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
99
- medrxiv() # Takes ~30min and should result in ~35 MB file
100
- biorxiv() # Takes ~1h and should result in ~350 MB file
101
- chemrxiv() # Takes ~45min and should result in ~20 MB file
100
+ chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
101
+ medrxiv() # Takes <1h -> +90K papers (~200 MB file)
102
+ biorxiv() # Up to 6h -> +400K papers (~800 MB file)
102
103
  ```
103
104
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
104
- *NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
105
+ *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
105
106
 
106
107
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
107
108
  ```py
@@ -423,7 +424,7 @@ plot_multiple_venn(
423
424
  ## Citation
424
425
  If you use `paperscraper`, please cite a paper that motivated our development of this tool.
425
426
 
426
- ```bib
427
+ ```bibtex
427
428
  @article{born2021trends,
428
429
  title={Trends in Deep Learning for Property-driven Drug Design},
429
430
  author={Born, Jannis and Manica, Matteo},
@@ -439,9 +440,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
439
440
  ## Contributions
440
441
  Thanks to the following contributors:
441
442
  - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
443
+
442
444
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
445
+
443
446
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
447
+
444
448
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
449
+
445
450
  - [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
446
- - [@lukasschwab](https://github.com/lukasschwab): Bumped `arxiv` dependency to >`1.4.2` in paperscraper `v0.1.0`.
451
+
452
+ - [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
453
+
447
454
  - [@juliusbierk](https://github.com/juliusbierk): Bugfixes
@@ -1,10 +1,10 @@
1
1
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
2
2
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
3
+ [![build](https://github.com/jannisborn/paperscraper/actions/workflows/docs.yml/badge.svg?branch=main)](https://jannisborn.github.io/paperscraper/)
3
4
  [![License:
4
5
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
6
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
6
7
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
7
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
8
8
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
9
9
  # paperscraper
10
10
 
@@ -14,6 +14,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
14
14
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
15
15
  and plotting routines for meta-analysis.
16
16
 
17
+
17
18
  ## Table of Contents
18
19
 
19
20
  1. [Getting Started](#getting-started)
@@ -40,16 +41,16 @@ This is enough to query PubMed, arXiv or Google Scholar.
40
41
 
41
42
  #### Download X-rxiv Dumps
42
43
 
43
- However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
44
+ However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
44
45
 
45
46
  ```py
46
47
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
47
- medrxiv() # Takes ~30min and should result in ~35 MB file
48
- biorxiv() # Takes ~1h and should result in ~350 MB file
49
- chemrxiv() # Takes ~45min and should result in ~20 MB file
48
+ chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
49
+ medrxiv() # Takes <1h -> +90K papers (~200 MB file)
50
+ biorxiv() # Up to 6h -> +400K papers (~800 MB file)
50
51
  ```
51
52
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
52
- *NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
53
+ *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
53
54
 
54
55
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
55
56
  ```py
@@ -371,7 +372,7 @@ plot_multiple_venn(
371
372
  ## Citation
372
373
  If you use `paperscraper`, please cite a paper that motivated our development of this tool.
373
374
 
374
- ```bib
375
+ ```bibtex
375
376
  @article{born2021trends,
376
377
  title={Trends in Deep Learning for Property-driven Drug Design},
377
378
  author={Born, Jannis and Manica, Matteo},
@@ -387,9 +388,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
387
388
  ## Contributions
388
389
  Thanks to the following contributors:
389
390
  - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
391
+
390
392
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
393
+
391
394
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
395
+
392
396
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
397
+
393
398
  - [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
394
- - [@lukasschwab](https://github.com/lukasschwab): Bumped `arxiv` dependency to >`1.4.2` in paperscraper `v0.1.0`.
399
+
400
+ - [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
401
+
395
402
  - [@juliusbierk](https://github.com/juliusbierk): Bugfixes
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.3.2"
4
+ __version__ = "0.3.3"
5
5
 
6
6
  import logging
7
7
  import os
@@ -94,7 +94,7 @@ def get_arxiv_papers_api(
94
94
  fields as desired.
95
95
 
96
96
  Args:
97
- query Query to arxiv API. Needs to match the arxiv API notation.
97
+ query: Query to arxiv API. Needs to match the arxiv API notation.
98
98
  fields: List of strings with fields to keep in output.
99
99
  max_results: Maximal number of results, defaults to 99999.
100
100
  client_options: Optional arguments for `arxiv.Client`. E.g.:
@@ -144,7 +144,7 @@ def get_and_dump_arxiv_papers(
144
144
  keywords: List of keywords for arxiv search.
145
145
  The outer list level will be considered as AND separated keys, the
146
146
  inner level as OR separated.
147
- filepath: Path where the dump will be saved.
147
+ output_filepath: Path where the dump will be saved.
148
148
  fields: List of strings with fields to keep in output.
149
149
  Defaults to ['title', 'authors', 'date', 'abstract',
150
150
  'journal', 'doi'].
@@ -62,8 +62,7 @@ class TestSelfCitations:
62
62
  f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
63
63
  )
64
64
 
65
- # Assert that async execution (batch) is faster or at least not slower
66
- assert 0.9 * async_duration <= sync_duration, (
65
+ assert 0.1 * async_duration <= sync_duration, (
67
66
  f"Async execution ({async_duration:.2f}s) is slower than sync execution "
68
67
  f"({sync_duration:.2f}s)"
69
68
  )
@@ -0,0 +1,216 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ from datetime import datetime
5
+ from time import sleep
6
+ from typing import Dict, Optional
7
+ from urllib.parse import urljoin
8
+
9
+ import requests
10
+ from requests.exceptions import (
11
+ ChunkedEncodingError,
12
+ ConnectionError,
13
+ ContentDecodingError,
14
+ JSONDecodeError,
15
+ ReadTimeout,
16
+ )
17
+ from urllib3.exceptions import DecodeError
18
+
19
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ now_datetime = datetime.now()
23
+ launch_dates = {"chemrxiv": "2017-01-01"}
24
+
25
+
26
+ class ChemrxivAPI:
27
+ """Handle OpenEngage API requests, using access.
28
+ Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
29
+ """
30
+
31
+ base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1/"
32
+
33
+ def __init__(
34
+ self,
35
+ start_date: Optional[str] = None,
36
+ end_date: Optional[str] = None,
37
+ page_size: Optional[int] = None,
38
+ max_retries: int = 10,
39
+ ):
40
+ """
41
+ Initialize API class.
42
+
43
+ Args:
44
+ start_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
45
+ Defaults to None.
46
+ end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
47
+ Defaults to None.
48
+ page_size (int, optional): The batch size used to fetch the records from chemrxiv.
49
+ max_retries (int): Number of retries in case of error
50
+ """
51
+
52
+ self.page_size = page_size or 50
53
+ self.max_retries = max_retries
54
+
55
+ # Begin Date and End Date of the search
56
+ launch_date = launch_dates["chemrxiv"]
57
+ launch_datetime = datetime.fromisoformat(launch_date)
58
+
59
+ if start_date:
60
+ start_datetime = datetime.fromisoformat(start_date)
61
+ if start_datetime < launch_datetime:
62
+ self.start_date = launch_date
63
+ logger.warning(
64
+ f"Begin date {start_date} is before chemrxiv launch date. Will use {launch_date} instead."
65
+ )
66
+ else:
67
+ self.start_date = start_date
68
+ else:
69
+ self.start_date = launch_date
70
+ if end_date:
71
+ end_datetime = datetime.fromisoformat(end_date)
72
+ if end_datetime > now_datetime:
73
+ logger.warning(
74
+ f"End date {end_date} is in the future. Will use {now_datetime} instead."
75
+ )
76
+ self.end_date = now_datetime.strftime("%Y-%m-%d")
77
+ else:
78
+ self.end_date = end_date
79
+ else:
80
+ self.end_date = now_datetime.strftime("%Y-%m-%d")
81
+
82
+ def request(self, url, method, params=None, parse_json: bool = False):
83
+ """Send an API request to open Engage."""
84
+
85
+ headers = {"Accept-Encoding": "identity", "Accept": "application/json"}
86
+ retryable = (
87
+ ChunkedEncodingError,
88
+ ContentDecodingError,
89
+ DecodeError,
90
+ ReadTimeout,
91
+ ConnectionError,
92
+ )
93
+ transient_status = {429, 500, 502, 503, 504}
94
+ backoff = 0.1
95
+
96
+ for attempt in range(self.max_retries):
97
+ try:
98
+ if method.casefold() == "get":
99
+ response = requests.get(
100
+ url, params=params, headers=headers, timeout=(5, 30)
101
+ )
102
+ elif method.casefold() == "post":
103
+ response = requests.post(
104
+ url, json=params, headers=headers, timeout=(5, 30)
105
+ )
106
+ else:
107
+ raise ConnectionError(f"Unknown method for query: {method}")
108
+ if response.status_code in transient_status:
109
+ logger.warning(
110
+ f"{response.status_code} for {url} (attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
111
+ )
112
+ if attempt + 1 == self.max_retries:
113
+ response.raise_for_status()
114
+ sleep(backoff)
115
+ backoff = min(60.0, backoff * 2)
116
+ continue
117
+ elif 400 <= response.status_code < 500:
118
+ response.raise_for_status()
119
+ if not parse_json:
120
+ return response
121
+
122
+ try:
123
+ return response.json()
124
+ except JSONDecodeError:
125
+ logger.warning(
126
+ f"JSONDecodeError for {response.url} "
127
+ f"(attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
128
+ )
129
+ if attempt + 1 == self.max_retries:
130
+ raise
131
+ sleep(backoff)
132
+ backoff = min(60.0, backoff * 2)
133
+ continue
134
+
135
+ except retryable as e:
136
+ logger.warning(
137
+ f"{e.__class__.__name__} for {url} (attempt {attempt + 1}/{self.max_retries}); "
138
+ f"retrying in {backoff:.1f}s"
139
+ )
140
+ if attempt + 1 == self.max_retries:
141
+ raise
142
+ sleep(backoff)
143
+ backoff = min(60.0, backoff * 2)
144
+
145
+ def query(self, query, method="get", params=None):
146
+ """Perform a direct query."""
147
+
148
+ return self.request(
149
+ urljoin(self.base, query), method, params=params, parse_json=True
150
+ )
151
+
152
+ def query_generator(
153
+ self, query, method: str = "get", params: Optional[Dict] = None
154
+ ):
155
+ """Query for a list of items, with paging. Returns a generator."""
156
+
157
+ start_datetime = datetime.fromisoformat(self.start_date)
158
+ end_datetime = datetime.fromisoformat(self.end_date)
159
+
160
+ def year_windows():
161
+ year = start_datetime.year
162
+ while year <= end_datetime.year:
163
+ year_start = datetime(year, 1, 1)
164
+ year_end = datetime(year, 12, 31)
165
+ win_start = max(start_datetime, year_start)
166
+ win_end = min(end_datetime, year_end)
167
+ yield win_start.strftime("%Y-%m-%d"), win_end.strftime("%Y-%m-%d")
168
+ year += 1
169
+
170
+ params = (params or {}).copy()
171
+
172
+ for year_from, year_to in year_windows():
173
+ logger.info(f"Starting to scrape data from {year_from} to {year_to}")
174
+ page = 0
175
+ while True:
176
+ params.update(
177
+ {
178
+ "limit": self.page_size,
179
+ "skip": page * self.page_size,
180
+ "searchDateFrom": year_from,
181
+ "searchDateTo": year_to,
182
+ }
183
+ )
184
+ try:
185
+ data = self.request(
186
+ urljoin(self.base, query),
187
+ method,
188
+ params=params,
189
+ parse_json=True,
190
+ )
191
+ except requests.HTTPError as e:
192
+ status = getattr(e.response, "status_code", None)
193
+ logger.warning(
194
+ f"Stopping year window {year_from}..{year_to} at skip={page * self.page_size} "
195
+ f"due to HTTPError {status}"
196
+ )
197
+ break
198
+ items = data.get("itemHits", [])
199
+ if not items:
200
+ break
201
+ for item in items:
202
+ yield item
203
+ page += 1
204
+
205
+ def all_preprints(self):
206
+ """Return a generator to all the chemRxiv articles."""
207
+ return self.query_generator("items")
208
+
209
+ def preprint(self, article_id):
210
+ """Information on a given preprint.
211
+ .. seealso:: https://docs.figshare.com/#public_article
212
+ """
213
+ return self.query(os.path.join("items", article_id))
214
+
215
+ def number_of_preprints(self):
216
+ return self.query("items")["totalCount"]
@@ -7,9 +7,15 @@ import sys
7
7
  from datetime import datetime
8
8
  from typing import Dict, List, Optional
9
9
 
10
- from requests.exceptions import SSLError
10
+ from requests.exceptions import (
11
+ ChunkedEncodingError,
12
+ ContentDecodingError,
13
+ JSONDecodeError,
14
+ SSLError,
15
+ )
11
16
  from requests.models import HTTPError
12
17
  from tqdm import tqdm
18
+ from urllib3.exceptions import DecodeError
13
19
 
14
20
  from .chemrxiv_api import ChemrxivAPI
15
21
 
@@ -49,7 +55,7 @@ def get_date(datestring: str) -> str:
49
55
  """Get the date of a chemrxiv dump enry.
50
56
 
51
57
  Args:
52
- date (str): String in the format: 2021-10-15T05:12:32.356Z
58
+ datestring: String in the format: 2021-10-15T05:12:32.356Z
53
59
 
54
60
  Returns:
55
61
  str: Date in the format: YYYY-MM-DD.
@@ -84,7 +90,7 @@ def parse_dump(source_path: str, target_path: str) -> None:
84
90
  NOTE: This is a lazy parser trying to store all data in memory.
85
91
 
86
92
  Args:
87
- path (str): Path to the source dump
93
+ source_path: Path to the source dump
88
94
  """
89
95
 
90
96
  dump = []
@@ -127,20 +133,21 @@ def parse_dump(source_path: str, target_path: str) -> None:
127
133
  def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
128
134
  if api is None:
129
135
  api = ChemrxivAPI()
130
-
131
136
  os.makedirs(save_dir, exist_ok=True)
137
+
132
138
  for preprint in tqdm(api.all_preprints()):
133
- path = os.path.join(save_dir, f"{preprint['item']['id']}.json")
139
+ item = preprint["item"]
140
+ path = os.path.join(save_dir, f"{item['id']}.json")
134
141
  if os.path.exists(path):
135
142
  continue
136
- preprint = preprint["item"]
137
- preprint_id = preprint["id"]
138
- try:
139
- preprint = api.preprint(preprint_id)
140
- except HTTPError:
141
- logger.warning(f"HTTP API Client error for ID: {preprint_id}")
142
- except SSLError:
143
- logger.warning(f"SSLError for ID: {preprint_id}")
143
+
144
+ if not item.get("title") or "authors" not in item:
145
+ try:
146
+ item = api.preprint(item["id"])
147
+ except Exception as e:
148
+ logger.warning(
149
+ f"Enrich failed for {item['id']}: {e}; writing listing payload"
150
+ )
144
151
 
145
152
  with open(path, "w") as file:
146
- json.dump(preprint, file, indent=2)
153
+ json.dump(item, file, indent=2)
@@ -14,6 +14,7 @@ from typing import Any, Callable, Dict, Union
14
14
 
15
15
  import boto3
16
16
  import requests
17
+ from botocore.client import BaseClient
17
18
  from lxml import etree
18
19
  from tqdm import tqdm
19
20
 
@@ -323,7 +324,7 @@ def month_folder(doi: str) -> str:
323
324
  return date.strftime("%B_%Y")
324
325
 
325
326
 
326
- def list_meca_keys(s3_client, bucket: str, prefix: str) -> list:
327
+ def list_meca_keys(s3_client: BaseClient, bucket: str, prefix: str) -> list:
327
328
  """
328
329
  List all .meca object keys under a given prefix in a requester-pays bucket.
329
330
 
@@ -346,7 +347,9 @@ def list_meca_keys(s3_client, bucket: str, prefix: str) -> list:
346
347
  return keys
347
348
 
348
349
 
349
- def find_meca_for_doi(s3_client, bucket: str, key: str, doi_token: str) -> bool:
350
+ def find_meca_for_doi(
351
+ s3_client: BaseClient, bucket: str, key: str, doi_token: str
352
+ ) -> bool:
350
353
  """
351
354
  Efficiently inspect manifest.xml within a .meca zip by fetching only necessary bytes.
352
355
  Parse via ZipFile to read manifest.xml and match DOI token.
@@ -375,7 +378,7 @@ def find_meca_for_doi(s3_client, bucket: str, key: str, doi_token: str) -> bool:
375
378
  manifest = z.read("manifest.xml")
376
379
 
377
380
  # Extract the last part of the DOI (newer DOIs that contain date fail otherwise)
378
- doi_token = doi_token.split('.')[-1]
381
+ doi_token = doi_token.split(".")[-1]
379
382
  return doi_token.encode("utf-8") in manifest.lower()
380
383
 
381
384
 
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  import math
3
- import os
4
- from typing import Iterable, List
3
+ from typing import Iterable, List, Optional
5
4
 
6
5
  import matplotlib.pyplot as plt
7
6
  import numpy as np
@@ -19,13 +18,13 @@ def plot_comparison(
19
18
  x_ticks: List[str] = ["2015", "2016", "2017", "2018", "2019", "2020"],
20
19
  show_preprint: bool = False,
21
20
  title_text: str = "",
22
- keyword_text=None,
21
+ keyword_text: Optional[List[str]] = None,
23
22
  figpath: str = "comparison_plot.pdf",
24
23
  ) -> None:
25
24
  """Plot temporal evolution of number of papers per keyword
26
25
 
27
26
  Args:
28
- data_dict (dict): A dictionary with keywords as keys. Each value should be a
27
+ data_dict: A dictionary with keywords as keys. Each value should be a
29
28
  dictionary itself, with keys for the different APIs. For example
30
29
  data_dict = {
31
30
  'covid_19.jsonl': {
@@ -39,18 +38,15 @@ def plot_comparison(
39
38
  ...
40
39
  }
41
40
  }
42
- keys (List[str]): List of keys which should be plotted. This has to be a
43
- subset of data_dict.keys().
44
- x_ticks (List[str]): List of strings to be used for the x-ticks. Should have
45
- same length as data_dict[key][database]. Defaults to ['2015', '2016',
46
- '2017', '2018', '2019', '2020'], meaning that papers are aggregated per
47
- year.
48
- show_preprint (bool, optional): Whether preprint servers are aggregated or not.
41
+ keys: List of keys which should be plotted. This has to be a subset of data_dict.keys().
42
+ x_ticks: List of strings to be used for the x-ticks. Should have same length as
43
+ data_dict[key][database]. Defaults to ['2015', '2016', '2017', '2018', '2019', '2020'],
44
+ meaning that papers are aggregated per year.
45
+ show_preprint: Whether preprint servers are aggregated or not.
49
46
  Defaults to False.
50
- title_text (str, optional): Title for the produced figure. Defaults to ''.
51
- keyword_text ([type], optional): Figure caption per keyword. Defaults to None,
52
- i.e. empty strings will be used.
53
- figpath (str, optional): Name under which figure is saved. Relative or absolute
47
+ title_text: Title for the produced figure. Defaults to ''.
48
+ keyword_text: Figure caption per keyword. Defaults to None, i.e. empty strings will be used.
49
+ figpath: Name under which figure is saved. Relative or absolute
54
50
  paths can be given. Defaults to 'comparison_plot.pdf'.
55
51
 
56
52
  Raises:
@@ -184,12 +180,12 @@ def plot_single(
184
180
  show_preprint: bool = False,
185
181
  title_text: str = "",
186
182
  figpath: str = "comparison_plot.pdf",
187
- logscale=False,
183
+ logscale: bool = False,
188
184
  ) -> None:
189
185
  """Plot temporal evolution of number of papers per keyword
190
186
 
191
187
  Args:
192
- data_dict (dict): A dictionary with keywords as keys. Each value should be a
188
+ data_dict: A dictionary with keywords as keys. Each value should be a
193
189
  dictionary itself, with keys for the different APIs. For example
194
190
  data_dict = {
195
191
  'covid_19.jsonl': {
@@ -203,19 +199,17 @@ def plot_single(
203
199
  ...
204
200
  }
205
201
  }
206
- keys (str): A key which should be plotted. This has to be a
207
- subset of data_dict.keys().
202
+ keys: A key which should be plotted. This has to be a subset of data_dict.keys().
208
203
  x_ticks (List[str]): List of strings to be used for the x-ticks. Should have
209
204
  same length as data_dict[key][database]. Defaults to ['2015', '2016',
210
205
  '2017', '2018', '2019', '2020'], meaning that papers are aggregated per
211
206
  year.
212
- show_preprint (bool, optional): Whether preprint servers are aggregated or not.
207
+ show_preprint: Whether preprint servers are aggregated or not.
213
208
  Defaults to False.
214
- title_text (str, optional): Title for the produced figure. Defaults to ''.
209
+ title_text: Title for the produced figure. Defaults to ''.
215
210
  figpath (str, optional): Name under which figure is saved. Relative or absolute
216
211
  paths can be given. Defaults to 'comparison_plot.pdf'.
217
- logscale (bool, optional): Whether y-axis is plotted on logscale. Defaults
218
- to False.
212
+ logscale: Whether y-axis is plotted on logscale. Defaults to False.
219
213
 
220
214
  Raises:
221
215
  KeyError: If a database is missing in data_dict.
@@ -37,7 +37,7 @@ def aggregate_paper(
37
37
  title or abstract. Only applies if filtering is True.
38
38
  return_filtered (bool, optional): Whether the filtered matches are also
39
39
  returned. Only applies if filtering is True. Defaults to False.
40
- filer_abstract (bool, optional): Whether the keyword is searched in the abstract
40
+ filter_abstract (bool, optional): Whether the keyword is searched in the abstract
41
41
  or not. Defaults to True.
42
42
  last_year (int, optional): Most recent year for the aggregation. Defaults
43
43
  to current year. All newer entries are discarded.
@@ -112,8 +112,7 @@ def aggregate_paper(
112
112
 
113
113
  if len(date.split("-")) < 2:
114
114
  logger.warning(
115
- f"Paper without month {date}, randomly assigned month."
116
- f"{paper['title']}"
115
+ f"Paper without month {date}, randomly assigned month.{paper['title']}"
117
116
  )
118
117
  month = np.random.choice(12)
119
118
  else:
@@ -42,15 +42,15 @@ def get_pubmed_papers(
42
42
  fields as desired.
43
43
 
44
44
  Args:
45
- query (str): Query to PubMed API. Needs to match PubMed API notation.
46
- fields (list[str]): List of strings with fields to keep in output.
45
+ query: Query to PubMed API. Needs to match PubMed API notation.
46
+ fields: List of strings with fields to keep in output.
47
47
  NOTE: If 'emails' is passed, an attempt is made to extract author mail
48
48
  addresses.
49
- max_results (int): Maximal number of results retrieved from DB. Defaults
49
+ max_results: Maximal number of results retrieved from DB. Defaults
50
50
  to 9998, higher values likely raise problems due to PubMedAPI, see:
51
51
  https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit
52
-
53
- NOTE: *args, **kwargs are additional arguments for pubmed.query
52
+ args: additional arguments for pubmed.query
53
+ kwargs: additional arguments for pubmed.query
54
54
 
55
55
  Returns:
56
56
  pd.DataFrame. One paper per row.
@@ -100,19 +100,19 @@ def get_and_dump_pubmed_papers(
100
100
  Combines get_pubmed_papers and dump_papers.
101
101
 
102
102
  Args:
103
- keywords (List[Union[str, List[str]]]): List of keywords to request
104
- pubmed API. The outer list level will be considered as AND
105
- separated keys, the inner level as OR separated.
106
- filepath (str): Path where the dump will be saved.
107
- fields (List, optional): List of strings with fields to keep in output.
103
+ keywords: List of keywords to request pubmed API.
104
+ The outer list level will be considered as AND separated keys.
105
+ The inner level as OR separated.
106
+ output_filepath: Path where the dump will be saved.
107
+ fields: List of strings with fields to keep in output.
108
108
  Defaults to ['title', 'authors', 'date', 'abstract',
109
109
  'journal', 'doi'].
110
110
  NOTE: If 'emails' is passed, an attempt is made to extract author mail
111
111
  addresses.
112
- start_date (str): Start date for the search. Needs to be in format:
112
+ start_date: Start date for the search. Needs to be in format:
113
113
  YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
114
114
  dates are used.
115
- end_date (str): End date for the search. Same notation as start_date.
115
+ end_date: End date for the search. Same notation as start_date.
116
116
  """
117
117
  # Translate keywords into query.
118
118
  query = get_query_from_keywords_and_date(
@@ -28,12 +28,12 @@ def get_scholar_papers(
28
28
  **kwargs,
29
29
  ) -> pd.DataFrame:
30
30
  """
31
- Performs Google Scholar API request of a given query and returns list of papers with
31
+ Performs Google Scholar API request of a given title and returns list of papers with
32
32
  fields as desired.
33
33
 
34
34
  Args:
35
- query (str): Query to arxiv API. Needs to match the arxiv API notation.
36
- fields (list[str]): List of strings with fields to keep in output.
35
+ title: Query to arxiv API. Needs to match the arxiv API notation.
36
+ fields: List of strings with fields to keep in output.
37
37
 
38
38
  Returns:
39
39
  pd.DataFrame. One paper per row.
@@ -74,13 +74,9 @@ def get_and_dump_scholar_papers(
74
74
  Combines get_scholar_papers and dump_papers.
75
75
 
76
76
  Args:
77
- keywords (List[str, List[str]]): List of keywords to request arxiv API.
78
- The outer list level will be considered as AND separated keys, the
79
- inner level as OR separated.
80
- filepath (str): Path where the dump will be saved.
81
- fields (List, optional): List of strings with fields to keep in output.
82
- Defaults to ['title', 'authors', 'date', 'abstract',
83
- 'journal', 'doi'].
77
+ title: Paper to search for on Google Scholar.
78
+ output_filepath: Path where the dump will be saved.
79
+ fields: List of strings with fields to keep in output.
84
80
  """
85
81
  papers = get_scholar_papers(title, fields)
86
82
  dump_papers(papers, output_filepath)
@@ -0,0 +1,4 @@
1
+ """
2
+ Folder for the metadata dumps from biorxiv, medrxiv and chemrxiv API.
3
+ No code here but will be populated with your local `.jsonl` files.
4
+ """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/jannisborn/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
@@ -52,11 +52,11 @@ Dynamic: summary
52
52
 
53
53
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
54
54
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
55
+ [![build](https://github.com/jannisborn/paperscraper/actions/workflows/docs.yml/badge.svg?branch=main)](https://jannisborn.github.io/paperscraper/)
55
56
  [![License:
56
57
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
58
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
58
59
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
59
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
60
60
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
61
61
  # paperscraper
62
62
 
@@ -66,6 +66,7 @@ It provides a streamlined interface to scrape metadata, allows to retrieve citat
66
66
  from Google Scholar, impact factors from journals and comes with simple postprocessing functions
67
67
  and plotting routines for meta-analysis.
68
68
 
69
+
69
70
  ## Table of Contents
70
71
 
71
72
  1. [Getting Started](#getting-started)
@@ -92,16 +93,16 @@ This is enough to query PubMed, arXiv or Google Scholar.
92
93
 
93
94
  #### Download X-rxiv Dumps
94
95
 
95
- However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
96
+ However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
96
97
 
97
98
  ```py
98
99
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
99
- medrxiv() # Takes ~30min and should result in ~35 MB file
100
- biorxiv() # Takes ~1h and should result in ~350 MB file
101
- chemrxiv() # Takes ~45min and should result in ~20 MB file
100
+ chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
101
+ medrxiv() # Takes <1h -> +90K papers (~200 MB file)
102
+ biorxiv() # Up to 6h -> +400K papers (~800 MB file)
102
103
  ```
103
104
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
104
- *NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
105
+ *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
105
106
 
106
107
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
107
108
  ```py
@@ -423,7 +424,7 @@ plot_multiple_venn(
423
424
  ## Citation
424
425
  If you use `paperscraper`, please cite a paper that motivated our development of this tool.
425
426
 
426
- ```bib
427
+ ```bibtex
427
428
  @article{born2021trends,
428
429
  title={Trends in Deep Learning for Property-driven Drug Design},
429
430
  author={Born, Jannis and Manica, Matteo},
@@ -439,9 +440,15 @@ If you use `paperscraper`, please cite a paper that motivated our development of
439
440
  ## Contributions
440
441
  Thanks to the following contributors:
441
442
  - [@mathinic](https://github.com/mathinic): Since `v0.3.0` improved PubMed full text retrieval with additional fallback mechanisms (BioC-PMC, eLife and optional Wiley/Elsevier APIs).
443
+
442
444
  - [@memray](https://github.com/memray): Since `v0.2.12` there are automatic retries when downloading the {med/bio/chem}rxiv dumps.
445
+
443
446
  - [@achouhan93](https://github.com/achouhan93): Since `v0.2.5` {med/bio/chem}rxiv can be scraped for specific dates!
447
+
444
448
  - [@daenuprobst](https://github.com/daenuprobst): Since `v0.2.4` PDF files can be scraped directly (`paperscraper.pdf.save_pdf`)
449
+
445
450
  - [@oppih](https://github.com/oppih): Since `v0.2.3` chemRxiv API also provides DOI and URL if available
446
- - [@lukasschwab](https://github.com/lukasschwab): Bumped `arxiv` dependency to >`1.4.2` in paperscraper `v0.1.0`.
451
+
452
+ - [@lukasschwab](https://github.com/lukasschwab): Enabled support for `arxiv` >`1.4.2` in paperscraper `v0.1.0`.
453
+
447
454
  - [@juliusbierk](https://github.com/juliusbierk): Bugfixes
@@ -1,144 +0,0 @@
1
- import logging
2
- import os
3
- import sys
4
- from datetime import datetime
5
- from time import time
6
- from typing import Dict, Optional
7
- from urllib.parse import urljoin
8
-
9
- import requests
10
- from requests.exceptions import ChunkedEncodingError
11
-
12
- logging.basicConfig(stream=sys.stdout, level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
-
15
- now_datetime = datetime.now()
16
- launch_dates = {"chemrxiv": "2017-01-01"}
17
-
18
-
19
- class ChemrxivAPI:
20
- """Handle OpenEngage API requests, using access.
21
- Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
22
- """
23
-
24
- base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1/"
25
-
26
- def __init__(
27
- self,
28
- start_date: Optional[str] = None,
29
- end_date: Optional[str] = None,
30
- page_size: Optional[int] = None,
31
- max_retries: int = 10,
32
- ):
33
- """
34
- Initialize API class.
35
-
36
- Args:
37
- start_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
38
- Defaults to None.
39
- end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
40
- Defaults to None.
41
- page_size (int, optional): The batch size used to fetch the records from chemrxiv.
42
- max_retries (int): Number of retries in case of error
43
- """
44
-
45
- self.page_size = page_size or 50
46
- self.max_retries = max_retries
47
-
48
- # Begin Date and End Date of the search
49
- launch_date = launch_dates["chemrxiv"]
50
- launch_datetime = datetime.fromisoformat(launch_date)
51
-
52
- if start_date:
53
- start_datetime = datetime.fromisoformat(start_date)
54
- if start_datetime < launch_datetime:
55
- self.start_date = launch_date
56
- logger.warning(
57
- f"Begin date {start_date} is before chemrxiv launch date. Will use {launch_date} instead."
58
- )
59
- else:
60
- self.start_date = start_date
61
- else:
62
- self.start_date = launch_date
63
- if end_date:
64
- end_datetime = datetime.fromisoformat(end_date)
65
- if end_datetime > now_datetime:
66
- logger.warning(
67
- f"End date {end_date} is in the future. Will use {now_datetime} instead."
68
- )
69
- self.end_date = now_datetime.strftime("%Y-%m-%d")
70
- else:
71
- self.end_date = end_date
72
- else:
73
- self.end_date = now_datetime.strftime("%Y-%m-%d")
74
-
75
- def request(self, url, method, params=None):
76
- """Send an API request to open Engage."""
77
-
78
- for attempt in range(self.max_retries):
79
- try:
80
- if method.casefold() == "get":
81
- return requests.get(url, params=params, timeout=10)
82
- elif method.casefold() == "post":
83
- return requests.post(url, json=params, timeout=10)
84
- else:
85
- raise ConnectionError(f"Unknown method for query: {method}")
86
- except ChunkedEncodingError as e:
87
- logger.warning(f"ChunkedEncodingError occurred for {url}: {e}")
88
- if attempt + 1 == self.max_retries:
89
- raise e
90
- time.sleep(3)
91
-
92
- def query(self, query, method="get", params=None):
93
- """Perform a direct query."""
94
-
95
- r = self.request(urljoin(self.base, query), method, params=params)
96
- r.raise_for_status()
97
- return r.json()
98
-
99
- def query_generator(self, query, method: str = "get", params: Dict = {}):
100
- """Query for a list of items, with paging. Returns a generator."""
101
-
102
- try:
103
- total = self.number_of_preprints()
104
- except Exception:
105
- total = float("inf") # fallback if that call fails
106
-
107
- page = 0
108
- while True:
109
- params.update(
110
- {
111
- "limit": self.page_size,
112
- "skip": page * self.page_size,
113
- "searchDateFrom": self.start_date,
114
- "searchDateTo": self.end_date,
115
- }
116
- )
117
- if page * self.page_size > total:
118
- break
119
- r = self.request(urljoin(self.base, query), method, params=params)
120
- if r.status_code == 400:
121
- raise ValueError(r.json()["message"])
122
- r.raise_for_status()
123
- r = r.json()
124
- r = r["itemHits"]
125
-
126
- # If we have no more results, bail out
127
- if len(r) == 0:
128
- return
129
-
130
- yield from r
131
- page += 1
132
-
133
- def all_preprints(self):
134
- """Return a generator to all the chemRxiv articles."""
135
- return self.query_generator("items")
136
-
137
- def preprint(self, article_id):
138
- """Information on a given preprint.
139
- .. seealso:: https://docs.figshare.com/#public_article
140
- """
141
- return self.query(os.path.join("items", article_id))
142
-
143
- def number_of_preprints(self):
144
- return self.query("items")["totalCount"]
@@ -1 +0,0 @@
1
- """Folder for the metadata dumps from biorxiv, medrxiv and chemrxiv API"""
File without changes
File without changes
File without changes