paperscraper 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.2.4 → paperscraper-0.2.6}/PKG-INFO +7 -5
- {paperscraper-0.2.4 → paperscraper-0.2.6}/README.md +6 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/__init__.py +1 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/arxiv.py +6 -3
- paperscraper-0.2.6/paperscraper/get_dumps/biorxiv.py +47 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/chemrxiv.py +15 -4
- paperscraper-0.2.6/paperscraper/get_dumps/medrxiv.py +44 -0
- paperscraper-0.2.6/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +125 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/chemrxiv/utils.py +2 -4
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/postprocessing.py +1 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/utils.py +2 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/xrxiv_api.py +2 -2
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/PKG-INFO +7 -5
- paperscraper-0.2.4/paperscraper/get_dumps/biorxiv.py +0 -34
- paperscraper-0.2.4/paperscraper/get_dumps/medrxiv.py +0 -31
- paperscraper-0.2.4/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -67
- {paperscraper-0.2.4 → paperscraper-0.2.6}/LICENSE +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/utils.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/__init__.py +1 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +1 -1
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/journal_if.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/load_dumps.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pdf.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/plotting.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/pubmed.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/scholar/scholar.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/utils.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/SOURCES.txt +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/requires.txt +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/setup.cfg +0 -0
- {paperscraper-0.2.4 → paperscraper-0.2.6}/setup.py +0 -0
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
7
7
|
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
8
8
|
License: MIT
|
|
9
9
|
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 3 - Alpha
|
|
12
11
|
Classifier: Intended Audience :: Developers
|
|
13
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -56,10 +55,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
|
|
|
56
55
|
biorxiv() # Takes ~1h and should result in ~350 MB file
|
|
57
56
|
chemrxiv() # Takes ~45min and should result in ~20 MB file
|
|
58
57
|
```
|
|
59
|
-
|
|
60
58
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
|
|
61
59
|
so that the changes take effect.
|
|
62
60
|
|
|
61
|
+
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
|
|
62
|
+
```py
|
|
63
|
+
medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
|
|
64
|
+
```
|
|
65
|
+
But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
|
|
66
|
+
|
|
63
67
|
## Examples
|
|
64
68
|
|
|
65
69
|
`paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
|
|
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
|
|
|
305
309
|
author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
|
|
306
310
|
}
|
|
307
311
|
```
|
|
308
|
-
|
|
309
|
-
|
|
@@ -35,10 +35,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
|
|
|
35
35
|
biorxiv() # Takes ~1h and should result in ~350 MB file
|
|
36
36
|
chemrxiv() # Takes ~45min and should result in ~20 MB file
|
|
37
37
|
```
|
|
38
|
-
|
|
39
38
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
|
|
40
39
|
so that the changes take effect.
|
|
41
40
|
|
|
41
|
+
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
|
|
42
|
+
```py
|
|
43
|
+
medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
|
|
44
|
+
```
|
|
45
|
+
But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
|
|
46
|
+
|
|
42
47
|
## Examples
|
|
43
48
|
|
|
44
49
|
`paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Dict, List, Union
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from tqdm import tqdm
|
|
4
5
|
|
|
5
6
|
import arxiv
|
|
6
7
|
|
|
@@ -11,6 +12,7 @@ arxiv_field_mapper = {
|
|
|
11
12
|
"published": "date",
|
|
12
13
|
"journal_ref": "journal",
|
|
13
14
|
"summary": "abstract",
|
|
15
|
+
"entry_id": "doi",
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
# Authors, date, and journal fields need specific processing
|
|
@@ -18,6 +20,7 @@ process_fields = {
|
|
|
18
20
|
"authors": lambda authors: ", ".join([a.name for a in authors]),
|
|
19
21
|
"date": lambda date: date.strftime("%Y-%m-%d"),
|
|
20
22
|
"journal": lambda j: j if j is not None else "",
|
|
23
|
+
"doi": lambda entry_id: f"10.48550/arXiv.{entry_id.split('/')[-1].split('v')[0]}",
|
|
21
24
|
}
|
|
22
25
|
|
|
23
26
|
|
|
@@ -58,9 +61,9 @@ def get_arxiv_papers(
|
|
|
58
61
|
arxiv_field_mapper.get(key, key), lambda x: x
|
|
59
62
|
)(value)
|
|
60
63
|
for key, value in vars(paper).items()
|
|
61
|
-
if arxiv_field_mapper.get(key, key) in fields
|
|
64
|
+
if arxiv_field_mapper.get(key, key) in fields and key != "doi"
|
|
62
65
|
}
|
|
63
|
-
for paper in results
|
|
66
|
+
for paper in tqdm(results, desc=f"Processing {query}")
|
|
64
67
|
]
|
|
65
68
|
)
|
|
66
69
|
return processed
|
|
@@ -71,7 +74,7 @@ def get_and_dump_arxiv_papers(
|
|
|
71
74
|
output_filepath: str,
|
|
72
75
|
fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
|
|
73
76
|
*args,
|
|
74
|
-
**kwargs
|
|
77
|
+
**kwargs,
|
|
75
78
|
):
|
|
76
79
|
"""
|
|
77
80
|
Combines get_arxiv_papers and dump_papers.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Dump bioRxiv data in JSONL format."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import pkg_resources
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from ..xrxiv.xrxiv_api import BioRxivApi
|
|
11
|
+
|
|
12
|
+
today = datetime.today().strftime("%Y-%m-%d")
|
|
13
|
+
save_path = os.path.join(
|
|
14
|
+
pkg_resources.resource_filename("paperscraper", "server_dumps"),
|
|
15
|
+
f"biorxiv_{today}.jsonl",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def biorxiv(
|
|
20
|
+
begin_date: Optional[str] = None,
|
|
21
|
+
end_date: Optional[str] = None,
|
|
22
|
+
save_path: str = save_path,
|
|
23
|
+
):
|
|
24
|
+
"""Fetches papers from biorxiv based on time range, i.e., begin_date and end_date.
|
|
25
|
+
If the begin_date and end_date are not provided, papers will be fetched from biorxiv
|
|
26
|
+
from the launch date of biorxiv until the current date. The fetched papers will be
|
|
27
|
+
stored in jsonl format in save_path.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
save_path (str, optional): Path where the dump is stored.
|
|
31
|
+
Defaults to save_path.
|
|
32
|
+
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
|
|
33
|
+
Defaults to None.
|
|
34
|
+
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
|
|
35
|
+
Defaults to None.
|
|
36
|
+
"""
|
|
37
|
+
# create API client
|
|
38
|
+
api = BioRxivApi()
|
|
39
|
+
|
|
40
|
+
# dump all papers
|
|
41
|
+
with open(save_path, "w") as fp:
|
|
42
|
+
for index, paper in enumerate(
|
|
43
|
+
tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
|
|
44
|
+
):
|
|
45
|
+
if index > 0:
|
|
46
|
+
fp.write(os.linesep)
|
|
47
|
+
fp.write(json.dumps(paper))
|
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import sys
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import pkg_resources
|
|
8
9
|
|
|
@@ -16,17 +17,27 @@ save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
|
|
|
16
17
|
save_path = os.path.join(save_folder, f"chemrxiv_{today}.jsonl")
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def chemrxiv(
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
def chemrxiv(
|
|
21
|
+
begin_date: Optional[str] = None,
|
|
22
|
+
end_date: Optional[str] = None,
|
|
23
|
+
save_path: str = save_path,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""Fetches papers from bichemrxiv based on time range, i.e., begin_date and end_date.
|
|
26
|
+
If the begin_date and end_date are not provided, papers will be fetched from chemrxiv
|
|
27
|
+
from the launch date of chemrxiv until the current date. The fetched papers will be
|
|
28
|
+
stored in jsonl format in save_path.
|
|
22
29
|
|
|
23
30
|
Args:
|
|
31
|
+
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
|
|
32
|
+
Defaults to None.
|
|
33
|
+
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
|
|
34
|
+
Defaults to None.
|
|
24
35
|
save_path (str, optional): Path where the dump is stored.
|
|
25
36
|
Defaults to save_path.
|
|
26
37
|
"""
|
|
27
38
|
|
|
28
39
|
# create API client
|
|
29
|
-
api = ChemrxivAPI()
|
|
40
|
+
api = ChemrxivAPI(begin_date, end_date)
|
|
30
41
|
# Download the data
|
|
31
42
|
download_full(save_folder, api)
|
|
32
43
|
# Convert to JSONL format.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Dump medrxiv data in JSONL format."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import pkg_resources
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from ..xrxiv.xrxiv_api import MedRxivApi
|
|
11
|
+
|
|
12
|
+
today = datetime.today().strftime("%Y-%m-%d")
|
|
13
|
+
save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
|
|
14
|
+
save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def medrxiv(
|
|
18
|
+
begin_date: Optional[str] = None,
|
|
19
|
+
end_date: Optional[str] = None,
|
|
20
|
+
save_path: str = save_path,
|
|
21
|
+
):
|
|
22
|
+
"""Fetches papers from medrxiv based on time range, i.e., begin_date and end_date.
|
|
23
|
+
If the begin_date and end_date are not provided, then papers will be fetched from
|
|
24
|
+
medrxiv starting from the launch date of medrxiv until current date. The fetched
|
|
25
|
+
papers will be stored in jsonl format in save_path.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
save_path (str, optional): Path where the dump is stored.
|
|
29
|
+
Defaults to save_path.
|
|
30
|
+
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
|
|
31
|
+
Defaults to None.
|
|
32
|
+
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
|
|
33
|
+
Defaults to None.
|
|
34
|
+
"""
|
|
35
|
+
# create API client
|
|
36
|
+
api = MedRxivApi()
|
|
37
|
+
# dump all papers
|
|
38
|
+
with open(save_path, "w") as fp:
|
|
39
|
+
for index, paper in enumerate(
|
|
40
|
+
tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
|
|
41
|
+
):
|
|
42
|
+
if index > 0:
|
|
43
|
+
fp.write(os.linesep)
|
|
44
|
+
fp.write(json.dumps(paper))
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
now_datetime = datetime.now()
|
|
13
|
+
launch_dates = {"chemrxiv": "2017-01-01"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChemrxivAPI:
|
|
17
|
+
"""Handle OpenEngage API requests, using access.
|
|
18
|
+
Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
begin_date: Optional[str] = None,
|
|
26
|
+
end_date: Optional[str] = None,
|
|
27
|
+
page_size: Optional[int] = None,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initialize API class.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
|
|
34
|
+
Defaults to None.
|
|
35
|
+
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
|
|
36
|
+
Defaults to None.
|
|
37
|
+
page_size (int, optional): The batch size used to fetch the records from chemrxiv.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
self.page_size = page_size or 50
|
|
41
|
+
|
|
42
|
+
# Begin Date and End Date of the search
|
|
43
|
+
launch_date = launch_dates["chemrxiv"]
|
|
44
|
+
launch_datetime = datetime.fromisoformat(launch_date)
|
|
45
|
+
|
|
46
|
+
if begin_date:
|
|
47
|
+
begin_datetime = datetime.fromisoformat(begin_date)
|
|
48
|
+
if begin_datetime < launch_datetime:
|
|
49
|
+
self.begin_date = launch_date
|
|
50
|
+
logger.warning(
|
|
51
|
+
f"Begin date {begin_date} is before chemrxiv launch date. Will use {launch_date} instead."
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
self.begin_date = begin_date
|
|
55
|
+
else:
|
|
56
|
+
self.begin_date = launch_date
|
|
57
|
+
if end_date:
|
|
58
|
+
end_datetime = datetime.fromisoformat(end_date)
|
|
59
|
+
if end_datetime > now_datetime:
|
|
60
|
+
logger.warning(
|
|
61
|
+
f"End date {end_date} is in the future. Will use {now_datetime} instead."
|
|
62
|
+
)
|
|
63
|
+
self.end_date = now_datetime.strftime("%Y-%m-%d")
|
|
64
|
+
else:
|
|
65
|
+
self.end_date = end_date
|
|
66
|
+
else:
|
|
67
|
+
self.end_date = now_datetime.strftime("%Y-%m-%d")
|
|
68
|
+
|
|
69
|
+
def request(self, url, method, params=None):
|
|
70
|
+
"""Send an API request to open Engage."""
|
|
71
|
+
|
|
72
|
+
if method.casefold() == "get":
|
|
73
|
+
return requests.get(url, params=params)
|
|
74
|
+
elif method.casefold() == "post":
|
|
75
|
+
return requests.post(url, json=params)
|
|
76
|
+
else:
|
|
77
|
+
raise ConnectionError(f"Unknown method for query: {method}")
|
|
78
|
+
|
|
79
|
+
def query(self, query, method="get", params=None):
|
|
80
|
+
"""Perform a direct query."""
|
|
81
|
+
r = self.request(
|
|
82
|
+
os.path.join(f"{self.base}", f"{query}"), method, params=params
|
|
83
|
+
)
|
|
84
|
+
r.raise_for_status()
|
|
85
|
+
return r.json()
|
|
86
|
+
|
|
87
|
+
def query_generator(self, query, method: str = "get", params: Dict = {}):
|
|
88
|
+
"""Query for a list of items, with paging. Returns a generator."""
|
|
89
|
+
|
|
90
|
+
page = 0
|
|
91
|
+
while True:
|
|
92
|
+
params.update(
|
|
93
|
+
{
|
|
94
|
+
"limit": self.page_size,
|
|
95
|
+
"skip": page * self.page_size,
|
|
96
|
+
"searchDateFrom": self.begin_date,
|
|
97
|
+
"searchDateTo": self.end_date,
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
r = self.request(os.path.join(self.base, query), method, params=params)
|
|
101
|
+
if r.status_code == 400:
|
|
102
|
+
raise ValueError(r.json()["message"])
|
|
103
|
+
r.raise_for_status()
|
|
104
|
+
r = r.json()
|
|
105
|
+
r = r["itemHits"]
|
|
106
|
+
|
|
107
|
+
# If we have no more results, bail out
|
|
108
|
+
if len(r) == 0:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
yield from r
|
|
112
|
+
page += 1
|
|
113
|
+
|
|
114
|
+
def all_preprints(self):
|
|
115
|
+
"""Return a generator to all the chemRxiv articles."""
|
|
116
|
+
return self.query_generator("items")
|
|
117
|
+
|
|
118
|
+
def preprint(self, article_id):
|
|
119
|
+
"""Information on a given preprint.
|
|
120
|
+
.. seealso:: https://docs.figshare.com/#public_article
|
|
121
|
+
"""
|
|
122
|
+
return self.query(os.path.join("items", article_id))
|
|
123
|
+
|
|
124
|
+
def number_of_preprints(self):
|
|
125
|
+
return self.query("items")["totalCount"]
|
|
@@ -28,9 +28,7 @@ def get_author(author_list: List[Dict]) -> str:
|
|
|
28
28
|
str: ;-concatenated author list.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
return "; ".join(
|
|
32
|
-
[" ".join([a["firstName"], a["lastName"]]) for a in author_list]
|
|
33
|
-
)
|
|
31
|
+
return "; ".join([" ".join([a["firstName"], a["lastName"]]) for a in author_list])
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
def get_categories(category_list: List[Dict]) -> str:
|
|
@@ -143,7 +141,7 @@ def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
|
|
|
143
141
|
except HTTPError:
|
|
144
142
|
logger.warning(f"HTTP API Client error for ID: {preprint_id}")
|
|
145
143
|
except SSLError:
|
|
146
|
-
logger.warning(f
|
|
144
|
+
logger.warning(f"SSLError for ID: {preprint_id}")
|
|
147
145
|
|
|
148
146
|
with open(path, "w") as file:
|
|
149
147
|
json.dump(preprint, file, indent=2)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
from pymed.article import PubMedArticle
|
|
2
1
|
import warnings
|
|
3
2
|
from typing import List, Union
|
|
4
3
|
|
|
4
|
+
from pymed.article import PubMedArticle
|
|
5
|
+
|
|
5
6
|
finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
|
|
6
7
|
finalize_conjunction = lambda x: x[:-5]
|
|
7
8
|
date_root = '("{0}"[Date - Create] : "{1}"[Date - Create])'
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""API for bioRxiv and medRXiv."""
|
|
2
|
-
import requests
|
|
3
2
|
from datetime import datetime
|
|
4
|
-
from typing import
|
|
3
|
+
from typing import Generator, List, Optional
|
|
5
4
|
|
|
5
|
+
import requests
|
|
6
6
|
|
|
7
7
|
launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
|
|
8
8
|
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
7
7
|
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
8
8
|
License: MIT
|
|
9
9
|
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 3 - Alpha
|
|
12
11
|
Classifier: Intended Audience :: Developers
|
|
13
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -56,10 +55,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
|
|
|
56
55
|
biorxiv() # Takes ~1h and should result in ~350 MB file
|
|
57
56
|
chemrxiv() # Takes ~45min and should result in ~20 MB file
|
|
58
57
|
```
|
|
59
|
-
|
|
60
58
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
|
|
61
59
|
so that the changes take effect.
|
|
62
60
|
|
|
61
|
+
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
|
|
62
|
+
```py
|
|
63
|
+
medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
|
|
64
|
+
```
|
|
65
|
+
But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
|
|
66
|
+
|
|
63
67
|
## Examples
|
|
64
68
|
|
|
65
69
|
`paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
|
|
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
|
|
|
305
309
|
author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
|
|
306
310
|
}
|
|
307
311
|
```
|
|
308
|
-
|
|
309
|
-
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
"""Dump bioRxiv data in JSONL format."""
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
import pkg_resources
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
|
|
9
|
-
from ..xrxiv.xrxiv_api import BioRxivApi
|
|
10
|
-
|
|
11
|
-
today = datetime.today().strftime("%Y-%m-%d")
|
|
12
|
-
save_path = os.path.join(
|
|
13
|
-
pkg_resources.resource_filename("paperscraper", "server_dumps"),
|
|
14
|
-
f"biorxiv_{today}.jsonl",
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def biorxiv(save_path: str = save_path):
|
|
19
|
-
"""Fetches all papers from biorxiv until current date, stores them in jsonl
|
|
20
|
-
format in save_path.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
save_path (str, optional): Path where the dump is stored.
|
|
24
|
-
Defaults to save_path.
|
|
25
|
-
"""
|
|
26
|
-
# create API client
|
|
27
|
-
api = BioRxivApi()
|
|
28
|
-
|
|
29
|
-
# dump all papers
|
|
30
|
-
with open(save_path, "w") as fp:
|
|
31
|
-
for index, paper in enumerate(tqdm(api.get_papers())):
|
|
32
|
-
if index > 0:
|
|
33
|
-
fp.write(os.linesep)
|
|
34
|
-
fp.write(json.dumps(paper))
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
"""Dump medrxiv data in JSONL format."""
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
import pkg_resources
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
|
|
9
|
-
from ..xrxiv.xrxiv_api import MedRxivApi
|
|
10
|
-
|
|
11
|
-
today = datetime.today().strftime("%Y-%m-%d")
|
|
12
|
-
save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
|
|
13
|
-
save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def medrxiv(save_path: str = save_path):
|
|
17
|
-
"""Fetches all papers from medrxiv until current date, stores them in jsonl
|
|
18
|
-
format in save_path.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
save_path (str, optional): Path where the dump is stored.
|
|
22
|
-
Defaults to save_path.
|
|
23
|
-
"""
|
|
24
|
-
# create API client
|
|
25
|
-
api = MedRxivApi()
|
|
26
|
-
# dump all papers
|
|
27
|
-
with open(save_path, "w") as fp:
|
|
28
|
-
for index, paper in enumerate(tqdm(api.get_papers())):
|
|
29
|
-
if index > 0:
|
|
30
|
-
fp.write(os.linesep)
|
|
31
|
-
fp.write(json.dumps(paper))
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Optional, Dict
|
|
3
|
-
|
|
4
|
-
import requests
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ChemrxivAPI:
|
|
8
|
-
"""Handle OpenEngage API requests, using access.
|
|
9
|
-
Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
|
|
13
|
-
|
|
14
|
-
def __init__(self, page_size: Optional[int] = None):
|
|
15
|
-
|
|
16
|
-
self.page_size = page_size or 50
|
|
17
|
-
|
|
18
|
-
def request(self, url, method, params=None):
|
|
19
|
-
"""Send an API request to open Engage."""
|
|
20
|
-
|
|
21
|
-
if method.casefold() == "get":
|
|
22
|
-
return requests.get(url, params=params)
|
|
23
|
-
elif method.casefold() == "post":
|
|
24
|
-
return requests.post(url, json=params)
|
|
25
|
-
else:
|
|
26
|
-
raise ConnectionError(f"Unknown method for query: {method}")
|
|
27
|
-
|
|
28
|
-
def query(self, query, method="get", params=None):
|
|
29
|
-
"""Perform a direct query."""
|
|
30
|
-
r = self.request(
|
|
31
|
-
os.path.join(f"{self.base}", f"{query}"), method, params=params
|
|
32
|
-
)
|
|
33
|
-
r.raise_for_status()
|
|
34
|
-
return r.json()
|
|
35
|
-
|
|
36
|
-
def query_generator(self, query, method: str = "get", params: Dict = {}):
|
|
37
|
-
"""Query for a list of items, with paging. Returns a generator."""
|
|
38
|
-
|
|
39
|
-
page = 0
|
|
40
|
-
while True:
|
|
41
|
-
params.update({"limit": self.page_size, "skip": page * self.page_size})
|
|
42
|
-
r = self.request(os.path.join(self.base, query), method, params=params)
|
|
43
|
-
if r.status_code == 400:
|
|
44
|
-
raise ValueError(r.json()["message"])
|
|
45
|
-
r.raise_for_status()
|
|
46
|
-
r = r.json()
|
|
47
|
-
r = r["itemHits"]
|
|
48
|
-
|
|
49
|
-
# If we have no more results, bail out
|
|
50
|
-
if len(r) == 0:
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
yield from r
|
|
54
|
-
page += 1
|
|
55
|
-
|
|
56
|
-
def all_preprints(self):
|
|
57
|
-
"""Return a generator to all the chemRxiv articles."""
|
|
58
|
-
return self.query_generator("items")
|
|
59
|
-
|
|
60
|
-
def preprint(self, article_id):
|
|
61
|
-
"""Information on a given preprint.
|
|
62
|
-
.. seealso:: https://docs.figshare.com/#public_article
|
|
63
|
-
"""
|
|
64
|
-
return self.query(os.path.join("items", article_id))
|
|
65
|
-
|
|
66
|
-
def number_of_preprints(self):
|
|
67
|
-
return self.query("items")["totalCount"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|