paperscraper 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {paperscraper-0.2.4 → paperscraper-0.2.6}/PKG-INFO +7 -5
  2. {paperscraper-0.2.4 → paperscraper-0.2.6}/README.md +6 -1
  3. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/__init__.py +1 -1
  4. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/arxiv.py +6 -3
  5. paperscraper-0.2.6/paperscraper/get_dumps/biorxiv.py +47 -0
  6. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/chemrxiv.py +15 -4
  7. paperscraper-0.2.6/paperscraper/get_dumps/medrxiv.py +44 -0
  8. paperscraper-0.2.6/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +125 -0
  9. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/chemrxiv/utils.py +2 -4
  10. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/postprocessing.py +1 -1
  11. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/utils.py +2 -1
  12. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/xrxiv_api.py +2 -2
  13. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/PKG-INFO +7 -5
  14. paperscraper-0.2.4/paperscraper/get_dumps/biorxiv.py +0 -34
  15. paperscraper-0.2.4/paperscraper/get_dumps/medrxiv.py +0 -31
  16. paperscraper-0.2.4/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -67
  17. {paperscraper-0.2.4 → paperscraper-0.2.6}/LICENSE +0 -0
  18. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/__init__.py +0 -0
  19. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/utils.py +0 -0
  20. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/__init__.py +1 -1
  21. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/__init__.py +0 -0
  22. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +1 -1
  23. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/journal_if.py +0 -0
  24. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/load_dumps.py +0 -0
  25. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pdf.py +0 -0
  26. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/plotting.py +0 -0
  27. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/__init__.py +0 -0
  28. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/pubmed.py +0 -0
  29. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/scholar/__init__.py +0 -0
  30. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/scholar/scholar.py +0 -0
  31. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/server_dumps/__init__.py +0 -0
  32. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/utils.py +0 -0
  33. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/__init__.py +0 -0
  34. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  35. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/SOURCES.txt +0 -0
  36. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/dependency_links.txt +0 -0
  37. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/not-zip-safe +0 -0
  38. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/requires.txt +0 -0
  39. {paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/top_level.txt +0 -0
  40. {paperscraper-0.2.4 → paperscraper-0.2.6}/setup.cfg +0 -0
  41. {paperscraper-0.2.4 → paperscraper-0.2.6}/setup.py +0 -0
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: paperscraper
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
7
7
  Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
8
8
  License: MIT
9
9
  Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 3 - Alpha
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: Intended Audience :: Science/Research
@@ -56,10 +55,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
56
55
  biorxiv() # Takes ~1h and should result in ~350 MB file
57
56
  chemrxiv() # Takes ~45min and should result in ~20 MB file
58
57
  ```
59
-
60
58
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
61
59
  so that the changes take effect.
62
60
 
61
+ Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
62
+ ```py
63
+ medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
64
+ ```
65
+ But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
66
+
63
67
  ## Examples
64
68
 
65
69
  `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
305
309
  author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
306
310
  }
307
311
  ```
308
-
309
-
@@ -35,10 +35,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
35
35
  biorxiv() # Takes ~1h and should result in ~350 MB file
36
36
  chemrxiv() # Takes ~45min and should result in ~20 MB file
37
37
  ```
38
-
39
38
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
40
39
  so that the changes take effect.
41
40
 
41
+ Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
42
+ ```py
43
+ medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
44
+ ```
45
+ But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
46
+
42
47
  ## Examples
43
48
 
44
49
  `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
@@ -1,6 +1,6 @@
1
1
  """Initialize the module."""
2
2
  __name__ = "paperscraper"
3
- __version__ = "0.2.4"
3
+ __version__ = "0.2.6"
4
4
 
5
5
  import logging
6
6
  import os
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, List, Union
2
2
 
3
3
  import pandas as pd
4
+ from tqdm import tqdm
4
5
 
5
6
  import arxiv
6
7
 
@@ -11,6 +12,7 @@ arxiv_field_mapper = {
11
12
  "published": "date",
12
13
  "journal_ref": "journal",
13
14
  "summary": "abstract",
15
+ "entry_id": "doi",
14
16
  }
15
17
 
16
18
  # Authors, date, and journal fields need specific processing
@@ -18,6 +20,7 @@ process_fields = {
18
20
  "authors": lambda authors: ", ".join([a.name for a in authors]),
19
21
  "date": lambda date: date.strftime("%Y-%m-%d"),
20
22
  "journal": lambda j: j if j is not None else "",
23
+ "doi": lambda entry_id: f"10.48550/arXiv.{entry_id.split('/')[-1].split('v')[0]}",
21
24
  }
22
25
 
23
26
 
@@ -58,9 +61,9 @@ def get_arxiv_papers(
58
61
  arxiv_field_mapper.get(key, key), lambda x: x
59
62
  )(value)
60
63
  for key, value in vars(paper).items()
61
- if arxiv_field_mapper.get(key, key) in fields
64
+ if arxiv_field_mapper.get(key, key) in fields and key != "doi"
62
65
  }
63
- for paper in results
66
+ for paper in tqdm(results, desc=f"Processing {query}")
64
67
  ]
65
68
  )
66
69
  return processed
@@ -71,7 +74,7 @@ def get_and_dump_arxiv_papers(
71
74
  output_filepath: str,
72
75
  fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
73
76
  *args,
74
- **kwargs
77
+ **kwargs,
75
78
  ):
76
79
  """
77
80
  Combines get_arxiv_papers and dump_papers.
@@ -0,0 +1,47 @@
1
+ """Dump bioRxiv data in JSONL format."""
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+ from typing import Optional
6
+
7
+ import pkg_resources
8
+ from tqdm import tqdm
9
+
10
+ from ..xrxiv.xrxiv_api import BioRxivApi
11
+
12
+ today = datetime.today().strftime("%Y-%m-%d")
13
+ save_path = os.path.join(
14
+ pkg_resources.resource_filename("paperscraper", "server_dumps"),
15
+ f"biorxiv_{today}.jsonl",
16
+ )
17
+
18
+
19
+ def biorxiv(
20
+ begin_date: Optional[str] = None,
21
+ end_date: Optional[str] = None,
22
+ save_path: str = save_path,
23
+ ):
24
+ """Fetches papers from biorxiv based on time range, i.e., begin_date and end_date.
25
+ If the begin_date and end_date are not provided, papers will be fetched from biorxiv
26
+ from the launch date of biorxiv until the current date. The fetched papers will be
27
+ stored in jsonl format in save_path.
28
+
29
+ Args:
30
+ save_path (str, optional): Path where the dump is stored.
31
+ Defaults to save_path.
32
+ begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
33
+ Defaults to None.
34
+ end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
35
+ Defaults to None.
36
+ """
37
+ # create API client
38
+ api = BioRxivApi()
39
+
40
+ # dump all papers
41
+ with open(save_path, "w") as fp:
42
+ for index, paper in enumerate(
43
+ tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
44
+ ):
45
+ if index > 0:
46
+ fp.write(os.linesep)
47
+ fp.write(json.dumps(paper))
@@ -3,6 +3,7 @@ import logging
3
3
  import os
4
4
  import sys
5
5
  from datetime import datetime
6
+ from typing import Optional
6
7
 
7
8
  import pkg_resources
8
9
 
@@ -16,17 +17,27 @@ save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
16
17
  save_path = os.path.join(save_folder, f"chemrxiv_{today}.jsonl")
17
18
 
18
19
 
19
- def chemrxiv(save_path: str = save_path) -> None:
20
- """Fetches all papers from biorxiv until current date, stores them in jsonl
21
- format in save_path.
20
+ def chemrxiv(
21
+ begin_date: Optional[str] = None,
22
+ end_date: Optional[str] = None,
23
+ save_path: str = save_path,
24
+ ) -> None:
25
+ """Fetches papers from bichemrxiv based on time range, i.e., begin_date and end_date.
26
+ If the begin_date and end_date are not provided, papers will be fetched from chemrxiv
27
+ from the launch date of chemrxiv until the current date. The fetched papers will be
28
+ stored in jsonl format in save_path.
22
29
 
23
30
  Args:
31
+ begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
32
+ Defaults to None.
33
+ end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
34
+ Defaults to None.
24
35
  save_path (str, optional): Path where the dump is stored.
25
36
  Defaults to save_path.
26
37
  """
27
38
 
28
39
  # create API client
29
- api = ChemrxivAPI()
40
+ api = ChemrxivAPI(begin_date, end_date)
30
41
  # Download the data
31
42
  download_full(save_folder, api)
32
43
  # Convert to JSONL format.
@@ -0,0 +1,44 @@
1
+ """Dump medrxiv data in JSONL format."""
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+ from typing import Optional
6
+
7
+ import pkg_resources
8
+ from tqdm import tqdm
9
+
10
+ from ..xrxiv.xrxiv_api import MedRxivApi
11
+
12
+ today = datetime.today().strftime("%Y-%m-%d")
13
+ save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
14
+ save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
15
+
16
+
17
+ def medrxiv(
18
+ begin_date: Optional[str] = None,
19
+ end_date: Optional[str] = None,
20
+ save_path: str = save_path,
21
+ ):
22
+ """Fetches papers from medrxiv based on time range, i.e., begin_date and end_date.
23
+ If the begin_date and end_date are not provided, then papers will be fetched from
24
+ medrxiv starting from the launch date of medrxiv until current date. The fetched
25
+ papers will be stored in jsonl format in save_path.
26
+
27
+ Args:
28
+ save_path (str, optional): Path where the dump is stored.
29
+ Defaults to save_path.
30
+ begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
31
+ Defaults to None.
32
+ end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
33
+ Defaults to None.
34
+ """
35
+ # create API client
36
+ api = MedRxivApi()
37
+ # dump all papers
38
+ with open(save_path, "w") as fp:
39
+ for index, paper in enumerate(
40
+ tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
41
+ ):
42
+ if index > 0:
43
+ fp.write(os.linesep)
44
+ fp.write(json.dumps(paper))
@@ -0,0 +1,125 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ from datetime import datetime
5
+ from typing import Dict, Optional
6
+
7
+ import requests
8
+
9
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ now_datetime = datetime.now()
13
+ launch_dates = {"chemrxiv": "2017-01-01"}
14
+
15
+
16
+ class ChemrxivAPI:
17
+ """Handle OpenEngage API requests, using access.
18
+ Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
19
+ """
20
+
21
+ base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
22
+
23
+ def __init__(
24
+ self,
25
+ begin_date: Optional[str] = None,
26
+ end_date: Optional[str] = None,
27
+ page_size: Optional[int] = None,
28
+ ):
29
+ """
30
+ Initialize API class.
31
+
32
+ Args:
33
+ begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
34
+ Defaults to None.
35
+ end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
36
+ Defaults to None.
37
+ page_size (int, optional): The batch size used to fetch the records from chemrxiv.
38
+ """
39
+
40
+ self.page_size = page_size or 50
41
+
42
+ # Begin Date and End Date of the search
43
+ launch_date = launch_dates["chemrxiv"]
44
+ launch_datetime = datetime.fromisoformat(launch_date)
45
+
46
+ if begin_date:
47
+ begin_datetime = datetime.fromisoformat(begin_date)
48
+ if begin_datetime < launch_datetime:
49
+ self.begin_date = launch_date
50
+ logger.warning(
51
+ f"Begin date {begin_date} is before chemrxiv launch date. Will use {launch_date} instead."
52
+ )
53
+ else:
54
+ self.begin_date = begin_date
55
+ else:
56
+ self.begin_date = launch_date
57
+ if end_date:
58
+ end_datetime = datetime.fromisoformat(end_date)
59
+ if end_datetime > now_datetime:
60
+ logger.warning(
61
+ f"End date {end_date} is in the future. Will use {now_datetime} instead."
62
+ )
63
+ self.end_date = now_datetime.strftime("%Y-%m-%d")
64
+ else:
65
+ self.end_date = end_date
66
+ else:
67
+ self.end_date = now_datetime.strftime("%Y-%m-%d")
68
+
69
+ def request(self, url, method, params=None):
70
+ """Send an API request to open Engage."""
71
+
72
+ if method.casefold() == "get":
73
+ return requests.get(url, params=params)
74
+ elif method.casefold() == "post":
75
+ return requests.post(url, json=params)
76
+ else:
77
+ raise ConnectionError(f"Unknown method for query: {method}")
78
+
79
+ def query(self, query, method="get", params=None):
80
+ """Perform a direct query."""
81
+ r = self.request(
82
+ os.path.join(f"{self.base}", f"{query}"), method, params=params
83
+ )
84
+ r.raise_for_status()
85
+ return r.json()
86
+
87
+ def query_generator(self, query, method: str = "get", params: Dict = {}):
88
+ """Query for a list of items, with paging. Returns a generator."""
89
+
90
+ page = 0
91
+ while True:
92
+ params.update(
93
+ {
94
+ "limit": self.page_size,
95
+ "skip": page * self.page_size,
96
+ "searchDateFrom": self.begin_date,
97
+ "searchDateTo": self.end_date,
98
+ }
99
+ )
100
+ r = self.request(os.path.join(self.base, query), method, params=params)
101
+ if r.status_code == 400:
102
+ raise ValueError(r.json()["message"])
103
+ r.raise_for_status()
104
+ r = r.json()
105
+ r = r["itemHits"]
106
+
107
+ # If we have no more results, bail out
108
+ if len(r) == 0:
109
+ return
110
+
111
+ yield from r
112
+ page += 1
113
+
114
+ def all_preprints(self):
115
+ """Return a generator to all the chemRxiv articles."""
116
+ return self.query_generator("items")
117
+
118
+ def preprint(self, article_id):
119
+ """Information on a given preprint.
120
+ .. seealso:: https://docs.figshare.com/#public_article
121
+ """
122
+ return self.query(os.path.join("items", article_id))
123
+
124
+ def number_of_preprints(self):
125
+ return self.query("items")["totalCount"]
@@ -28,9 +28,7 @@ def get_author(author_list: List[Dict]) -> str:
28
28
  str: ;-concatenated author list.
29
29
  """
30
30
 
31
- return "; ".join(
32
- [" ".join([a["firstName"], a["lastName"]]) for a in author_list]
33
- )
31
+ return "; ".join([" ".join([a["firstName"], a["lastName"]]) for a in author_list])
34
32
 
35
33
 
36
34
  def get_categories(category_list: List[Dict]) -> str:
@@ -143,7 +141,7 @@ def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
143
141
  except HTTPError:
144
142
  logger.warning(f"HTTP API Client error for ID: {preprint_id}")
145
143
  except SSLError:
146
- logger.warning(f'SSLError for ID: {preprint_id}')
144
+ logger.warning(f"SSLError for ID: {preprint_id}")
147
145
 
148
146
  with open(path, "w") as file:
149
147
  json.dump(preprint, file, indent=2)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import sys
3
- from typing import List, Dict
3
+ from typing import Dict, List
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -1,7 +1,8 @@
1
- from pymed.article import PubMedArticle
2
1
  import warnings
3
2
  from typing import List, Union
4
3
 
4
+ from pymed.article import PubMedArticle
5
+
5
6
  finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
6
7
  finalize_conjunction = lambda x: x[:-5]
7
8
  date_root = '("{0}"[Date - Create] : "{1}"[Date - Create])'
@@ -1,8 +1,8 @@
1
1
  """API for bioRxiv and medRXiv."""
2
- import requests
3
2
  from datetime import datetime
4
- from typing import Optional, List, Generator
3
+ from typing import Generator, List, Optional
5
4
 
5
+ import requests
6
6
 
7
7
  launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
8
8
 
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: paperscraper
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
7
7
  Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
8
8
  License: MIT
9
9
  Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 3 - Alpha
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: Intended Audience :: Science/Research
@@ -56,10 +55,15 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
56
55
  biorxiv() # Takes ~1h and should result in ~350 MB file
57
56
  chemrxiv() # Takes ~45min and should result in ~20 MB file
58
57
  ```
59
-
60
58
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
61
59
  so that the changes take effect.
62
60
 
61
+ Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
62
+ ```py
63
+ medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
64
+ ```
65
+ But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
66
+
63
67
  ## Examples
64
68
 
65
69
  `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
305
309
  author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
306
310
  }
307
311
  ```
308
-
309
-
@@ -1,34 +0,0 @@
1
- """Dump bioRxiv data in JSONL format."""
2
- import json
3
- import os
4
- from datetime import datetime
5
-
6
- import pkg_resources
7
- from tqdm import tqdm
8
-
9
- from ..xrxiv.xrxiv_api import BioRxivApi
10
-
11
- today = datetime.today().strftime("%Y-%m-%d")
12
- save_path = os.path.join(
13
- pkg_resources.resource_filename("paperscraper", "server_dumps"),
14
- f"biorxiv_{today}.jsonl",
15
- )
16
-
17
-
18
- def biorxiv(save_path: str = save_path):
19
- """Fetches all papers from biorxiv until current date, stores them in jsonl
20
- format in save_path.
21
-
22
- Args:
23
- save_path (str, optional): Path where the dump is stored.
24
- Defaults to save_path.
25
- """
26
- # create API client
27
- api = BioRxivApi()
28
-
29
- # dump all papers
30
- with open(save_path, "w") as fp:
31
- for index, paper in enumerate(tqdm(api.get_papers())):
32
- if index > 0:
33
- fp.write(os.linesep)
34
- fp.write(json.dumps(paper))
@@ -1,31 +0,0 @@
1
- """Dump medrxiv data in JSONL format."""
2
- import json
3
- import os
4
- from datetime import datetime
5
-
6
- import pkg_resources
7
- from tqdm import tqdm
8
-
9
- from ..xrxiv.xrxiv_api import MedRxivApi
10
-
11
- today = datetime.today().strftime("%Y-%m-%d")
12
- save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
13
- save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
14
-
15
-
16
- def medrxiv(save_path: str = save_path):
17
- """Fetches all papers from medrxiv until current date, stores them in jsonl
18
- format in save_path.
19
-
20
- Args:
21
- save_path (str, optional): Path where the dump is stored.
22
- Defaults to save_path.
23
- """
24
- # create API client
25
- api = MedRxivApi()
26
- # dump all papers
27
- with open(save_path, "w") as fp:
28
- for index, paper in enumerate(tqdm(api.get_papers())):
29
- if index > 0:
30
- fp.write(os.linesep)
31
- fp.write(json.dumps(paper))
@@ -1,67 +0,0 @@
1
- import os
2
- from typing import Optional, Dict
3
-
4
- import requests
5
-
6
-
7
- class ChemrxivAPI:
8
- """Handle OpenEngage API requests, using access.
9
- Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
10
- """
11
-
12
- base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
13
-
14
- def __init__(self, page_size: Optional[int] = None):
15
-
16
- self.page_size = page_size or 50
17
-
18
- def request(self, url, method, params=None):
19
- """Send an API request to open Engage."""
20
-
21
- if method.casefold() == "get":
22
- return requests.get(url, params=params)
23
- elif method.casefold() == "post":
24
- return requests.post(url, json=params)
25
- else:
26
- raise ConnectionError(f"Unknown method for query: {method}")
27
-
28
- def query(self, query, method="get", params=None):
29
- """Perform a direct query."""
30
- r = self.request(
31
- os.path.join(f"{self.base}", f"{query}"), method, params=params
32
- )
33
- r.raise_for_status()
34
- return r.json()
35
-
36
- def query_generator(self, query, method: str = "get", params: Dict = {}):
37
- """Query for a list of items, with paging. Returns a generator."""
38
-
39
- page = 0
40
- while True:
41
- params.update({"limit": self.page_size, "skip": page * self.page_size})
42
- r = self.request(os.path.join(self.base, query), method, params=params)
43
- if r.status_code == 400:
44
- raise ValueError(r.json()["message"])
45
- r.raise_for_status()
46
- r = r.json()
47
- r = r["itemHits"]
48
-
49
- # If we have no more results, bail out
50
- if len(r) == 0:
51
- return
52
-
53
- yield from r
54
- page += 1
55
-
56
- def all_preprints(self):
57
- """Return a generator to all the chemRxiv articles."""
58
- return self.query_generator("items")
59
-
60
- def preprint(self, article_id):
61
- """Information on a given preprint.
62
- .. seealso:: https://docs.figshare.com/#public_article
63
- """
64
- return self.query(os.path.join("items", article_id))
65
-
66
- def number_of_preprints(self):
67
- return self.query("items")["totalCount"]
File without changes
@@ -1,3 +1,3 @@
1
1
  from .biorxiv import biorxiv # noqa
2
- from .medrxiv import medrxiv # noqa
3
2
  from .chemrxiv import chemrxiv # noqa
3
+ from .medrxiv import medrxiv # noqa
@@ -1,2 +1,2 @@
1
- from .utils import * # noqa
2
1
  from .chemrxiv_api import ChemrxivAPI # noqa
2
+ from .utils import * # noqa
File without changes
File without changes