PyPI - paperscraper - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

paperscraper 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{paperscraper-0.2.4 → paperscraper-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,12 @@
 Metadata-Version: 2.1
 Name: paperscraper
-Version: 0.2.4
+Version: 0.2.6
 Summary: paperscraper: Package to scrape papers.
 Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
 Author: Jannis Born, Matteo Manica
 Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
 License: MIT
 Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
-Platform: UNKNOWN
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -56,10 +55,15 @@ medrxiv()  #  Takes ~30min and should result in ~35 MB file
 biorxiv()  # Takes ~1h and should result in ~350 MB file
 chemrxiv()  #  Takes ~45min and should result in ~20 MB file
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
 so that the changes take effect.
+Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
+```py
+medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
+```
+But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
 ## Examples
 `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
 	author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
 }
 ```

{paperscraper-0.2.4 → paperscraper-0.2.6}/README.md RENAMED Viewed

@@ -35,10 +35,15 @@ medrxiv()  #  Takes ~30min and should result in ~35 MB file
 biorxiv()  # Takes ~1h and should result in ~350 MB file
 chemrxiv()  #  Takes ~45min and should result in ~20 MB file
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
 so that the changes take effect.
+Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
+```py
+medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
+```
+But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
 ## Examples
 `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """Initialize the module."""
 __name__ = "paperscraper"
-__version__ = "0.2.4"
+__version__ = "0.2.6"
 import logging
 import os

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/arxiv.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List, Union
 import pandas as pd
+from tqdm import tqdm
 import arxiv
@@ -11,6 +12,7 @@ arxiv_field_mapper = {
     "published": "date",
     "journal_ref": "journal",
     "summary": "abstract",
+    "entry_id": "doi",
 }
 # Authors, date, and journal fields need specific processing
@@ -18,6 +20,7 @@ process_fields = {
     "authors": lambda authors: ", ".join([a.name for a in authors]),
     "date": lambda date: date.strftime("%Y-%m-%d"),
     "journal": lambda j: j if j is not None else "",
+    "doi": lambda entry_id: f"10.48550/arXiv.{entry_id.split('/')[-1].split('v')[0]}",
 }
@@ -58,9 +61,9 @@ def get_arxiv_papers(
                     arxiv_field_mapper.get(key, key), lambda x: x
                 )(value)
                 for key, value in vars(paper).items()
-                if arxiv_field_mapper.get(key, key) in fields
+                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
             }
-            for paper in results
+            for paper in tqdm(results, desc=f"Processing {query}")
         ]
     )
     return processed
@@ -71,7 +74,7 @@ def get_and_dump_arxiv_papers(
     output_filepath: str,
     fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
     *args,
-    **kwargs
+    **kwargs,
 ):
     """
     Combines get_arxiv_papers and dump_papers.

paperscraper-0.2.6/paperscraper/get_dumps/biorxiv.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Dump bioRxiv data in JSONL format."""
+import json
+import os
+from datetime import datetime
+from typing import Optional
+import pkg_resources
+from tqdm import tqdm
+from ..xrxiv.xrxiv_api import BioRxivApi
+today = datetime.today().strftime("%Y-%m-%d")
+save_path = os.path.join(
+    pkg_resources.resource_filename("paperscraper", "server_dumps"),
+    f"biorxiv_{today}.jsonl",
+)
+def biorxiv(
+    begin_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    save_path: str = save_path,
+):
+    """Fetches papers from biorxiv based on time range, i.e., begin_date and end_date.
+    If the begin_date and end_date are not provided, papers will be fetched from biorxiv
+    from the launch date of biorxiv until the current date. The fetched papers will be
+    stored in jsonl format in save_path.
+    Args:
+        save_path (str, optional): Path where the dump is stored.
+            Defaults to save_path.
+        begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None.
+        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
+            Defaults to None.
+    """
+    # create API client
+    api = BioRxivApi()
+    # dump all papers
+    with open(save_path, "w") as fp:
+        for index, paper in enumerate(
+            tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
+        ):
+            if index > 0:
+                fp.write(os.linesep)
+            fp.write(json.dumps(paper))

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/chemrxiv.py RENAMED Viewed

@@ -3,6 +3,7 @@ import logging
 import os
 import sys
 from datetime import datetime
+from typing import Optional
 import pkg_resources
@@ -16,17 +17,27 @@ save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
 save_path = os.path.join(save_folder, f"chemrxiv_{today}.jsonl")
-def chemrxiv(save_path: str = save_path) -> None:
-    """Fetches all papers from biorxiv until current date, stores them in jsonl
-    format in save_path.
+def chemrxiv(
+    begin_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    save_path: str = save_path,
+) -> None:
+    """Fetches papers from bichemrxiv based on time range, i.e., begin_date and end_date.
+    If the begin_date and end_date are not provided, papers will be fetched from chemrxiv
+    from the launch date of chemrxiv until the current date. The fetched papers will be
+    stored in jsonl format in save_path.
     Args:
+        begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None.
+        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
+            Defaults to None.
         save_path (str, optional): Path where the dump is stored.
             Defaults to save_path.
     """
     # create API client
-    api = ChemrxivAPI()
+    api = ChemrxivAPI(begin_date, end_date)
     # Download the data
     download_full(save_folder, api)
     # Convert to JSONL format.

paperscraper-0.2.6/paperscraper/get_dumps/medrxiv.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Dump medrxiv data in JSONL format."""
+import json
+import os
+from datetime import datetime
+from typing import Optional
+import pkg_resources
+from tqdm import tqdm
+from ..xrxiv.xrxiv_api import MedRxivApi
+today = datetime.today().strftime("%Y-%m-%d")
+save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
+save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
+def medrxiv(
+    begin_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    save_path: str = save_path,
+):
+    """Fetches papers from medrxiv based on time range, i.e., begin_date and end_date.
+    If the begin_date and end_date are not provided, then papers will be fetched from
+    medrxiv starting from the launch date of medrxiv until current date. The fetched
+    papers will be stored in jsonl format in save_path.
+    Args:
+        save_path (str, optional): Path where the dump is stored.
+            Defaults to save_path.
+        begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None.
+        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
+            Defaults to None.
+    """
+    # create API client
+    api = MedRxivApi()
+    # dump all papers
+    with open(save_path, "w") as fp:
+        for index, paper in enumerate(
+            tqdm(api.get_papers(begin_date=begin_date, end_date=end_date))
+        ):
+            if index > 0:
+                fp.write(os.linesep)
+            fp.write(json.dumps(paper))

paperscraper-0.2.6/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py ADDED Viewed

@@ -0,0 +1,125 @@
+import logging
+import os
+import sys
+from datetime import datetime
+from typing import Dict, Optional
+import requests
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+now_datetime = datetime.now()
+launch_dates = {"chemrxiv": "2017-01-01"}
+class ChemrxivAPI:
+    """Handle OpenEngage API requests, using access.
+    Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
+    """
+    base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
+    def __init__(
+        self,
+        begin_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        page_size: Optional[int] = None,
+    ):
+        """
+        Initialize API class.
+        Args:
+            begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
+                Defaults to None.
+            end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
+                Defaults to None.
+            page_size (int, optional): The batch size used to fetch the records from chemrxiv.
+        """
+        self.page_size = page_size or 50
+        # Begin Date and End Date of the search
+        launch_date = launch_dates["chemrxiv"]
+        launch_datetime = datetime.fromisoformat(launch_date)
+        if begin_date:
+            begin_datetime = datetime.fromisoformat(begin_date)
+            if begin_datetime < launch_datetime:
+                self.begin_date = launch_date
+                logger.warning(
+                    f"Begin date {begin_date} is before chemrxiv launch date. Will use {launch_date} instead."
+                )
+            else:
+                self.begin_date = begin_date
+        else:
+            self.begin_date = launch_date
+        if end_date:
+            end_datetime = datetime.fromisoformat(end_date)
+            if end_datetime > now_datetime:
+                logger.warning(
+                    f"End date {end_date} is in the future. Will use {now_datetime} instead."
+                )
+                self.end_date = now_datetime.strftime("%Y-%m-%d")
+            else:
+                self.end_date = end_date
+        else:
+            self.end_date = now_datetime.strftime("%Y-%m-%d")
+    def request(self, url, method, params=None):
+        """Send an API request to open Engage."""
+        if method.casefold() == "get":
+            return requests.get(url, params=params)
+        elif method.casefold() == "post":
+            return requests.post(url, json=params)
+        else:
+            raise ConnectionError(f"Unknown method for query: {method}")
+    def query(self, query, method="get", params=None):
+        """Perform a direct query."""
+        r = self.request(
+            os.path.join(f"{self.base}", f"{query}"), method, params=params
+        )
+        r.raise_for_status()
+        return r.json()
+    def query_generator(self, query, method: str = "get", params: Dict = {}):
+        """Query for a list of items, with paging. Returns a generator."""
+        page = 0
+        while True:
+            params.update(
+                {
+                    "limit": self.page_size,
+                    "skip": page * self.page_size,
+                    "searchDateFrom": self.begin_date,
+                    "searchDateTo": self.end_date,
+                }
+            )
+            r = self.request(os.path.join(self.base, query), method, params=params)
+            if r.status_code == 400:
+                raise ValueError(r.json()["message"])
+            r.raise_for_status()
+            r = r.json()
+            r = r["itemHits"]
+            # If we have no more results, bail out
+            if len(r) == 0:
+                return
+            yield from r
+            page += 1
+    def all_preprints(self):
+        """Return a generator to all the chemRxiv articles."""
+        return self.query_generator("items")
+    def preprint(self, article_id):
+        """Information on a given preprint.
+        .. seealso:: https://docs.figshare.com/#public_article
+        """
+        return self.query(os.path.join("items", article_id))
+    def number_of_preprints(self):
+        return self.query("items")["totalCount"]

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/utils/chemrxiv/utils.py RENAMED Viewed

@@ -28,9 +28,7 @@ def get_author(author_list: List[Dict]) -> str:
         str: ;-concatenated author list.
     """
-    return "; ".join(
-        [" ".join([a["firstName"], a["lastName"]]) for a in author_list]
-    )
+    return "; ".join([" ".join([a["firstName"], a["lastName"]]) for a in author_list])
 def get_categories(category_list: List[Dict]) -> str:
@@ -143,7 +141,7 @@ def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
         except HTTPError:
             logger.warning(f"HTTP API Client error for ID: {preprint_id}")
         except SSLError:
-            logger.warning(f'SSLError for ID: {preprint_id}')
+            logger.warning(f"SSLError for ID: {preprint_id}")
         with open(path, "w") as file:
             json.dump(preprint, file, indent=2)

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/postprocessing.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import logging
 import sys
-from typing import List, Dict
+from typing import Dict, List
 import numpy as np
 import pandas as pd

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/pubmed/utils.py RENAMED Viewed

@@ -1,7 +1,8 @@
-from pymed.article import PubMedArticle
 import warnings
 from typing import List, Union
+from pymed.article import PubMedArticle
 finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
 finalize_conjunction = lambda x: x[:-5]
 date_root = '("{0}"[Date - Create] : "{1}"[Date - Create])'

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/xrxiv/xrxiv_api.py RENAMED Viewed

@@ -1,8 +1,8 @@
 """API for bioRxiv and medRXiv."""
-import requests
 from datetime import datetime
-from typing import Optional, List, Generator
+from typing import Generator, List, Optional
+import requests
 launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper.egg-info/PKG-INFO RENAMED Viewed

@@ -1,13 +1,12 @@
 Metadata-Version: 2.1
 Name: paperscraper
-Version: 0.2.4
+Version: 0.2.6
 Summary: paperscraper: Package to scrape papers.
 Home-page: https://github.com/PhosphorylatedRabbits/paperscraper
 Author: Jannis Born, Matteo Manica
 Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
 License: MIT
 Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv
-Platform: UNKNOWN
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -56,10 +55,15 @@ medrxiv()  #  Takes ~30min and should result in ~35 MB file
 biorxiv()  # Takes ~1h and should result in ~350 MB file
 chemrxiv()  #  Takes ~45min and should result in ~20 MB file
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
 so that the changes take effect.
+Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
+```py
+medrxiv(begin_date="2023-04-01", end_date="2023-04-08")
+```
+But watch out. The resulting `.jsonl` file will be labelled according to the current date and all your subsequent searches will be based on this file **only**. If you use this option you might want to keep an eye on the source files (`paperscraper/server_dumps/*jsonl`) to ensure they contain the paper metadata for all papers you're interested in.
 ## Examples
 `paperscraper` is build on top of the packages [pymed](https://pypi.org/project/pymed/),
@@ -305,5 +309,3 @@ If you use `paperscraper`, please cite the papers that motivated our development
 	author = {Jannis Born and David Beymer and Deepta Rajan and Adam Coy and Vandana V. Mukherjee and Matteo Manica and Prasanth Prasanna and Deddeh Ballah and Michal Guindy and Dorith Shaham and Pallav L. Shah and Emmanouil Karteris and Jan L. Robertus and Maria Gabrani and Michal Rosen-Zvi}
 }
 ```

paperscraper-0.2.4/paperscraper/get_dumps/biorxiv.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""Dump bioRxiv data in JSONL format."""
-import json
-import os
-from datetime import datetime
-import pkg_resources
-from tqdm import tqdm
-from ..xrxiv.xrxiv_api import BioRxivApi
-today = datetime.today().strftime("%Y-%m-%d")
-save_path = os.path.join(
-    pkg_resources.resource_filename("paperscraper", "server_dumps"),
-    f"biorxiv_{today}.jsonl",
-)
-def biorxiv(save_path: str = save_path):
-    """Fetches all papers from biorxiv until current date, stores them in jsonl
-    format in save_path.
-    Args:
-        save_path (str, optional): Path where the dump is stored.
-            Defaults to save_path.
-    """
-    # create API client
-    api = BioRxivApi()
-    # dump all papers
-    with open(save_path, "w") as fp:
-        for index, paper in enumerate(tqdm(api.get_papers())):
-            if index > 0:
-                fp.write(os.linesep)
-            fp.write(json.dumps(paper))

paperscraper-0.2.4/paperscraper/get_dumps/medrxiv.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""Dump medrxiv data in JSONL format."""
-import json
-import os
-from datetime import datetime
-import pkg_resources
-from tqdm import tqdm
-from ..xrxiv.xrxiv_api import MedRxivApi
-today = datetime.today().strftime("%Y-%m-%d")
-save_folder = pkg_resources.resource_filename("paperscraper", "server_dumps")
-save_path = os.path.join(save_folder, f"medrxiv_{today}.jsonl")
-def medrxiv(save_path: str = save_path):
-    """Fetches all papers from medrxiv until current date, stores them in jsonl
-    format in save_path.
-    Args:
-        save_path (str, optional): Path where the dump is stored.
-            Defaults to save_path.
-    """
-    # create API client
-    api = MedRxivApi()
-    # dump all papers
-    with open(save_path, "w") as fp:
-        for index, paper in enumerate(tqdm(api.get_papers())):
-            if index > 0:
-                fp.write(os.linesep)
-            fp.write(json.dumps(paper))

paperscraper-0.2.4/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py DELETED Viewed

@@ -1,67 +0,0 @@
-import os
-from typing import Optional, Dict
-import requests
-class ChemrxivAPI:
-    """Handle OpenEngage API requests, using access.
-    Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
-    """
-    base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1"
-    def __init__(self, page_size: Optional[int] = None):
-        self.page_size = page_size or 50
-    def request(self, url, method, params=None):
-        """Send an API request to open Engage."""
-        if method.casefold() == "get":
-            return requests.get(url, params=params)
-        elif method.casefold() == "post":
-            return requests.post(url, json=params)
-        else:
-            raise ConnectionError(f"Unknown method for query: {method}")
-    def query(self, query, method="get", params=None):
-        """Perform a direct query."""
-        r = self.request(
-            os.path.join(f"{self.base}", f"{query}"), method, params=params
-        )
-        r.raise_for_status()
-        return r.json()
-    def query_generator(self, query, method: str = "get", params: Dict = {}):
-        """Query for a list of items, with paging. Returns a generator."""
-        page = 0
-        while True:
-            params.update({"limit": self.page_size, "skip": page * self.page_size})
-            r = self.request(os.path.join(self.base, query), method, params=params)
-            if r.status_code == 400:
-                raise ValueError(r.json()["message"])
-            r.raise_for_status()
-            r = r.json()
-            r = r["itemHits"]
-            # If we have no more results, bail out
-            if len(r) == 0:
-                return
-            yield from r
-            page += 1
-    def all_preprints(self):
-        """Return a generator to all the chemRxiv articles."""
-        return self.query_generator("items")
-    def preprint(self, article_id):
-        """Information on a given preprint.
-        .. seealso:: https://docs.figshare.com/#public_article
-        """
-        return self.query(os.path.join("items", article_id))
-    def number_of_preprints(self):
-        return self.query("items")["totalCount"]

{paperscraper-0.2.4 → paperscraper-0.2.6}/LICENSE RENAMED Viewed

File without changes

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/__init__.py RENAMED Viewed

File without changes

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/arxiv/utils.py RENAMED Viewed

File without changes

{paperscraper-0.2.4 → paperscraper-0.2.6}/paperscraper/get_dumps/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 from .biorxiv import biorxiv  # noqa
-from .medrxiv import medrxiv  # noqa
 from .chemrxiv import chemrxiv  # noqa
+from .medrxiv import medrxiv  # noqa