PyPI - paperscraper - Versions diffs - 0.3.0__tar.gz → 0.3.1__tar.gz - Mend

paperscraper 0.3.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{paperscraper-0.3.0 → paperscraper-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: paperscraper
-Version: 0.3.0
+Version: 0.3.1
 Summary: paperscraper: Package to scrape papers.
 Home-page: https://github.com/jannisborn/paperscraper
 Author: Jannis Born, Matteo Manica
@@ -20,7 +20,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: arxiv>=1.4.2
-Requires-Dist: pymed-paperscraper>=1.0.3
+Requires-Dist: pymed-paperscraper>=1.0.4
 Requires-Dist: pandas
 Requires-Dist: requests
 Requires-Dist: tqdm
@@ -35,6 +35,9 @@ Requires-Dist: pytest
 Requires-Dist: tldextract
 Requires-Dist: semanticscholar
 Requires-Dist: pydantic
+Requires-Dist: unidecode
+Requires-Dist: dotenv
+Requires-Dist: boto3
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -86,7 +89,7 @@ and plotting routines for meta-analysis.
 pip install paperscraper
 ```
-This is enough to query **PubMed**, **arXiv** or Google Scholar.
+This is enough to query PubMed, arXiv or Google Scholar.
 #### Download X-rxiv Dumps
@@ -230,6 +233,7 @@ For more comprehensive access to papers from major publishers, you can provide A
 - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
 - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
+- **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
 To use publisher APIs:
@@ -237,7 +241,11 @@ To use publisher APIs:
 ```
 WILEY_TDM_API_TOKEN=your_wiley_token_here
 ELSEVIER_TDM_API_KEY=your_elsevier_key_here
+AWS_ACCESS_KEY_ID=your_aws_access_key_here
+AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
 ```
+NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
+NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
 2. Pass the file path when calling retrieval functions:

{paperscraper-0.3.0 → paperscraper-0.3.1}/README.md RENAMED Viewed

@@ -37,7 +37,7 @@ and plotting routines for meta-analysis.
 pip install paperscraper
 ```
-This is enough to query **PubMed**, **arXiv** or Google Scholar.
+This is enough to query PubMed, arXiv or Google Scholar.
 #### Download X-rxiv Dumps
@@ -181,6 +181,7 @@ For more comprehensive access to papers from major publishers, you can provide A
 - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
 - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
+- **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
 To use publisher APIs:
@@ -188,7 +189,11 @@ To use publisher APIs:
 ```
 WILEY_TDM_API_TOKEN=your_wiley_token_here
 ELSEVIER_TDM_API_KEY=your_elsevier_key_here
+AWS_ACCESS_KEY_ID=your_aws_access_key_here
+AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
 ```
+NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
+NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
 2. Pass the file path when calling retrieval functions:

{paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Initialize the module."""
 __name__ = "paperscraper"
-__version__ = "0.3.0"
+__version__ = "0.3.1"
 import logging
 import os
@@ -36,7 +36,7 @@ def dump_queries(keywords: List[List[Union[str, List[str]]]], dump_root: str) ->
     for idx, keyword in enumerate(keywords):
         for db, f in QUERY_FN_DICT.items():
-            logger.info(f" Keyword {idx+1}/{len(keywords)}, DB: {db}")
+            logger.info(f" Keyword {idx + 1}/{len(keywords)}, DB: {db}")
             filename = get_filename_from_query(keyword)
             os.makedirs(os.path.join(dump_root, db), exist_ok=True)
             f(keyword, output_filepath=os.path.join(dump_root, db, filename))

paperscraper-0.3.1/paperscraper/async_utils.py ADDED Viewed

@@ -0,0 +1,88 @@
+import asyncio
+import logging
+import sys
+import threading
+from functools import wraps
+from typing import Any, Awaitable, Callable, TypeVar, Union
+import httpx
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+T = TypeVar("T")
+F = TypeVar("F", bound=Callable[..., Awaitable[Any]])
+def _start_bg_loop(loop: asyncio.AbstractEventLoop):
+    asyncio.set_event_loop(loop)
+    loop.run_forever()
+# Start one background loop in its own daemon thread
+_background_loop = asyncio.new_event_loop()
+threading.Thread(target=_start_bg_loop, args=(_background_loop,), daemon=True).start()
+def optional_async(
+    func: Callable[..., Awaitable[T]],
+) -> Callable[..., Union[T, Awaitable[T]]]:
+    """
+    Allows an async function to be called from sync code (blocks until done)
+    or from within an async context (returns a coroutine to await).
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
+        coro = func(*args, **kwargs)
+        try:
+            # If we're already in an asyncio loop, hand back the coroutine:
+            asyncio.get_running_loop()
+            return coro  # caller must await it
+        except RuntimeError:
+            # Otherwise, schedule on the background loop and block
+            future = asyncio.run_coroutine_threadsafe(coro, _background_loop)
+            return future.result()
+    return wrapper
+def retry_with_exponential_backoff(
+    *, max_retries: int = 5, base_delay: float = 1.0
+) -> Callable[[F], F]:
+    """
+    Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
+    Args:
+        max_retries: how many times to retry before giving up.
+        base_delay: initial delay in seconds; next delays will be duplication of previous.
+    Usage:
+        @retry_with_exponential_backoff(max_retries=3, base_delay=0.5)
+        async def fetch_data(...):
+            ...
+    """
+    def decorator(func: F) -> F:
+        @wraps(func)
+        async def wrapper(*args, **kwargs) -> Any:
+            delay = base_delay
+            for attempt in range(max_retries):
+                try:
+                    return await func(*args, **kwargs)
+                except httpx.HTTPStatusError as e:
+                    # only retry on 429
+                    status = e.response.status_code if e.response is not None else None
+                    if status != 429 or attempt == max_retries - 1:
+                        raise
+                # backoff
+                await asyncio.sleep(delay)
+                delay *= 2
+            # in theory we never reach here
+        return wrapper
+    return decorator

paperscraper-0.3.1/paperscraper/citations/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .citations import get_citations_by_doi, get_citations_from_title  # noqa
+from .core import SelfLinkClient  # noqa
+from .self_citations import self_citations_paper  # noqa
+from .self_references import self_references_paper  # noqa

{paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/core.py RENAMED Viewed

@@ -1,15 +1,8 @@
-import asyncio
 import logging
-import re
 import sys
-from typing import Dict, Iterable, Literal, Union
+from typing import Literal
-import httpx
-from semanticscholar import SemanticScholar
-from ..utils import optional_async
 from .entity import Paper, Researcher
-from .utils import check_overlap, doi_pattern
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -20,6 +13,9 @@ ModeType = Literal[tuple(MODES := ("paper", "author"))]
 class SelfLinkClient:
     def __init__(self, entity: str, mode: ModeType = "paper") -> None:
+        self.mode = mode.lower()
+        if self.mode not in MODES:
+            raise ValueError(f"Unknown mode `{self.mode}`, chose from {MODES}")
         if self.mode == "paper":
             self.object = Paper(entity)

paperscraper-0.3.1/paperscraper/citations/entity/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .paper import Paper, PaperResult # noqa
2	+ from .researcher import Researcher, ResearcherResult # noqa

paperscraper-0.3.1/paperscraper/citations/entity/paper.py ADDED Viewed

@@ -0,0 +1,100 @@
+import logging
+import sys
+from typing import List, Literal, Optional
+from ..self_citations import CitationResult, self_citations_paper
+from ..self_references import ReferenceResult, self_references_paper
+from ..utils import (
+    determine_paper_input_type,
+    get_doi_from_ssid,
+    get_doi_from_title,
+    get_title_and_id_from_doi,
+)
+from .core import Entity
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PaperResult(ReferenceResult, CitationResult):
+    title: str
+ModeType = Literal[tuple(MODES := ("doi", "title", "ss_id", "infer"))]
+BASE_URL: str = "https://api.semanticscholar.org/graph/v1/paper/search"
+class Paper(Entity):
+    title: str = ""
+    doi: str = ""
+    authors: List[str] = []
+    def __init__(self, input: str, mode: ModeType = "infer"):
+        """
+        Set up a Paper object for analysis.
+        Args:
+            input: Paper identifier. This can be the title, DOI or semantic scholar ID
+                of the paper.
+            mode: The format in which the ID was provided. Defaults to "infer".
+        Raises:
+            ValueError: If unknown mode is given.
+        """
+        if mode not in MODES:
+            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
+        input = input.strip()
+        self.input = input
+        if mode == "infer":
+            mode = determine_paper_input_type(input)
+        if mode == "doi":
+            self.doi = input
+        elif mode == "title":
+            self.doi = get_doi_from_title(input)
+        elif mode == "ssid":
+            self.doi = get_doi_from_ssid(input)
+        if self.doi is not None:
+            out = get_title_and_id_from_doi(self.doi)
+            if out is not None:
+                self.title = out["title"]
+                self.ssid = out["ssid"]
+    def self_references(self):
+        """
+        Extracts the self references of a paper, for each author.
+        """
+        if isinstance(self.doi, str):
+            self.ref_result: ReferenceResult = self_references_paper(self.doi)
+    def self_citations(self):
+        """
+        Extracts the self citations of a paper, for each author.
+        """
+        if isinstance(self.doi, str):
+            self.citation_result: CitationResult = self_citations_paper(self.doi)
+    def get_result(self) -> Optional[PaperResult]:
+        """
+        Provides the result of the analysis.
+        Returns: PaperResult if available.
+        """
+        if not hasattr(self, "ref_result"):
+            logger.warning(
+                f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
+            )
+            return
+        elif not hasattr(self, "citation_result"):
+            logger.warning(
+                f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
+            )
+            return
+        ref_result = self.ref_result.model_dump()
+        ref_result.pop("ssid", None)
+        return PaperResult(
+            title=self.title, **ref_result, **self.citation_result.model_dump()
+        )

paperscraper-0.3.1/paperscraper/citations/entity/researcher.py ADDED Viewed

@@ -0,0 +1,90 @@
+from typing import List, Literal, Optional
+from semanticscholar import SemanticScholar
+from tqdm import tqdm
+from ..orcid import orcid_to_author_name
+from ..self_references import ReferenceResult
+from ..utils import author_name_to_ssaid, get_papers_for_author
+from .core import Entity, EntityResult
+class ResearcherResult(EntityResult):
+    name: str
+    ssid: int
+    orcid: Optional[str] = None
+    # TODO: the ratios will be averaged across all papers for that author
+ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
+sch = SemanticScholar()
+class Researcher(Entity):
+    name: str
+    ssid: int
+    orcid: Optional[str] = None
+    def __init__(self, input: str, mode: ModeType = "infer"):
+        """
+        Construct researcher object for self citation/reference analysis.
+        Args:
+            input: A researcher to search for.
+            mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
+                Defaults to "infer".
+        Raises:
+            ValueError: Unknown mode
+        """
+        if mode not in MODES:
+            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
+        input = input.strip()
+        if mode == "infer":
+            if input.isdigit():
+                mode = "ssaid"
+            elif (
+                input.count("-") == 3
+                and len(input) == 19
+                and all([x.isdigit() for x in input.split("-")])
+            ):
+                mode = "orcid"
+            else:
+                mode = "author"
+        if mode == "ssaid":
+            self.author = sch.get_author(input)
+            self.ssid = input
+        elif mode == "orcid":
+            self.author = orcid_to_author_name(input)
+            self.orcid = input
+            self.ssid = author_name_to_ssaid(input)
+        elif mode == "author":
+            self.author = input
+            self.ssid = author_name_to_ssaid(input)
+        # TODO: Skip over erratum / corrigendum
+        self.ssids = get_papers_for_author(self.ssid)
+    def self_references(self):
+        """
+        Sifts through all papers of a researcher and extracts the self references.
+        """
+        # TODO: Asynchronous call to self_references
+        print("Going through SSIDs", self.ssids)
+        # TODO: Aggregate results
+    def self_citations(self):
+        """
+        Sifts through all papers of a researcher and finds how often they are self-cited.
+        """
+        ...
+    def get_result(self) -> ResearcherResult:
+        """
+        Provides the result of the analysis.
+        """
+        ...

paperscraper-0.3.1/paperscraper/citations/orcid.py ADDED Viewed

@@ -0,0 +1,29 @@
+import logging
+import sys
+from typing import Optional
+import requests
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+BASE_URL = "https://pub.orcid.org/v3.0/"
+def orcid_to_author_name(orcid_id: str) -> Optional[str]:
+    """
+    Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
+    returns the full name of the author from the ORCID public API.
+    """
+    headers = {"Accept": "application/json"}
+    response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
+    if response.status_code == 200:
+        data = response.json()
+        given = data.get("name", {}).get("given-names", {}).get("value", "")
+        family = data.get("name", {}).get("family-name", {}).get("value", "")
+        full_name = f"{given} {family}".strip()
+        return full_name
+    logger.error(
+        f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
+    )

paperscraper-0.3.1/paperscraper/citations/self_citations.py ADDED Viewed

@@ -0,0 +1,126 @@
+import asyncio
+import logging
+import re
+import sys
+from typing import Any, Dict, List, Union
+import httpx
+import numpy as np
+from pydantic import BaseModel
+from ..async_utils import optional_async, retry_with_exponential_backoff
+from .utils import DOI_PATTERN, find_matching
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+class CitationResult(BaseModel):
+    ssid: str  # semantic scholar paper id
+    num_citations: int
+    self_citations: Dict[str, float] = {}
+    citation_score: float
+async def _fetch_citation_data(
+    client: httpx.AsyncClient, suffix: str
+) -> Dict[str, Any]:
+    """
+    Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
+    Args:
+        client: An active httpx.AsyncClient.
+        suffix: Prefixed identifier (e.g., "DOI:10.1000/xyz123" or SSID).
+    Returns:
+        The JSON-decoded response as a dictionary.
+    """
+    response = await client.get(
+        f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
+        params={"fields": "title,authors,citations.authors"},
+    )
+    response.raise_for_status()
+    return response.json()
+async def _process_single(client: httpx.AsyncClient, identifier: str) -> CitationResult:
+    """
+    Compute self-citation stats for a single paper.
+    Args:
+        client: An active httpx.AsyncClient.
+        identifier: A DOI or Semantic Scholar ID.
+    Returns:
+        A CitationResult containing counts and percentages of self-citations.
+    """
+    # Determine prefix for Semantic Scholar API
+    if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
+        prefix = ""
+    elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
+        prefix = "DOI:"
+    else:
+        prefix = ""
+    suffix = f"{prefix}{identifier}"
+    paper = await _fetch_citation_data(client, suffix)
+    # Initialize counters
+    author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
+    citations = paper.get("citations", [])
+    total_cites = len(citations)
+    # Tally self-citations
+    for cite in citations:
+        matched = find_matching(paper.get("authors", []), cite.get("authors", []))
+        for name in matched:
+            author_counts[name] += 1
+    # Compute percentages
+    ratios: Dict[str, float] = {
+        name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
+        for name, count in author_counts.items()
+    }
+    avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
+    return CitationResult(
+        ssid=identifier,
+        num_citations=total_cites,
+        self_citations=ratios,
+        citation_score=avg_score,
+    )
+@optional_async
+@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
+async def self_citations_paper(
+    inputs: Union[str, List[str]], verbose: bool = False
+) -> Union[CitationResult, List[CitationResult]]:
+    """
+    Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.
+    Args:
+        inputs: A single DOI/SSID string or a list of them.
+        verbose: If True, logs detailed information for each paper.
+    Returns:
+        A single CitationResult if a string was passed, else a list of CitationResults.
+    """
+    single_input = isinstance(inputs, str)
+    identifiers = [inputs] if single_input else list(inputs)
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+        tasks = [_process_single(client, ident) for ident in identifiers]
+        results = await asyncio.gather(*tasks)
+    if verbose:
+        for res in results:
+            logger.info(
+                f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
+            )
+            for author, pct in res.self_citations.items():
+                logger.info(f"  {author}: {pct}%")
+    return results[0] if single_input else results

paperscraper 0.3.0__tar.gz → 0.3.1__tar.gz

paperscraper 0.3.0tar.gz → 0.3.1tar.gz