PyPI - paperscraper - Versions diffs - 0.3.4__tar.gz → 0.3.6__tar.gz - Mend

paperscraper 0.3.4tar.gz → 0.3.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{paperscraper-0.3.4 → paperscraper-0.3.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: paperscraper
-Version: 0.3.4
+Version: 0.3.6
 Summary: paperscraper: Package to scrape papers.
 Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
 License: MIT
@@ -19,13 +19,13 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.9
+Requires-Python: <3.14,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: arxiv>=1.4.7
-Requires-Dist: pymed-paperscraper>=1.0.4
+Requires-Dist: arxiv>=2.4.0
+Requires-Dist: pymed-paperscraper>=1.0.6
 Requires-Dist: pandas>=1.0.4
-Requires-Dist: requests==2.32.0
+Requires-Dist: requests>=2.32.2
 Requires-Dist: tqdm>=4.51.0
 Requires-Dist: scholarly>=1.0.0
 Requires-Dist: seaborn>=0.11.0
@@ -102,12 +102,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
 ```py
 from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
-chemrxiv()  #  Takes 30min -> +30K papers (~50 MB file)
-medrxiv()  #  Takes <1h -> +90K papers (~200 MB file)
-biorxiv()  # Up to 6h -> +400K papers (~800 MB file)
+chemrxiv()  #  Takes <15min -> +50K papers (~30 MB file)
+medrxiv()  #  Takes <30min -> +100K papers (~200 MB file)
+biorxiv()  # Takes <3h -> +450 papers (~800 MB file)
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
-*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
+*NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
+```py
+biorxiv(
+    max_retries=12,
+    request_timeout=(5.0, 45.0),      # connect timeout, read timeout
+    retry_backoff_seconds=1.0,        # initial retry backoff
+    max_workers=8,                    # number of parallel date windows
+    window_days=30,                   # smaller windows increase parallelism
+)
+```
 Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
 ```py

{paperscraper-0.3.4 → paperscraper-0.3.6}/README.md RENAMED Viewed

@@ -57,12 +57,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
 ```py
 from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
-chemrxiv()  #  Takes 30min -> +30K papers (~50 MB file)
-medrxiv()  #  Takes <1h -> +90K papers (~200 MB file)
-biorxiv()  # Up to 6h -> +400K papers (~800 MB file)
+chemrxiv()  #  Takes <15min -> +50K papers (~30 MB file)
+medrxiv()  #  Takes <30min -> +100K papers (~200 MB file)
+biorxiv()  # Takes <3h -> +450 papers (~800 MB file)
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
-*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
+*NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
+```py
+biorxiv(
+    max_retries=12,
+    request_timeout=(5.0, 45.0),      # connect timeout, read timeout
+    retry_backoff_seconds=1.0,        # initial retry backoff
+    max_workers=8,                    # number of parallel date windows
+    window_days=30,                   # smaller windows increase parallelism
+)
+```
 Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
 ```py

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Initialize the module."""
 __name__ = "paperscraper"
-__version__ = "0.3.4"
+__version__ = "0.3.6"
 import logging
 import os

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/async_utils.py RENAMED Viewed

@@ -48,6 +48,16 @@ def optional_async(
     return wrapper
+def run_sync(coroutine: Awaitable[T]) -> T:
+    """
+    Run a coroutine on the background loop and block for the result.
+    This is safe to call from sync or async contexts, but will block the caller.
+    """
+    future = asyncio.run_coroutine_threadsafe(coroutine, _background_loop)
+    return future.result()
 def retry_with_exponential_backoff(
     *,
     max_retries: int = 5,
@@ -96,7 +106,7 @@ def retry_with_exponential_backoff(
                                 pass
                     delay *= factor
-                except httpx.ReadError as e:
+                except (httpx.ReadError, httpx.TimeoutException, httpx.TransportError) as e:
                     last_exception = e
                     sleep_for = delay
                     delay *= factor

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/researcher.py RENAMED Viewed

@@ -1,9 +1,9 @@
-import asyncio
 import os
 from typing import Any, List, Literal, Optional, Tuple
 from semanticscholar import SemanticScholar
+from ...async_utils import run_sync
 from ..orcid import orcid_to_author_name
 from ..self_citations import CitationResult, self_citations_paper
 from ..self_references import ReferenceResult, self_references_paper
@@ -128,7 +128,7 @@ class Researcher(Entity):
         Returns:
             A ResearcherResult containing aggregated self-reference data.
         """
-        reference_results = asyncio.run(self._self_references_async(verbose=verbose))
+        reference_results = run_sync(self._self_references_async(verbose=verbose))
         individual_self_references = {
             getattr(result, "title"): getattr(result, "self_references").get(
@@ -182,8 +182,11 @@ class Researcher(Entity):
     def self_citations(self, verbose: bool = False) -> ResearcherResult:
         """
         Sifts through all papers of a researcher and finds how often they are self-cited.
+        Args:
+            verbose: If True, logs detailed information for each paper.
         """
-        citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
+        citation_results = run_sync(self._self_citations_async(verbose=verbose))
         individual_self_citations = {
             getattr(result, "title"): getattr(result, "self_citations").get(
                 self.name, 0.0
@@ -214,8 +217,8 @@ class Researcher(Entity):
         """
         Provides the result of the analysis.
         """
-        if not hasattr(self, "self_ref"):
+        if getattr(self.result, "num_references", -1) < 0:
             self.self_references()
-        if not hasattr(self, "self_cite"):
+        if getattr(self.result, "num_citations", -1) < 0:
             self.self_citations()
         return self.result

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_citations.py RENAMED Viewed

@@ -7,9 +7,18 @@ from typing import Any, Dict, List, Union
 import httpx
 import numpy as np
 from pydantic import BaseModel
+from tqdm import tqdm
 from ..async_utils import optional_async, retry_with_exponential_backoff
-from .utils import DOI_PATTERN, find_matching
+from .utils import (
+    DOI_PATTERN,
+    HEADERS,
+    HTTPX_LIMITS,
+    REQUEST_SEMAPHORE,
+    REQUEST_TIMEOUT_SECONDS,
+    find_matching,
+    wait_for_request_slot,
+)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -30,6 +39,7 @@ async def _fetch_citation_data(
 ) -> Dict[str, Any]:
     """
     Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
+    Respects rate limiting to avoid exceeding API limits.
     Args:
         client: An active httpx.AsyncClient.
@@ -38,9 +48,12 @@ async def _fetch_citation_data(
     Returns:
         The JSON-decoded response as a dictionary.
     """
+    await wait_for_request_slot()
     response = await client.get(
         f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
         params={"fields": "title,authors,citations.authors"},
+        headers=HEADERS,
     )
     response.raise_for_status()
     return response.json()
@@ -57,43 +70,44 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
     Returns:
         A CitationResult containing counts and percentages of self-citations.
     """
-    # Determine prefix for Semantic Scholar API
-    if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
-        prefix = ""
-    elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
-        prefix = "DOI:"
-    else:
-        prefix = ""
-    suffix = f"{prefix}{identifier}"
-    paper = await _fetch_citation_data(client, suffix)
-    # Initialize counters
-    author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
-    citations = paper.get("citations", [])
-    total_cites = len(citations)
-    # Tally self-citations
-    for cite in citations:
-        matched = find_matching(paper.get("authors", []), cite.get("authors", []))
-        for name in matched:
-            author_counts[name] += 1
-    # Compute percentages
-    ratios: Dict[str, float] = {
-        name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
-        for name, count in author_counts.items()
-    }
-    avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
-    return CitationResult(
-        ssid=identifier,
-        title=paper.get("title", ""),
-        num_citations=total_cites,
-        self_citations=ratios,
-        citation_score=avg_score,
-    )
+    async with REQUEST_SEMAPHORE:
+        # Determine prefix for Semantic Scholar API
+        if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
+            prefix = ""
+        elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
+            prefix = "DOI:"
+        else:
+            prefix = ""
+        suffix = f"{prefix}{identifier}"
+        paper = await _fetch_citation_data(client, suffix)
+        # Initialize counters
+        author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
+        citations = paper.get("citations", [])
+        total_cites = len(citations)
+        # Tally self-citations
+        for cite in citations:
+            matched = find_matching(paper.get("authors", []), cite.get("authors", []))
+            for name in matched:
+                author_counts[name] += 1
+        # Compute percentages
+        ratios: Dict[str, float] = {
+            name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
+            for name, count in author_counts.items()
+        }
+        avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
+        return CitationResult(
+            ssid=identifier,
+            title=paper.get("title", ""),
+            num_citations=total_cites,
+            self_citations=ratios,
+            citation_score=avg_score,
+        )
 @optional_async
@@ -114,9 +128,26 @@ async def self_citations_paper(
     single_input = isinstance(inputs, str)
     identifiers = [inputs] if single_input else list(inputs)
-    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+    results: List[CitationResult] = []
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
         tasks = [_process_single(client, ident) for ident in identifiers]
-        results = await asyncio.gather(*tasks)
+        iterator = tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Collecting self-citations",
+        )
+        for coro in iterator:
+            try:
+                res = await coro
+            except Exception as exc:
+                logger.warning(f"Self-citation fetch failed: {exc}")
+                continue
+            results.append(res)
     if verbose:
         for res in results:
@@ -126,4 +157,8 @@ async def self_citations_paper(
             for author, pct in res.self_citations.items():
                 logger.info(f"  {author}: {pct}%")
-    return results[0] if single_input else results
+    if single_input:
+        if not results:
+            raise RuntimeError("Failed to fetch self-citations for input.")
+        return results[0]
+    return results

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_references.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import asyncio
 import logging
-import os
 import re
 import sys
 from typing import Any, Dict, List, Literal, Union
@@ -11,7 +10,15 @@ from pydantic import BaseModel
 from tqdm import tqdm
 from ..async_utils import optional_async, retry_with_exponential_backoff
-from .utils import DOI_PATTERN, find_matching
+from .utils import (
+    DOI_PATTERN,
+    HEADERS,
+    HTTPX_LIMITS,
+    REQUEST_SEMAPHORE,
+    REQUEST_TIMEOUT_SECONDS,
+    find_matching,
+    wait_for_request_slot,
+)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -19,15 +26,6 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
 ModeType = Literal[tuple(MODES := ("doi", "infer", "ssid"))]
-SS_API_KEY = os.getenv("SS_API_KEY")
-HEADERS: Dict[str, str] = {}
-if SS_API_KEY:
-    HEADERS["x-api-key"] = SS_API_KEY
-CONCURRENCY_LIMIT = 10
-_SEM = asyncio.Semaphore(CONCURRENCY_LIMIT)
 class ReferenceResult(BaseModel):
     ssid: str  # semantic scholar paper id
     title: str
@@ -42,6 +40,7 @@ async def _fetch_paper_with_references(
 ) -> Dict[str, Any]:
     """
     Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
+    Respects rate limiting to avoid exceeding API limits.
     Args:
         client: An active httpx.AsyncClient.
@@ -50,6 +49,8 @@ async def _fetch_paper_with_references(
     Returns:
         The JSON-decoded response as a dictionary.
     """
+    await wait_for_request_slot()
     response = await client.get(
         f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
         params={"fields": "title,authors,references.authors"},
@@ -72,7 +73,7 @@ async def _process_single_reference(
     Returns:
         A ReferenceResult containing counts and percentages of self-references.
     """
-    async with _SEM:
+    async with REQUEST_SEMAPHORE:
         # Determine prefix for API
         if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
             prefix = ""
@@ -134,18 +135,24 @@ async def self_references_paper(
     single_input = isinstance(inputs, str)
     identifiers = [inputs] if single_input else list(inputs)
-    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
         tasks = [_process_single_reference(client, ident) for ident in identifiers]
         results: List[ReferenceResult] = []
-        iterator = asyncio.as_completed(tasks)
-        if verbose:
-            iterator = tqdm(
-                iterator, total=len(tasks), desc="Collecting self-references"
-            )
+        iterator = tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Collecting self-references",
+        )
         for coro in iterator:
-            res = await coro
+            try:
+                res = await coro
+            except Exception as exc:
+                logger.warning(f"Self-reference fetch failed: {exc}")
+                continue
             results.append(res)
     if verbose:
@@ -157,4 +164,8 @@ async def self_references_paper(
             for author, pct in res.self_references.items():
                 logger.info(f"  {author}: {pct}% self-references")
-    return results[0] if single_input else results
+    if single_input:
+        if not results:
+            raise RuntimeError("Failed to fetch self-references for input.")
+        return results[0]
+    return results

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_citations.py RENAMED Viewed

@@ -64,12 +64,15 @@ class TestSelfCitations:
             f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
         )
-        assert 0.1 * async_duration <= sync_duration, (
+        assert async_duration*0.8 <= sync_duration, (
             f"Async execution ({async_duration:.2f}s) is slower than sync execution "
             f"({sync_duration:.2f}s)"
         )
-        for a, s in zip(result, sync_result):
+        for a, s in zip(
+            sorted(result, key=lambda r: r.ssid),
+            sorted(sync_result, key=lambda r: r.ssid),
+        ):
             assert a == s, f"{a} vs {s}"
     def test_researcher(self):

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_references.py RENAMED Viewed

@@ -49,33 +49,6 @@ class TestSelfReferences:
                 assert isinstance(self_cites, float)
                 assert self_cites >= 0 and self_cites <= 100
-    def test_compare_async_and_sync_performance(self, dois):
-        """
-        Compares the execution time of asynchronous and synchronous `self_references`
-        for a list of DOIs.
-        """
-        start_time = time.perf_counter()
-        async_results = self_references_paper(dois)
-        async_duration = time.perf_counter() - start_time
-        # Measure synchronous execution time (three independent calls)
-        start_time = time.perf_counter()
-        sync_results = [self_references_paper(doi) for doi in dois]
-        sync_duration = time.perf_counter() - start_time
-        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
-        print(
-            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
-        )
-        assert len(sync_results) == len(async_results)
-        assert 0.5 * async_duration <= sync_duration, (
-            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
-            f"({sync_duration:.2f}s)"
-        )
     def test_researcher(self):
         """
         Tests calculation of self-references for all papers of an author.

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/utils.py RENAMED Viewed

@@ -1,8 +1,10 @@
+import asyncio
 import logging
 import os
 import re
 import sys
-from typing import Any, Dict, List, Literal, Optional, Tuple
+import time
+from typing import Dict, List, Literal, Optional, Tuple
 import httpx
 import requests
@@ -15,6 +17,11 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 logging.getLogger("httpx").setLevel(logging.WARNING)
+REQUEST_TIMEOUT_SECONDS = float(os.getenv("SS_REQUEST_TIMEOUT", "20"))
+CONCURRENCY_LIMIT = max(1, int(os.getenv("SS_CONCURRENCY_LIMIT", "1")))
+# Minimum delay between outbound requests to Semantic Scholar.
+RATE_LIMIT_DELAY = max(0.0, float(os.getenv("SS_RATE_LIMIT_DELAY", "1.1")))
 DOI_PATTERN = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
 PAPER_URL: str = "https://api.semanticscholar.org/graph/v1/paper/"
 AUTHOR_URL: str = "https://api.semanticscholar.org/graph/v1/author/search"
@@ -25,6 +32,30 @@ HEADERS: Dict[str, str] = {}
 if SS_API_KEY:
     HEADERS["x-api-key"] = SS_API_KEY
+HTTPX_LIMITS = httpx.Limits(
+    max_connections=CONCURRENCY_LIMIT, max_keepalive_connections=CONCURRENCY_LIMIT
+)
+REQUEST_SEMAPHORE = asyncio.Semaphore(CONCURRENCY_LIMIT)
+_REQUEST_SCHEDULER_LOCK = asyncio.Lock()
+_NEXT_REQUEST_TIME = 0.0
+async def wait_for_request_slot() -> None:
+    """
+    Enforces global pacing between Semantic Scholar requests.
+    Uses a shared scheduler to avoid bursts across modules.
+    """
+    global _NEXT_REQUEST_TIME
+    async with _REQUEST_SCHEDULER_LOCK:
+        now = time.monotonic()
+        scheduled = max(_NEXT_REQUEST_TIME, now)
+        _NEXT_REQUEST_TIME = scheduled + RATE_LIMIT_DELAY
+    delay = scheduled - now
+    if delay > 0:
+        await asyncio.sleep(delay)
 def get_doi_from_title(title: str) -> Optional[str]:
     """
@@ -62,7 +93,9 @@ async def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
     Returns:
       str or None: The DOI of the paper, or None if not found or in case of an error.
     """
-    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
         logger.warning(
             "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
         )
@@ -99,7 +132,9 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
     Returns:
         dict or None: A dictionary with keys 'title' and 'ssid'.
     """
-    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
         # Send the GET request to Semantic Scholar
         response = await client.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
         if response.status_code == 200:
@@ -115,6 +150,7 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
 async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
     """
     Given an author name, returns the Semantic Scholar author ID.
+    Respects rate limiting to avoid exceeding API limits.
     Parameters:
         author_name (str): The full name of the author.
@@ -123,7 +159,11 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
         Tuple[str, str] or None: The SS author ID alongside the SS name (may differ
             slightly from input name) or None if no author is found.
     """
-    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
+        await wait_for_request_slot()
         response = await client.get(
             AUTHOR_URL,
             params={"query": author_name, "fields": "name", "limit": 1},
@@ -139,7 +179,7 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
         logger.error(
             f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
         )
-        return ('-1', 'N.A.')
+        return ("-1", "N.A.")
 def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
@@ -164,6 +204,7 @@ def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
     return mode
+@optional_async
 @retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
 async def get_papers_for_author(ss_author_id: str) -> List[str]:
     """
@@ -179,7 +220,9 @@ async def get_papers_for_author(ss_author_id: str) -> List[str]:
     offset = 0
     limit = 100
-    async with httpx.AsyncClient() as client:
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
+    ) as client:
         while True:
             response = await client.get(
                 f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",

{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/biorxiv.py RENAMED Viewed

@@ -1,11 +1,8 @@
 """Dump bioRxiv data in JSONL format."""
-import json
 import os
 from datetime import datetime
-from typing import Optional
-from tqdm import tqdm
+from typing import Optional, Tuple
 from ..utils import get_server_dumps_dir
 from ..xrxiv.xrxiv_api import BioRxivApi
@@ -22,6 +19,10 @@ def biorxiv(
     end_date: Optional[str] = None,
     save_path: str = save_path,
     max_retries: int = 10,
+    request_timeout: Tuple[float, float] = (5.0, 30.0),
+    retry_backoff_seconds: float = 1.0,
+    window_days: int = 30,
+    max_workers: int = 8,
 ):
     """Fetches papers from biorxiv based on time range, i.e., start_date and end_date.
     If the start_date and end_date are not provided, papers will be fetched from biorxiv
@@ -37,15 +38,28 @@ def biorxiv(
             Defaults to save_path.
         max_retries (int, optional): Number of retries when API shows connection issues.
             Defaults to 10.
+        request_timeout (Tuple[float, float], optional): (connect timeout, read timeout).
+            Defaults to (5.0, 30.0).
+        retry_backoff_seconds (float, optional): Initial retry backoff.
+            Defaults to 1.0.
+        window_days (int, optional): Date-window size used for pagination.
+            Defaults to 30.
+        max_workers (int, optional): Number of parallel workers over date windows.
+            Defaults to 8.
     """
-    # create API client
-    api = BioRxivApi(max_retries=max_retries)
-    # dump all papers
-    with open(save_path, "w") as fp:
-        for index, paper in enumerate(
-            tqdm(api.get_papers(start_date=start_date, end_date=end_date))
-        ):
-            if index > 0:
-                fp.write(os.linesep)
-            fp.write(json.dumps(paper))
+    api = BioRxivApi(
+        max_retries=max_retries,
+        request_timeout=request_timeout,
+        retry_backoff_seconds=retry_backoff_seconds,
+        window_days=max(1, int(window_days)),
+    )
+    api.dump_papers(
+        save_path=save_path,
+        start_date=start_date,
+        end_date=end_date,
+        max_retries=max_retries,
+        max_workers=max_workers,
+        window_days=window_days,
+        deduplicate_dois=False,
+        show_progress=True,
+    )

paperscraper 0.3.4__tar.gz → 0.3.6__tar.gz

paperscraper 0.3.4tar.gz → 0.3.6tar.gz