paperscraper 0.3.4__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {paperscraper-0.3.4 → paperscraper-0.3.6}/PKG-INFO +19 -9
  2. {paperscraper-0.3.4 → paperscraper-0.3.6}/README.md +14 -4
  3. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/__init__.py +1 -1
  4. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/async_utils.py +11 -1
  5. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/researcher.py +8 -5
  6. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_citations.py +76 -41
  7. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_references.py +31 -20
  8. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_citations.py +5 -2
  9. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_references.py +0 -27
  10. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/utils.py +49 -6
  11. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/biorxiv.py +29 -15
  12. paperscraper-0.3.6/paperscraper/get_dumps/chemrxiv.py +62 -0
  13. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/medrxiv.py +29 -14
  14. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +3 -27
  15. paperscraper-0.3.6/paperscraper/get_dumps/utils/chemrxiv/crossref_api.py +226 -0
  16. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/utils.py +74 -0
  17. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/load_dumps.py +5 -1
  18. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/pdf.py +31 -4
  19. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_dump.py +91 -36
  20. paperscraper-0.3.6/paperscraper/xrxiv/xrxiv_api.py +576 -0
  21. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/PKG-INFO +19 -9
  22. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/SOURCES.txt +1 -0
  23. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/requires.txt +3 -3
  24. {paperscraper-0.3.4 → paperscraper-0.3.6}/pyproject.toml +4 -4
  25. paperscraper-0.3.4/paperscraper/get_dumps/chemrxiv.py +0 -44
  26. paperscraper-0.3.4/paperscraper/xrxiv/xrxiv_api.py +0 -174
  27. {paperscraper-0.3.4 → paperscraper-0.3.6}/LICENSE +0 -0
  28. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/__init__.py +0 -0
  29. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/arxiv.py +0 -0
  30. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/utils.py +0 -0
  31. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/__init__.py +0 -0
  32. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/citations.py +0 -0
  33. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/core.py +0 -0
  34. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/__init__.py +0 -0
  35. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/core.py +0 -0
  36. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/paper.py +0 -0
  37. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/orcid.py +0 -0
  38. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/__init__.py +0 -0
  39. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_citations.py +0 -0
  40. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_paper.py +0 -0
  41. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/__init__.py +0 -0
  42. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/arxiv.py +0 -0
  43. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/__init__.py +0 -0
  44. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  45. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/impact.py +0 -0
  46. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/__init__.py +0 -0
  47. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/fallbacks.py +0 -0
  48. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/utils.py +0 -0
  49. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/plotting.py +0 -0
  50. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/postprocessing.py +0 -0
  51. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/__init__.py +0 -0
  52. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/pubmed.py +0 -0
  53. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/tests/__init__.py +0 -0
  54. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  55. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/utils.py +0 -0
  56. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/__init__.py +0 -0
  57. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/core.py +0 -0
  58. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/scholar.py +0 -0
  59. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/tests/__init__.py +0 -0
  60. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/tests/test_scholar.py +0 -0
  61. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/server_dumps/__init__.py +0 -0
  62. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/__init__.py +0 -0
  63. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_impactor.py +0 -0
  64. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_pdf.py +0 -0
  65. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/utils.py +0 -0
  66. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/__init__.py +0 -0
  67. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/tests/__init__.py +0 -0
  68. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
  69. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  70. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/dependency_links.txt +0 -0
  71. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/not-zip-safe +0 -0
  72. {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/top_level.txt +0 -0
  73. {paperscraper-0.3.4 → paperscraper-0.3.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
6
6
  License: MIT
@@ -19,13 +19,13 @@ Classifier: Programming Language :: Python :: 3.11
19
19
  Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3.13
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
- Requires-Python: >=3.9
22
+ Requires-Python: <3.14,>=3.9
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
- Requires-Dist: arxiv>=1.4.7
26
- Requires-Dist: pymed-paperscraper>=1.0.4
25
+ Requires-Dist: arxiv>=2.4.0
26
+ Requires-Dist: pymed-paperscraper>=1.0.6
27
27
  Requires-Dist: pandas>=1.0.4
28
- Requires-Dist: requests==2.32.0
28
+ Requires-Dist: requests>=2.32.2
29
29
  Requires-Dist: tqdm>=4.51.0
30
30
  Requires-Dist: scholarly>=1.0.0
31
31
  Requires-Dist: seaborn>=0.11.0
@@ -102,12 +102,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
102
102
 
103
103
  ```py
104
104
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
105
- chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
106
- medrxiv() # Takes <1h -> +90K papers (~200 MB file)
107
- biorxiv() # Up to 6h -> +400K papers (~800 MB file)
105
+ chemrxiv() # Takes <15min -> +50K papers (~30 MB file)
106
+ medrxiv() # Takes <30min -> +100K papers (~200 MB file)
107
+ biorxiv() # Takes <3h -> +450 papers (~800 MB file)
108
108
  ```
109
109
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
110
- *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
110
+ *NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
111
+
112
+ ```py
113
+ biorxiv(
114
+ max_retries=12,
115
+ request_timeout=(5.0, 45.0), # connect timeout, read timeout
116
+ retry_backoff_seconds=1.0, # initial retry backoff
117
+ max_workers=8, # number of parallel date windows
118
+ window_days=30, # smaller windows increase parallelism
119
+ )
120
+ ```
111
121
 
112
122
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
113
123
  ```py
@@ -57,12 +57,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
57
57
 
58
58
  ```py
59
59
  from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
60
- chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
61
- medrxiv() # Takes <1h -> +90K papers (~200 MB file)
62
- biorxiv() # Up to 6h -> +400K papers (~800 MB file)
60
+ chemrxiv() # Takes <15min -> +50K papers (~30 MB file)
61
+ medrxiv() # Takes <30min -> +100K papers (~200 MB file)
62
+ biorxiv() # Takes <3h -> +450 papers (~800 MB file)
63
63
  ```
64
64
  *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
65
- *NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
65
+ *NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
66
+
67
+ ```py
68
+ biorxiv(
69
+ max_retries=12,
70
+ request_timeout=(5.0, 45.0), # connect timeout, read timeout
71
+ retry_backoff_seconds=1.0, # initial retry backoff
72
+ max_workers=8, # number of parallel date windows
73
+ window_days=30, # smaller windows increase parallelism
74
+ )
75
+ ```
66
76
 
67
77
  Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
68
78
  ```py
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.3.4"
4
+ __version__ = "0.3.6"
5
5
 
6
6
  import logging
7
7
  import os
@@ -48,6 +48,16 @@ def optional_async(
48
48
  return wrapper
49
49
 
50
50
 
51
+ def run_sync(coroutine: Awaitable[T]) -> T:
52
+ """
53
+ Run a coroutine on the background loop and block for the result.
54
+
55
+ This is safe to call from sync or async contexts, but will block the caller.
56
+ """
57
+ future = asyncio.run_coroutine_threadsafe(coroutine, _background_loop)
58
+ return future.result()
59
+
60
+
51
61
  def retry_with_exponential_backoff(
52
62
  *,
53
63
  max_retries: int = 5,
@@ -96,7 +106,7 @@ def retry_with_exponential_backoff(
96
106
  pass
97
107
  delay *= factor
98
108
 
99
- except httpx.ReadError as e:
109
+ except (httpx.ReadError, httpx.TimeoutException, httpx.TransportError) as e:
100
110
  last_exception = e
101
111
  sleep_for = delay
102
112
  delay *= factor
@@ -1,9 +1,9 @@
1
- import asyncio
2
1
  import os
3
2
  from typing import Any, List, Literal, Optional, Tuple
4
3
 
5
4
  from semanticscholar import SemanticScholar
6
5
 
6
+ from ...async_utils import run_sync
7
7
  from ..orcid import orcid_to_author_name
8
8
  from ..self_citations import CitationResult, self_citations_paper
9
9
  from ..self_references import ReferenceResult, self_references_paper
@@ -128,7 +128,7 @@ class Researcher(Entity):
128
128
  Returns:
129
129
  A ResearcherResult containing aggregated self-reference data.
130
130
  """
131
- reference_results = asyncio.run(self._self_references_async(verbose=verbose))
131
+ reference_results = run_sync(self._self_references_async(verbose=verbose))
132
132
 
133
133
  individual_self_references = {
134
134
  getattr(result, "title"): getattr(result, "self_references").get(
@@ -182,8 +182,11 @@ class Researcher(Entity):
182
182
  def self_citations(self, verbose: bool = False) -> ResearcherResult:
183
183
  """
184
184
  Sifts through all papers of a researcher and finds how often they are self-cited.
185
+
186
+ Args:
187
+ verbose: If True, logs detailed information for each paper.
185
188
  """
186
- citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
189
+ citation_results = run_sync(self._self_citations_async(verbose=verbose))
187
190
  individual_self_citations = {
188
191
  getattr(result, "title"): getattr(result, "self_citations").get(
189
192
  self.name, 0.0
@@ -214,8 +217,8 @@ class Researcher(Entity):
214
217
  """
215
218
  Provides the result of the analysis.
216
219
  """
217
- if not hasattr(self, "self_ref"):
220
+ if getattr(self.result, "num_references", -1) < 0:
218
221
  self.self_references()
219
- if not hasattr(self, "self_cite"):
222
+ if getattr(self.result, "num_citations", -1) < 0:
220
223
  self.self_citations()
221
224
  return self.result
@@ -7,9 +7,18 @@ from typing import Any, Dict, List, Union
7
7
  import httpx
8
8
  import numpy as np
9
9
  from pydantic import BaseModel
10
+ from tqdm import tqdm
10
11
 
11
12
  from ..async_utils import optional_async, retry_with_exponential_backoff
12
- from .utils import DOI_PATTERN, find_matching
13
+ from .utils import (
14
+ DOI_PATTERN,
15
+ HEADERS,
16
+ HTTPX_LIMITS,
17
+ REQUEST_SEMAPHORE,
18
+ REQUEST_TIMEOUT_SECONDS,
19
+ find_matching,
20
+ wait_for_request_slot,
21
+ )
13
22
 
14
23
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
15
24
  logger = logging.getLogger(__name__)
@@ -30,6 +39,7 @@ async def _fetch_citation_data(
30
39
  ) -> Dict[str, Any]:
31
40
  """
32
41
  Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
42
+ Respects rate limiting to avoid exceeding API limits.
33
43
 
34
44
  Args:
35
45
  client: An active httpx.AsyncClient.
@@ -38,9 +48,12 @@ async def _fetch_citation_data(
38
48
  Returns:
39
49
  The JSON-decoded response as a dictionary.
40
50
  """
51
+ await wait_for_request_slot()
52
+
41
53
  response = await client.get(
42
54
  f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
43
55
  params={"fields": "title,authors,citations.authors"},
56
+ headers=HEADERS,
44
57
  )
45
58
  response.raise_for_status()
46
59
  return response.json()
@@ -57,43 +70,44 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
57
70
  Returns:
58
71
  A CitationResult containing counts and percentages of self-citations.
59
72
  """
60
- # Determine prefix for Semantic Scholar API
61
- if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
62
- prefix = ""
63
- elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
64
- prefix = "DOI:"
65
- else:
66
- prefix = ""
67
-
68
- suffix = f"{prefix}{identifier}"
69
- paper = await _fetch_citation_data(client, suffix)
70
-
71
- # Initialize counters
72
- author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
73
- citations = paper.get("citations", [])
74
- total_cites = len(citations)
75
-
76
- # Tally self-citations
77
- for cite in citations:
78
- matched = find_matching(paper.get("authors", []), cite.get("authors", []))
79
- for name in matched:
80
- author_counts[name] += 1
81
-
82
- # Compute percentages
83
- ratios: Dict[str, float] = {
84
- name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
85
- for name, count in author_counts.items()
86
- }
87
-
88
- avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
89
-
90
- return CitationResult(
91
- ssid=identifier,
92
- title=paper.get("title", ""),
93
- num_citations=total_cites,
94
- self_citations=ratios,
95
- citation_score=avg_score,
96
- )
73
+ async with REQUEST_SEMAPHORE:
74
+ # Determine prefix for Semantic Scholar API
75
+ if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
76
+ prefix = ""
77
+ elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
78
+ prefix = "DOI:"
79
+ else:
80
+ prefix = ""
81
+
82
+ suffix = f"{prefix}{identifier}"
83
+ paper = await _fetch_citation_data(client, suffix)
84
+
85
+ # Initialize counters
86
+ author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
87
+ citations = paper.get("citations", [])
88
+ total_cites = len(citations)
89
+
90
+ # Tally self-citations
91
+ for cite in citations:
92
+ matched = find_matching(paper.get("authors", []), cite.get("authors", []))
93
+ for name in matched:
94
+ author_counts[name] += 1
95
+
96
+ # Compute percentages
97
+ ratios: Dict[str, float] = {
98
+ name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
99
+ for name, count in author_counts.items()
100
+ }
101
+
102
+ avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
103
+
104
+ return CitationResult(
105
+ ssid=identifier,
106
+ title=paper.get("title", ""),
107
+ num_citations=total_cites,
108
+ self_citations=ratios,
109
+ citation_score=avg_score,
110
+ )
97
111
 
98
112
 
99
113
  @optional_async
@@ -114,9 +128,26 @@ async def self_citations_paper(
114
128
  single_input = isinstance(inputs, str)
115
129
  identifiers = [inputs] if single_input else list(inputs)
116
130
 
117
- async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
131
+ results: List[CitationResult] = []
132
+
133
+ async with httpx.AsyncClient(
134
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
135
+ ) as client:
118
136
  tasks = [_process_single(client, ident) for ident in identifiers]
119
- results = await asyncio.gather(*tasks)
137
+
138
+ iterator = tqdm(
139
+ asyncio.as_completed(tasks),
140
+ total=len(tasks),
141
+ desc="Collecting self-citations",
142
+ )
143
+
144
+ for coro in iterator:
145
+ try:
146
+ res = await coro
147
+ except Exception as exc:
148
+ logger.warning(f"Self-citation fetch failed: {exc}")
149
+ continue
150
+ results.append(res)
120
151
 
121
152
  if verbose:
122
153
  for res in results:
@@ -126,4 +157,8 @@ async def self_citations_paper(
126
157
  for author, pct in res.self_citations.items():
127
158
  logger.info(f" {author}: {pct}%")
128
159
 
129
- return results[0] if single_input else results
160
+ if single_input:
161
+ if not results:
162
+ raise RuntimeError("Failed to fetch self-citations for input.")
163
+ return results[0]
164
+ return results
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import logging
3
- import os
4
3
  import re
5
4
  import sys
6
5
  from typing import Any, Dict, List, Literal, Union
@@ -11,7 +10,15 @@ from pydantic import BaseModel
11
10
  from tqdm import tqdm
12
11
 
13
12
  from ..async_utils import optional_async, retry_with_exponential_backoff
14
- from .utils import DOI_PATTERN, find_matching
13
+ from .utils import (
14
+ DOI_PATTERN,
15
+ HEADERS,
16
+ HTTPX_LIMITS,
17
+ REQUEST_SEMAPHORE,
18
+ REQUEST_TIMEOUT_SECONDS,
19
+ find_matching,
20
+ wait_for_request_slot,
21
+ )
15
22
 
16
23
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
17
24
  logger = logging.getLogger(__name__)
@@ -19,15 +26,6 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
19
26
  ModeType = Literal[tuple(MODES := ("doi", "infer", "ssid"))]
20
27
 
21
28
 
22
- SS_API_KEY = os.getenv("SS_API_KEY")
23
- HEADERS: Dict[str, str] = {}
24
- if SS_API_KEY:
25
- HEADERS["x-api-key"] = SS_API_KEY
26
-
27
- CONCURRENCY_LIMIT = 10
28
- _SEM = asyncio.Semaphore(CONCURRENCY_LIMIT)
29
-
30
-
31
29
  class ReferenceResult(BaseModel):
32
30
  ssid: str # semantic scholar paper id
33
31
  title: str
@@ -42,6 +40,7 @@ async def _fetch_paper_with_references(
42
40
  ) -> Dict[str, Any]:
43
41
  """
44
42
  Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
43
+ Respects rate limiting to avoid exceeding API limits.
45
44
 
46
45
  Args:
47
46
  client: An active httpx.AsyncClient.
@@ -50,6 +49,8 @@ async def _fetch_paper_with_references(
50
49
  Returns:
51
50
  The JSON-decoded response as a dictionary.
52
51
  """
52
+ await wait_for_request_slot()
53
+
53
54
  response = await client.get(
54
55
  f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
55
56
  params={"fields": "title,authors,references.authors"},
@@ -72,7 +73,7 @@ async def _process_single_reference(
72
73
  Returns:
73
74
  A ReferenceResult containing counts and percentages of self-references.
74
75
  """
75
- async with _SEM:
76
+ async with REQUEST_SEMAPHORE:
76
77
  # Determine prefix for API
77
78
  if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
78
79
  prefix = ""
@@ -134,18 +135,24 @@ async def self_references_paper(
134
135
  single_input = isinstance(inputs, str)
135
136
  identifiers = [inputs] if single_input else list(inputs)
136
137
 
137
- async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
138
+ async with httpx.AsyncClient(
139
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
140
+ ) as client:
138
141
  tasks = [_process_single_reference(client, ident) for ident in identifiers]
139
142
  results: List[ReferenceResult] = []
140
143
 
141
- iterator = asyncio.as_completed(tasks)
142
- if verbose:
143
- iterator = tqdm(
144
- iterator, total=len(tasks), desc="Collecting self-references"
145
- )
144
+ iterator = tqdm(
145
+ asyncio.as_completed(tasks),
146
+ total=len(tasks),
147
+ desc="Collecting self-references",
148
+ )
146
149
 
147
150
  for coro in iterator:
148
- res = await coro
151
+ try:
152
+ res = await coro
153
+ except Exception as exc:
154
+ logger.warning(f"Self-reference fetch failed: {exc}")
155
+ continue
149
156
  results.append(res)
150
157
 
151
158
  if verbose:
@@ -157,4 +164,8 @@ async def self_references_paper(
157
164
  for author, pct in res.self_references.items():
158
165
  logger.info(f" {author}: {pct}% self-references")
159
166
 
160
- return results[0] if single_input else results
167
+ if single_input:
168
+ if not results:
169
+ raise RuntimeError("Failed to fetch self-references for input.")
170
+ return results[0]
171
+ return results
@@ -64,12 +64,15 @@ class TestSelfCitations:
64
64
  f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
65
65
  )
66
66
 
67
- assert 0.1 * async_duration <= sync_duration, (
67
+ assert async_duration*0.8 <= sync_duration, (
68
68
  f"Async execution ({async_duration:.2f}s) is slower than sync execution "
69
69
  f"({sync_duration:.2f}s)"
70
70
  )
71
71
 
72
- for a, s in zip(result, sync_result):
72
+ for a, s in zip(
73
+ sorted(result, key=lambda r: r.ssid),
74
+ sorted(sync_result, key=lambda r: r.ssid),
75
+ ):
73
76
  assert a == s, f"{a} vs {s}"
74
77
 
75
78
  def test_researcher(self):
@@ -49,33 +49,6 @@ class TestSelfReferences:
49
49
  assert isinstance(self_cites, float)
50
50
  assert self_cites >= 0 and self_cites <= 100
51
51
 
52
- def test_compare_async_and_sync_performance(self, dois):
53
- """
54
- Compares the execution time of asynchronous and synchronous `self_references`
55
- for a list of DOIs.
56
- """
57
-
58
- start_time = time.perf_counter()
59
- async_results = self_references_paper(dois)
60
- async_duration = time.perf_counter() - start_time
61
-
62
- # Measure synchronous execution time (three independent calls)
63
- start_time = time.perf_counter()
64
- sync_results = [self_references_paper(doi) for doi in dois]
65
-
66
- sync_duration = time.perf_counter() - start_time
67
-
68
- print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
69
- print(
70
- f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
71
- )
72
- assert len(sync_results) == len(async_results)
73
-
74
- assert 0.5 * async_duration <= sync_duration, (
75
- f"Async execution ({async_duration:.2f}s) is slower than sync execution "
76
- f"({sync_duration:.2f}s)"
77
- )
78
-
79
52
  def test_researcher(self):
80
53
  """
81
54
  Tests calculation of self-references for all papers of an author.
@@ -1,8 +1,10 @@
1
+ import asyncio
1
2
  import logging
2
3
  import os
3
4
  import re
4
5
  import sys
5
- from typing import Any, Dict, List, Literal, Optional, Tuple
6
+ import time
7
+ from typing import Dict, List, Literal, Optional, Tuple
6
8
 
7
9
  import httpx
8
10
  import requests
@@ -15,6 +17,11 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
15
17
  logger = logging.getLogger(__name__)
16
18
  logging.getLogger("httpx").setLevel(logging.WARNING)
17
19
 
20
+ REQUEST_TIMEOUT_SECONDS = float(os.getenv("SS_REQUEST_TIMEOUT", "20"))
21
+ CONCURRENCY_LIMIT = max(1, int(os.getenv("SS_CONCURRENCY_LIMIT", "1")))
22
+ # Minimum delay between outbound requests to Semantic Scholar.
23
+ RATE_LIMIT_DELAY = max(0.0, float(os.getenv("SS_RATE_LIMIT_DELAY", "1.1")))
24
+
18
25
  DOI_PATTERN = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
19
26
  PAPER_URL: str = "https://api.semanticscholar.org/graph/v1/paper/"
20
27
  AUTHOR_URL: str = "https://api.semanticscholar.org/graph/v1/author/search"
@@ -25,6 +32,30 @@ HEADERS: Dict[str, str] = {}
25
32
  if SS_API_KEY:
26
33
  HEADERS["x-api-key"] = SS_API_KEY
27
34
 
35
+ HTTPX_LIMITS = httpx.Limits(
36
+ max_connections=CONCURRENCY_LIMIT, max_keepalive_connections=CONCURRENCY_LIMIT
37
+ )
38
+ REQUEST_SEMAPHORE = asyncio.Semaphore(CONCURRENCY_LIMIT)
39
+ _REQUEST_SCHEDULER_LOCK = asyncio.Lock()
40
+ _NEXT_REQUEST_TIME = 0.0
41
+
42
+
43
+ async def wait_for_request_slot() -> None:
44
+ """
45
+ Enforces global pacing between Semantic Scholar requests.
46
+ Uses a shared scheduler to avoid bursts across modules.
47
+ """
48
+ global _NEXT_REQUEST_TIME
49
+
50
+ async with _REQUEST_SCHEDULER_LOCK:
51
+ now = time.monotonic()
52
+ scheduled = max(_NEXT_REQUEST_TIME, now)
53
+ _NEXT_REQUEST_TIME = scheduled + RATE_LIMIT_DELAY
54
+
55
+ delay = scheduled - now
56
+ if delay > 0:
57
+ await asyncio.sleep(delay)
58
+
28
59
 
29
60
  def get_doi_from_title(title: str) -> Optional[str]:
30
61
  """
@@ -62,7 +93,9 @@ async def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
62
93
  Returns:
63
94
  str or None: The DOI of the paper, or None if not found or in case of an error.
64
95
  """
65
- async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
96
+ async with httpx.AsyncClient(
97
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
98
+ ) as client:
66
99
  logger.warning(
67
100
  "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
68
101
  )
@@ -99,7 +132,9 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
99
132
  Returns:
100
133
  dict or None: A dictionary with keys 'title' and 'ssid'.
101
134
  """
102
- async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
135
+ async with httpx.AsyncClient(
136
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
137
+ ) as client:
103
138
  # Send the GET request to Semantic Scholar
104
139
  response = await client.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
105
140
  if response.status_code == 200:
@@ -115,6 +150,7 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
115
150
  async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
116
151
  """
117
152
  Given an author name, returns the Semantic Scholar author ID.
153
+ Respects rate limiting to avoid exceeding API limits.
118
154
 
119
155
  Parameters:
120
156
  author_name (str): The full name of the author.
@@ -123,7 +159,11 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
123
159
  Tuple[str, str] or None: The SS author ID alongside the SS name (may differ
124
160
  slightly from input name) or None if no author is found.
125
161
  """
126
- async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
162
+ async with httpx.AsyncClient(
163
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
164
+ ) as client:
165
+ await wait_for_request_slot()
166
+
127
167
  response = await client.get(
128
168
  AUTHOR_URL,
129
169
  params={"query": author_name, "fields": "name", "limit": 1},
@@ -139,7 +179,7 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
139
179
  logger.error(
140
180
  f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
141
181
  )
142
- return ('-1', 'N.A.')
182
+ return ("-1", "N.A.")
143
183
 
144
184
 
145
185
  def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
@@ -164,6 +204,7 @@ def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
164
204
  return mode
165
205
 
166
206
 
207
+ @optional_async
167
208
  @retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
168
209
  async def get_papers_for_author(ss_author_id: str) -> List[str]:
169
210
  """
@@ -179,7 +220,9 @@ async def get_papers_for_author(ss_author_id: str) -> List[str]:
179
220
  offset = 0
180
221
  limit = 100
181
222
 
182
- async with httpx.AsyncClient() as client:
223
+ async with httpx.AsyncClient(
224
+ timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
225
+ ) as client:
183
226
  while True:
184
227
  response = await client.get(
185
228
  f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",
@@ -1,11 +1,8 @@
1
1
  """Dump bioRxiv data in JSONL format."""
2
2
 
3
- import json
4
3
  import os
5
4
  from datetime import datetime
6
- from typing import Optional
7
-
8
- from tqdm import tqdm
5
+ from typing import Optional, Tuple
9
6
 
10
7
  from ..utils import get_server_dumps_dir
11
8
  from ..xrxiv.xrxiv_api import BioRxivApi
@@ -22,6 +19,10 @@ def biorxiv(
22
19
  end_date: Optional[str] = None,
23
20
  save_path: str = save_path,
24
21
  max_retries: int = 10,
22
+ request_timeout: Tuple[float, float] = (5.0, 30.0),
23
+ retry_backoff_seconds: float = 1.0,
24
+ window_days: int = 30,
25
+ max_workers: int = 8,
25
26
  ):
26
27
  """Fetches papers from biorxiv based on time range, i.e., start_date and end_date.
27
28
  If the start_date and end_date are not provided, papers will be fetched from biorxiv
@@ -37,15 +38,28 @@ def biorxiv(
37
38
  Defaults to save_path.
38
39
  max_retries (int, optional): Number of retries when API shows connection issues.
39
40
  Defaults to 10.
41
+ request_timeout (Tuple[float, float], optional): (connect timeout, read timeout).
42
+ Defaults to (5.0, 30.0).
43
+ retry_backoff_seconds (float, optional): Initial retry backoff.
44
+ Defaults to 1.0.
45
+ window_days (int, optional): Date-window size used for pagination.
46
+ Defaults to 30.
47
+ max_workers (int, optional): Number of parallel workers over date windows.
48
+ Defaults to 8.
40
49
  """
41
- # create API client
42
- api = BioRxivApi(max_retries=max_retries)
43
-
44
- # dump all papers
45
- with open(save_path, "w") as fp:
46
- for index, paper in enumerate(
47
- tqdm(api.get_papers(start_date=start_date, end_date=end_date))
48
- ):
49
- if index > 0:
50
- fp.write(os.linesep)
51
- fp.write(json.dumps(paper))
50
+ api = BioRxivApi(
51
+ max_retries=max_retries,
52
+ request_timeout=request_timeout,
53
+ retry_backoff_seconds=retry_backoff_seconds,
54
+ window_days=max(1, int(window_days)),
55
+ )
56
+ api.dump_papers(
57
+ save_path=save_path,
58
+ start_date=start_date,
59
+ end_date=end_date,
60
+ max_retries=max_retries,
61
+ max_workers=max_workers,
62
+ window_days=window_days,
63
+ deduplicate_dois=False,
64
+ show_progress=True,
65
+ )