paperscraper 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.3.4 → paperscraper-0.3.6}/PKG-INFO +19 -9
- {paperscraper-0.3.4 → paperscraper-0.3.6}/README.md +14 -4
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/__init__.py +1 -1
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/async_utils.py +11 -1
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/researcher.py +8 -5
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_citations.py +76 -41
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/self_references.py +31 -20
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_citations.py +5 -2
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_references.py +0 -27
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/utils.py +49 -6
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/biorxiv.py +29 -15
- paperscraper-0.3.6/paperscraper/get_dumps/chemrxiv.py +62 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/medrxiv.py +29 -14
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +3 -27
- paperscraper-0.3.6/paperscraper/get_dumps/utils/chemrxiv/crossref_api.py +226 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/utils.py +74 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/load_dumps.py +5 -1
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/pdf.py +31 -4
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_dump.py +91 -36
- paperscraper-0.3.6/paperscraper/xrxiv/xrxiv_api.py +576 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/PKG-INFO +19 -9
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/SOURCES.txt +1 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/requires.txt +3 -3
- {paperscraper-0.3.4 → paperscraper-0.3.6}/pyproject.toml +4 -4
- paperscraper-0.3.4/paperscraper/get_dumps/chemrxiv.py +0 -44
- paperscraper-0.3.4/paperscraper/xrxiv/xrxiv_api.py +0 -174
- {paperscraper-0.3.4 → paperscraper-0.3.6}/LICENSE +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/arxiv.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/arxiv/utils.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/citations.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/core.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/core.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/entity/paper.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/orcid.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_citations.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_paper.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/arxiv.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/impact.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/fallbacks.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pdf/utils.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/plotting.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/postprocessing.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/pubmed.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/tests/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/core.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/scholar.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/tests/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/scholar/tests/test_scholar.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_impactor.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/tests/test_pdf.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/utils.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/tests/__init__.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.3.4 → paperscraper-0.3.6}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,13 +19,13 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
-
Requires-Python:
|
|
22
|
+
Requires-Python: <3.14,>=3.9
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
-
Requires-Dist: arxiv>=
|
|
26
|
-
Requires-Dist: pymed-paperscraper>=1.0.
|
|
25
|
+
Requires-Dist: arxiv>=2.4.0
|
|
26
|
+
Requires-Dist: pymed-paperscraper>=1.0.6
|
|
27
27
|
Requires-Dist: pandas>=1.0.4
|
|
28
|
-
Requires-Dist: requests
|
|
28
|
+
Requires-Dist: requests>=2.32.2
|
|
29
29
|
Requires-Dist: tqdm>=4.51.0
|
|
30
30
|
Requires-Dist: scholarly>=1.0.0
|
|
31
31
|
Requires-Dist: seaborn>=0.11.0
|
|
@@ -102,12 +102,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
|
|
|
102
102
|
|
|
103
103
|
```py
|
|
104
104
|
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
|
|
105
|
-
chemrxiv() # Takes
|
|
106
|
-
medrxiv() # Takes <
|
|
107
|
-
biorxiv() #
|
|
105
|
+
chemrxiv() # Takes <15min -> +50K papers (~30 MB file)
|
|
106
|
+
medrxiv() # Takes <30min -> +100K papers (~200 MB file)
|
|
107
|
+
biorxiv() # Takes <3h -> +450 papers (~800 MB file)
|
|
108
108
|
```
|
|
109
109
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
|
|
110
|
-
*NOTE*: If you experience API connection issues,
|
|
110
|
+
*NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
|
|
111
|
+
|
|
112
|
+
```py
|
|
113
|
+
biorxiv(
|
|
114
|
+
max_retries=12,
|
|
115
|
+
request_timeout=(5.0, 45.0), # connect timeout, read timeout
|
|
116
|
+
retry_backoff_seconds=1.0, # initial retry backoff
|
|
117
|
+
max_workers=8, # number of parallel date windows
|
|
118
|
+
window_days=30, # smaller windows increase parallelism
|
|
119
|
+
)
|
|
120
|
+
```
|
|
111
121
|
|
|
112
122
|
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
|
|
113
123
|
```py
|
|
@@ -57,12 +57,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://
|
|
|
57
57
|
|
|
58
58
|
```py
|
|
59
59
|
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
|
|
60
|
-
chemrxiv() # Takes
|
|
61
|
-
medrxiv() # Takes <
|
|
62
|
-
biorxiv() #
|
|
60
|
+
chemrxiv() # Takes <15min -> +50K papers (~30 MB file)
|
|
61
|
+
medrxiv() # Takes <30min -> +100K papers (~200 MB file)
|
|
62
|
+
biorxiv() # Takes <3h -> +450 papers (~800 MB file)
|
|
63
63
|
```
|
|
64
64
|
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
|
|
65
|
-
*NOTE*: If you experience API connection issues,
|
|
65
|
+
*NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:
|
|
66
|
+
|
|
67
|
+
```py
|
|
68
|
+
biorxiv(
|
|
69
|
+
max_retries=12,
|
|
70
|
+
request_timeout=(5.0, 45.0), # connect timeout, read timeout
|
|
71
|
+
retry_backoff_seconds=1.0, # initial retry backoff
|
|
72
|
+
max_workers=8, # number of parallel date windows
|
|
73
|
+
window_days=30, # smaller windows increase parallelism
|
|
74
|
+
)
|
|
75
|
+
```
|
|
66
76
|
|
|
67
77
|
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
|
|
68
78
|
```py
|
|
@@ -48,6 +48,16 @@ def optional_async(
|
|
|
48
48
|
return wrapper
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
def run_sync(coroutine: Awaitable[T]) -> T:
|
|
52
|
+
"""
|
|
53
|
+
Run a coroutine on the background loop and block for the result.
|
|
54
|
+
|
|
55
|
+
This is safe to call from sync or async contexts, but will block the caller.
|
|
56
|
+
"""
|
|
57
|
+
future = asyncio.run_coroutine_threadsafe(coroutine, _background_loop)
|
|
58
|
+
return future.result()
|
|
59
|
+
|
|
60
|
+
|
|
51
61
|
def retry_with_exponential_backoff(
|
|
52
62
|
*,
|
|
53
63
|
max_retries: int = 5,
|
|
@@ -96,7 +106,7 @@ def retry_with_exponential_backoff(
|
|
|
96
106
|
pass
|
|
97
107
|
delay *= factor
|
|
98
108
|
|
|
99
|
-
except httpx.ReadError as e:
|
|
109
|
+
except (httpx.ReadError, httpx.TimeoutException, httpx.TransportError) as e:
|
|
100
110
|
last_exception = e
|
|
101
111
|
sleep_for = delay
|
|
102
112
|
delay *= factor
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import os
|
|
3
2
|
from typing import Any, List, Literal, Optional, Tuple
|
|
4
3
|
|
|
5
4
|
from semanticscholar import SemanticScholar
|
|
6
5
|
|
|
6
|
+
from ...async_utils import run_sync
|
|
7
7
|
from ..orcid import orcid_to_author_name
|
|
8
8
|
from ..self_citations import CitationResult, self_citations_paper
|
|
9
9
|
from ..self_references import ReferenceResult, self_references_paper
|
|
@@ -128,7 +128,7 @@ class Researcher(Entity):
|
|
|
128
128
|
Returns:
|
|
129
129
|
A ResearcherResult containing aggregated self-reference data.
|
|
130
130
|
"""
|
|
131
|
-
reference_results =
|
|
131
|
+
reference_results = run_sync(self._self_references_async(verbose=verbose))
|
|
132
132
|
|
|
133
133
|
individual_self_references = {
|
|
134
134
|
getattr(result, "title"): getattr(result, "self_references").get(
|
|
@@ -182,8 +182,11 @@ class Researcher(Entity):
|
|
|
182
182
|
def self_citations(self, verbose: bool = False) -> ResearcherResult:
|
|
183
183
|
"""
|
|
184
184
|
Sifts through all papers of a researcher and finds how often they are self-cited.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
verbose: If True, logs detailed information for each paper.
|
|
185
188
|
"""
|
|
186
|
-
citation_results =
|
|
189
|
+
citation_results = run_sync(self._self_citations_async(verbose=verbose))
|
|
187
190
|
individual_self_citations = {
|
|
188
191
|
getattr(result, "title"): getattr(result, "self_citations").get(
|
|
189
192
|
self.name, 0.0
|
|
@@ -214,8 +217,8 @@ class Researcher(Entity):
|
|
|
214
217
|
"""
|
|
215
218
|
Provides the result of the analysis.
|
|
216
219
|
"""
|
|
217
|
-
if
|
|
220
|
+
if getattr(self.result, "num_references", -1) < 0:
|
|
218
221
|
self.self_references()
|
|
219
|
-
if
|
|
222
|
+
if getattr(self.result, "num_citations", -1) < 0:
|
|
220
223
|
self.self_citations()
|
|
221
224
|
return self.result
|
|
@@ -7,9 +7,18 @@ from typing import Any, Dict, List, Union
|
|
|
7
7
|
import httpx
|
|
8
8
|
import numpy as np
|
|
9
9
|
from pydantic import BaseModel
|
|
10
|
+
from tqdm import tqdm
|
|
10
11
|
|
|
11
12
|
from ..async_utils import optional_async, retry_with_exponential_backoff
|
|
12
|
-
from .utils import
|
|
13
|
+
from .utils import (
|
|
14
|
+
DOI_PATTERN,
|
|
15
|
+
HEADERS,
|
|
16
|
+
HTTPX_LIMITS,
|
|
17
|
+
REQUEST_SEMAPHORE,
|
|
18
|
+
REQUEST_TIMEOUT_SECONDS,
|
|
19
|
+
find_matching,
|
|
20
|
+
wait_for_request_slot,
|
|
21
|
+
)
|
|
13
22
|
|
|
14
23
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
15
24
|
logger = logging.getLogger(__name__)
|
|
@@ -30,6 +39,7 @@ async def _fetch_citation_data(
|
|
|
30
39
|
) -> Dict[str, Any]:
|
|
31
40
|
"""
|
|
32
41
|
Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
|
|
42
|
+
Respects rate limiting to avoid exceeding API limits.
|
|
33
43
|
|
|
34
44
|
Args:
|
|
35
45
|
client: An active httpx.AsyncClient.
|
|
@@ -38,9 +48,12 @@ async def _fetch_citation_data(
|
|
|
38
48
|
Returns:
|
|
39
49
|
The JSON-decoded response as a dictionary.
|
|
40
50
|
"""
|
|
51
|
+
await wait_for_request_slot()
|
|
52
|
+
|
|
41
53
|
response = await client.get(
|
|
42
54
|
f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
|
|
43
55
|
params={"fields": "title,authors,citations.authors"},
|
|
56
|
+
headers=HEADERS,
|
|
44
57
|
)
|
|
45
58
|
response.raise_for_status()
|
|
46
59
|
return response.json()
|
|
@@ -57,43 +70,44 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
|
|
|
57
70
|
Returns:
|
|
58
71
|
A CitationResult containing counts and percentages of self-citations.
|
|
59
72
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
async with REQUEST_SEMAPHORE:
|
|
74
|
+
# Determine prefix for Semantic Scholar API
|
|
75
|
+
if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
|
|
76
|
+
prefix = ""
|
|
77
|
+
elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
|
|
78
|
+
prefix = "DOI:"
|
|
79
|
+
else:
|
|
80
|
+
prefix = ""
|
|
81
|
+
|
|
82
|
+
suffix = f"{prefix}{identifier}"
|
|
83
|
+
paper = await _fetch_citation_data(client, suffix)
|
|
84
|
+
|
|
85
|
+
# Initialize counters
|
|
86
|
+
author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
|
|
87
|
+
citations = paper.get("citations", [])
|
|
88
|
+
total_cites = len(citations)
|
|
89
|
+
|
|
90
|
+
# Tally self-citations
|
|
91
|
+
for cite in citations:
|
|
92
|
+
matched = find_matching(paper.get("authors", []), cite.get("authors", []))
|
|
93
|
+
for name in matched:
|
|
94
|
+
author_counts[name] += 1
|
|
95
|
+
|
|
96
|
+
# Compute percentages
|
|
97
|
+
ratios: Dict[str, float] = {
|
|
98
|
+
name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
|
|
99
|
+
for name, count in author_counts.items()
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
|
|
103
|
+
|
|
104
|
+
return CitationResult(
|
|
105
|
+
ssid=identifier,
|
|
106
|
+
title=paper.get("title", ""),
|
|
107
|
+
num_citations=total_cites,
|
|
108
|
+
self_citations=ratios,
|
|
109
|
+
citation_score=avg_score,
|
|
110
|
+
)
|
|
97
111
|
|
|
98
112
|
|
|
99
113
|
@optional_async
|
|
@@ -114,9 +128,26 @@ async def self_citations_paper(
|
|
|
114
128
|
single_input = isinstance(inputs, str)
|
|
115
129
|
identifiers = [inputs] if single_input else list(inputs)
|
|
116
130
|
|
|
117
|
-
|
|
131
|
+
results: List[CitationResult] = []
|
|
132
|
+
|
|
133
|
+
async with httpx.AsyncClient(
|
|
134
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
135
|
+
) as client:
|
|
118
136
|
tasks = [_process_single(client, ident) for ident in identifiers]
|
|
119
|
-
|
|
137
|
+
|
|
138
|
+
iterator = tqdm(
|
|
139
|
+
asyncio.as_completed(tasks),
|
|
140
|
+
total=len(tasks),
|
|
141
|
+
desc="Collecting self-citations",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
for coro in iterator:
|
|
145
|
+
try:
|
|
146
|
+
res = await coro
|
|
147
|
+
except Exception as exc:
|
|
148
|
+
logger.warning(f"Self-citation fetch failed: {exc}")
|
|
149
|
+
continue
|
|
150
|
+
results.append(res)
|
|
120
151
|
|
|
121
152
|
if verbose:
|
|
122
153
|
for res in results:
|
|
@@ -126,4 +157,8 @@ async def self_citations_paper(
|
|
|
126
157
|
for author, pct in res.self_citations.items():
|
|
127
158
|
logger.info(f" {author}: {pct}%")
|
|
128
159
|
|
|
129
|
-
|
|
160
|
+
if single_input:
|
|
161
|
+
if not results:
|
|
162
|
+
raise RuntimeError("Failed to fetch self-citations for input.")
|
|
163
|
+
return results[0]
|
|
164
|
+
return results
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
3
|
import re
|
|
5
4
|
import sys
|
|
6
5
|
from typing import Any, Dict, List, Literal, Union
|
|
@@ -11,7 +10,15 @@ from pydantic import BaseModel
|
|
|
11
10
|
from tqdm import tqdm
|
|
12
11
|
|
|
13
12
|
from ..async_utils import optional_async, retry_with_exponential_backoff
|
|
14
|
-
from .utils import
|
|
13
|
+
from .utils import (
|
|
14
|
+
DOI_PATTERN,
|
|
15
|
+
HEADERS,
|
|
16
|
+
HTTPX_LIMITS,
|
|
17
|
+
REQUEST_SEMAPHORE,
|
|
18
|
+
REQUEST_TIMEOUT_SECONDS,
|
|
19
|
+
find_matching,
|
|
20
|
+
wait_for_request_slot,
|
|
21
|
+
)
|
|
15
22
|
|
|
16
23
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
17
24
|
logger = logging.getLogger(__name__)
|
|
@@ -19,15 +26,6 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
|
19
26
|
ModeType = Literal[tuple(MODES := ("doi", "infer", "ssid"))]
|
|
20
27
|
|
|
21
28
|
|
|
22
|
-
SS_API_KEY = os.getenv("SS_API_KEY")
|
|
23
|
-
HEADERS: Dict[str, str] = {}
|
|
24
|
-
if SS_API_KEY:
|
|
25
|
-
HEADERS["x-api-key"] = SS_API_KEY
|
|
26
|
-
|
|
27
|
-
CONCURRENCY_LIMIT = 10
|
|
28
|
-
_SEM = asyncio.Semaphore(CONCURRENCY_LIMIT)
|
|
29
|
-
|
|
30
|
-
|
|
31
29
|
class ReferenceResult(BaseModel):
|
|
32
30
|
ssid: str # semantic scholar paper id
|
|
33
31
|
title: str
|
|
@@ -42,6 +40,7 @@ async def _fetch_paper_with_references(
|
|
|
42
40
|
) -> Dict[str, Any]:
|
|
43
41
|
"""
|
|
44
42
|
Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
|
|
43
|
+
Respects rate limiting to avoid exceeding API limits.
|
|
45
44
|
|
|
46
45
|
Args:
|
|
47
46
|
client: An active httpx.AsyncClient.
|
|
@@ -50,6 +49,8 @@ async def _fetch_paper_with_references(
|
|
|
50
49
|
Returns:
|
|
51
50
|
The JSON-decoded response as a dictionary.
|
|
52
51
|
"""
|
|
52
|
+
await wait_for_request_slot()
|
|
53
|
+
|
|
53
54
|
response = await client.get(
|
|
54
55
|
f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
|
|
55
56
|
params={"fields": "title,authors,references.authors"},
|
|
@@ -72,7 +73,7 @@ async def _process_single_reference(
|
|
|
72
73
|
Returns:
|
|
73
74
|
A ReferenceResult containing counts and percentages of self-references.
|
|
74
75
|
"""
|
|
75
|
-
async with
|
|
76
|
+
async with REQUEST_SEMAPHORE:
|
|
76
77
|
# Determine prefix for API
|
|
77
78
|
if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
|
|
78
79
|
prefix = ""
|
|
@@ -134,18 +135,24 @@ async def self_references_paper(
|
|
|
134
135
|
single_input = isinstance(inputs, str)
|
|
135
136
|
identifiers = [inputs] if single_input else list(inputs)
|
|
136
137
|
|
|
137
|
-
async with httpx.AsyncClient(
|
|
138
|
+
async with httpx.AsyncClient(
|
|
139
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
140
|
+
) as client:
|
|
138
141
|
tasks = [_process_single_reference(client, ident) for ident in identifiers]
|
|
139
142
|
results: List[ReferenceResult] = []
|
|
140
143
|
|
|
141
|
-
iterator =
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
144
|
+
iterator = tqdm(
|
|
145
|
+
asyncio.as_completed(tasks),
|
|
146
|
+
total=len(tasks),
|
|
147
|
+
desc="Collecting self-references",
|
|
148
|
+
)
|
|
146
149
|
|
|
147
150
|
for coro in iterator:
|
|
148
|
-
|
|
151
|
+
try:
|
|
152
|
+
res = await coro
|
|
153
|
+
except Exception as exc:
|
|
154
|
+
logger.warning(f"Self-reference fetch failed: {exc}")
|
|
155
|
+
continue
|
|
149
156
|
results.append(res)
|
|
150
157
|
|
|
151
158
|
if verbose:
|
|
@@ -157,4 +164,8 @@ async def self_references_paper(
|
|
|
157
164
|
for author, pct in res.self_references.items():
|
|
158
165
|
logger.info(f" {author}: {pct}% self-references")
|
|
159
166
|
|
|
160
|
-
|
|
167
|
+
if single_input:
|
|
168
|
+
if not results:
|
|
169
|
+
raise RuntimeError("Failed to fetch self-references for input.")
|
|
170
|
+
return results[0]
|
|
171
|
+
return results
|
{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_citations.py
RENAMED
|
@@ -64,12 +64,15 @@ class TestSelfCitations:
|
|
|
64
64
|
f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
|
|
65
65
|
)
|
|
66
66
|
|
|
67
|
-
assert 0.
|
|
67
|
+
assert async_duration*0.8 <= sync_duration, (
|
|
68
68
|
f"Async execution ({async_duration:.2f}s) is slower than sync execution "
|
|
69
69
|
f"({sync_duration:.2f}s)"
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
-
for a, s in zip(
|
|
72
|
+
for a, s in zip(
|
|
73
|
+
sorted(result, key=lambda r: r.ssid),
|
|
74
|
+
sorted(sync_result, key=lambda r: r.ssid),
|
|
75
|
+
):
|
|
73
76
|
assert a == s, f"{a} vs {s}"
|
|
74
77
|
|
|
75
78
|
def test_researcher(self):
|
{paperscraper-0.3.4 → paperscraper-0.3.6}/paperscraper/citations/tests/test_self_references.py
RENAMED
|
@@ -49,33 +49,6 @@ class TestSelfReferences:
|
|
|
49
49
|
assert isinstance(self_cites, float)
|
|
50
50
|
assert self_cites >= 0 and self_cites <= 100
|
|
51
51
|
|
|
52
|
-
def test_compare_async_and_sync_performance(self, dois):
|
|
53
|
-
"""
|
|
54
|
-
Compares the execution time of asynchronous and synchronous `self_references`
|
|
55
|
-
for a list of DOIs.
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
start_time = time.perf_counter()
|
|
59
|
-
async_results = self_references_paper(dois)
|
|
60
|
-
async_duration = time.perf_counter() - start_time
|
|
61
|
-
|
|
62
|
-
# Measure synchronous execution time (three independent calls)
|
|
63
|
-
start_time = time.perf_counter()
|
|
64
|
-
sync_results = [self_references_paper(doi) for doi in dois]
|
|
65
|
-
|
|
66
|
-
sync_duration = time.perf_counter() - start_time
|
|
67
|
-
|
|
68
|
-
print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
|
|
69
|
-
print(
|
|
70
|
-
f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
|
|
71
|
-
)
|
|
72
|
-
assert len(sync_results) == len(async_results)
|
|
73
|
-
|
|
74
|
-
assert 0.5 * async_duration <= sync_duration, (
|
|
75
|
-
f"Async execution ({async_duration:.2f}s) is slower than sync execution "
|
|
76
|
-
f"({sync_duration:.2f}s)"
|
|
77
|
-
)
|
|
78
|
-
|
|
79
52
|
def test_researcher(self):
|
|
80
53
|
"""
|
|
81
54
|
Tests calculation of self-references for all papers of an author.
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
4
5
|
import sys
|
|
5
|
-
|
|
6
|
+
import time
|
|
7
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
6
8
|
|
|
7
9
|
import httpx
|
|
8
10
|
import requests
|
|
@@ -15,6 +17,11 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
17
19
|
|
|
20
|
+
REQUEST_TIMEOUT_SECONDS = float(os.getenv("SS_REQUEST_TIMEOUT", "20"))
|
|
21
|
+
CONCURRENCY_LIMIT = max(1, int(os.getenv("SS_CONCURRENCY_LIMIT", "1")))
|
|
22
|
+
# Minimum delay between outbound requests to Semantic Scholar.
|
|
23
|
+
RATE_LIMIT_DELAY = max(0.0, float(os.getenv("SS_RATE_LIMIT_DELAY", "1.1")))
|
|
24
|
+
|
|
18
25
|
DOI_PATTERN = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
|
19
26
|
PAPER_URL: str = "https://api.semanticscholar.org/graph/v1/paper/"
|
|
20
27
|
AUTHOR_URL: str = "https://api.semanticscholar.org/graph/v1/author/search"
|
|
@@ -25,6 +32,30 @@ HEADERS: Dict[str, str] = {}
|
|
|
25
32
|
if SS_API_KEY:
|
|
26
33
|
HEADERS["x-api-key"] = SS_API_KEY
|
|
27
34
|
|
|
35
|
+
HTTPX_LIMITS = httpx.Limits(
|
|
36
|
+
max_connections=CONCURRENCY_LIMIT, max_keepalive_connections=CONCURRENCY_LIMIT
|
|
37
|
+
)
|
|
38
|
+
REQUEST_SEMAPHORE = asyncio.Semaphore(CONCURRENCY_LIMIT)
|
|
39
|
+
_REQUEST_SCHEDULER_LOCK = asyncio.Lock()
|
|
40
|
+
_NEXT_REQUEST_TIME = 0.0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def wait_for_request_slot() -> None:
|
|
44
|
+
"""
|
|
45
|
+
Enforces global pacing between Semantic Scholar requests.
|
|
46
|
+
Uses a shared scheduler to avoid bursts across modules.
|
|
47
|
+
"""
|
|
48
|
+
global _NEXT_REQUEST_TIME
|
|
49
|
+
|
|
50
|
+
async with _REQUEST_SCHEDULER_LOCK:
|
|
51
|
+
now = time.monotonic()
|
|
52
|
+
scheduled = max(_NEXT_REQUEST_TIME, now)
|
|
53
|
+
_NEXT_REQUEST_TIME = scheduled + RATE_LIMIT_DELAY
|
|
54
|
+
|
|
55
|
+
delay = scheduled - now
|
|
56
|
+
if delay > 0:
|
|
57
|
+
await asyncio.sleep(delay)
|
|
58
|
+
|
|
28
59
|
|
|
29
60
|
def get_doi_from_title(title: str) -> Optional[str]:
|
|
30
61
|
"""
|
|
@@ -62,7 +93,9 @@ async def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
|
|
|
62
93
|
Returns:
|
|
63
94
|
str or None: The DOI of the paper, or None if not found or in case of an error.
|
|
64
95
|
"""
|
|
65
|
-
async with httpx.AsyncClient(
|
|
96
|
+
async with httpx.AsyncClient(
|
|
97
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
98
|
+
) as client:
|
|
66
99
|
logger.warning(
|
|
67
100
|
"Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
|
|
68
101
|
)
|
|
@@ -99,7 +132,9 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
|
|
|
99
132
|
Returns:
|
|
100
133
|
dict or None: A dictionary with keys 'title' and 'ssid'.
|
|
101
134
|
"""
|
|
102
|
-
async with httpx.AsyncClient(
|
|
135
|
+
async with httpx.AsyncClient(
|
|
136
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
137
|
+
) as client:
|
|
103
138
|
# Send the GET request to Semantic Scholar
|
|
104
139
|
response = await client.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
|
|
105
140
|
if response.status_code == 200:
|
|
@@ -115,6 +150,7 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
|
|
|
115
150
|
async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
|
|
116
151
|
"""
|
|
117
152
|
Given an author name, returns the Semantic Scholar author ID.
|
|
153
|
+
Respects rate limiting to avoid exceeding API limits.
|
|
118
154
|
|
|
119
155
|
Parameters:
|
|
120
156
|
author_name (str): The full name of the author.
|
|
@@ -123,7 +159,11 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
|
|
|
123
159
|
Tuple[str, str] or None: The SS author ID alongside the SS name (may differ
|
|
124
160
|
slightly from input name) or None if no author is found.
|
|
125
161
|
"""
|
|
126
|
-
async with httpx.AsyncClient(
|
|
162
|
+
async with httpx.AsyncClient(
|
|
163
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
164
|
+
) as client:
|
|
165
|
+
await wait_for_request_slot()
|
|
166
|
+
|
|
127
167
|
response = await client.get(
|
|
128
168
|
AUTHOR_URL,
|
|
129
169
|
params={"query": author_name, "fields": "name", "limit": 1},
|
|
@@ -139,7 +179,7 @@ async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
|
|
|
139
179
|
logger.error(
|
|
140
180
|
f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
|
|
141
181
|
)
|
|
142
|
-
return (
|
|
182
|
+
return ("-1", "N.A.")
|
|
143
183
|
|
|
144
184
|
|
|
145
185
|
def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
|
|
@@ -164,6 +204,7 @@ def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
|
|
|
164
204
|
return mode
|
|
165
205
|
|
|
166
206
|
|
|
207
|
+
@optional_async
|
|
167
208
|
@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
|
|
168
209
|
async def get_papers_for_author(ss_author_id: str) -> List[str]:
|
|
169
210
|
"""
|
|
@@ -179,7 +220,9 @@ async def get_papers_for_author(ss_author_id: str) -> List[str]:
|
|
|
179
220
|
offset = 0
|
|
180
221
|
limit = 100
|
|
181
222
|
|
|
182
|
-
async with httpx.AsyncClient(
|
|
223
|
+
async with httpx.AsyncClient(
|
|
224
|
+
timeout=httpx.Timeout(REQUEST_TIMEOUT_SECONDS), limits=HTTPX_LIMITS
|
|
225
|
+
) as client:
|
|
183
226
|
while True:
|
|
184
227
|
response = await client.get(
|
|
185
228
|
f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",
|
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
"""Dump bioRxiv data in JSONL format."""
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import os
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from tqdm import tqdm
|
|
5
|
+
from typing import Optional, Tuple
|
|
9
6
|
|
|
10
7
|
from ..utils import get_server_dumps_dir
|
|
11
8
|
from ..xrxiv.xrxiv_api import BioRxivApi
|
|
@@ -22,6 +19,10 @@ def biorxiv(
|
|
|
22
19
|
end_date: Optional[str] = None,
|
|
23
20
|
save_path: str = save_path,
|
|
24
21
|
max_retries: int = 10,
|
|
22
|
+
request_timeout: Tuple[float, float] = (5.0, 30.0),
|
|
23
|
+
retry_backoff_seconds: float = 1.0,
|
|
24
|
+
window_days: int = 30,
|
|
25
|
+
max_workers: int = 8,
|
|
25
26
|
):
|
|
26
27
|
"""Fetches papers from biorxiv based on time range, i.e., start_date and end_date.
|
|
27
28
|
If the start_date and end_date are not provided, papers will be fetched from biorxiv
|
|
@@ -37,15 +38,28 @@ def biorxiv(
|
|
|
37
38
|
Defaults to save_path.
|
|
38
39
|
max_retries (int, optional): Number of retries when API shows connection issues.
|
|
39
40
|
Defaults to 10.
|
|
41
|
+
request_timeout (Tuple[float, float], optional): (connect timeout, read timeout).
|
|
42
|
+
Defaults to (5.0, 30.0).
|
|
43
|
+
retry_backoff_seconds (float, optional): Initial retry backoff.
|
|
44
|
+
Defaults to 1.0.
|
|
45
|
+
window_days (int, optional): Date-window size used for pagination.
|
|
46
|
+
Defaults to 30.
|
|
47
|
+
max_workers (int, optional): Number of parallel workers over date windows.
|
|
48
|
+
Defaults to 8.
|
|
40
49
|
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
api = BioRxivApi(
|
|
51
|
+
max_retries=max_retries,
|
|
52
|
+
request_timeout=request_timeout,
|
|
53
|
+
retry_backoff_seconds=retry_backoff_seconds,
|
|
54
|
+
window_days=max(1, int(window_days)),
|
|
55
|
+
)
|
|
56
|
+
api.dump_papers(
|
|
57
|
+
save_path=save_path,
|
|
58
|
+
start_date=start_date,
|
|
59
|
+
end_date=end_date,
|
|
60
|
+
max_retries=max_retries,
|
|
61
|
+
max_workers=max_workers,
|
|
62
|
+
window_days=window_days,
|
|
63
|
+
deduplicate_dois=False,
|
|
64
|
+
show_progress=True,
|
|
65
|
+
)
|