paperscraper 0.3.3__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.3.3 → paperscraper-0.3.5}/PKG-INFO +38 -26
- {paperscraper-0.3.3 → paperscraper-0.3.5}/README.md +19 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/__init__.py +1 -1
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/arxiv.py +2 -3
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/utils.py +2 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/async_utils.py +36 -9
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/citations.py +2 -1
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/core.py +6 -5
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/paper.py +17 -15
- paperscraper-0.3.5/paperscraper/citations/entity/researcher.py +221 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/self_citations.py +5 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/self_references.py +68 -42
- paperscraper-0.3.5/paperscraper/citations/tests/test_citations.py +32 -0
- paperscraper-0.3.5/paperscraper/citations/tests/test_self_citations.py +147 -0
- paperscraper-0.3.5/paperscraper/citations/tests/test_self_references.py +96 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/utils.py +99 -51
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/arxiv.py +2 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/biorxiv.py +2 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/chemrxiv.py +2 -3
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/medrxiv.py +2 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +39 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/utils.py +20 -12
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/load_dumps.py +2 -3
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/fallbacks.py +134 -55
- paperscraper-0.3.5/paperscraper/pdf/pdf.py +442 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/utils.py +21 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/pubmed.py +10 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_dump.py +8 -2
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_impactor.py +23 -4
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_pdf.py +0 -5
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/utils.py +6 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/PKG-INFO +38 -26
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/SOURCES.txt +1 -1
- paperscraper-0.3.5/paperscraper.egg-info/requires.txt +19 -0
- paperscraper-0.3.5/pyproject.toml +90 -0
- paperscraper-0.3.3/paperscraper/citations/entity/researcher.py +0 -90
- paperscraper-0.3.3/paperscraper/citations/tests/test_citations.py +0 -18
- paperscraper-0.3.3/paperscraper/citations/tests/test_self_citations.py +0 -71
- paperscraper-0.3.3/paperscraper/citations/tests/test_self_references.py +0 -78
- paperscraper-0.3.3/paperscraper/pdf/pdf.py +0 -250
- paperscraper-0.3.3/paperscraper.egg-info/requires.txt +0 -19
- paperscraper-0.3.3/setup.py +0 -77
- {paperscraper-0.3.3 → paperscraper-0.3.5}/LICENSE +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/core.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/orcid.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/tests/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/tests/test_paper.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/impact.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/plotting.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/postprocessing.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/tests/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/core.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/scholar.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/tests/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/tests/test_scholar.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/tests/__init__.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/xrxiv_api.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.3.3 → paperscraper-0.3.5}/setup.cfg +0 -0
|
@@ -1,54 +1,47 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
|
-
|
|
6
|
-
Author: Jannis Born, Matteo Manica
|
|
7
|
-
Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
|
|
5
|
+
Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
|
|
8
6
|
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jannisborn/paperscraper
|
|
8
|
+
Project-URL: Documentation, https://jannisborn.github.io/paperscraper/
|
|
9
|
+
Project-URL: Repository, https://github.com/jannisborn/paperscraper
|
|
9
10
|
Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
|
|
10
11
|
Classifier: Development Status :: 3 - Alpha
|
|
11
12
|
Classifier: Intended Audience :: Developers
|
|
12
13
|
Classifier: Intended Audience :: Science/Research
|
|
13
14
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
15
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
20
23
|
Description-Content-Type: text/markdown
|
|
21
24
|
License-File: LICENSE
|
|
22
|
-
Requires-Dist: arxiv>=1.4.
|
|
25
|
+
Requires-Dist: arxiv>=1.4.7
|
|
23
26
|
Requires-Dist: pymed-paperscraper>=1.0.4
|
|
24
|
-
Requires-Dist: pandas
|
|
25
|
-
Requires-Dist: requests
|
|
26
|
-
Requires-Dist: tqdm
|
|
27
|
+
Requires-Dist: pandas>=1.0.4
|
|
28
|
+
Requires-Dist: requests>=2.32.2
|
|
29
|
+
Requires-Dist: tqdm>=4.51.0
|
|
27
30
|
Requires-Dist: scholarly>=1.0.0
|
|
28
|
-
Requires-Dist: seaborn
|
|
29
|
-
Requires-Dist: matplotlib
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: bs4
|
|
32
|
-
Requires-Dist: impact-factor>=1.1.
|
|
33
|
-
Requires-Dist: thefuzz
|
|
31
|
+
Requires-Dist: seaborn>=0.11.0
|
|
32
|
+
Requires-Dist: matplotlib>=3.3.2
|
|
33
|
+
Requires-Dist: matplotlib-venn>=0.11.5
|
|
34
|
+
Requires-Dist: bs4>=0.0.1
|
|
35
|
+
Requires-Dist: impact-factor>=1.1.3
|
|
36
|
+
Requires-Dist: thefuzz>=0.20.0
|
|
34
37
|
Requires-Dist: pytest
|
|
35
38
|
Requires-Dist: tldextract
|
|
36
|
-
Requires-Dist: semanticscholar
|
|
39
|
+
Requires-Dist: semanticscholar>=0.8.4
|
|
37
40
|
Requires-Dist: pydantic
|
|
38
41
|
Requires-Dist: unidecode
|
|
39
42
|
Requires-Dist: dotenv
|
|
40
43
|
Requires-Dist: boto3
|
|
41
|
-
Dynamic: author
|
|
42
|
-
Dynamic: author-email
|
|
43
|
-
Dynamic: classifier
|
|
44
|
-
Dynamic: description
|
|
45
|
-
Dynamic: description-content-type
|
|
46
|
-
Dynamic: home-page
|
|
47
|
-
Dynamic: keywords
|
|
48
|
-
Dynamic: license
|
|
49
44
|
Dynamic: license-file
|
|
50
|
-
Dynamic: requires-dist
|
|
51
|
-
Dynamic: summary
|
|
52
45
|
|
|
53
46
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
|
|
54
47
|
[](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
|
|
@@ -91,6 +84,18 @@ pip install paperscraper
|
|
|
91
84
|
|
|
92
85
|
This is enough to query PubMed, arXiv or Google Scholar.
|
|
93
86
|
|
|
87
|
+
### Local development
|
|
88
|
+
|
|
89
|
+
```console
|
|
90
|
+
uv sync
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
|
|
94
|
+
|
|
95
|
+
```console
|
|
96
|
+
uv run python -c "import paperscraper"
|
|
97
|
+
```
|
|
98
|
+
|
|
94
99
|
#### Download X-rxiv Dumps
|
|
95
100
|
|
|
96
101
|
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
|
|
@@ -280,6 +285,13 @@ doi = '10.1021/acs.jcim.3c00132'
|
|
|
280
285
|
get_citations_by_doi(doi)
|
|
281
286
|
```
|
|
282
287
|
|
|
288
|
+
NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
|
|
289
|
+
```sh
|
|
290
|
+
export SS_API_KEY=YOUR_API_KEY
|
|
291
|
+
```
|
|
292
|
+
This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
|
|
293
|
+
|
|
294
|
+
|
|
283
295
|
### Journal impact factor
|
|
284
296
|
|
|
285
297
|
You can also retrieve the impact factor for all journals:
|
|
@@ -39,6 +39,18 @@ pip install paperscraper
|
|
|
39
39
|
|
|
40
40
|
This is enough to query PubMed, arXiv or Google Scholar.
|
|
41
41
|
|
|
42
|
+
### Local development
|
|
43
|
+
|
|
44
|
+
```console
|
|
45
|
+
uv sync
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
|
|
49
|
+
|
|
50
|
+
```console
|
|
51
|
+
uv run python -c "import paperscraper"
|
|
52
|
+
```
|
|
53
|
+
|
|
42
54
|
#### Download X-rxiv Dumps
|
|
43
55
|
|
|
44
56
|
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
|
|
@@ -228,6 +240,13 @@ doi = '10.1021/acs.jcim.3c00132'
|
|
|
228
240
|
get_citations_by_doi(doi)
|
|
229
241
|
```
|
|
230
242
|
|
|
243
|
+
NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
|
|
244
|
+
```sh
|
|
245
|
+
export SS_API_KEY=YOUR_API_KEY
|
|
246
|
+
```
|
|
247
|
+
This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
|
|
248
|
+
|
|
249
|
+
|
|
231
250
|
### Journal impact factor
|
|
232
251
|
|
|
233
252
|
You can also retrieve the impact factor for all journals:
|
|
@@ -6,17 +6,16 @@ from typing import Dict, List, Literal, Union
|
|
|
6
6
|
|
|
7
7
|
import arxiv
|
|
8
8
|
import pandas as pd
|
|
9
|
-
import pkg_resources
|
|
10
9
|
from tqdm import tqdm
|
|
11
10
|
|
|
12
|
-
from ..utils import dump_papers
|
|
11
|
+
from ..utils import dump_papers, get_server_dumps_dir
|
|
13
12
|
from ..xrxiv.xrxiv_query import XRXivQuery
|
|
14
13
|
from .utils import get_query_from_keywords, infer_backend
|
|
15
14
|
|
|
16
15
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
19
|
-
dump_root =
|
|
18
|
+
dump_root = get_server_dumps_dir()
|
|
20
19
|
|
|
21
20
|
global ARXIV_QUERIER
|
|
22
21
|
ARXIV_QUERIER = None
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
from ..utils import get_server_dumps_dir
|
|
7
7
|
|
|
8
8
|
finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
|
|
9
9
|
finalize_conjunction = lambda x: x[:-5]
|
|
@@ -59,6 +59,6 @@ def get_query_from_keywords(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def infer_backend():
|
|
62
|
-
dump_root =
|
|
62
|
+
dump_root = get_server_dumps_dir()
|
|
63
63
|
dump_paths = glob.glob(os.path.join(dump_root, "arxiv" + "*"))
|
|
64
64
|
return "api" if not dump_paths else "local"
|
|
@@ -49,14 +49,20 @@ def optional_async(
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def retry_with_exponential_backoff(
|
|
52
|
-
*,
|
|
52
|
+
*,
|
|
53
|
+
max_retries: int = 5,
|
|
54
|
+
base_delay: float = 1.0,
|
|
55
|
+
factor: float = 1.3,
|
|
56
|
+
constant_delay: float = 0.2,
|
|
53
57
|
) -> Callable[[F], F]:
|
|
54
58
|
"""
|
|
55
59
|
Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
|
|
56
60
|
|
|
57
61
|
Args:
|
|
58
62
|
max_retries: how many times to retry before giving up.
|
|
59
|
-
base_delay: initial delay in seconds; next delays will be
|
|
63
|
+
base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
|
|
64
|
+
factor: multiplier for delay after each retry.
|
|
65
|
+
constant_delay: fixed delay before each attempt.
|
|
60
66
|
|
|
61
67
|
Usage:
|
|
62
68
|
|
|
@@ -70,18 +76,39 @@ def retry_with_exponential_backoff(
|
|
|
70
76
|
@wraps(func)
|
|
71
77
|
async def wrapper(*args, **kwargs) -> Any:
|
|
72
78
|
delay = base_delay
|
|
73
|
-
|
|
79
|
+
last_exception: BaseException | None = None
|
|
80
|
+
for attempt in range(1, max_retries + 1):
|
|
81
|
+
await asyncio.sleep(constant_delay)
|
|
74
82
|
try:
|
|
75
83
|
return await func(*args, **kwargs)
|
|
76
84
|
except httpx.HTTPStatusError as e:
|
|
77
|
-
# only retry on 429
|
|
78
85
|
status = e.response.status_code if e.response is not None else None
|
|
79
|
-
if status != 429
|
|
86
|
+
if status != 429:
|
|
80
87
|
raise
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
last_exception = e
|
|
89
|
+
sleep_for = delay
|
|
90
|
+
if e.response is not None:
|
|
91
|
+
ra = e.response.headers.get("Retry-After")
|
|
92
|
+
if ra is not None:
|
|
93
|
+
try:
|
|
94
|
+
sleep_for = float(ra)
|
|
95
|
+
except ValueError:
|
|
96
|
+
pass
|
|
97
|
+
delay *= factor
|
|
98
|
+
|
|
99
|
+
except httpx.ReadError as e:
|
|
100
|
+
last_exception = e
|
|
101
|
+
sleep_for = delay
|
|
102
|
+
delay *= factor
|
|
103
|
+
|
|
104
|
+
if attempt == max_retries:
|
|
105
|
+
msg = (
|
|
106
|
+
f"{func.__name__} failed after {attempt} attempts with "
|
|
107
|
+
f"last delay {sleep_for:.2f}s"
|
|
108
|
+
)
|
|
109
|
+
raise RuntimeError(msg) from last_exception
|
|
110
|
+
|
|
111
|
+
await asyncio.sleep(sleep_for)
|
|
85
112
|
|
|
86
113
|
return wrapper
|
|
87
114
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
import sys
|
|
3
4
|
from time import sleep
|
|
4
5
|
|
|
@@ -7,7 +8,7 @@ from semanticscholar import SemanticScholar, SemanticScholarException
|
|
|
7
8
|
|
|
8
9
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
9
10
|
logger = logging.getLogger(__name__)
|
|
10
|
-
sch = SemanticScholar()
|
|
11
|
+
sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def get_citations_by_doi(doi: str) -> int:
|
|
@@ -5,14 +5,15 @@ from pydantic import BaseModel
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class EntityResult(BaseModel):
|
|
8
|
-
num_citations: int
|
|
9
|
-
num_references: int
|
|
10
|
-
# keys are authors or papers and values are absolute self links
|
|
11
|
-
self_citations: Dict[str, int] = {}
|
|
12
|
-
self_references: Dict[str, int] = {}
|
|
13
8
|
# aggregated results
|
|
14
9
|
self_citation_ratio: float = 0
|
|
15
10
|
self_reference_ratio: float = 0
|
|
11
|
+
# total number of author citations/references
|
|
12
|
+
num_citations: int
|
|
13
|
+
num_references: int
|
|
14
|
+
# keys are papers and values are percentage of self citations/references
|
|
15
|
+
self_citations: Dict[str, float] = {}
|
|
16
|
+
self_references: Dict[str, float] = {}
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class Entity:
|
|
@@ -68,14 +68,14 @@ class Paper(Entity):
|
|
|
68
68
|
Extracts the self references of a paper, for each author.
|
|
69
69
|
"""
|
|
70
70
|
if isinstance(self.doi, str):
|
|
71
|
-
self.
|
|
71
|
+
self.self_ref: ReferenceResult = self_references_paper(self.doi)
|
|
72
72
|
|
|
73
73
|
def self_citations(self):
|
|
74
74
|
"""
|
|
75
75
|
Extracts the self citations of a paper, for each author.
|
|
76
76
|
"""
|
|
77
77
|
if isinstance(self.doi, str):
|
|
78
|
-
self.
|
|
78
|
+
self.self_cite: CitationResult = self_citations_paper(self.doi)
|
|
79
79
|
|
|
80
80
|
def get_result(self) -> Optional[PaperResult]:
|
|
81
81
|
"""
|
|
@@ -83,18 +83,20 @@ class Paper(Entity):
|
|
|
83
83
|
|
|
84
84
|
Returns: PaperResult if available.
|
|
85
85
|
"""
|
|
86
|
-
if not hasattr(self, "
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
)
|
|
90
|
-
return
|
|
91
|
-
elif not hasattr(self, "citation_result"):
|
|
92
|
-
logger.warning(
|
|
93
|
-
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
|
|
94
|
-
)
|
|
95
|
-
return
|
|
96
|
-
ref_result = self.ref_result.model_dump()
|
|
97
|
-
ref_result.pop("ssid", None)
|
|
86
|
+
if not hasattr(self, "self_ref"):
|
|
87
|
+
self.self_references()
|
|
88
|
+
if not hasattr(self, "self_cite"):
|
|
89
|
+
self.self_citations()
|
|
98
90
|
return PaperResult(
|
|
99
|
-
title=self.title,
|
|
91
|
+
title=self.title,
|
|
92
|
+
**{
|
|
93
|
+
k: v
|
|
94
|
+
for k, v in self.self_ref.model_dump().items()
|
|
95
|
+
if k not in ["ssid", "title"]
|
|
96
|
+
},
|
|
97
|
+
**{
|
|
98
|
+
k: v
|
|
99
|
+
for k, v in self.self_cite.model_dump().items()
|
|
100
|
+
if k not in ["title"]
|
|
101
|
+
},
|
|
100
102
|
)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, List, Literal, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from semanticscholar import SemanticScholar
|
|
6
|
+
|
|
7
|
+
from ..orcid import orcid_to_author_name
|
|
8
|
+
from ..self_citations import CitationResult, self_citations_paper
|
|
9
|
+
from ..self_references import ReferenceResult, self_references_paper
|
|
10
|
+
from ..utils import author_name_to_ssaid, get_papers_for_author
|
|
11
|
+
from .core import Entity, EntityResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResearcherResult(EntityResult):
|
|
15
|
+
name: str
|
|
16
|
+
ssaid: int
|
|
17
|
+
orcid: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
def _ordered_items(self) -> List[Tuple[str, Any]]:
|
|
20
|
+
# enforce specific ordering
|
|
21
|
+
return [
|
|
22
|
+
("name", self.name),
|
|
23
|
+
("self_reference_ratio", self.self_reference_ratio),
|
|
24
|
+
("self_citation_ratio", self.self_citation_ratio),
|
|
25
|
+
("num_references", self.num_references),
|
|
26
|
+
("num_citations", self.num_citations),
|
|
27
|
+
("self_references", self.self_references),
|
|
28
|
+
("self_citations", self.self_citations),
|
|
29
|
+
("ssaid", self.ssaid),
|
|
30
|
+
("orcid", self.orcid),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
|
|
35
|
+
return f"{self.__class__.__name__}({inner})"
|
|
36
|
+
|
|
37
|
+
def __str__(self) -> str:
|
|
38
|
+
return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
|
|
42
|
+
|
|
43
|
+
sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Researcher(Entity):
|
|
47
|
+
name: str
|
|
48
|
+
ssaid: int
|
|
49
|
+
orcid: Optional[str] = None
|
|
50
|
+
ssids: List[int] = []
|
|
51
|
+
|
|
52
|
+
def __init__(self, input: str, mode: ModeType = "infer"):
|
|
53
|
+
"""
|
|
54
|
+
Construct researcher object for self citation/reference analysis.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
|
|
58
|
+
mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
|
|
59
|
+
Defaults to "infer".
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: Unknown mode
|
|
63
|
+
"""
|
|
64
|
+
if mode not in MODES:
|
|
65
|
+
raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
|
|
66
|
+
|
|
67
|
+
input = input.strip()
|
|
68
|
+
if mode == "infer":
|
|
69
|
+
if input.isdigit():
|
|
70
|
+
mode = "ssaid"
|
|
71
|
+
elif (
|
|
72
|
+
input.count("-") == 3
|
|
73
|
+
and len(input) == 19
|
|
74
|
+
and all([x.isdigit() for x in input.split("-")])
|
|
75
|
+
):
|
|
76
|
+
mode = "orcid"
|
|
77
|
+
else:
|
|
78
|
+
mode = "name"
|
|
79
|
+
if mode == "ssaid":
|
|
80
|
+
self.name = sch.get_author(input)._name
|
|
81
|
+
self.ssaid = input
|
|
82
|
+
elif mode == "orcid":
|
|
83
|
+
orcid_name = orcid_to_author_name(input)
|
|
84
|
+
self.orcid = input
|
|
85
|
+
self.ssaid, self.name = author_name_to_ssaid(orcid_name)
|
|
86
|
+
elif mode == "name":
|
|
87
|
+
self.name = input
|
|
88
|
+
self.ssaid, self.name = author_name_to_ssaid(input)
|
|
89
|
+
|
|
90
|
+
self.result = ResearcherResult(
|
|
91
|
+
name=self.name,
|
|
92
|
+
ssaid=int(self.ssaid),
|
|
93
|
+
orcid=self.orcid,
|
|
94
|
+
num_citations=-1,
|
|
95
|
+
num_references=-1,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
async def _self_references_async(
|
|
99
|
+
self, verbose: bool = False
|
|
100
|
+
) -> List[ReferenceResult]:
|
|
101
|
+
"""Async version of self_references."""
|
|
102
|
+
if self.ssaid == "-1":
|
|
103
|
+
return []
|
|
104
|
+
if self.ssids == []:
|
|
105
|
+
self.ssids = await get_papers_for_author(self.ssaid)
|
|
106
|
+
|
|
107
|
+
results: List[ReferenceResult] = await self_references_paper(
|
|
108
|
+
self.ssids, verbose=verbose
|
|
109
|
+
)
|
|
110
|
+
# Remove papers with zero references or that are erratum/corrigendum
|
|
111
|
+
results = [
|
|
112
|
+
r
|
|
113
|
+
for r in results
|
|
114
|
+
if r.num_references > 0
|
|
115
|
+
and "erratum" not in r.title.lower()
|
|
116
|
+
and "corrigendum" not in r.title.lower()
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
return results
|
|
120
|
+
|
|
121
|
+
def self_references(self, verbose: bool = False) -> ResearcherResult:
|
|
122
|
+
"""
|
|
123
|
+
Sifts through all papers of a researcher and extracts the self references.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
verbose: If True, logs detailed information for each paper.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
A ResearcherResult containing aggregated self-reference data.
|
|
130
|
+
"""
|
|
131
|
+
reference_results = asyncio.run(self._self_references_async(verbose=verbose))
|
|
132
|
+
|
|
133
|
+
individual_self_references = {
|
|
134
|
+
getattr(result, "title"): getattr(result, "self_references").get(
|
|
135
|
+
self.name, 0.0
|
|
136
|
+
)
|
|
137
|
+
for result in reference_results
|
|
138
|
+
}
|
|
139
|
+
reference_ratio = sum(individual_self_references.values()) / max(
|
|
140
|
+
1, len(individual_self_references)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self.result = self.result.model_copy(
|
|
144
|
+
update={
|
|
145
|
+
"num_references": sum(r.num_references for r in reference_results),
|
|
146
|
+
"self_references": dict(
|
|
147
|
+
sorted(
|
|
148
|
+
individual_self_references.items(),
|
|
149
|
+
key=lambda x: x[1],
|
|
150
|
+
reverse=True,
|
|
151
|
+
)
|
|
152
|
+
),
|
|
153
|
+
"self_reference_ratio": round(reference_ratio, 3),
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return self.result
|
|
158
|
+
|
|
159
|
+
async def _self_citations_async(
|
|
160
|
+
self, verbose: bool = False
|
|
161
|
+
) -> List[CitationResult]:
|
|
162
|
+
"""Async version of self_citations."""
|
|
163
|
+
if self.ssaid == "-1":
|
|
164
|
+
return []
|
|
165
|
+
if self.ssids == []:
|
|
166
|
+
self.ssids = await get_papers_for_author(self.ssaid)
|
|
167
|
+
|
|
168
|
+
results: List[CitationResult] = await self_citations_paper(
|
|
169
|
+
self.ssids, verbose=verbose
|
|
170
|
+
)
|
|
171
|
+
# Remove papers with zero references or that are erratum/corrigendum
|
|
172
|
+
results = [
|
|
173
|
+
r
|
|
174
|
+
for r in results
|
|
175
|
+
if r.num_citations > 0
|
|
176
|
+
and "erratum" not in r.title.lower()
|
|
177
|
+
and "corrigendum" not in r.title.lower()
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
return results
|
|
181
|
+
|
|
182
|
+
def self_citations(self, verbose: bool = False) -> ResearcherResult:
|
|
183
|
+
"""
|
|
184
|
+
Sifts through all papers of a researcher and finds how often they are self-cited.
|
|
185
|
+
"""
|
|
186
|
+
citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
|
|
187
|
+
individual_self_citations = {
|
|
188
|
+
getattr(result, "title"): getattr(result, "self_citations").get(
|
|
189
|
+
self.name, 0.0
|
|
190
|
+
)
|
|
191
|
+
for result in citation_results
|
|
192
|
+
}
|
|
193
|
+
citation_ratio = sum(individual_self_citations.values()) / max(
|
|
194
|
+
1, len(individual_self_citations)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
self.result = self.result.model_copy(
|
|
198
|
+
update={
|
|
199
|
+
"num_citations": sum(r.num_citations for r in citation_results),
|
|
200
|
+
"self_citations": dict(
|
|
201
|
+
sorted(
|
|
202
|
+
individual_self_citations.items(),
|
|
203
|
+
key=lambda x: x[1],
|
|
204
|
+
reverse=True,
|
|
205
|
+
)
|
|
206
|
+
),
|
|
207
|
+
"self_citation_ratio": round(citation_ratio, 3),
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return self.result
|
|
212
|
+
|
|
213
|
+
def get_result(self) -> ResearcherResult:
|
|
214
|
+
"""
|
|
215
|
+
Provides the result of the analysis.
|
|
216
|
+
"""
|
|
217
|
+
if not hasattr(self, "self_ref"):
|
|
218
|
+
self.self_references()
|
|
219
|
+
if not hasattr(self, "self_cite"):
|
|
220
|
+
self.self_citations()
|
|
221
|
+
return self.result
|
|
@@ -18,11 +18,13 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
|
18
18
|
|
|
19
19
|
class CitationResult(BaseModel):
|
|
20
20
|
ssid: str # semantic scholar paper id
|
|
21
|
+
title: str
|
|
21
22
|
num_citations: int
|
|
22
23
|
self_citations: Dict[str, float] = {}
|
|
23
24
|
citation_score: float
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
@retry_with_exponential_backoff(max_retries=14, base_delay=1.0)
|
|
26
28
|
async def _fetch_citation_data(
|
|
27
29
|
client: httpx.AsyncClient, suffix: str
|
|
28
30
|
) -> Dict[str, Any]:
|
|
@@ -87,6 +89,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
|
|
|
87
89
|
|
|
88
90
|
return CitationResult(
|
|
89
91
|
ssid=identifier,
|
|
92
|
+
title=paper.get("title", ""),
|
|
90
93
|
num_citations=total_cites,
|
|
91
94
|
self_citations=ratios,
|
|
92
95
|
citation_score=avg_score,
|
|
@@ -94,7 +97,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
|
|
|
94
97
|
|
|
95
98
|
|
|
96
99
|
@optional_async
|
|
97
|
-
@retry_with_exponential_backoff(max_retries=
|
|
100
|
+
@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
|
|
98
101
|
async def self_citations_paper(
|
|
99
102
|
inputs: Union[str, List[str]], verbose: bool = False
|
|
100
103
|
) -> Union[CitationResult, List[CitationResult]]:
|
|
@@ -118,7 +121,7 @@ async def self_citations_paper(
|
|
|
118
121
|
if verbose:
|
|
119
122
|
for res in results:
|
|
120
123
|
logger.info(
|
|
121
|
-
f'Self-citations in "{res.
|
|
124
|
+
f'Self-citations in "{res.title}": N={res.num_citations}, Score={res.citation_score}%'
|
|
122
125
|
)
|
|
123
126
|
for author, pct in res.self_citations.items():
|
|
124
127
|
logger.info(f" {author}: {pct}%")
|