paperscraper 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paperscraper-0.3.0 → paperscraper-0.3.1}/PKG-INFO +11 -3
- {paperscraper-0.3.0 → paperscraper-0.3.1}/README.md +6 -1
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/__init__.py +2 -2
- paperscraper-0.3.1/paperscraper/async_utils.py +88 -0
- paperscraper-0.3.1/paperscraper/citations/__init__.py +4 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/core.py +4 -8
- paperscraper-0.3.1/paperscraper/citations/entity/__init__.py +2 -0
- paperscraper-0.3.1/paperscraper/citations/entity/paper.py +100 -0
- paperscraper-0.3.1/paperscraper/citations/entity/researcher.py +90 -0
- paperscraper-0.3.1/paperscraper/citations/orcid.py +29 -0
- paperscraper-0.3.1/paperscraper/citations/self_citations.py +126 -0
- paperscraper-0.3.1/paperscraper/citations/self_references.py +134 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/tests/test_citations.py +2 -3
- paperscraper-0.3.1/paperscraper/citations/tests/test_paper.py +52 -0
- paperscraper-0.3.1/paperscraper/citations/tests/test_self_citations.py +72 -0
- paperscraper-0.3.1/paperscraper/citations/tests/test_self_references.py +79 -0
- paperscraper-0.3.1/paperscraper/citations/utils.py +241 -0
- paperscraper-0.3.1/paperscraper/pdf/__init__.py +1 -0
- paperscraper-0.3.0/paperscraper/pdf.py → paperscraper-0.3.1/paperscraper/pdf/fallbacks.py +181 -242
- paperscraper-0.3.1/paperscraper/pdf/pdf.py +250 -0
- paperscraper-0.3.1/paperscraper/pdf/utils.py +33 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/tests/test_dump.py +19 -19
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/tests/test_pdf.py +137 -27
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/utils.py +0 -17
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/xrxiv/tests/test_xrxiv.py +3 -3
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/PKG-INFO +11 -3
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/SOURCES.txt +8 -1
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/requires.txt +4 -1
- {paperscraper-0.3.0 → paperscraper-0.3.1}/setup.py +4 -1
- paperscraper-0.3.0/paperscraper/citations/__init__.py +0 -3
- paperscraper-0.3.0/paperscraper/citations/entity/__init__.py +0 -2
- paperscraper-0.3.0/paperscraper/citations/entity/paper.py +0 -39
- paperscraper-0.3.0/paperscraper/citations/entity/researcher.py +0 -48
- paperscraper-0.3.0/paperscraper/citations/self_citations.py +0 -0
- paperscraper-0.3.0/paperscraper/citations/self_references.py +0 -170
- paperscraper-0.3.0/paperscraper/citations/tests/test_self_references.py +0 -80
- paperscraper-0.3.0/paperscraper/citations/utils.py +0 -23
- {paperscraper-0.3.0 → paperscraper-0.3.1}/LICENSE +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/arxiv/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/arxiv/arxiv.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/arxiv/utils.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/citations.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/entity/core.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/citations/tests/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/arxiv.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/biorxiv.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/chemrxiv.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/medrxiv.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/utils/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/impact.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/load_dumps.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/plotting.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/postprocessing.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/pubmed/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/pubmed/pubmed.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/pubmed/tests/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/pubmed/utils.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/scholar/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/scholar/core.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/scholar/scholar.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/scholar/tests/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/scholar/tests/test_scholar.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/server_dumps/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/tests/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/tests/test_impactor.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/xrxiv/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/xrxiv/tests/__init__.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/xrxiv/xrxiv_api.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper/xrxiv/xrxiv_query.py +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/dependency_links.txt +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/not-zip-safe +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/paperscraper.egg-info/top_level.txt +0 -0
- {paperscraper-0.3.0 → paperscraper-0.3.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paperscraper
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: paperscraper: Package to scrape papers.
|
|
5
5
|
Home-page: https://github.com/jannisborn/paperscraper
|
|
6
6
|
Author: Jannis Born, Matteo Manica
|
|
@@ -20,7 +20,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: arxiv>=1.4.2
|
|
23
|
-
Requires-Dist: pymed-paperscraper>=1.0.
|
|
23
|
+
Requires-Dist: pymed-paperscraper>=1.0.4
|
|
24
24
|
Requires-Dist: pandas
|
|
25
25
|
Requires-Dist: requests
|
|
26
26
|
Requires-Dist: tqdm
|
|
@@ -35,6 +35,9 @@ Requires-Dist: pytest
|
|
|
35
35
|
Requires-Dist: tldextract
|
|
36
36
|
Requires-Dist: semanticscholar
|
|
37
37
|
Requires-Dist: pydantic
|
|
38
|
+
Requires-Dist: unidecode
|
|
39
|
+
Requires-Dist: dotenv
|
|
40
|
+
Requires-Dist: boto3
|
|
38
41
|
Dynamic: author
|
|
39
42
|
Dynamic: author-email
|
|
40
43
|
Dynamic: classifier
|
|
@@ -86,7 +89,7 @@ and plotting routines for meta-analysis.
|
|
|
86
89
|
pip install paperscraper
|
|
87
90
|
```
|
|
88
91
|
|
|
89
|
-
This is enough to query
|
|
92
|
+
This is enough to query PubMed, arXiv or Google Scholar.
|
|
90
93
|
|
|
91
94
|
#### Download X-rxiv Dumps
|
|
92
95
|
|
|
@@ -230,6 +233,7 @@ For more comprehensive access to papers from major publishers, you can provide A
|
|
|
230
233
|
|
|
231
234
|
- **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
|
|
232
235
|
- **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
|
|
236
|
+
- **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
|
|
233
237
|
|
|
234
238
|
To use publisher APIs:
|
|
235
239
|
|
|
@@ -237,7 +241,11 @@ To use publisher APIs:
|
|
|
237
241
|
```
|
|
238
242
|
WILEY_TDM_API_TOKEN=your_wiley_token_here
|
|
239
243
|
ELSEVIER_TDM_API_KEY=your_elsevier_key_here
|
|
244
|
+
AWS_ACCESS_KEY_ID=your_aws_access_key_here
|
|
245
|
+
AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
|
|
240
246
|
```
|
|
247
|
+
NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
|
|
248
|
+
NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
|
|
241
249
|
|
|
242
250
|
2. Pass the file path when calling retrieval functions:
|
|
243
251
|
|
|
@@ -37,7 +37,7 @@ and plotting routines for meta-analysis.
|
|
|
37
37
|
pip install paperscraper
|
|
38
38
|
```
|
|
39
39
|
|
|
40
|
-
This is enough to query
|
|
40
|
+
This is enough to query PubMed, arXiv or Google Scholar.
|
|
41
41
|
|
|
42
42
|
#### Download X-rxiv Dumps
|
|
43
43
|
|
|
@@ -181,6 +181,7 @@ For more comprehensive access to papers from major publishers, you can provide A
|
|
|
181
181
|
|
|
182
182
|
- **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
|
|
183
183
|
- **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
|
|
184
|
+
- **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
|
|
184
185
|
|
|
185
186
|
To use publisher APIs:
|
|
186
187
|
|
|
@@ -188,7 +189,11 @@ To use publisher APIs:
|
|
|
188
189
|
```
|
|
189
190
|
WILEY_TDM_API_TOKEN=your_wiley_token_here
|
|
190
191
|
ELSEVIER_TDM_API_KEY=your_elsevier_key_here
|
|
192
|
+
AWS_ACCESS_KEY_ID=your_aws_access_key_here
|
|
193
|
+
AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
|
|
191
194
|
```
|
|
195
|
+
NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
|
|
196
|
+
NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
|
|
192
197
|
|
|
193
198
|
2. Pass the file path when calling retrieval functions:
|
|
194
199
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Initialize the module."""
|
|
2
2
|
|
|
3
3
|
__name__ = "paperscraper"
|
|
4
|
-
__version__ = "0.3.
|
|
4
|
+
__version__ = "0.3.1"
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
@@ -36,7 +36,7 @@ def dump_queries(keywords: List[List[Union[str, List[str]]]], dump_root: str) ->
|
|
|
36
36
|
|
|
37
37
|
for idx, keyword in enumerate(keywords):
|
|
38
38
|
for db, f in QUERY_FN_DICT.items():
|
|
39
|
-
logger.info(f" Keyword {idx+1}/{len(keywords)}, DB: {db}")
|
|
39
|
+
logger.info(f" Keyword {idx + 1}/{len(keywords)}, DB: {db}")
|
|
40
40
|
filename = get_filename_from_query(keyword)
|
|
41
41
|
os.makedirs(os.path.join(dump_root, db), exist_ok=True)
|
|
42
42
|
f(keyword, output_filepath=os.path.join(dump_root, db, filename))
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
import threading
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import Any, Awaitable, Callable, TypeVar, Union
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
F = TypeVar("F", bound=Callable[..., Awaitable[Any]])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _start_bg_loop(loop: asyncio.AbstractEventLoop):
|
|
19
|
+
asyncio.set_event_loop(loop)
|
|
20
|
+
loop.run_forever()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Start one background loop in its own daemon thread
|
|
24
|
+
_background_loop = asyncio.new_event_loop()
|
|
25
|
+
threading.Thread(target=_start_bg_loop, args=(_background_loop,), daemon=True).start()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def optional_async(
|
|
29
|
+
func: Callable[..., Awaitable[T]],
|
|
30
|
+
) -> Callable[..., Union[T, Awaitable[T]]]:
|
|
31
|
+
"""
|
|
32
|
+
Allows an async function to be called from sync code (blocks until done)
|
|
33
|
+
or from within an async context (returns a coroutine to await).
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
@wraps(func)
|
|
37
|
+
def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
|
|
38
|
+
coro = func(*args, **kwargs)
|
|
39
|
+
try:
|
|
40
|
+
# If we're already in an asyncio loop, hand back the coroutine:
|
|
41
|
+
asyncio.get_running_loop()
|
|
42
|
+
return coro # caller must await it
|
|
43
|
+
except RuntimeError:
|
|
44
|
+
# Otherwise, schedule on the background loop and block
|
|
45
|
+
future = asyncio.run_coroutine_threadsafe(coro, _background_loop)
|
|
46
|
+
return future.result()
|
|
47
|
+
|
|
48
|
+
return wrapper
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def retry_with_exponential_backoff(
|
|
52
|
+
*, max_retries: int = 5, base_delay: float = 1.0
|
|
53
|
+
) -> Callable[[F], F]:
|
|
54
|
+
"""
|
|
55
|
+
Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
max_retries: how many times to retry before giving up.
|
|
59
|
+
base_delay: initial delay in seconds; next delays will be duplication of previous.
|
|
60
|
+
|
|
61
|
+
Usage:
|
|
62
|
+
|
|
63
|
+
@retry_with_exponential_backoff(max_retries=3, base_delay=0.5)
|
|
64
|
+
async def fetch_data(...):
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def decorator(func: F) -> F:
|
|
70
|
+
@wraps(func)
|
|
71
|
+
async def wrapper(*args, **kwargs) -> Any:
|
|
72
|
+
delay = base_delay
|
|
73
|
+
for attempt in range(max_retries):
|
|
74
|
+
try:
|
|
75
|
+
return await func(*args, **kwargs)
|
|
76
|
+
except httpx.HTTPStatusError as e:
|
|
77
|
+
# only retry on 429
|
|
78
|
+
status = e.response.status_code if e.response is not None else None
|
|
79
|
+
if status != 429 or attempt == max_retries - 1:
|
|
80
|
+
raise
|
|
81
|
+
# backoff
|
|
82
|
+
await asyncio.sleep(delay)
|
|
83
|
+
delay *= 2
|
|
84
|
+
# in theory we never reach here
|
|
85
|
+
|
|
86
|
+
return wrapper
|
|
87
|
+
|
|
88
|
+
return decorator
|
|
@@ -1,15 +1,8 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import logging
|
|
3
|
-
import re
|
|
4
2
|
import sys
|
|
5
|
-
from typing import
|
|
3
|
+
from typing import Literal
|
|
6
4
|
|
|
7
|
-
import httpx
|
|
8
|
-
from semanticscholar import SemanticScholar
|
|
9
|
-
|
|
10
|
-
from ..utils import optional_async
|
|
11
5
|
from .entity import Paper, Researcher
|
|
12
|
-
from .utils import check_overlap, doi_pattern
|
|
13
6
|
|
|
14
7
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
15
8
|
logger = logging.getLogger(__name__)
|
|
@@ -20,6 +13,9 @@ ModeType = Literal[tuple(MODES := ("paper", "author"))]
|
|
|
20
13
|
|
|
21
14
|
class SelfLinkClient:
|
|
22
15
|
def __init__(self, entity: str, mode: ModeType = "paper") -> None:
|
|
16
|
+
self.mode = mode.lower()
|
|
17
|
+
if self.mode not in MODES:
|
|
18
|
+
raise ValueError(f"Unknown mode `{self.mode}`, chose from {MODES}")
|
|
23
19
|
if self.mode == "paper":
|
|
24
20
|
self.object = Paper(entity)
|
|
25
21
|
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from typing import List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
from ..self_citations import CitationResult, self_citations_paper
|
|
6
|
+
from ..self_references import ReferenceResult, self_references_paper
|
|
7
|
+
from ..utils import (
|
|
8
|
+
determine_paper_input_type,
|
|
9
|
+
get_doi_from_ssid,
|
|
10
|
+
get_doi_from_title,
|
|
11
|
+
get_title_and_id_from_doi,
|
|
12
|
+
)
|
|
13
|
+
from .core import Entity
|
|
14
|
+
|
|
15
|
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PaperResult(ReferenceResult, CitationResult):
|
|
20
|
+
title: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ModeType = Literal[tuple(MODES := ("doi", "title", "ss_id", "infer"))]
|
|
24
|
+
|
|
25
|
+
BASE_URL: str = "https://api.semanticscholar.org/graph/v1/paper/search"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Paper(Entity):
|
|
29
|
+
title: str = ""
|
|
30
|
+
doi: str = ""
|
|
31
|
+
authors: List[str] = []
|
|
32
|
+
|
|
33
|
+
def __init__(self, input: str, mode: ModeType = "infer"):
|
|
34
|
+
"""
|
|
35
|
+
Set up a Paper object for analysis.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
input: Paper identifier. This can be the title, DOI or semantic scholar ID
|
|
39
|
+
of the paper.
|
|
40
|
+
mode: The format in which the ID was provided. Defaults to "infer".
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If unknown mode is given.
|
|
44
|
+
"""
|
|
45
|
+
if mode not in MODES:
|
|
46
|
+
raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
|
|
47
|
+
|
|
48
|
+
input = input.strip()
|
|
49
|
+
self.input = input
|
|
50
|
+
if mode == "infer":
|
|
51
|
+
mode = determine_paper_input_type(input)
|
|
52
|
+
|
|
53
|
+
if mode == "doi":
|
|
54
|
+
self.doi = input
|
|
55
|
+
elif mode == "title":
|
|
56
|
+
self.doi = get_doi_from_title(input)
|
|
57
|
+
elif mode == "ssid":
|
|
58
|
+
self.doi = get_doi_from_ssid(input)
|
|
59
|
+
|
|
60
|
+
if self.doi is not None:
|
|
61
|
+
out = get_title_and_id_from_doi(self.doi)
|
|
62
|
+
if out is not None:
|
|
63
|
+
self.title = out["title"]
|
|
64
|
+
self.ssid = out["ssid"]
|
|
65
|
+
|
|
66
|
+
def self_references(self):
|
|
67
|
+
"""
|
|
68
|
+
Extracts the self references of a paper, for each author.
|
|
69
|
+
"""
|
|
70
|
+
if isinstance(self.doi, str):
|
|
71
|
+
self.ref_result: ReferenceResult = self_references_paper(self.doi)
|
|
72
|
+
|
|
73
|
+
def self_citations(self):
|
|
74
|
+
"""
|
|
75
|
+
Extracts the self citations of a paper, for each author.
|
|
76
|
+
"""
|
|
77
|
+
if isinstance(self.doi, str):
|
|
78
|
+
self.citation_result: CitationResult = self_citations_paper(self.doi)
|
|
79
|
+
|
|
80
|
+
def get_result(self) -> Optional[PaperResult]:
|
|
81
|
+
"""
|
|
82
|
+
Provides the result of the analysis.
|
|
83
|
+
|
|
84
|
+
Returns: PaperResult if available.
|
|
85
|
+
"""
|
|
86
|
+
if not hasattr(self, "ref_result"):
|
|
87
|
+
logger.warning(
|
|
88
|
+
f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
|
|
89
|
+
)
|
|
90
|
+
return
|
|
91
|
+
elif not hasattr(self, "citation_result"):
|
|
92
|
+
logger.warning(
|
|
93
|
+
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
|
|
94
|
+
)
|
|
95
|
+
return
|
|
96
|
+
ref_result = self.ref_result.model_dump()
|
|
97
|
+
ref_result.pop("ssid", None)
|
|
98
|
+
return PaperResult(
|
|
99
|
+
title=self.title, **ref_result, **self.citation_result.model_dump()
|
|
100
|
+
)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from typing import List, Literal, Optional
|
|
2
|
+
|
|
3
|
+
from semanticscholar import SemanticScholar
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from ..orcid import orcid_to_author_name
|
|
7
|
+
from ..self_references import ReferenceResult
|
|
8
|
+
from ..utils import author_name_to_ssaid, get_papers_for_author
|
|
9
|
+
from .core import Entity, EntityResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ResearcherResult(EntityResult):
|
|
13
|
+
name: str
|
|
14
|
+
ssid: int
|
|
15
|
+
orcid: Optional[str] = None
|
|
16
|
+
# TODO: the ratios will be averaged across all papers for that author
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
|
|
20
|
+
|
|
21
|
+
sch = SemanticScholar()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Researcher(Entity):
|
|
25
|
+
name: str
|
|
26
|
+
ssid: int
|
|
27
|
+
orcid: Optional[str] = None
|
|
28
|
+
|
|
29
|
+
def __init__(self, input: str, mode: ModeType = "infer"):
|
|
30
|
+
"""
|
|
31
|
+
Construct researcher object for self citation/reference analysis.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
input: A researcher to search for.
|
|
35
|
+
mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
|
|
36
|
+
Defaults to "infer".
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: Unknown mode
|
|
40
|
+
"""
|
|
41
|
+
if mode not in MODES:
|
|
42
|
+
raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
|
|
43
|
+
|
|
44
|
+
input = input.strip()
|
|
45
|
+
if mode == "infer":
|
|
46
|
+
if input.isdigit():
|
|
47
|
+
mode = "ssaid"
|
|
48
|
+
elif (
|
|
49
|
+
input.count("-") == 3
|
|
50
|
+
and len(input) == 19
|
|
51
|
+
and all([x.isdigit() for x in input.split("-")])
|
|
52
|
+
):
|
|
53
|
+
mode = "orcid"
|
|
54
|
+
else:
|
|
55
|
+
mode = "author"
|
|
56
|
+
|
|
57
|
+
if mode == "ssaid":
|
|
58
|
+
self.author = sch.get_author(input)
|
|
59
|
+
self.ssid = input
|
|
60
|
+
elif mode == "orcid":
|
|
61
|
+
self.author = orcid_to_author_name(input)
|
|
62
|
+
self.orcid = input
|
|
63
|
+
self.ssid = author_name_to_ssaid(input)
|
|
64
|
+
elif mode == "author":
|
|
65
|
+
self.author = input
|
|
66
|
+
self.ssid = author_name_to_ssaid(input)
|
|
67
|
+
|
|
68
|
+
# TODO: Skip over erratum / corrigendum
|
|
69
|
+
self.ssids = get_papers_for_author(self.ssid)
|
|
70
|
+
|
|
71
|
+
def self_references(self):
|
|
72
|
+
"""
|
|
73
|
+
Sifts through all papers of a researcher and extracts the self references.
|
|
74
|
+
"""
|
|
75
|
+
# TODO: Asynchronous call to self_references
|
|
76
|
+
print("Going through SSIDs", self.ssids)
|
|
77
|
+
|
|
78
|
+
# TODO: Aggregate results
|
|
79
|
+
|
|
80
|
+
def self_citations(self):
|
|
81
|
+
"""
|
|
82
|
+
Sifts through all papers of a researcher and finds how often they are self-cited.
|
|
83
|
+
"""
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
def get_result(self) -> ResearcherResult:
|
|
87
|
+
"""
|
|
88
|
+
Provides the result of the analysis.
|
|
89
|
+
"""
|
|
90
|
+
...
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
BASE_URL = "https://pub.orcid.org/v3.0/"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def orcid_to_author_name(orcid_id: str) -> Optional[str]:
|
|
14
|
+
"""
|
|
15
|
+
Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
|
|
16
|
+
returns the full name of the author from the ORCID public API.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
headers = {"Accept": "application/json"}
|
|
20
|
+
response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
|
|
21
|
+
if response.status_code == 200:
|
|
22
|
+
data = response.json()
|
|
23
|
+
given = data.get("name", {}).get("given-names", {}).get("value", "")
|
|
24
|
+
family = data.get("name", {}).get("family-name", {}).get("value", "")
|
|
25
|
+
full_name = f"{given} {family}".strip()
|
|
26
|
+
return full_name
|
|
27
|
+
logger.error(
|
|
28
|
+
f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
|
|
29
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, Dict, List, Union
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
import numpy as np
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from ..async_utils import optional_async, retry_with_exponential_backoff
|
|
12
|
+
from .utils import DOI_PATTERN, find_matching
|
|
13
|
+
|
|
14
|
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CitationResult(BaseModel):
|
|
20
|
+
ssid: str # semantic scholar paper id
|
|
21
|
+
num_citations: int
|
|
22
|
+
self_citations: Dict[str, float] = {}
|
|
23
|
+
citation_score: float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def _fetch_citation_data(
|
|
27
|
+
client: httpx.AsyncClient, suffix: str
|
|
28
|
+
) -> Dict[str, Any]:
|
|
29
|
+
"""
|
|
30
|
+
Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
client: An active httpx.AsyncClient.
|
|
34
|
+
suffix: Prefixed identifier (e.g., "DOI:10.1000/xyz123" or SSID).
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The JSON-decoded response as a dictionary.
|
|
38
|
+
"""
|
|
39
|
+
response = await client.get(
|
|
40
|
+
f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
|
|
41
|
+
params={"fields": "title,authors,citations.authors"},
|
|
42
|
+
)
|
|
43
|
+
response.raise_for_status()
|
|
44
|
+
return response.json()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def _process_single(client: httpx.AsyncClient, identifier: str) -> CitationResult:
|
|
48
|
+
"""
|
|
49
|
+
Compute self-citation stats for a single paper.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
client: An active httpx.AsyncClient.
|
|
53
|
+
identifier: A DOI or Semantic Scholar ID.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
A CitationResult containing counts and percentages of self-citations.
|
|
57
|
+
"""
|
|
58
|
+
# Determine prefix for Semantic Scholar API
|
|
59
|
+
if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
|
|
60
|
+
prefix = ""
|
|
61
|
+
elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
|
|
62
|
+
prefix = "DOI:"
|
|
63
|
+
else:
|
|
64
|
+
prefix = ""
|
|
65
|
+
|
|
66
|
+
suffix = f"{prefix}{identifier}"
|
|
67
|
+
paper = await _fetch_citation_data(client, suffix)
|
|
68
|
+
|
|
69
|
+
# Initialize counters
|
|
70
|
+
author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
|
|
71
|
+
citations = paper.get("citations", [])
|
|
72
|
+
total_cites = len(citations)
|
|
73
|
+
|
|
74
|
+
# Tally self-citations
|
|
75
|
+
for cite in citations:
|
|
76
|
+
matched = find_matching(paper.get("authors", []), cite.get("authors", []))
|
|
77
|
+
for name in matched:
|
|
78
|
+
author_counts[name] += 1
|
|
79
|
+
|
|
80
|
+
# Compute percentages
|
|
81
|
+
ratios: Dict[str, float] = {
|
|
82
|
+
name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
|
|
83
|
+
for name, count in author_counts.items()
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
|
|
87
|
+
|
|
88
|
+
return CitationResult(
|
|
89
|
+
ssid=identifier,
|
|
90
|
+
num_citations=total_cites,
|
|
91
|
+
self_citations=ratios,
|
|
92
|
+
citation_score=avg_score,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@optional_async
|
|
97
|
+
@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
|
|
98
|
+
async def self_citations_paper(
|
|
99
|
+
inputs: Union[str, List[str]], verbose: bool = False
|
|
100
|
+
) -> Union[CitationResult, List[CitationResult]]:
|
|
101
|
+
"""
|
|
102
|
+
Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
inputs: A single DOI/SSID string or a list of them.
|
|
106
|
+
verbose: If True, logs detailed information for each paper.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
A single CitationResult if a string was passed, else a list of CitationResults.
|
|
110
|
+
"""
|
|
111
|
+
single_input = isinstance(inputs, str)
|
|
112
|
+
identifiers = [inputs] if single_input else list(inputs)
|
|
113
|
+
|
|
114
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
|
|
115
|
+
tasks = [_process_single(client, ident) for ident in identifiers]
|
|
116
|
+
results = await asyncio.gather(*tasks)
|
|
117
|
+
|
|
118
|
+
if verbose:
|
|
119
|
+
for res in results:
|
|
120
|
+
logger.info(
|
|
121
|
+
f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
|
|
122
|
+
)
|
|
123
|
+
for author, pct in res.self_citations.items():
|
|
124
|
+
logger.info(f" {author}: {pct}%")
|
|
125
|
+
|
|
126
|
+
return results[0] if single_input else results
|