paperscraper 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {paperscraper-0.3.0 → paperscraper-0.3.2}/PKG-INFO +11 -4
  2. {paperscraper-0.3.0 → paperscraper-0.3.2}/README.md +6 -2
  3. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/__init__.py +2 -2
  4. paperscraper-0.3.2/paperscraper/async_utils.py +88 -0
  5. paperscraper-0.3.2/paperscraper/citations/__init__.py +4 -0
  6. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/citations/core.py +4 -8
  7. paperscraper-0.3.2/paperscraper/citations/entity/__init__.py +2 -0
  8. paperscraper-0.3.2/paperscraper/citations/entity/paper.py +100 -0
  9. paperscraper-0.3.2/paperscraper/citations/entity/researcher.py +90 -0
  10. paperscraper-0.3.2/paperscraper/citations/orcid.py +29 -0
  11. paperscraper-0.3.2/paperscraper/citations/self_citations.py +126 -0
  12. paperscraper-0.3.2/paperscraper/citations/self_references.py +134 -0
  13. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/citations/tests/test_citations.py +2 -3
  14. paperscraper-0.3.2/paperscraper/citations/tests/test_paper.py +52 -0
  15. paperscraper-0.3.2/paperscraper/citations/tests/test_self_citations.py +72 -0
  16. paperscraper-0.3.2/paperscraper/citations/tests/test_self_references.py +78 -0
  17. paperscraper-0.3.2/paperscraper/citations/utils.py +241 -0
  18. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +7 -0
  19. paperscraper-0.3.2/paperscraper/pdf/__init__.py +1 -0
  20. paperscraper-0.3.0/paperscraper/pdf.py → paperscraper-0.3.2/paperscraper/pdf/fallbacks.py +181 -242
  21. paperscraper-0.3.2/paperscraper/pdf/pdf.py +250 -0
  22. paperscraper-0.3.2/paperscraper/pdf/utils.py +33 -0
  23. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/tests/test_dump.py +19 -19
  24. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/tests/test_pdf.py +140 -37
  25. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/utils.py +0 -17
  26. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/xrxiv/tests/test_xrxiv.py +3 -3
  27. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/PKG-INFO +11 -4
  28. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/SOURCES.txt +8 -1
  29. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/requires.txt +4 -1
  30. {paperscraper-0.3.0 → paperscraper-0.3.2}/setup.py +4 -1
  31. paperscraper-0.3.0/paperscraper/citations/__init__.py +0 -3
  32. paperscraper-0.3.0/paperscraper/citations/entity/__init__.py +0 -2
  33. paperscraper-0.3.0/paperscraper/citations/entity/paper.py +0 -39
  34. paperscraper-0.3.0/paperscraper/citations/entity/researcher.py +0 -48
  35. paperscraper-0.3.0/paperscraper/citations/self_citations.py +0 -0
  36. paperscraper-0.3.0/paperscraper/citations/self_references.py +0 -170
  37. paperscraper-0.3.0/paperscraper/citations/tests/test_self_references.py +0 -80
  38. paperscraper-0.3.0/paperscraper/citations/utils.py +0 -23
  39. {paperscraper-0.3.0 → paperscraper-0.3.2}/LICENSE +0 -0
  40. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/arxiv/__init__.py +0 -0
  41. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/arxiv/arxiv.py +0 -0
  42. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/arxiv/utils.py +0 -0
  43. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/citations/citations.py +0 -0
  44. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/citations/entity/core.py +0 -0
  45. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/citations/tests/__init__.py +0 -0
  46. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/__init__.py +0 -0
  47. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/arxiv.py +0 -0
  48. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/biorxiv.py +0 -0
  49. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/chemrxiv.py +0 -0
  50. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/medrxiv.py +0 -0
  51. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/utils/__init__.py +0 -0
  52. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  53. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/get_dumps/utils/chemrxiv/utils.py +0 -0
  54. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/impact.py +0 -0
  55. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/load_dumps.py +0 -0
  56. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/plotting.py +0 -0
  57. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/postprocessing.py +0 -0
  58. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/pubmed/__init__.py +0 -0
  59. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/pubmed/pubmed.py +0 -0
  60. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/pubmed/tests/__init__.py +0 -0
  61. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  62. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/pubmed/utils.py +0 -0
  63. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/scholar/__init__.py +0 -0
  64. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/scholar/core.py +0 -0
  65. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/scholar/scholar.py +0 -0
  66. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/scholar/tests/__init__.py +0 -0
  67. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/scholar/tests/test_scholar.py +0 -0
  68. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/server_dumps/__init__.py +0 -0
  69. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/tests/__init__.py +0 -0
  70. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/tests/test_impactor.py +0 -0
  71. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/xrxiv/__init__.py +0 -0
  72. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/xrxiv/tests/__init__.py +0 -0
  73. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  74. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  75. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/dependency_links.txt +0 -0
  76. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/not-zip-safe +0 -0
  77. {paperscraper-0.3.0 → paperscraper-0.3.2}/paperscraper.egg-info/top_level.txt +0 -0
  78. {paperscraper-0.3.0 → paperscraper-0.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: paperscraper: Package to scrape papers.
5
5
  Home-page: https://github.com/jannisborn/paperscraper
6
6
  Author: Jannis Born, Matteo Manica
@@ -20,7 +20,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: arxiv>=1.4.2
23
- Requires-Dist: pymed-paperscraper>=1.0.3
23
+ Requires-Dist: pymed-paperscraper>=1.0.4
24
24
  Requires-Dist: pandas
25
25
  Requires-Dist: requests
26
26
  Requires-Dist: tqdm
@@ -35,6 +35,9 @@ Requires-Dist: pytest
35
35
  Requires-Dist: tldextract
36
36
  Requires-Dist: semanticscholar
37
37
  Requires-Dist: pydantic
38
+ Requires-Dist: unidecode
39
+ Requires-Dist: dotenv
40
+ Requires-Dist: boto3
38
41
  Dynamic: author
39
42
  Dynamic: author-email
40
43
  Dynamic: classifier
@@ -53,7 +56,6 @@ Dynamic: summary
53
56
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
54
57
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
55
58
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
56
- [![Downloads](https://static.pepy.tech/badge/paperscraper/month)](https://pepy.tech/project/paperscraper)
57
59
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
58
60
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
59
61
  # paperscraper
@@ -86,7 +88,7 @@ and plotting routines for meta-analysis.
86
88
  pip install paperscraper
87
89
  ```
88
90
 
89
- This is enough to query **PubMed**, **arXiv** or Google Scholar.
91
+ This is enough to query PubMed, arXiv or Google Scholar.
90
92
 
91
93
  #### Download X-rxiv Dumps
92
94
 
@@ -230,6 +232,7 @@ For more comprehensive access to papers from major publishers, you can provide A
230
232
 
231
233
  - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
232
234
  - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
235
+ - **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
233
236
 
234
237
  To use publisher APIs:
235
238
 
@@ -237,7 +240,11 @@ To use publisher APIs:
237
240
  ```
238
241
  WILEY_TDM_API_TOKEN=your_wiley_token_here
239
242
  ELSEVIER_TDM_API_KEY=your_elsevier_key_here
243
+ AWS_ACCESS_KEY_ID=your_aws_access_key_here
244
+ AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
240
245
  ```
246
+ NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
247
+ NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
241
248
 
242
249
  2. Pass the file path when calling retrieval functions:
243
250
 
@@ -4,7 +4,6 @@
4
4
  MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
5
  [![PyPI version](https://badge.fury.io/py/paperscraper.svg)](https://badge.fury.io/py/paperscraper)
6
6
  [![Downloads](https://static.pepy.tech/badge/paperscraper)](https://pepy.tech/project/paperscraper)
7
- [![Downloads](https://static.pepy.tech/badge/paperscraper/month)](https://pepy.tech/project/paperscraper)
8
7
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
9
8
  [![codecov](https://codecov.io/github/jannisborn/paperscraper/branch/main/graph/badge.svg?token=Clwi0pu61a)](https://codecov.io/github/jannisborn/paperscraper)
10
9
  # paperscraper
@@ -37,7 +36,7 @@ and plotting routines for meta-analysis.
37
36
  pip install paperscraper
38
37
  ```
39
38
 
40
- This is enough to query **PubMed**, **arXiv** or Google Scholar.
39
+ This is enough to query PubMed, arXiv or Google Scholar.
41
40
 
42
41
  #### Download X-rxiv Dumps
43
42
 
@@ -181,6 +180,7 @@ For more comprehensive access to papers from major publishers, you can provide A
181
180
 
182
181
  - **Wiley TDM API**: Enables access to [Wiley](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) publications (2,000+ journals).
183
182
  - **Elsevier TDM API**: Enables access to [Elsevier](https://www.elsevier.com/about/policies-and-standards/text-and-data-mining) publications (The Lancet, Cell, ...).
183
+ - **bioRxiv TDM API** Enable access to [bioRxiv](https://www.biorxiv.org/tdm) publications (since May 2025 bioRxiv is protected with Cloudflare)
184
184
 
185
185
  To use publisher APIs:
186
186
 
@@ -188,7 +188,11 @@ To use publisher APIs:
188
188
  ```
189
189
  WILEY_TDM_API_TOKEN=your_wiley_token_here
190
190
  ELSEVIER_TDM_API_KEY=your_elsevier_key_here
191
+ AWS_ACCESS_KEY_ID=your_aws_access_key_here
192
+ AWS_SECRET_ACCESS_KEY=your_aws_secret_key_here
191
193
  ```
194
+ NOTE: The AWS keys can be created in your AWS/IAM account. When creating the key, make sure you tick the `AmazonS3ReadOnlyAccess` permission policy.
195
+ NOTE: If you name the file `.env` it will be loaded automatically (if it is in the cwd or anywhere above the tree to home).
192
196
 
193
197
  2. Pass the file path when calling retrieval functions:
194
198
 
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.3.0"
4
+ __version__ = "0.3.2"
5
5
 
6
6
  import logging
7
7
  import os
@@ -36,7 +36,7 @@ def dump_queries(keywords: List[List[Union[str, List[str]]]], dump_root: str) ->
36
36
 
37
37
  for idx, keyword in enumerate(keywords):
38
38
  for db, f in QUERY_FN_DICT.items():
39
- logger.info(f" Keyword {idx+1}/{len(keywords)}, DB: {db}")
39
+ logger.info(f" Keyword {idx + 1}/{len(keywords)}, DB: {db}")
40
40
  filename = get_filename_from_query(keyword)
41
41
  os.makedirs(os.path.join(dump_root, db), exist_ok=True)
42
42
  f(keyword, output_filepath=os.path.join(dump_root, db, filename))
@@ -0,0 +1,88 @@
1
+ import asyncio
2
+ import logging
3
+ import sys
4
+ import threading
5
+ from functools import wraps
6
+ from typing import Any, Awaitable, Callable, TypeVar, Union
7
+
8
+ import httpx
9
+
10
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+ logging.getLogger("httpx").setLevel(logging.WARNING)
13
+
14
+ T = TypeVar("T")
15
+ F = TypeVar("F", bound=Callable[..., Awaitable[Any]])
16
+
17
+
18
+ def _start_bg_loop(loop: asyncio.AbstractEventLoop):
19
+ asyncio.set_event_loop(loop)
20
+ loop.run_forever()
21
+
22
+
23
+ # Start one background loop in its own daemon thread
24
+ _background_loop = asyncio.new_event_loop()
25
+ threading.Thread(target=_start_bg_loop, args=(_background_loop,), daemon=True).start()
26
+
27
+
28
+ def optional_async(
29
+ func: Callable[..., Awaitable[T]],
30
+ ) -> Callable[..., Union[T, Awaitable[T]]]:
31
+ """
32
+ Allows an async function to be called from sync code (blocks until done)
33
+ or from within an async context (returns a coroutine to await).
34
+ """
35
+
36
+ @wraps(func)
37
+ def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
38
+ coro = func(*args, **kwargs)
39
+ try:
40
+ # If we're already in an asyncio loop, hand back the coroutine:
41
+ asyncio.get_running_loop()
42
+ return coro # caller must await it
43
+ except RuntimeError:
44
+ # Otherwise, schedule on the background loop and block
45
+ future = asyncio.run_coroutine_threadsafe(coro, _background_loop)
46
+ return future.result()
47
+
48
+ return wrapper
49
+
50
+
51
+ def retry_with_exponential_backoff(
52
+ *, max_retries: int = 5, base_delay: float = 1.0
53
+ ) -> Callable[[F], F]:
54
+ """
55
+ Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
56
+
57
+ Args:
58
+ max_retries: how many times to retry before giving up.
59
+ base_delay: initial delay in seconds; next delays will be duplication of previous.
60
+
61
+ Usage:
62
+
63
+ @retry_with_exponential_backoff(max_retries=3, base_delay=0.5)
64
+ async def fetch_data(...):
65
+ ...
66
+
67
+ """
68
+
69
+ def decorator(func: F) -> F:
70
+ @wraps(func)
71
+ async def wrapper(*args, **kwargs) -> Any:
72
+ delay = base_delay
73
+ for attempt in range(max_retries):
74
+ try:
75
+ return await func(*args, **kwargs)
76
+ except httpx.HTTPStatusError as e:
77
+ # only retry on 429
78
+ status = e.response.status_code if e.response is not None else None
79
+ if status != 429 or attempt == max_retries - 1:
80
+ raise
81
+ # backoff
82
+ await asyncio.sleep(delay)
83
+ delay *= 2
84
+ # in theory we never reach here
85
+
86
+ return wrapper
87
+
88
+ return decorator
@@ -0,0 +1,4 @@
1
+ from .citations import get_citations_by_doi, get_citations_from_title # noqa
2
+ from .core import SelfLinkClient # noqa
3
+ from .self_citations import self_citations_paper # noqa
4
+ from .self_references import self_references_paper # noqa
@@ -1,15 +1,8 @@
1
- import asyncio
2
1
  import logging
3
- import re
4
2
  import sys
5
- from typing import Dict, Iterable, Literal, Union
3
+ from typing import Literal
6
4
 
7
- import httpx
8
- from semanticscholar import SemanticScholar
9
-
10
- from ..utils import optional_async
11
5
  from .entity import Paper, Researcher
12
- from .utils import check_overlap, doi_pattern
13
6
 
14
7
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
15
8
  logger = logging.getLogger(__name__)
@@ -20,6 +13,9 @@ ModeType = Literal[tuple(MODES := ("paper", "author"))]
20
13
 
21
14
  class SelfLinkClient:
22
15
  def __init__(self, entity: str, mode: ModeType = "paper") -> None:
16
+ self.mode = mode.lower()
17
+ if self.mode not in MODES:
18
+ raise ValueError(f"Unknown mode `{self.mode}`, chose from {MODES}")
23
19
  if self.mode == "paper":
24
20
  self.object = Paper(entity)
25
21
 
@@ -0,0 +1,2 @@
1
+ from .paper import Paper, PaperResult # noqa
2
+ from .researcher import Researcher, ResearcherResult # noqa
@@ -0,0 +1,100 @@
1
+ import logging
2
+ import sys
3
+ from typing import List, Literal, Optional
4
+
5
+ from ..self_citations import CitationResult, self_citations_paper
6
+ from ..self_references import ReferenceResult, self_references_paper
7
+ from ..utils import (
8
+ determine_paper_input_type,
9
+ get_doi_from_ssid,
10
+ get_doi_from_title,
11
+ get_title_and_id_from_doi,
12
+ )
13
+ from .core import Entity
14
+
15
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class PaperResult(ReferenceResult, CitationResult):
20
+ title: str
21
+
22
+
23
+ ModeType = Literal[tuple(MODES := ("doi", "title", "ss_id", "infer"))]
24
+
25
+ BASE_URL: str = "https://api.semanticscholar.org/graph/v1/paper/search"
26
+
27
+
28
+ class Paper(Entity):
29
+ title: str = ""
30
+ doi: str = ""
31
+ authors: List[str] = []
32
+
33
+ def __init__(self, input: str, mode: ModeType = "infer"):
34
+ """
35
+ Set up a Paper object for analysis.
36
+
37
+ Args:
38
+ input: Paper identifier. This can be the title, DOI or semantic scholar ID
39
+ of the paper.
40
+ mode: The format in which the ID was provided. Defaults to "infer".
41
+
42
+ Raises:
43
+ ValueError: If unknown mode is given.
44
+ """
45
+ if mode not in MODES:
46
+ raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
47
+
48
+ input = input.strip()
49
+ self.input = input
50
+ if mode == "infer":
51
+ mode = determine_paper_input_type(input)
52
+
53
+ if mode == "doi":
54
+ self.doi = input
55
+ elif mode == "title":
56
+ self.doi = get_doi_from_title(input)
57
+ elif mode == "ssid":
58
+ self.doi = get_doi_from_ssid(input)
59
+
60
+ if self.doi is not None:
61
+ out = get_title_and_id_from_doi(self.doi)
62
+ if out is not None:
63
+ self.title = out["title"]
64
+ self.ssid = out["ssid"]
65
+
66
+ def self_references(self):
67
+ """
68
+ Extracts the self references of a paper, for each author.
69
+ """
70
+ if isinstance(self.doi, str):
71
+ self.ref_result: ReferenceResult = self_references_paper(self.doi)
72
+
73
+ def self_citations(self):
74
+ """
75
+ Extracts the self citations of a paper, for each author.
76
+ """
77
+ if isinstance(self.doi, str):
78
+ self.citation_result: CitationResult = self_citations_paper(self.doi)
79
+
80
+ def get_result(self) -> Optional[PaperResult]:
81
+ """
82
+ Provides the result of the analysis.
83
+
84
+ Returns: PaperResult if available.
85
+ """
86
+ if not hasattr(self, "ref_result"):
87
+ logger.warning(
88
+ f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
89
+ )
90
+ return
91
+ elif not hasattr(self, "citation_result"):
92
+ logger.warning(
93
+ f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
94
+ )
95
+ return
96
+ ref_result = self.ref_result.model_dump()
97
+ ref_result.pop("ssid", None)
98
+ return PaperResult(
99
+ title=self.title, **ref_result, **self.citation_result.model_dump()
100
+ )
@@ -0,0 +1,90 @@
1
+ from typing import List, Literal, Optional
2
+
3
+ from semanticscholar import SemanticScholar
4
+ from tqdm import tqdm
5
+
6
+ from ..orcid import orcid_to_author_name
7
+ from ..self_references import ReferenceResult
8
+ from ..utils import author_name_to_ssaid, get_papers_for_author
9
+ from .core import Entity, EntityResult
10
+
11
+
12
+ class ResearcherResult(EntityResult):
13
+ name: str
14
+ ssid: int
15
+ orcid: Optional[str] = None
16
+ # TODO: the ratios will be averaged across all papers for that author
17
+
18
+
19
+ ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
20
+
21
+ sch = SemanticScholar()
22
+
23
+
24
+ class Researcher(Entity):
25
+ name: str
26
+ ssid: int
27
+ orcid: Optional[str] = None
28
+
29
+ def __init__(self, input: str, mode: ModeType = "infer"):
30
+ """
31
+ Construct researcher object for self citation/reference analysis.
32
+
33
+ Args:
34
+ input: A researcher to search for.
35
+ mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
36
+ Defaults to "infer".
37
+
38
+ Raises:
39
+ ValueError: Unknown mode
40
+ """
41
+ if mode not in MODES:
42
+ raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
43
+
44
+ input = input.strip()
45
+ if mode == "infer":
46
+ if input.isdigit():
47
+ mode = "ssaid"
48
+ elif (
49
+ input.count("-") == 3
50
+ and len(input) == 19
51
+ and all([x.isdigit() for x in input.split("-")])
52
+ ):
53
+ mode = "orcid"
54
+ else:
55
+ mode = "author"
56
+
57
+ if mode == "ssaid":
58
+ self.author = sch.get_author(input)
59
+ self.ssid = input
60
+ elif mode == "orcid":
61
+ self.author = orcid_to_author_name(input)
62
+ self.orcid = input
63
+ self.ssid = author_name_to_ssaid(input)
64
+ elif mode == "author":
65
+ self.author = input
66
+ self.ssid = author_name_to_ssaid(input)
67
+
68
+ # TODO: Skip over erratum / corrigendum
69
+ self.ssids = get_papers_for_author(self.ssid)
70
+
71
+ def self_references(self):
72
+ """
73
+ Sifts through all papers of a researcher and extracts the self references.
74
+ """
75
+ # TODO: Asynchronous call to self_references
76
+ print("Going through SSIDs", self.ssids)
77
+
78
+ # TODO: Aggregate results
79
+
80
+ def self_citations(self):
81
+ """
82
+ Sifts through all papers of a researcher and finds how often they are self-cited.
83
+ """
84
+ ...
85
+
86
+ def get_result(self) -> ResearcherResult:
87
+ """
88
+ Provides the result of the analysis.
89
+ """
90
+ ...
@@ -0,0 +1,29 @@
1
+ import logging
2
+ import sys
3
+ from typing import Optional
4
+
5
+ import requests
6
+
7
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ BASE_URL = "https://pub.orcid.org/v3.0/"
11
+
12
+
13
+ def orcid_to_author_name(orcid_id: str) -> Optional[str]:
14
+ """
15
+ Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
16
+ returns the full name of the author from the ORCID public API.
17
+ """
18
+
19
+ headers = {"Accept": "application/json"}
20
+ response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
21
+ if response.status_code == 200:
22
+ data = response.json()
23
+ given = data.get("name", {}).get("given-names", {}).get("value", "")
24
+ family = data.get("name", {}).get("family-name", {}).get("value", "")
25
+ full_name = f"{given} {family}".strip()
26
+ return full_name
27
+ logger.error(
28
+ f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
29
+ )
@@ -0,0 +1,126 @@
1
+ import asyncio
2
+ import logging
3
+ import re
4
+ import sys
5
+ from typing import Any, Dict, List, Union
6
+
7
+ import httpx
8
+ import numpy as np
9
+ from pydantic import BaseModel
10
+
11
+ from ..async_utils import optional_async, retry_with_exponential_backoff
12
+ from .utils import DOI_PATTERN, find_matching
13
+
14
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+ logging.getLogger("httpx").setLevel(logging.WARNING)
17
+
18
+
19
+ class CitationResult(BaseModel):
20
+ ssid: str # semantic scholar paper id
21
+ num_citations: int
22
+ self_citations: Dict[str, float] = {}
23
+ citation_score: float
24
+
25
+
26
+ async def _fetch_citation_data(
27
+ client: httpx.AsyncClient, suffix: str
28
+ ) -> Dict[str, Any]:
29
+ """
30
+ Fetch raw paper data from Semantic Scholar by DOI or SSID suffix.
31
+
32
+ Args:
33
+ client: An active httpx.AsyncClient.
34
+ suffix: Prefixed identifier (e.g., "DOI:10.1000/xyz123" or SSID).
35
+
36
+ Returns:
37
+ The JSON-decoded response as a dictionary.
38
+ """
39
+ response = await client.get(
40
+ f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
41
+ params={"fields": "title,authors,citations.authors"},
42
+ )
43
+ response.raise_for_status()
44
+ return response.json()
45
+
46
+
47
+ async def _process_single(client: httpx.AsyncClient, identifier: str) -> CitationResult:
48
+ """
49
+ Compute self-citation stats for a single paper.
50
+
51
+ Args:
52
+ client: An active httpx.AsyncClient.
53
+ identifier: A DOI or Semantic Scholar ID.
54
+
55
+ Returns:
56
+ A CitationResult containing counts and percentages of self-citations.
57
+ """
58
+ # Determine prefix for Semantic Scholar API
59
+ if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
60
+ prefix = ""
61
+ elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
62
+ prefix = "DOI:"
63
+ else:
64
+ prefix = ""
65
+
66
+ suffix = f"{prefix}{identifier}"
67
+ paper = await _fetch_citation_data(client, suffix)
68
+
69
+ # Initialize counters
70
+ author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
71
+ citations = paper.get("citations", [])
72
+ total_cites = len(citations)
73
+
74
+ # Tally self-citations
75
+ for cite in citations:
76
+ matched = find_matching(paper.get("authors", []), cite.get("authors", []))
77
+ for name in matched:
78
+ author_counts[name] += 1
79
+
80
+ # Compute percentages
81
+ ratios: Dict[str, float] = {
82
+ name: round((count / total_cites * 100), 2) if total_cites > 0 else 0.0
83
+ for name, count in author_counts.items()
84
+ }
85
+
86
+ avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
87
+
88
+ return CitationResult(
89
+ ssid=identifier,
90
+ num_citations=total_cites,
91
+ self_citations=ratios,
92
+ citation_score=avg_score,
93
+ )
94
+
95
+
96
+ @optional_async
97
+ @retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
98
+ async def self_citations_paper(
99
+ inputs: Union[str, List[str]], verbose: bool = False
100
+ ) -> Union[CitationResult, List[CitationResult]]:
101
+ """
102
+ Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.
103
+
104
+ Args:
105
+ inputs: A single DOI/SSID string or a list of them.
106
+ verbose: If True, logs detailed information for each paper.
107
+
108
+ Returns:
109
+ A single CitationResult if a string was passed, else a list of CitationResults.
110
+ """
111
+ single_input = isinstance(inputs, str)
112
+ identifiers = [inputs] if single_input else list(inputs)
113
+
114
+ async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
115
+ tasks = [_process_single(client, ident) for ident in identifiers]
116
+ results = await asyncio.gather(*tasks)
117
+
118
+ if verbose:
119
+ for res in results:
120
+ logger.info(
121
+ f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
122
+ )
123
+ for author, pct in res.self_citations.items():
124
+ logger.info(f" {author}: {pct}%")
125
+
126
+ return results[0] if single_input else results