paperscraper 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {paperscraper-0.3.3 → paperscraper-0.3.5}/PKG-INFO +38 -26
  2. {paperscraper-0.3.3 → paperscraper-0.3.5}/README.md +19 -0
  3. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/__init__.py +1 -1
  4. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/arxiv.py +2 -3
  5. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/utils.py +2 -2
  6. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/async_utils.py +36 -9
  7. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/citations.py +2 -1
  8. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/core.py +6 -5
  9. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/paper.py +17 -15
  10. paperscraper-0.3.5/paperscraper/citations/entity/researcher.py +221 -0
  11. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/self_citations.py +5 -2
  12. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/self_references.py +68 -42
  13. paperscraper-0.3.5/paperscraper/citations/tests/test_citations.py +32 -0
  14. paperscraper-0.3.5/paperscraper/citations/tests/test_self_citations.py +147 -0
  15. paperscraper-0.3.5/paperscraper/citations/tests/test_self_references.py +96 -0
  16. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/utils.py +99 -51
  17. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/arxiv.py +2 -2
  18. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/biorxiv.py +2 -2
  19. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/chemrxiv.py +2 -3
  20. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/medrxiv.py +2 -2
  21. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py +39 -2
  22. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/utils.py +20 -12
  23. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/load_dumps.py +2 -3
  24. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/fallbacks.py +134 -55
  25. paperscraper-0.3.5/paperscraper/pdf/pdf.py +442 -0
  26. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/utils.py +21 -0
  27. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/pubmed.py +10 -2
  28. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_dump.py +8 -2
  29. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_impactor.py +23 -4
  30. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/test_pdf.py +0 -5
  31. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/utils.py +6 -0
  32. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/PKG-INFO +38 -26
  33. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/SOURCES.txt +1 -1
  34. paperscraper-0.3.5/paperscraper.egg-info/requires.txt +19 -0
  35. paperscraper-0.3.5/pyproject.toml +90 -0
  36. paperscraper-0.3.3/paperscraper/citations/entity/researcher.py +0 -90
  37. paperscraper-0.3.3/paperscraper/citations/tests/test_citations.py +0 -18
  38. paperscraper-0.3.3/paperscraper/citations/tests/test_self_citations.py +0 -71
  39. paperscraper-0.3.3/paperscraper/citations/tests/test_self_references.py +0 -78
  40. paperscraper-0.3.3/paperscraper/pdf/pdf.py +0 -250
  41. paperscraper-0.3.3/paperscraper.egg-info/requires.txt +0 -19
  42. paperscraper-0.3.3/setup.py +0 -77
  43. {paperscraper-0.3.3 → paperscraper-0.3.5}/LICENSE +0 -0
  44. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/arxiv/__init__.py +0 -0
  45. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/__init__.py +0 -0
  46. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/core.py +0 -0
  47. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/entity/__init__.py +0 -0
  48. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/orcid.py +0 -0
  49. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/tests/__init__.py +0 -0
  50. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/citations/tests/test_paper.py +0 -0
  51. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/__init__.py +0 -0
  52. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/__init__.py +0 -0
  53. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/get_dumps/utils/chemrxiv/__init__.py +0 -0
  54. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/impact.py +0 -0
  55. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pdf/__init__.py +0 -0
  56. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/plotting.py +0 -0
  57. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/postprocessing.py +0 -0
  58. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/__init__.py +0 -0
  59. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/tests/__init__.py +0 -0
  60. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/tests/test_pubmed.py +0 -0
  61. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/pubmed/utils.py +0 -0
  62. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/__init__.py +0 -0
  63. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/core.py +0 -0
  64. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/scholar.py +0 -0
  65. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/tests/__init__.py +0 -0
  66. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/scholar/tests/test_scholar.py +0 -0
  67. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/server_dumps/__init__.py +0 -0
  68. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/tests/__init__.py +0 -0
  69. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/__init__.py +0 -0
  70. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/tests/__init__.py +0 -0
  71. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/tests/test_xrxiv.py +0 -0
  72. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/xrxiv_api.py +0 -0
  73. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper/xrxiv/xrxiv_query.py +0 -0
  74. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/dependency_links.txt +0 -0
  75. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/not-zip-safe +0 -0
  76. {paperscraper-0.3.3 → paperscraper-0.3.5}/paperscraper.egg-info/top_level.txt +0 -0
  77. {paperscraper-0.3.3 → paperscraper-0.3.5}/setup.cfg +0 -0
@@ -1,54 +1,47 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paperscraper
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: paperscraper: Package to scrape papers.
5
- Home-page: https://github.com/jannisborn/paperscraper
6
- Author: Jannis Born, Matteo Manica
7
- Author-email: jannis.born@gmx.de, drugilsberg@gmail.com
5
+ Author-email: Jannis Born <jannis.born@gmx.de>, Matteo Manica <drugilsberg@gmail.com>
8
6
  License: MIT
7
+ Project-URL: Homepage, https://github.com/jannisborn/paperscraper
8
+ Project-URL: Documentation, https://jannisborn.github.io/paperscraper/
9
+ Project-URL: Repository, https://github.com/jannisborn/paperscraper
9
10
  Keywords: Academics,Science,Publication,Search,PubMed,Arxiv,Medrxiv,Biorxiv,Chemrxiv,Google Scholar
10
11
  Classifier: Development Status :: 3 - Alpha
11
12
  Classifier: Intended Audience :: Developers
12
13
  Classifier: Intended Audience :: Science/Research
13
14
  Classifier: License :: OSI Approved :: MIT License
14
15
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.8
16
16
  Classifier: Programming Language :: Python :: 3.9
17
17
  Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
19
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.9
20
23
  Description-Content-Type: text/markdown
21
24
  License-File: LICENSE
22
- Requires-Dist: arxiv>=1.4.2
25
+ Requires-Dist: arxiv>=1.4.7
23
26
  Requires-Dist: pymed-paperscraper>=1.0.4
24
- Requires-Dist: pandas
25
- Requires-Dist: requests
26
- Requires-Dist: tqdm
27
+ Requires-Dist: pandas>=1.0.4
28
+ Requires-Dist: requests>=2.32.2
29
+ Requires-Dist: tqdm>=4.51.0
27
30
  Requires-Dist: scholarly>=1.0.0
28
- Requires-Dist: seaborn
29
- Requires-Dist: matplotlib
30
- Requires-Dist: matplotlib_venn
31
- Requires-Dist: bs4
32
- Requires-Dist: impact-factor>=1.1.1
33
- Requires-Dist: thefuzz
31
+ Requires-Dist: seaborn>=0.11.0
32
+ Requires-Dist: matplotlib>=3.3.2
33
+ Requires-Dist: matplotlib-venn>=0.11.5
34
+ Requires-Dist: bs4>=0.0.1
35
+ Requires-Dist: impact-factor>=1.1.3
36
+ Requires-Dist: thefuzz>=0.20.0
34
37
  Requires-Dist: pytest
35
38
  Requires-Dist: tldextract
36
- Requires-Dist: semanticscholar
39
+ Requires-Dist: semanticscholar>=0.8.4
37
40
  Requires-Dist: pydantic
38
41
  Requires-Dist: unidecode
39
42
  Requires-Dist: dotenv
40
43
  Requires-Dist: boto3
41
- Dynamic: author
42
- Dynamic: author-email
43
- Dynamic: classifier
44
- Dynamic: description
45
- Dynamic: description-content-type
46
- Dynamic: home-page
47
- Dynamic: keywords
48
- Dynamic: license
49
44
  Dynamic: license-file
50
- Dynamic: requires-dist
51
- Dynamic: summary
52
45
 
53
46
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_tip.yml?query=branch%3Amain)
54
47
  [![build](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml/badge.svg?branch=main)](https://github.com/jannisborn/paperscraper/actions/workflows/test_pypi.yml?query=branch%3Amain)
@@ -91,6 +84,18 @@ pip install paperscraper
91
84
 
92
85
  This is enough to query PubMed, arXiv or Google Scholar.
93
86
 
87
+ ### Local development
88
+
89
+ ```console
90
+ uv sync
91
+ ```
92
+
93
+ This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
94
+
95
+ ```console
96
+ uv run python -c "import paperscraper"
97
+ ```
98
+
94
99
  #### Download X-rxiv Dumps
95
100
 
96
101
  However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
@@ -280,6 +285,13 @@ doi = '10.1021/acs.jcim.3c00132'
280
285
  get_citations_by_doi(doi)
281
286
  ```
282
287
 
288
+ NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
289
+ ```sh
290
+ export SS_API_KEY=YOUR_API_KEY
291
+ ```
292
+ This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
293
+
294
+
283
295
  ### Journal impact factor
284
296
 
285
297
  You can also retrieve the impact factor for all journals:
@@ -39,6 +39,18 @@ pip install paperscraper
39
39
 
40
40
  This is enough to query PubMed, arXiv or Google Scholar.
41
41
 
42
+ ### Local development
43
+
44
+ ```console
45
+ uv sync
46
+ ```
47
+
48
+ This installs the project and dev tooling into `.venv`. Use `uv run` to execute commands, for example:
49
+
50
+ ```console
51
+ uv run python -c "import paperscraper"
52
+ ```
53
+
42
54
  #### Download X-rxiv Dumps
43
55
 
44
56
  However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
@@ -228,6 +240,13 @@ doi = '10.1021/acs.jcim.3c00132'
228
240
  get_citations_by_doi(doi)
229
241
  ```
230
242
 
243
+ NOTE: This uses the [Semantic Scholar API](https://www.semanticscholar.org/product/api/tutorial) which is bandwidth-limited. If you have an API Key set it via:
244
+ ```sh
245
+ export SS_API_KEY=YOUR_API_KEY
246
+ ```
247
+ This will increase your throughput for using `paperscraper.citations` based on the rate limits of your key.
248
+
249
+
231
250
  ### Journal impact factor
232
251
 
233
252
  You can also retrieve the impact factor for all journals:
@@ -1,7 +1,7 @@
1
1
  """Initialize the module."""
2
2
 
3
3
  __name__ = "paperscraper"
4
- __version__ = "0.3.3"
4
+ __version__ = "0.3.5"
5
5
 
6
6
  import logging
7
7
  import os
@@ -6,17 +6,16 @@ from typing import Dict, List, Literal, Union
6
6
 
7
7
  import arxiv
8
8
  import pandas as pd
9
- import pkg_resources
10
9
  from tqdm import tqdm
11
10
 
12
- from ..utils import dump_papers
11
+ from ..utils import dump_papers, get_server_dumps_dir
13
12
  from ..xrxiv.xrxiv_query import XRXivQuery
14
13
  from .utils import get_query_from_keywords, infer_backend
15
14
 
16
15
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
17
16
  logger = logging.getLogger(__name__)
18
17
 
19
- dump_root = pkg_resources.resource_filename("paperscraper", "server_dumps")
18
+ dump_root = get_server_dumps_dir()
20
19
 
21
20
  global ARXIV_QUERIER
22
21
  ARXIV_QUERIER = None
@@ -3,7 +3,7 @@ import os
3
3
  from datetime import datetime
4
4
  from typing import List, Union
5
5
 
6
- import pkg_resources
6
+ from ..utils import get_server_dumps_dir
7
7
 
8
8
  finalize_disjunction = lambda x: "(" + x[:-4] + ") AND "
9
9
  finalize_conjunction = lambda x: x[:-5]
@@ -59,6 +59,6 @@ def get_query_from_keywords(
59
59
 
60
60
 
61
61
  def infer_backend():
62
- dump_root = pkg_resources.resource_filename("paperscraper", "server_dumps")
62
+ dump_root = get_server_dumps_dir()
63
63
  dump_paths = glob.glob(os.path.join(dump_root, "arxiv" + "*"))
64
64
  return "api" if not dump_paths else "local"
@@ -49,14 +49,20 @@ def optional_async(
49
49
 
50
50
 
51
51
  def retry_with_exponential_backoff(
52
- *, max_retries: int = 5, base_delay: float = 1.0
52
+ *,
53
+ max_retries: int = 5,
54
+ base_delay: float = 1.0,
55
+ factor: float = 1.3,
56
+ constant_delay: float = 0.2,
53
57
  ) -> Callable[[F], F]:
54
58
  """
55
59
  Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
56
60
 
57
61
  Args:
58
62
  max_retries: how many times to retry before giving up.
59
- base_delay: initial delay in seconds; next delays will be duplication of previous.
63
+ base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
64
+ factor: multiplier for delay after each retry.
65
+ constant_delay: fixed delay before each attempt.
60
66
 
61
67
  Usage:
62
68
 
@@ -70,18 +76,39 @@ def retry_with_exponential_backoff(
70
76
  @wraps(func)
71
77
  async def wrapper(*args, **kwargs) -> Any:
72
78
  delay = base_delay
73
- for attempt in range(max_retries):
79
+ last_exception: BaseException | None = None
80
+ for attempt in range(1, max_retries + 1):
81
+ await asyncio.sleep(constant_delay)
74
82
  try:
75
83
  return await func(*args, **kwargs)
76
84
  except httpx.HTTPStatusError as e:
77
- # only retry on 429
78
85
  status = e.response.status_code if e.response is not None else None
79
- if status != 429 or attempt == max_retries - 1:
86
+ if status != 429:
80
87
  raise
81
- # backoff
82
- await asyncio.sleep(delay)
83
- delay *= 2
84
- # in theory we never reach here
88
+ last_exception = e
89
+ sleep_for = delay
90
+ if e.response is not None:
91
+ ra = e.response.headers.get("Retry-After")
92
+ if ra is not None:
93
+ try:
94
+ sleep_for = float(ra)
95
+ except ValueError:
96
+ pass
97
+ delay *= factor
98
+
99
+ except httpx.ReadError as e:
100
+ last_exception = e
101
+ sleep_for = delay
102
+ delay *= factor
103
+
104
+ if attempt == max_retries:
105
+ msg = (
106
+ f"{func.__name__} failed after {attempt} attempts with "
107
+ f"last delay {sleep_for:.2f}s"
108
+ )
109
+ raise RuntimeError(msg) from last_exception
110
+
111
+ await asyncio.sleep(sleep_for)
85
112
 
86
113
  return wrapper
87
114
 
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  import sys
3
4
  from time import sleep
4
5
 
@@ -7,7 +8,7 @@ from semanticscholar import SemanticScholar, SemanticScholarException
7
8
 
8
9
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
9
10
  logger = logging.getLogger(__name__)
10
- sch = SemanticScholar()
11
+ sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
11
12
 
12
13
 
13
14
  def get_citations_by_doi(doi: str) -> int:
@@ -5,14 +5,15 @@ from pydantic import BaseModel
5
5
 
6
6
 
7
7
  class EntityResult(BaseModel):
8
- num_citations: int
9
- num_references: int
10
- # keys are authors or papers and values are absolute self links
11
- self_citations: Dict[str, int] = {}
12
- self_references: Dict[str, int] = {}
13
8
  # aggregated results
14
9
  self_citation_ratio: float = 0
15
10
  self_reference_ratio: float = 0
11
+ # total number of author citations/references
12
+ num_citations: int
13
+ num_references: int
14
+ # keys are papers and values are percentage of self citations/references
15
+ self_citations: Dict[str, float] = {}
16
+ self_references: Dict[str, float] = {}
16
17
 
17
18
 
18
19
  class Entity:
@@ -68,14 +68,14 @@ class Paper(Entity):
68
68
  Extracts the self references of a paper, for each author.
69
69
  """
70
70
  if isinstance(self.doi, str):
71
- self.ref_result: ReferenceResult = self_references_paper(self.doi)
71
+ self.self_ref: ReferenceResult = self_references_paper(self.doi)
72
72
 
73
73
  def self_citations(self):
74
74
  """
75
75
  Extracts the self citations of a paper, for each author.
76
76
  """
77
77
  if isinstance(self.doi, str):
78
- self.citation_result: CitationResult = self_citations_paper(self.doi)
78
+ self.self_cite: CitationResult = self_citations_paper(self.doi)
79
79
 
80
80
  def get_result(self) -> Optional[PaperResult]:
81
81
  """
@@ -83,18 +83,20 @@ class Paper(Entity):
83
83
 
84
84
  Returns: PaperResult if available.
85
85
  """
86
- if not hasattr(self, "ref_result"):
87
- logger.warning(
88
- f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
89
- )
90
- return
91
- elif not hasattr(self, "citation_result"):
92
- logger.warning(
93
- f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
94
- )
95
- return
96
- ref_result = self.ref_result.model_dump()
97
- ref_result.pop("ssid", None)
86
+ if not hasattr(self, "self_ref"):
87
+ self.self_references()
88
+ if not hasattr(self, "self_cite"):
89
+ self.self_citations()
98
90
  return PaperResult(
99
- title=self.title, **ref_result, **self.citation_result.model_dump()
91
+ title=self.title,
92
+ **{
93
+ k: v
94
+ for k, v in self.self_ref.model_dump().items()
95
+ if k not in ["ssid", "title"]
96
+ },
97
+ **{
98
+ k: v
99
+ for k, v in self.self_cite.model_dump().items()
100
+ if k not in ["title"]
101
+ },
100
102
  )
@@ -0,0 +1,221 @@
1
+ import asyncio
2
+ import os
3
+ from typing import Any, List, Literal, Optional, Tuple
4
+
5
+ from semanticscholar import SemanticScholar
6
+
7
+ from ..orcid import orcid_to_author_name
8
+ from ..self_citations import CitationResult, self_citations_paper
9
+ from ..self_references import ReferenceResult, self_references_paper
10
+ from ..utils import author_name_to_ssaid, get_papers_for_author
11
+ from .core import Entity, EntityResult
12
+
13
+
14
+ class ResearcherResult(EntityResult):
15
+ name: str
16
+ ssaid: int
17
+ orcid: Optional[str] = None
18
+
19
+ def _ordered_items(self) -> List[Tuple[str, Any]]:
20
+ # enforce specific ordering
21
+ return [
22
+ ("name", self.name),
23
+ ("self_reference_ratio", self.self_reference_ratio),
24
+ ("self_citation_ratio", self.self_citation_ratio),
25
+ ("num_references", self.num_references),
26
+ ("num_citations", self.num_citations),
27
+ ("self_references", self.self_references),
28
+ ("self_citations", self.self_citations),
29
+ ("ssaid", self.ssaid),
30
+ ("orcid", self.orcid),
31
+ ]
32
+
33
+ def __repr__(self) -> str:
34
+ inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
35
+ return f"{self.__class__.__name__}({inner})"
36
+
37
+ def __str__(self) -> str:
38
+ return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())
39
+
40
+
41
+ ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
42
+
43
+ sch = SemanticScholar(api_key=os.getenv("SS_API_KEY"))
44
+
45
+
46
+ class Researcher(Entity):
47
+ name: str
48
+ ssaid: int
49
+ orcid: Optional[str] = None
50
+ ssids: List[int] = []
51
+
52
+ def __init__(self, input: str, mode: ModeType = "infer"):
53
+ """
54
+ Construct researcher object for self citation/reference analysis.
55
+
56
+ Args:
57
+ input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
58
+ mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
59
+ Defaults to "infer".
60
+
61
+ Raises:
62
+ ValueError: Unknown mode
63
+ """
64
+ if mode not in MODES:
65
+ raise ValueError(f"Unknown mode {mode} chose from {MODES}.")
66
+
67
+ input = input.strip()
68
+ if mode == "infer":
69
+ if input.isdigit():
70
+ mode = "ssaid"
71
+ elif (
72
+ input.count("-") == 3
73
+ and len(input) == 19
74
+ and all([x.isdigit() for x in input.split("-")])
75
+ ):
76
+ mode = "orcid"
77
+ else:
78
+ mode = "name"
79
+ if mode == "ssaid":
80
+ self.name = sch.get_author(input)._name
81
+ self.ssaid = input
82
+ elif mode == "orcid":
83
+ orcid_name = orcid_to_author_name(input)
84
+ self.orcid = input
85
+ self.ssaid, self.name = author_name_to_ssaid(orcid_name)
86
+ elif mode == "name":
87
+ self.name = input
88
+ self.ssaid, self.name = author_name_to_ssaid(input)
89
+
90
+ self.result = ResearcherResult(
91
+ name=self.name,
92
+ ssaid=int(self.ssaid),
93
+ orcid=self.orcid,
94
+ num_citations=-1,
95
+ num_references=-1,
96
+ )
97
+
98
+ async def _self_references_async(
99
+ self, verbose: bool = False
100
+ ) -> List[ReferenceResult]:
101
+ """Async version of self_references."""
102
+ if self.ssaid == "-1":
103
+ return []
104
+ if self.ssids == []:
105
+ self.ssids = await get_papers_for_author(self.ssaid)
106
+
107
+ results: List[ReferenceResult] = await self_references_paper(
108
+ self.ssids, verbose=verbose
109
+ )
110
+ # Remove papers with zero references or that are erratum/corrigendum
111
+ results = [
112
+ r
113
+ for r in results
114
+ if r.num_references > 0
115
+ and "erratum" not in r.title.lower()
116
+ and "corrigendum" not in r.title.lower()
117
+ ]
118
+
119
+ return results
120
+
121
+ def self_references(self, verbose: bool = False) -> ResearcherResult:
122
+ """
123
+ Sifts through all papers of a researcher and extracts the self references.
124
+
125
+ Args:
126
+ verbose: If True, logs detailed information for each paper.
127
+
128
+ Returns:
129
+ A ResearcherResult containing aggregated self-reference data.
130
+ """
131
+ reference_results = asyncio.run(self._self_references_async(verbose=verbose))
132
+
133
+ individual_self_references = {
134
+ getattr(result, "title"): getattr(result, "self_references").get(
135
+ self.name, 0.0
136
+ )
137
+ for result in reference_results
138
+ }
139
+ reference_ratio = sum(individual_self_references.values()) / max(
140
+ 1, len(individual_self_references)
141
+ )
142
+
143
+ self.result = self.result.model_copy(
144
+ update={
145
+ "num_references": sum(r.num_references for r in reference_results),
146
+ "self_references": dict(
147
+ sorted(
148
+ individual_self_references.items(),
149
+ key=lambda x: x[1],
150
+ reverse=True,
151
+ )
152
+ ),
153
+ "self_reference_ratio": round(reference_ratio, 3),
154
+ }
155
+ )
156
+
157
+ return self.result
158
+
159
+ async def _self_citations_async(
160
+ self, verbose: bool = False
161
+ ) -> List[CitationResult]:
162
+ """Async version of self_citations."""
163
+ if self.ssaid == "-1":
164
+ return []
165
+ if self.ssids == []:
166
+ self.ssids = await get_papers_for_author(self.ssaid)
167
+
168
+ results: List[CitationResult] = await self_citations_paper(
169
+ self.ssids, verbose=verbose
170
+ )
171
+ # Remove papers with zero references or that are erratum/corrigendum
172
+ results = [
173
+ r
174
+ for r in results
175
+ if r.num_citations > 0
176
+ and "erratum" not in r.title.lower()
177
+ and "corrigendum" not in r.title.lower()
178
+ ]
179
+
180
+ return results
181
+
182
+ def self_citations(self, verbose: bool = False) -> ResearcherResult:
183
+ """
184
+ Sifts through all papers of a researcher and finds how often they are self-cited.
185
+ """
186
+ citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
187
+ individual_self_citations = {
188
+ getattr(result, "title"): getattr(result, "self_citations").get(
189
+ self.name, 0.0
190
+ )
191
+ for result in citation_results
192
+ }
193
+ citation_ratio = sum(individual_self_citations.values()) / max(
194
+ 1, len(individual_self_citations)
195
+ )
196
+
197
+ self.result = self.result.model_copy(
198
+ update={
199
+ "num_citations": sum(r.num_citations for r in citation_results),
200
+ "self_citations": dict(
201
+ sorted(
202
+ individual_self_citations.items(),
203
+ key=lambda x: x[1],
204
+ reverse=True,
205
+ )
206
+ ),
207
+ "self_citation_ratio": round(citation_ratio, 3),
208
+ }
209
+ )
210
+
211
+ return self.result
212
+
213
+ def get_result(self) -> ResearcherResult:
214
+ """
215
+ Provides the result of the analysis.
216
+ """
217
+ if not hasattr(self, "self_ref"):
218
+ self.self_references()
219
+ if not hasattr(self, "self_cite"):
220
+ self.self_citations()
221
+ return self.result
@@ -18,11 +18,13 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
18
18
 
19
19
  class CitationResult(BaseModel):
20
20
  ssid: str # semantic scholar paper id
21
+ title: str
21
22
  num_citations: int
22
23
  self_citations: Dict[str, float] = {}
23
24
  citation_score: float
24
25
 
25
26
 
27
+ @retry_with_exponential_backoff(max_retries=14, base_delay=1.0)
26
28
  async def _fetch_citation_data(
27
29
  client: httpx.AsyncClient, suffix: str
28
30
  ) -> Dict[str, Any]:
@@ -87,6 +89,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
87
89
 
88
90
  return CitationResult(
89
91
  ssid=identifier,
92
+ title=paper.get("title", ""),
90
93
  num_citations=total_cites,
91
94
  self_citations=ratios,
92
95
  citation_score=avg_score,
@@ -94,7 +97,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
94
97
 
95
98
 
96
99
  @optional_async
97
- @retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
100
+ @retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
98
101
  async def self_citations_paper(
99
102
  inputs: Union[str, List[str]], verbose: bool = False
100
103
  ) -> Union[CitationResult, List[CitationResult]]:
@@ -118,7 +121,7 @@ async def self_citations_paper(
118
121
  if verbose:
119
122
  for res in results:
120
123
  logger.info(
121
- f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
124
+ f'Self-citations in "{res.title}": N={res.num_citations}, Score={res.citation_score}%'
122
125
  )
123
126
  for author, pct in res.self_citations.items():
124
127
  logger.info(f" {author}: {pct}%")