pub-analyzer 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pub_analyzer/__init__.py +1 -0
- pub_analyzer/__main__.py +7 -0
- pub_analyzer/css/body.tcss +87 -0
- pub_analyzer/css/buttons.tcss +24 -0
- pub_analyzer/css/checkbox.tcss +29 -0
- pub_analyzer/css/collapsible.tcss +31 -0
- pub_analyzer/css/datatable.tcss +50 -0
- pub_analyzer/css/editor.tcss +60 -0
- pub_analyzer/css/main.tcss +50 -0
- pub_analyzer/css/report.tcss +131 -0
- pub_analyzer/css/search.tcss +81 -0
- pub_analyzer/css/summary.tcss +75 -0
- pub_analyzer/css/tabs.tcss +18 -0
- pub_analyzer/css/tree.tcss +44 -0
- pub_analyzer/internal/__init__.py +1 -0
- pub_analyzer/internal/identifier.py +106 -0
- pub_analyzer/internal/limiter.py +34 -0
- pub_analyzer/internal/render.py +41 -0
- pub_analyzer/internal/report.py +497 -0
- pub_analyzer/internal/templates/author_report.typ +591 -0
- pub_analyzer/main.py +81 -0
- pub_analyzer/models/__init__.py +1 -0
- pub_analyzer/models/author.py +87 -0
- pub_analyzer/models/concept.py +19 -0
- pub_analyzer/models/institution.py +138 -0
- pub_analyzer/models/report.py +111 -0
- pub_analyzer/models/source.py +77 -0
- pub_analyzer/models/topic.py +59 -0
- pub_analyzer/models/work.py +158 -0
- pub_analyzer/widgets/__init__.py +1 -0
- pub_analyzer/widgets/author/__init__.py +1 -0
- pub_analyzer/widgets/author/cards.py +65 -0
- pub_analyzer/widgets/author/core.py +122 -0
- pub_analyzer/widgets/author/tables.py +50 -0
- pub_analyzer/widgets/body.py +55 -0
- pub_analyzer/widgets/common/__init__.py +18 -0
- pub_analyzer/widgets/common/card.py +29 -0
- pub_analyzer/widgets/common/filesystem.py +203 -0
- pub_analyzer/widgets/common/filters.py +111 -0
- pub_analyzer/widgets/common/input.py +97 -0
- pub_analyzer/widgets/common/label.py +36 -0
- pub_analyzer/widgets/common/modal.py +43 -0
- pub_analyzer/widgets/common/selector.py +66 -0
- pub_analyzer/widgets/common/summary.py +7 -0
- pub_analyzer/widgets/institution/__init__.py +1 -0
- pub_analyzer/widgets/institution/cards.py +78 -0
- pub_analyzer/widgets/institution/core.py +122 -0
- pub_analyzer/widgets/institution/tables.py +24 -0
- pub_analyzer/widgets/report/__init__.py +1 -0
- pub_analyzer/widgets/report/author.py +43 -0
- pub_analyzer/widgets/report/cards.py +130 -0
- pub_analyzer/widgets/report/concept.py +47 -0
- pub_analyzer/widgets/report/core.py +308 -0
- pub_analyzer/widgets/report/editor.py +80 -0
- pub_analyzer/widgets/report/export.py +112 -0
- pub_analyzer/widgets/report/grants.py +85 -0
- pub_analyzer/widgets/report/institution.py +39 -0
- pub_analyzer/widgets/report/locations.py +75 -0
- pub_analyzer/widgets/report/source.py +90 -0
- pub_analyzer/widgets/report/topic.py +55 -0
- pub_analyzer/widgets/report/work.py +391 -0
- pub_analyzer/widgets/search/__init__.py +11 -0
- pub_analyzer/widgets/search/core.py +96 -0
- pub_analyzer/widgets/search/results.py +82 -0
- pub_analyzer/widgets/sidebar.py +70 -0
- pub_analyzer-0.5.6.dist-info/METADATA +102 -0
- pub_analyzer-0.5.6.dist-info/RECORD +70 -0
- pub_analyzer-0.5.6.dist-info/WHEEL +4 -0
- pub_analyzer-0.5.6.dist-info/entry_points.txt +3 -0
- pub_analyzer-0.5.6.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Functions to extract OpenAlex IDs from Models."""
|
|
2
|
+
|
|
3
|
+
from pub_analyzer.models.author import Author, AuthorOpenAlexKey, AuthorResult, DehydratedAuthor
|
|
4
|
+
from pub_analyzer.models.institution import DehydratedInstitution, Institution, InstitutionOpenAlexKey, InstitutionResult
|
|
5
|
+
from pub_analyzer.models.source import DehydratedSource, Source
|
|
6
|
+
from pub_analyzer.models.work import Work
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_author_id(author: Author | AuthorResult | DehydratedAuthor) -> AuthorOpenAlexKey:
|
|
10
|
+
"""Extract OpenAlex ID from author Model.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
author: Author model instance.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Author OpenAlex ID.
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
```python
|
|
20
|
+
from pub_analyzer.internal.identifier import get_author_id
|
|
21
|
+
from pub_analyzer.models.author import DehydratedAuthor
|
|
22
|
+
|
|
23
|
+
author = DehydratedAuthor(id="https://openalex.org/A000000000")
|
|
24
|
+
print(get_author_id(author))
|
|
25
|
+
# 'A000000000'
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
if author.id.path:
|
|
29
|
+
return author.id.path.rpartition("/")[2]
|
|
30
|
+
else:
|
|
31
|
+
return ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_institution_id(institution: Institution | InstitutionResult | DehydratedInstitution) -> InstitutionOpenAlexKey:
|
|
35
|
+
"""Extract OpenAlex ID from institution Model.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
institution: Institution model instance.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Institution OpenAlex ID.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
```python
|
|
45
|
+
from pub_analyzer.internal.identifier import get_institution_id
|
|
46
|
+
from pub_analyzer.models.institution import DehydratedInstitution
|
|
47
|
+
|
|
48
|
+
institution = DehydratedInstitution(id="https://openalex.org/I000000000", **kwargs)
|
|
49
|
+
print(get_institution_id(institution))
|
|
50
|
+
# 'I000000000'
|
|
51
|
+
```
|
|
52
|
+
"""
|
|
53
|
+
if institution.id.path:
|
|
54
|
+
return institution.id.path.rpartition("/")[2]
|
|
55
|
+
else:
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_work_id(work: Work) -> str:
|
|
60
|
+
"""Extract OpenAlex ID from Work Model.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
work: Work model instance.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Work OpenAlex ID.
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
```python
|
|
70
|
+
from pub_analyzer.internal.identifier import get_work_id
|
|
71
|
+
from pub_analyzer.models.work import Work
|
|
72
|
+
|
|
73
|
+
work = Work(id="https://openalex.org/W000000000", **kwargs)
|
|
74
|
+
print(get_work_id(work))
|
|
75
|
+
# 'W000000000'
|
|
76
|
+
```
|
|
77
|
+
"""
|
|
78
|
+
if work.id.path:
|
|
79
|
+
return work.id.path.rpartition("/")[2]
|
|
80
|
+
else:
|
|
81
|
+
return ""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_source_id(source: DehydratedSource | Source) -> str:
|
|
85
|
+
"""Extract OpenAlex ID from Source Model.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
source: Source model instance.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Source OpenAlex ID.
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
```python
|
|
95
|
+
from pub_analyzer.internal.identifier import get_source_id
|
|
96
|
+
from pub_analyzer.models.source import Source
|
|
97
|
+
|
|
98
|
+
source = Source(id="https://openalex.org/S000000000", **kwargs)
|
|
99
|
+
print(get_source_id(source))
|
|
100
|
+
# 'S000000000'
|
|
101
|
+
```
|
|
102
|
+
"""
|
|
103
|
+
if source.id.path:
|
|
104
|
+
return source.id.path.rpartition("/")[2]
|
|
105
|
+
else:
|
|
106
|
+
return ""
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Rate limiter module."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RateLimiter:
|
|
8
|
+
"""Rate limiter."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, rate: int, per_second: float = 1.0) -> None:
|
|
11
|
+
self.rate = rate
|
|
12
|
+
self.per = per_second
|
|
13
|
+
self._tokens = float(rate)
|
|
14
|
+
self._updated_at = time.monotonic()
|
|
15
|
+
self._lock = asyncio.Lock()
|
|
16
|
+
|
|
17
|
+
async def acquire(self) -> None:
|
|
18
|
+
"""Wait until new token is available."""
|
|
19
|
+
while True:
|
|
20
|
+
async with self._lock:
|
|
21
|
+
now = time.monotonic()
|
|
22
|
+
elapsed = now - self._updated_at
|
|
23
|
+
if elapsed > 0:
|
|
24
|
+
self._tokens = min(self.rate, self._tokens + elapsed * (self.rate / self.per))
|
|
25
|
+
self._updated_at = now
|
|
26
|
+
|
|
27
|
+
if self._tokens >= 1.0:
|
|
28
|
+
self._tokens -= 1.0
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
missing = 1.0 - self._tokens
|
|
32
|
+
wait_time = missing * (self.per / self.rate)
|
|
33
|
+
|
|
34
|
+
await asyncio.sleep(wait_time)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Render reports."""
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
import time
|
|
5
|
+
from importlib.metadata import version
|
|
6
|
+
|
|
7
|
+
import typst
|
|
8
|
+
from textual import log
|
|
9
|
+
|
|
10
|
+
from pub_analyzer.models.report import AuthorReport, InstitutionReport
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def render_report(report: AuthorReport | InstitutionReport, file_path: pathlib.Path | None) -> bytes | None:
|
|
14
|
+
"""Render report to PDF.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
report: Report Model.
|
|
18
|
+
file_path: Path to save the compiled file.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
PDF bytes or None if output file path is defined.
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
SyntaxError: If typst compiler syntax error.
|
|
25
|
+
"""
|
|
26
|
+
if isinstance(report, AuthorReport):
|
|
27
|
+
templates_path = pathlib.Path(__file__).parent.resolve().joinpath("templates")
|
|
28
|
+
typst_file = templates_path / "author_report.typ"
|
|
29
|
+
if isinstance(report, InstitutionReport):
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
sys_inputs = {"report": report.model_dump_json(by_alias=True), "version": version("pub-analyzer")}
|
|
33
|
+
|
|
34
|
+
start_time = time.time()
|
|
35
|
+
if file_path:
|
|
36
|
+
result = typst.compile(input=typst_file, output=file_path, sys_inputs=sys_inputs)
|
|
37
|
+
else:
|
|
38
|
+
result = typst.compile(input=typst_file, sys_inputs=sys_inputs)
|
|
39
|
+
|
|
40
|
+
log.info(f"Typst compile time: {round((time.time() - start_time), 2)} seconds.")
|
|
41
|
+
return result
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""Functions to make reports."""
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import math
|
|
5
|
+
from typing import Any, NewType
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from pydantic import TypeAdapter
|
|
9
|
+
from textual import log
|
|
10
|
+
|
|
11
|
+
from pub_analyzer.internal import identifier
|
|
12
|
+
from pub_analyzer.internal.limiter import RateLimiter
|
|
13
|
+
from pub_analyzer.models.author import Author, AuthorOpenAlexKey, AuthorResult, AuthorYearCount, DehydratedAuthor
|
|
14
|
+
from pub_analyzer.models.institution import (
|
|
15
|
+
DehydratedInstitution,
|
|
16
|
+
Institution,
|
|
17
|
+
InstitutionOpenAlexKey,
|
|
18
|
+
InstitutionResult,
|
|
19
|
+
InstitutionYearCount,
|
|
20
|
+
)
|
|
21
|
+
from pub_analyzer.models.report import (
|
|
22
|
+
AuthorReport,
|
|
23
|
+
CitationReport,
|
|
24
|
+
CitationSummary,
|
|
25
|
+
CitationType,
|
|
26
|
+
InstitutionReport,
|
|
27
|
+
OpenAccessSummary,
|
|
28
|
+
SourcesSummary,
|
|
29
|
+
WorkReport,
|
|
30
|
+
WorkTypeCounter,
|
|
31
|
+
)
|
|
32
|
+
from pub_analyzer.models.source import DehydratedSource, Source
|
|
33
|
+
from pub_analyzer.models.work import Authorship, Work
|
|
34
|
+
|
|
35
|
+
FromDate = NewType("FromDate", datetime.datetime)
|
|
36
|
+
"""DateTime marker for works published from this date."""
|
|
37
|
+
|
|
38
|
+
ToDate = NewType("ToDate", datetime.datetime)
|
|
39
|
+
"""DateTime marker for works published up to this date."""
|
|
40
|
+
|
|
41
|
+
REQUEST_RATE_PER_SECOND = 8
|
|
42
|
+
"""The OpenAlex API requires a maximum of 10 requests per second. We limit this to 8 per second."""
|
|
43
|
+
PER_PAGE_SIZE = 100
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_author_profiles_keys(
|
|
47
|
+
author: Author, extra_profiles: list[Author | AuthorResult | DehydratedAuthor] | None
|
|
48
|
+
) -> list[AuthorOpenAlexKey]:
|
|
49
|
+
"""Create a list of profiles IDs joining main author profile and extra author profiles.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
author: Main OpenAlex author object.
|
|
53
|
+
extra_profiles: Extra OpenAlex authors objects related with the main author.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of Author OpenAlex Keys.
|
|
57
|
+
"""
|
|
58
|
+
if extra_profiles:
|
|
59
|
+
profiles = [author, *extra_profiles]
|
|
60
|
+
return [identifier.get_author_id(profile) for profile in profiles]
|
|
61
|
+
else:
|
|
62
|
+
return [identifier.get_author_id(author)]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_institution_keys(
|
|
66
|
+
institution: Institution, extra_profiles: list[Institution | InstitutionResult | DehydratedInstitution] | None
|
|
67
|
+
) -> list[InstitutionOpenAlexKey]:
|
|
68
|
+
"""Create a list of profiles IDs joining main institution profile and extra institution profiles.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
institution: Main OpenAlex institution object.
|
|
72
|
+
extra_profiles: Extra OpenAlex institutions objects related with the main institution.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of Institution OpenAlex Keys.
|
|
76
|
+
"""
|
|
77
|
+
if extra_profiles:
|
|
78
|
+
profiles = [institution, *extra_profiles]
|
|
79
|
+
return [identifier.get_institution_id(profile) for profile in profiles]
|
|
80
|
+
else:
|
|
81
|
+
return [identifier.get_institution_id(institution)]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _get_authors_list(authorships: list[Authorship]) -> list[str]:
|
|
85
|
+
"""Collect OpenAlex IDs from authors in a list of authorship's.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
authorships: List of authorships.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Authors keys IDs.
|
|
92
|
+
"""
|
|
93
|
+
return [identifier.get_author_id(authorship.author) for authorship in authorships]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _get_citation_type(original_work_authors: list[str], cited_work_authors: list[str]) -> CitationType:
|
|
97
|
+
"""Compare two lists of authors and returns the citation type.
|
|
98
|
+
|
|
99
|
+
Based on the authors of a given work and the authors of another work that cites the analyzed work,
|
|
100
|
+
calculate the citation type.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
original_work_authors: List of the authors of the evaluated work.
|
|
104
|
+
cited_work_authors: List of the authors of the citing document.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Calculated cite type (Type A or Type B).
|
|
108
|
+
|
|
109
|
+
Info:
|
|
110
|
+
**Type A:** Citations made by researchers in documents where the evaluated author or
|
|
111
|
+
one of his co-authors does not appear as part of the authorship of the citing documents.
|
|
112
|
+
|
|
113
|
+
**Type B:** Citations generated by the author or one of the co-authors of the work being
|
|
114
|
+
analyzed.
|
|
115
|
+
"""
|
|
116
|
+
original_set = set(original_work_authors)
|
|
117
|
+
cited_set = set(cited_work_authors)
|
|
118
|
+
|
|
119
|
+
return CitationType.TypeA if not original_set.intersection(cited_set) else CitationType.TypeB
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _add_work_abstract(work: dict[str, Any]) -> dict[str, Any]:
|
|
123
|
+
"""Get work abstract from abstract_inverted_index and insert new key `abstract`.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
work: Raw work.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Work with new key `abstract`.
|
|
130
|
+
"""
|
|
131
|
+
abstract_inverted_index = work.get("abstract_inverted_index")
|
|
132
|
+
if abstract_inverted_index:
|
|
133
|
+
work["abstract"] = " ".join(abstract_inverted_index)
|
|
134
|
+
else:
|
|
135
|
+
work["abstract"] = None
|
|
136
|
+
return work
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _get_valid_works(works: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
140
|
+
"""Skip works that do not contain enough data.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
works: List of raw works.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of raw works with enough data to pass the Works validation.
|
|
147
|
+
|
|
148
|
+
Danger:
|
|
149
|
+
Sometimes OpenAlex provides works with insufficient information to be considered.
|
|
150
|
+
In response, we have chosen to exclude such works at this stage, thus avoiding
|
|
151
|
+
the need to handle exceptions within the Model validators.
|
|
152
|
+
"""
|
|
153
|
+
valid_works = []
|
|
154
|
+
for work in works:
|
|
155
|
+
if work["title"] is not None:
|
|
156
|
+
valid_works.append(_add_work_abstract(work))
|
|
157
|
+
else:
|
|
158
|
+
log.warning(f"Discarded work: {work['id']}")
|
|
159
|
+
|
|
160
|
+
return valid_works
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _get_year_counter(
|
|
164
|
+
counts_by_year: list[AuthorYearCount] | list[InstitutionYearCount], work_publication_year: int
|
|
165
|
+
) -> AuthorYearCount | InstitutionYearCount | None:
|
|
166
|
+
"""Iterate over the counts_by_year and return the corresponding year counter."""
|
|
167
|
+
return next((year_counter for year_counter in counts_by_year if year_counter.year == work_publication_year), None)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
async def _get_works(client: httpx.AsyncClient, url: str, limiter: RateLimiter) -> list[Work]:
|
|
171
|
+
"""Get all works given a URL.
|
|
172
|
+
|
|
173
|
+
Iterate over all pages of the URL
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
client: HTTPX asynchronous client to be used to make the requests.
|
|
177
|
+
url: URL of works with all filters and sorting applied.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of Works Models.
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
|
|
184
|
+
"""
|
|
185
|
+
await limiter.acquire()
|
|
186
|
+
response = await client.get(url=url, follow_redirects=True)
|
|
187
|
+
response.raise_for_status()
|
|
188
|
+
|
|
189
|
+
json_response = response.json()
|
|
190
|
+
meta_info = json_response["meta"]
|
|
191
|
+
page_count = math.ceil(meta_info["count"] / meta_info["per_page"])
|
|
192
|
+
|
|
193
|
+
works_data = list(_get_valid_works(json_response["results"]))
|
|
194
|
+
|
|
195
|
+
for page_number in range(1, page_count):
|
|
196
|
+
await limiter.acquire()
|
|
197
|
+
page_result = (await client.get(url + f"&page={page_number + 1}", follow_redirects=True)).json()
|
|
198
|
+
works_data.extend(_get_valid_works(page_result["results"]))
|
|
199
|
+
|
|
200
|
+
return TypeAdapter(list[Work]).validate_python(works_data)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def _get_source(client: httpx.AsyncClient, url: str, limiter: RateLimiter) -> Source:
|
|
204
|
+
"""Get source given a URL.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
client: HTTPX asynchronous client to be used to make the requests.
|
|
208
|
+
url: URL of works with all filters.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Source Model.
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
|
|
215
|
+
"""
|
|
216
|
+
await limiter.acquire()
|
|
217
|
+
response = await client.get(url=url, follow_redirects=True)
|
|
218
|
+
response.raise_for_status()
|
|
219
|
+
|
|
220
|
+
json_response = response.json()
|
|
221
|
+
hp_url = json_response["homepage_url"]
|
|
222
|
+
if isinstance(hp_url, str):
|
|
223
|
+
if not hp_url.startswith(("http", "https")):
|
|
224
|
+
json_response["homepage_url"] = None
|
|
225
|
+
log.warning(f"Discarted source homepage url: {url}")
|
|
226
|
+
|
|
227
|
+
return Source(**json_response)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
async def make_author_report( # noqa: C901
|
|
231
|
+
author: Author,
|
|
232
|
+
extra_profiles: list[Author | AuthorResult | DehydratedAuthor] | None = None,
|
|
233
|
+
pub_from_date: FromDate | None = None,
|
|
234
|
+
pub_to_date: ToDate | None = None,
|
|
235
|
+
cited_from_date: FromDate | None = None,
|
|
236
|
+
cited_to_date: ToDate | None = None,
|
|
237
|
+
) -> AuthorReport:
|
|
238
|
+
"""Make a scientific production report by Author.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
author: Author to whom the report is generated.
|
|
242
|
+
extra_profiles: List of author profiles whose works will be attached.
|
|
243
|
+
|
|
244
|
+
pub_from_date: Filter works published from this date.
|
|
245
|
+
pub_to_date: Filter works published up to this date.
|
|
246
|
+
|
|
247
|
+
cited_from_date: Filter works that cite the author, published after this date.
|
|
248
|
+
cited_to_date: Filter works that cite the author, published up to this date.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Author's scientific production report Model.
|
|
252
|
+
|
|
253
|
+
Raises:
|
|
254
|
+
httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
|
|
255
|
+
"""
|
|
256
|
+
author_profiles_keys = _get_author_profiles_keys(author, extra_profiles)
|
|
257
|
+
profiles_query_parameter = "|".join(author_profiles_keys)
|
|
258
|
+
|
|
259
|
+
pub_from_filter = f",from_publication_date:{pub_from_date:%Y-%m-%d}" if pub_from_date else ""
|
|
260
|
+
pub_to_filter = f",to_publication_date:{pub_to_date:%Y-%m-%d}" if pub_to_date else ""
|
|
261
|
+
url = f"https://api.openalex.org/works?filter=author.id:{profiles_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
|
|
262
|
+
|
|
263
|
+
limiter = RateLimiter(rate=REQUEST_RATE_PER_SECOND, per_second=1.0)
|
|
264
|
+
async with httpx.AsyncClient(http2=True, timeout=None) as client:
|
|
265
|
+
# Getting all the author works.
|
|
266
|
+
author_works = await _get_works(client, url, limiter)
|
|
267
|
+
|
|
268
|
+
# Extra filters
|
|
269
|
+
cited_from_filter = f",from_publication_date:{cited_from_date:%Y-%m-%d}" if cited_from_date else ""
|
|
270
|
+
cited_to_filter = f",to_publication_date:{cited_to_date:%Y-%m-%d}" if cited_to_date else ""
|
|
271
|
+
|
|
272
|
+
# Report fields.
|
|
273
|
+
works: list[WorkReport] = []
|
|
274
|
+
report_citation_summary = CitationSummary()
|
|
275
|
+
open_access_summary = OpenAccessSummary()
|
|
276
|
+
works_type_counter: list[WorkTypeCounter] = []
|
|
277
|
+
dehydrated_sources: list[DehydratedSource] = []
|
|
278
|
+
counts_by_year: list[AuthorYearCount] = []
|
|
279
|
+
|
|
280
|
+
# Getting all works that have cited the author.
|
|
281
|
+
author_works_count = len(author_works)
|
|
282
|
+
for idx_work, author_work in enumerate(author_works, 1):
|
|
283
|
+
work_id = identifier.get_work_id(author_work)
|
|
284
|
+
log.info(f"[{work_id}] Work [{idx_work}/{author_works_count}]")
|
|
285
|
+
|
|
286
|
+
work_authors = _get_authors_list(authorships=author_work.authorships)
|
|
287
|
+
cited_by_api_url = f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
|
|
288
|
+
|
|
289
|
+
# Add work to the count by year
|
|
290
|
+
if author_work.publication_year:
|
|
291
|
+
year_counter = _get_year_counter(counts_by_year, author_work.publication_year)
|
|
292
|
+
if year_counter:
|
|
293
|
+
year_counter.works_count += 1
|
|
294
|
+
else:
|
|
295
|
+
counts_by_year.append(AuthorYearCount(year=author_work.publication_year, works_count=1, cited_by_count=0))
|
|
296
|
+
|
|
297
|
+
# Adding the type of OpenAccess in the counter.
|
|
298
|
+
open_access_summary.add_oa_type(author_work.open_access.oa_status)
|
|
299
|
+
|
|
300
|
+
# Adding the work type to works type counter.
|
|
301
|
+
work_type = next((work_type for work_type in works_type_counter if work_type.type_name == author_work.type), None)
|
|
302
|
+
if work_type:
|
|
303
|
+
work_type.count += 1
|
|
304
|
+
else:
|
|
305
|
+
works_type_counter.append(WorkTypeCounter(type_name=author_work.type, count=1))
|
|
306
|
+
|
|
307
|
+
# Add Sources to global list.
|
|
308
|
+
for location in author_work.locations:
|
|
309
|
+
if location.source and not any(source.id == location.source.id for source in dehydrated_sources):
|
|
310
|
+
dehydrated_sources.append(location.source)
|
|
311
|
+
|
|
312
|
+
cited_by_works = await _get_works(client, cited_by_api_url, limiter)
|
|
313
|
+
cited_by: list[CitationReport] = []
|
|
314
|
+
work_citation_summary = CitationSummary()
|
|
315
|
+
for cited_by_work in cited_by_works:
|
|
316
|
+
cited_authors = _get_authors_list(authorships=cited_by_work.authorships)
|
|
317
|
+
citation_type = _get_citation_type(work_authors, cited_authors)
|
|
318
|
+
|
|
319
|
+
# Adding the type of cites in the counters.
|
|
320
|
+
report_citation_summary.add_cite_type(citation_type)
|
|
321
|
+
work_citation_summary.add_cite_type(citation_type)
|
|
322
|
+
|
|
323
|
+
# Add work to the count by year
|
|
324
|
+
if cited_by_work.publication_year:
|
|
325
|
+
year_counter = _get_year_counter(counts_by_year, cited_by_work.publication_year)
|
|
326
|
+
if year_counter:
|
|
327
|
+
year_counter.cited_by_count += 1
|
|
328
|
+
else:
|
|
329
|
+
counts_by_year.append(AuthorYearCount(year=cited_by_work.publication_year, works_count=0, cited_by_count=1))
|
|
330
|
+
|
|
331
|
+
cited_by.append(CitationReport(work=cited_by_work, citation_type=citation_type))
|
|
332
|
+
|
|
333
|
+
works.append(WorkReport(work=author_work, cited_by=cited_by, citation_summary=work_citation_summary))
|
|
334
|
+
|
|
335
|
+
# Replace counts by year
|
|
336
|
+
counts_by_year.sort(key=lambda c: c.year)
|
|
337
|
+
author.counts_by_year = counts_by_year
|
|
338
|
+
|
|
339
|
+
# Get sources full info.
|
|
340
|
+
sources: list[Source] = []
|
|
341
|
+
sources_count = len(dehydrated_sources)
|
|
342
|
+
for idx, dehydrated_source in enumerate(dehydrated_sources, 1):
|
|
343
|
+
source_id = identifier.get_source_id(dehydrated_source)
|
|
344
|
+
source_url = f"https://api.openalex.org/sources/{source_id}"
|
|
345
|
+
|
|
346
|
+
log.info(f"Getting Sources... [{idx}/{sources_count}]")
|
|
347
|
+
sources.append(await _get_source(client, source_url, limiter))
|
|
348
|
+
|
|
349
|
+
# Sort sources by h_index
|
|
350
|
+
sources_sorted = sorted(sources, key=lambda source: source.summary_stats.two_yr_mean_citedness, reverse=True)
|
|
351
|
+
sources_summary = SourcesSummary(sources=sources_sorted)
|
|
352
|
+
|
|
353
|
+
return AuthorReport(
|
|
354
|
+
author=author,
|
|
355
|
+
works=works,
|
|
356
|
+
citation_summary=report_citation_summary,
|
|
357
|
+
open_access_summary=open_access_summary,
|
|
358
|
+
works_type_summary=works_type_counter,
|
|
359
|
+
sources_summary=sources_summary,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
async def make_institution_report( # noqa: C901
|
|
364
|
+
institution: Institution,
|
|
365
|
+
extra_profiles: list[Institution | InstitutionResult | DehydratedInstitution] | None = None,
|
|
366
|
+
pub_from_date: FromDate | None = None,
|
|
367
|
+
pub_to_date: ToDate | None = None,
|
|
368
|
+
cited_from_date: FromDate | None = None,
|
|
369
|
+
cited_to_date: ToDate | None = None,
|
|
370
|
+
) -> InstitutionReport:
|
|
371
|
+
"""Make a scientific production report by Institution.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
institution: Institution to which the report is generated.
|
|
375
|
+
extra_profiles: List of institutions profiles whose works will be attached.
|
|
376
|
+
|
|
377
|
+
pub_from_date: Filter works published from this date.
|
|
378
|
+
pub_to_date: Filter works published up to this date.
|
|
379
|
+
|
|
380
|
+
cited_from_date: Filter works that cite the institution, published after this date.
|
|
381
|
+
cited_to_date: Filter works that cite the institution, published up to this date.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Institution's scientific production report Model.
|
|
385
|
+
|
|
386
|
+
Raises:
|
|
387
|
+
httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
|
|
388
|
+
"""
|
|
389
|
+
institution_keys = _get_institution_keys(institution, extra_profiles)
|
|
390
|
+
institution_query_parameter = "|".join(institution_keys)
|
|
391
|
+
|
|
392
|
+
pub_from_filter = f",from_publication_date:{pub_from_date:%Y-%m-%d}" if pub_from_date else ""
|
|
393
|
+
pub_to_filter = f",to_publication_date:{pub_to_date:%Y-%m-%d}" if pub_to_date else ""
|
|
394
|
+
url = f"https://api.openalex.org/works?filter=institutions.id:{institution_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
|
|
395
|
+
|
|
396
|
+
limiter = RateLimiter(rate=REQUEST_RATE_PER_SECOND, per_second=1.0)
|
|
397
|
+
async with httpx.AsyncClient(http2=True, timeout=None) as client:
|
|
398
|
+
# Getting all the institution works.
|
|
399
|
+
institution_works = await _get_works(client=client, url=url, limiter=limiter)
|
|
400
|
+
|
|
401
|
+
# Extra filters
|
|
402
|
+
cited_from_filter = f",from_publication_date:{cited_from_date:%Y-%m-%d}" if cited_from_date else ""
|
|
403
|
+
cited_to_filter = f",to_publication_date:{cited_to_date:%Y-%m-%d}" if cited_to_date else ""
|
|
404
|
+
|
|
405
|
+
# Report fields.
|
|
406
|
+
works: list[WorkReport] = []
|
|
407
|
+
report_citation_summary = CitationSummary()
|
|
408
|
+
open_access_summary = OpenAccessSummary()
|
|
409
|
+
works_type_counter: list[WorkTypeCounter] = []
|
|
410
|
+
dehydrated_sources: list[DehydratedSource] = []
|
|
411
|
+
counts_by_year: list[InstitutionYearCount] = []
|
|
412
|
+
|
|
413
|
+
# Getting all works that have cited a work.
|
|
414
|
+
institution_works_count = len(institution_works)
|
|
415
|
+
for idx_work, institution_work in enumerate(institution_works, 1):
|
|
416
|
+
work_id = identifier.get_work_id(institution_work)
|
|
417
|
+
log.info(f"[{work_id}] Work [{idx_work}/{institution_works_count}]")
|
|
418
|
+
|
|
419
|
+
work_authors = _get_authors_list(authorships=institution_work.authorships)
|
|
420
|
+
cited_by_api_url = f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
|
|
421
|
+
|
|
422
|
+
# Add work to the count by year
|
|
423
|
+
if institution_work.publication_year:
|
|
424
|
+
year_counter = _get_year_counter(counts_by_year, institution_work.publication_year)
|
|
425
|
+
if year_counter:
|
|
426
|
+
year_counter.works_count += 1
|
|
427
|
+
else:
|
|
428
|
+
counts_by_year.append(InstitutionYearCount(year=institution_work.publication_year, works_count=1, cited_by_count=0))
|
|
429
|
+
|
|
430
|
+
# Adding the type of OpenAccess in the counter.
|
|
431
|
+
open_access_summary.add_oa_type(institution_work.open_access.oa_status)
|
|
432
|
+
|
|
433
|
+
# Adding the work type to works type counter.
|
|
434
|
+
work_type = next((work_type for work_type in works_type_counter if work_type.type_name == institution_work.type), None)
|
|
435
|
+
if work_type:
|
|
436
|
+
work_type.count += 1
|
|
437
|
+
else:
|
|
438
|
+
works_type_counter.append(WorkTypeCounter(type_name=institution_work.type, count=1))
|
|
439
|
+
|
|
440
|
+
# Add Sources to global list.
|
|
441
|
+
for location in institution_work.locations:
|
|
442
|
+
if location.source and not any(source.id == location.source.id for source in dehydrated_sources):
|
|
443
|
+
dehydrated_sources.append(location.source)
|
|
444
|
+
|
|
445
|
+
cited_by_works = await _get_works(client, cited_by_api_url, limiter)
|
|
446
|
+
cited_by: list[CitationReport] = []
|
|
447
|
+
work_citation_summary = CitationSummary()
|
|
448
|
+
for cited_by_work in cited_by_works:
|
|
449
|
+
cited_authors = _get_authors_list(authorships=cited_by_work.authorships)
|
|
450
|
+
citation_type = _get_citation_type(work_authors, cited_authors)
|
|
451
|
+
|
|
452
|
+
# Adding the type of cites in the counters.
|
|
453
|
+
report_citation_summary.add_cite_type(citation_type)
|
|
454
|
+
work_citation_summary.add_cite_type(citation_type)
|
|
455
|
+
|
|
456
|
+
# Add work to the count by year
|
|
457
|
+
if cited_by_work.publication_year:
|
|
458
|
+
year_counter = _get_year_counter(counts_by_year, cited_by_work.publication_year)
|
|
459
|
+
if year_counter:
|
|
460
|
+
year_counter.cited_by_count += 1
|
|
461
|
+
else:
|
|
462
|
+
counts_by_year.append(InstitutionYearCount(year=cited_by_work.publication_year, works_count=0, cited_by_count=1))
|
|
463
|
+
|
|
464
|
+
cited_by.append(CitationReport(work=cited_by_work, citation_type=citation_type))
|
|
465
|
+
|
|
466
|
+
works.append(WorkReport(work=institution_work, cited_by=cited_by, citation_summary=work_citation_summary))
|
|
467
|
+
|
|
468
|
+
# Replace counts by year
|
|
469
|
+
counts_by_year.sort(key=lambda c: c.year)
|
|
470
|
+
institution.counts_by_year = counts_by_year
|
|
471
|
+
|
|
472
|
+
# Get sources full info.
|
|
473
|
+
sources: list[Source] = []
|
|
474
|
+
sources_count = len(dehydrated_sources)
|
|
475
|
+
for idx, dehydrated_source in enumerate(dehydrated_sources, 1):
|
|
476
|
+
source_id = identifier.get_source_id(dehydrated_source)
|
|
477
|
+
source_url = f"https://api.openalex.org/sources/{source_id}"
|
|
478
|
+
|
|
479
|
+
log.info(f"[{source_id}] Getting Sources... [{idx}/{sources_count}]")
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
sources.append(await _get_source(client, source_url, limiter))
|
|
483
|
+
except httpx.HTTPStatusError as exc:
|
|
484
|
+
log.warning(f"Fail to retrive {source_id}: {exc}")
|
|
485
|
+
|
|
486
|
+
# Sort sources by h_index
|
|
487
|
+
sources_sorted = sorted(sources, key=lambda source: source.summary_stats.two_yr_mean_citedness, reverse=True)
|
|
488
|
+
sources_summary = SourcesSummary(sources=sources_sorted)
|
|
489
|
+
|
|
490
|
+
return InstitutionReport(
|
|
491
|
+
institution=institution,
|
|
492
|
+
works=works,
|
|
493
|
+
citation_summary=report_citation_summary,
|
|
494
|
+
open_access_summary=open_access_summary,
|
|
495
|
+
works_type_summary=works_type_counter,
|
|
496
|
+
sources_summary=sources_summary,
|
|
497
|
+
)
|