pub-analyzer 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pub-analyzer might be problematic. Click here for more details.

@@ -0,0 +1,60 @@
1
+ /* COLORS */
2
+ $bg-main-color: white;
3
+ $bg-secondary-color: #e5e7eb;
4
+ $bg-secondary-color-accent: #d1d5db;
5
+ $text-primary-color: black;
6
+
7
+ $bg-main-color-darken: #1e293b;
8
+ $bg-secondary-color-darken: #0f172a;
9
+ $text-primary-color-darken: black;
10
+
11
+ $primary-color: #b91c1c;
12
+ $primary-color-accent: #991b1b;
13
+ $primary-color-highlight: #dc2626;
14
+
15
+ TextEditor {
16
+ #dialog {
17
+ margin: 0 10;
18
+ min-height: 10vh;
19
+ max-height: 60vh;
20
+ }
21
+
22
+ #text-editor-container {
23
+ height: 1fr;
24
+ }
25
+
26
+ TextArea{
27
+ height: auto;
28
+ padding: 1 3;
29
+
30
+ background: $bg-main-color;
31
+ border: none;
32
+
33
+ .text-area--cursor {
34
+ background: $primary-color;
35
+ }
36
+ .text-area--cursor-gutter {
37
+ color: $bg-main-color;
38
+ background: $primary-color-accent;
39
+ }
40
+ .text-area--cursor-line {
41
+ background: $bg-main-color;
42
+ }
43
+ .text-area--matching-bracket {
44
+ background: $primary-color-highlight 30%;
45
+ }
46
+
47
+ }
48
+
49
+ #actions-buttons {
50
+ height: 3;
51
+ margin-top: 1;
52
+ margin-bottom: 2;
53
+
54
+ align: center middle;
55
+
56
+ Button {
57
+ margin: 0 5;
58
+ }
59
+ }
60
+ }
@@ -67,29 +67,29 @@ LoadReportWidget .button-container {
67
67
  }
68
68
 
69
69
  /* Export Report Pane */
70
- ExportReportPane #export-form {
70
+ #export-form {
71
71
  height: auto;
72
72
  }
73
73
 
74
- ExportReportPane .export-form-input-container {
74
+ .export-form-input-container {
75
75
  height: auto;
76
76
  margin-bottom: 2;
77
77
  }
78
78
 
79
- ExportReportPane .export-form-label {
79
+ .export-form-label {
80
80
  width: 25vw;
81
81
  border-bottom: solid $text-primary-color;
82
82
  }
83
83
 
84
- ExportReportPane .file-selector-container {
84
+ .file-selector-container {
85
85
  height: 3;
86
86
  }
87
87
 
88
- ExportReportPane .export-form-input {
88
+ .export-form-input {
89
89
  width: 50vw;
90
90
  }
91
91
 
92
- ExportReportPane .export-form-buttons {
92
+ .export-form-buttons {
93
93
  align: center middle;
94
94
  height: 3;
95
95
  }
@@ -113,6 +113,15 @@ WorkModal #dialog .abstract {
113
113
  padding: 1 2;
114
114
  }
115
115
 
116
+ WorkModal TabPane EditWidget {
117
+ height: 3;
118
+ margin-top: 1;
119
+
120
+ Horizontal {
121
+ align: center middle;
122
+ }
123
+ }
124
+
116
125
  WorkModal #dialog #tables-container {
117
126
  margin: 1 0;
118
127
  }
@@ -0,0 +1,34 @@
1
+ """Rate limiter module."""
2
+
3
+ import asyncio
4
+ import time
5
+
6
+
7
+ class RateLimiter:
8
+ """Rate limiter."""
9
+
10
+ def __init__(self, rate: int, per_second: float = 1.0) -> None:
11
+ self.rate = rate
12
+ self.per = per_second
13
+ self._tokens = float(rate)
14
+ self._updated_at = time.monotonic()
15
+ self._lock = asyncio.Lock()
16
+
17
+ async def acquire(self) -> None:
18
+ """Wait until new token is available."""
19
+ while True:
20
+ async with self._lock:
21
+ now = time.monotonic()
22
+ elapsed = now - self._updated_at
23
+ if elapsed > 0:
24
+ self._tokens = min(self.rate, self._tokens + elapsed * (self.rate / self.per))
25
+ self._updated_at = now
26
+
27
+ if self._tokens >= 1.0:
28
+ self._tokens -= 1.0
29
+ return
30
+
31
+ missing = 1.0 - self._tokens
32
+ wait_time = missing * (self.per / self.rate)
33
+
34
+ await asyncio.sleep(wait_time)
@@ -1,68 +1,41 @@
1
1
  """Render reports."""
2
2
 
3
3
  import pathlib
4
+ import time
4
5
  from importlib.metadata import version
5
6
 
6
7
  import typst
7
- from jinja2 import Environment, FileSystemLoader
8
+ from textual import log
8
9
 
9
10
  from pub_analyzer.models.report import AuthorReport, InstitutionReport
10
11
 
11
12
 
12
- async def render_template_report(report: AuthorReport | InstitutionReport) -> str:
13
- """Render report template.
14
-
15
- Render the report to typst format using the templates.
13
+ def render_report(report: AuthorReport | InstitutionReport, file_path: pathlib.Path | None) -> bytes | None:
14
+ """Render report to PDF.
16
15
 
17
16
  Args:
18
17
  report: Report Model.
18
+ file_path: Path to save the compiled file.
19
19
 
20
20
  Returns:
21
- Report in Typst language.
21
+ PDF bytes or None if output file path is defined.
22
22
 
23
23
  Raises:
24
- NotImplementedError: If report is `InstitutionReport` type.
24
+ SyntaxError: If typst compiler syntax error.
25
25
  """
26
26
  if isinstance(report, AuthorReport):
27
- templates_path = pathlib.Path(__file__).parent.resolve().joinpath("templates/author")
27
+ templates_path = pathlib.Path(__file__).parent.resolve().joinpath("templates")
28
+ typst_file = templates_path / "author_report.typ"
28
29
  if isinstance(report, InstitutionReport):
29
30
  raise NotImplementedError
30
31
 
31
- # Render template
32
- env = Environment(loader=FileSystemLoader(searchpath=templates_path), enable_async=True, trim_blocks=True, lstrip_blocks=True)
33
- return await env.get_template("report.typ").render_async(report=report, version=version("pub-analyzer"))
34
-
35
-
36
- async def render_report(report: AuthorReport | InstitutionReport, file_path: pathlib.Path) -> bytes:
37
- """Render report to PDF.
38
-
39
- The specified path is not where the PDF file will be saved. The path is where the typst
40
- file will be created (You can create a temporary path using the `tempfile` package).
41
- This is done in this way because at the moment the typst package can only read the
42
- document to be compiled from a file.
43
-
44
- Args:
45
- report: Report Model.
46
- file_path: Temporary directory for the typst file.
32
+ sys_inputs = {"report": report.model_dump_json(by_alias=True), "version": version("pub-analyzer")}
47
33
 
48
- Returns:
49
- PDF bytes.
50
-
51
- Raises:
52
- SyntaxError: If typst compiler syntax error.
53
- """
54
- template_render = await render_template_report(report=report)
55
-
56
- # Write template to typst file
57
- root = file_path.parent
58
- temp_file = open(root.joinpath(file_path.stem + ".typ"), mode="w", encoding="utf-8")
59
- temp_file.write(template_render)
60
- temp_file.close()
61
-
62
- # Render typst file
63
- pdf_render = typst.compile(temp_file.name)
64
-
65
- if isinstance(pdf_render, bytes):
66
- return pdf_render
34
+ start_time = time.time()
35
+ if file_path:
36
+ result = typst.compile(input=typst_file, output=file_path, sys_inputs=sys_inputs)
67
37
  else:
68
- raise SyntaxError
38
+ result = typst.compile(input=typst_file, sys_inputs=sys_inputs)
39
+
40
+ log.info(f"Typst compile time: {round((time.time() - start_time), 2)} seconds.")
41
+ return result
@@ -6,8 +6,10 @@ from typing import Any, NewType
6
6
 
7
7
  import httpx
8
8
  from pydantic import TypeAdapter
9
+ from textual import log
9
10
 
10
11
  from pub_analyzer.internal import identifier
12
+ from pub_analyzer.internal.limiter import RateLimiter
11
13
  from pub_analyzer.models.author import Author, AuthorOpenAlexKey, AuthorResult, DehydratedAuthor
12
14
  from pub_analyzer.models.institution import DehydratedInstitution, Institution, InstitutionOpenAlexKey, InstitutionResult
13
15
  from pub_analyzer.models.report import (
@@ -30,6 +32,10 @@ FromDate = NewType("FromDate", datetime.datetime)
30
32
  ToDate = NewType("ToDate", datetime.datetime)
31
33
  """DateTime marker for works published up to this date."""
32
34
 
35
+ REQUEST_RATE_PER_SECOND = 8
36
+ """The OpenAlex API requires a maximum of 10 requests per second. We limit this to 8 per second."""
37
+ PER_PAGE_SIZE = 100
38
+
33
39
 
34
40
  def _get_author_profiles_keys(
35
41
  author: Author, extra_profiles: list[Author | AuthorResult | DehydratedAuthor] | None
@@ -138,10 +144,17 @@ def _get_valid_works(works: list[dict[str, Any]]) -> list[dict[str, Any]]:
138
144
  In response, we have chosen to exclude such works at this stage, thus avoiding
139
145
  the need to handle exceptions within the Model validators.
140
146
  """
141
- return [_add_work_abstract(work) for work in works if work["title"] is not None]
147
+ valid_works = []
148
+ for work in works:
149
+ if work["title"] is not None:
150
+ valid_works.append(_add_work_abstract(work))
151
+ else:
152
+ log.warning(f"Discarded work: {work['id']}")
153
+
154
+ return valid_works
142
155
 
143
156
 
144
- async def _get_works(client: httpx.AsyncClient, url: str) -> list[Work]:
157
+ async def _get_works(client: httpx.AsyncClient, url: str, limiter: RateLimiter) -> list[Work]:
145
158
  """Get all works given a URL.
146
159
 
147
160
  Iterate over all pages of the URL
@@ -156,7 +169,8 @@ async def _get_works(client: httpx.AsyncClient, url: str) -> list[Work]:
156
169
  Raises:
157
170
  httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
158
171
  """
159
- response = await client.get(url=url)
172
+ await limiter.acquire()
173
+ response = await client.get(url=url, follow_redirects=True)
160
174
  response.raise_for_status()
161
175
 
162
176
  json_response = response.json()
@@ -166,13 +180,14 @@ async def _get_works(client: httpx.AsyncClient, url: str) -> list[Work]:
166
180
  works_data = list(_get_valid_works(json_response["results"]))
167
181
 
168
182
  for page_number in range(1, page_count):
169
- page_result = (await client.get(url + f"&page={page_number + 1}")).json()
183
+ await limiter.acquire()
184
+ page_result = (await client.get(url + f"&page={page_number + 1}", follow_redirects=True)).json()
170
185
  works_data.extend(_get_valid_works(page_result["results"]))
171
186
 
172
187
  return TypeAdapter(list[Work]).validate_python(works_data)
173
188
 
174
189
 
175
- async def _get_source(client: httpx.AsyncClient, url: str) -> Source:
190
+ async def _get_source(client: httpx.AsyncClient, url: str, limiter: RateLimiter) -> Source:
176
191
  """Get source given a URL.
177
192
 
178
193
  Args:
@@ -185,10 +200,18 @@ async def _get_source(client: httpx.AsyncClient, url: str) -> Source:
185
200
  Raises:
186
201
  httpx.HTTPStatusError: One response from OpenAlex API had an error HTTP status of 4xx or 5xx.
187
202
  """
188
- response = await client.get(url=url)
203
+ await limiter.acquire()
204
+ response = await client.get(url=url, follow_redirects=True)
189
205
  response.raise_for_status()
190
206
 
191
- return Source(**response.json())
207
+ json_response = response.json()
208
+ hp_url = json_response["homepage_url"]
209
+ if isinstance(hp_url, str):
210
+ if not hp_url.startswith(("http", "https")):
211
+ json_response["homepage_url"] = None
212
+ log.warning(f"Discarted source homepage url: {url}")
213
+
214
+ return Source(**json_response)
192
215
 
193
216
 
194
217
  async def make_author_report(
@@ -222,13 +245,12 @@ async def make_author_report(
222
245
 
223
246
  pub_from_filter = f",from_publication_date:{pub_from_date:%Y-%m-%d}" if pub_from_date else ""
224
247
  pub_to_filter = f",to_publication_date:{pub_to_date:%Y-%m-%d}" if pub_to_date else ""
225
- url = (
226
- f"https://api.openalex.org/works?filter=author.id:{profiles_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date"
227
- )
248
+ url = f"https://api.openalex.org/works?filter=author.id:{profiles_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
228
249
 
229
- async with httpx.AsyncClient() as client:
250
+ limiter = RateLimiter(rate=REQUEST_RATE_PER_SECOND, per_second=1.0)
251
+ async with httpx.AsyncClient(http2=True, timeout=None) as client:
230
252
  # Getting all the author works.
231
- author_works = await _get_works(client, url)
253
+ author_works = await _get_works(client, url, limiter)
232
254
 
233
255
  # Extra filters
234
256
  cited_from_filter = f",from_publication_date:{cited_from_date:%Y-%m-%d}" if cited_from_date else ""
@@ -242,12 +264,13 @@ async def make_author_report(
242
264
  dehydrated_sources: list[DehydratedSource] = []
243
265
 
244
266
  # Getting all works that have cited the author.
245
- for author_work in author_works:
267
+ author_works_count = len(author_works)
268
+ for idx_work, author_work in enumerate(author_works, 1):
246
269
  work_id = identifier.get_work_id(author_work)
270
+ log.info(f"[{work_id}] Work [{idx_work}/{author_works_count}]")
271
+
247
272
  work_authors = _get_authors_list(authorships=author_work.authorships)
248
- cited_by_api_url = (
249
- f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date"
250
- )
273
+ cited_by_api_url = f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
251
274
 
252
275
  # Adding the type of OpenAccess in the counter.
253
276
  open_access_summary.add_oa_type(author_work.open_access.oa_status)
@@ -264,7 +287,7 @@ async def make_author_report(
264
287
  if location.source and not any(source.id == location.source.id for source in dehydrated_sources):
265
288
  dehydrated_sources.append(location.source)
266
289
 
267
- cited_by_works = await _get_works(client, cited_by_api_url)
290
+ cited_by_works = await _get_works(client, cited_by_api_url, limiter)
268
291
  cited_by: list[CitationReport] = []
269
292
  work_citation_summary = CitationSummary()
270
293
  for cited_by_work in cited_by_works:
@@ -281,10 +304,13 @@ async def make_author_report(
281
304
 
282
305
  # Get sources full info.
283
306
  sources: list[Source] = []
284
- for dehydrated_source in dehydrated_sources:
307
+ sources_count = len(dehydrated_sources)
308
+ for idx, dehydrated_source in enumerate(dehydrated_sources, 1):
285
309
  source_id = identifier.get_source_id(dehydrated_source)
286
310
  source_url = f"https://api.openalex.org/sources/{source_id}"
287
- sources.append(await _get_source(client, source_url))
311
+
312
+ log.info(f"Getting Sources... [{idx}/{sources_count}]")
313
+ sources.append(await _get_source(client, source_url, limiter))
288
314
 
289
315
  # Sort sources by h_index
290
316
  sources_sorted = sorted(sources, key=lambda source: source.summary_stats.two_yr_mean_citedness, reverse=True)
@@ -331,11 +357,12 @@ async def make_institution_report(
331
357
 
332
358
  pub_from_filter = f",from_publication_date:{pub_from_date:%Y-%m-%d}" if pub_from_date else ""
333
359
  pub_to_filter = f",to_publication_date:{pub_to_date:%Y-%m-%d}" if pub_to_date else ""
334
- url = f"https://api.openalex.org/works?filter=institutions.id:{institution_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date"
360
+ url = f"https://api.openalex.org/works?filter=institutions.id:{institution_query_parameter}{pub_from_filter}{pub_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
335
361
 
336
- async with httpx.AsyncClient() as client:
362
+ limiter = RateLimiter(rate=REQUEST_RATE_PER_SECOND, per_second=1.0)
363
+ async with httpx.AsyncClient(http2=True, timeout=None) as client:
337
364
  # Getting all the institution works.
338
- institution_works = await _get_works(client=client, url=url)
365
+ institution_works = await _get_works(client=client, url=url, limiter=limiter)
339
366
 
340
367
  # Extra filters
341
368
  cited_from_filter = f",from_publication_date:{cited_from_date:%Y-%m-%d}" if cited_from_date else ""
@@ -349,12 +376,13 @@ async def make_institution_report(
349
376
  dehydrated_sources: list[DehydratedSource] = []
350
377
 
351
378
  # Getting all works that have cited a work.
352
- for institution_work in institution_works:
379
+ institution_works_count = len(institution_works)
380
+ for idx_work, institution_work in enumerate(institution_works, 1):
353
381
  work_id = identifier.get_work_id(institution_work)
382
+ log.info(f"[{work_id}] Work [{idx_work}/{institution_works_count}]")
383
+
354
384
  work_authors = _get_authors_list(authorships=institution_work.authorships)
355
- cited_by_api_url = (
356
- f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date"
357
- )
385
+ cited_by_api_url = f"https://api.openalex.org/works?filter=cites:{work_id}{cited_from_filter}{cited_to_filter}&sort=publication_date&per-page={PER_PAGE_SIZE}"
358
386
 
359
387
  # Adding the type of OpenAccess in the counter.
360
388
  open_access_summary.add_oa_type(institution_work.open_access.oa_status)
@@ -371,7 +399,7 @@ async def make_institution_report(
371
399
  if location.source and not any(source.id == location.source.id for source in dehydrated_sources):
372
400
  dehydrated_sources.append(location.source)
373
401
 
374
- cited_by_works = await _get_works(client, cited_by_api_url)
402
+ cited_by_works = await _get_works(client, cited_by_api_url, limiter)
375
403
  cited_by: list[CitationReport] = []
376
404
  work_citation_summary = CitationSummary()
377
405
  for cited_by_work in cited_by_works:
@@ -388,10 +416,13 @@ async def make_institution_report(
388
416
 
389
417
  # Get sources full info.
390
418
  sources: list[Source] = []
391
- for dehydrated_source in dehydrated_sources:
419
+ sources_count = len(dehydrated_sources)
420
+ for idx, dehydrated_source in enumerate(dehydrated_sources, 1):
392
421
  source_id = identifier.get_source_id(dehydrated_source)
393
422
  source_url = f"https://api.openalex.org/sources/{source_id}"
394
- sources.append(await _get_source(client, source_url))
423
+
424
+ log.debug(f"[{work_id}] Getting Sources... [{idx}/{sources_count}]")
425
+ sources.append(await _get_source(client, source_url, limiter))
395
426
 
396
427
  # Sort sources by h_index
397
428
  sources_sorted = sorted(sources, key=lambda source: source.summary_stats.two_yr_mean_citedness, reverse=True)