aletheca 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. aletheca-0.1.0/PKG-INFO +183 -0
  2. aletheca-0.1.0/README.md +157 -0
  3. aletheca-0.1.0/pyproject.toml +119 -0
  4. aletheca-0.1.0/src/aletheca/__init__.py +64 -0
  5. aletheca-0.1.0/src/aletheca/_helpers.py +105 -0
  6. aletheca-0.1.0/src/aletheca/client.py +162 -0
  7. aletheca-0.1.0/src/aletheca/config.py +45 -0
  8. aletheca-0.1.0/src/aletheca/constants.py +21 -0
  9. aletheca-0.1.0/src/aletheca/endpoints.py +338 -0
  10. aletheca-0.1.0/src/aletheca/models/__init__.py +127 -0
  11. aletheca-0.1.0/src/aletheca/models/author.py +43 -0
  12. aletheca-0.1.0/src/aletheca/models/award.py +54 -0
  13. aletheca-0.1.0/src/aletheca/models/base.py +45 -0
  14. aletheca-0.1.0/src/aletheca/models/common.py +361 -0
  15. aletheca-0.1.0/src/aletheca/models/dehydrated.py +60 -0
  16. aletheca-0.1.0/src/aletheca/models/funder.py +33 -0
  17. aletheca-0.1.0/src/aletheca/models/ids.py +99 -0
  18. aletheca-0.1.0/src/aletheca/models/institution.py +71 -0
  19. aletheca-0.1.0/src/aletheca/models/keyword.py +18 -0
  20. aletheca-0.1.0/src/aletheca/models/publisher.py +46 -0
  21. aletheca-0.1.0/src/aletheca/models/safe_types.py +5 -0
  22. aletheca-0.1.0/src/aletheca/models/source.py +64 -0
  23. aletheca-0.1.0/src/aletheca/models/topic.py +29 -0
  24. aletheca-0.1.0/src/aletheca/models/work.py +129 -0
  25. aletheca-0.1.0/src/aletheca/py.typed +0 -0
  26. aletheca-0.1.0/src/aletheca/queries.py +165 -0
  27. aletheca-0.1.0/src/aletheca/resources/__init__.py +23 -0
  28. aletheca-0.1.0/src/aletheca/resources/_standard.py +84 -0
  29. aletheca-0.1.0/src/aletheca/resources/authors_client.py +21 -0
  30. aletheca-0.1.0/src/aletheca/resources/awards_client.py +21 -0
  31. aletheca-0.1.0/src/aletheca/resources/funders_client.py +21 -0
  32. aletheca-0.1.0/src/aletheca/resources/institutions_client.py +21 -0
  33. aletheca-0.1.0/src/aletheca/resources/keywords_client.py +21 -0
  34. aletheca-0.1.0/src/aletheca/resources/publishers_client.py +21 -0
  35. aletheca-0.1.0/src/aletheca/resources/sources_client.py +21 -0
  36. aletheca-0.1.0/src/aletheca/resources/topics_client.py +21 -0
  37. aletheca-0.1.0/src/aletheca/resources/works_client.py +35 -0
  38. aletheca-0.1.0/src/aletheca/session.py +102 -0
  39. aletheca-0.1.0/src/aletheca/unwrapper.py +40 -0
@@ -0,0 +1,183 @@
1
+ Metadata-Version: 2.3
2
+ Name: aletheca
3
+ Version: 0.1.0
4
+ Summary: Python interface for the OpenAlex API, built on top of the bibliofabric framework.
5
+ Author: Samuel Mok
6
+ Author-email: Samuel Mok <s.mok@utwente.nl>
7
+ License: MIT
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Framework :: AsyncIO
13
+ Classifier: Typing :: Typed
14
+ Requires-Dist: bibliofabric>=0.4.1,<0.5.0
15
+ Requires-Dist: polars ; extra == 'analysis'
16
+ Requires-Dist: duckdb>=1.3.0 ; extra == 'analysis'
17
+ Requires-Dist: matplotlib>=3.8.0 ; extra == 'analysis'
18
+ Requires-Dist: rich>=13.0.0 ; extra == 'analysis'
19
+ Requires-Dist: pandas>=2.1.0 ; extra == 'analysis'
20
+ Requires-Dist: numpy>=1.26.0 ; extra == 'analysis'
21
+ Requires-Dist: pyarrow>=14.0.0 ; extra == 'analysis'
22
+ Requires-Python: >=3.12
23
+ Project-URL: Homepage, https://github.com/utsmok/aletheca
24
+ Provides-Extra: analysis
25
+ Description-Content-Type: text/markdown
26
+
27
+ # Aletheca: Asynchronous Python client for the OpenAlex API
28
+
29
+ Samuel Mok -- s.mok@utwente.nl -- 2025-2026
30
+
31
+ Aletheca is an async Python client for the [OpenAlex API](https://docs.openalex.org/), built on [bibliofabric](https://github.com/utsmok/bibliofabric).
32
+
33
+ **Docs:** [utsmok.github.io/aletheca](https://utsmok.github.io/aletheca/) -- **PyPI:** [aletheca](https://pypi.org/project/aletheca/) -- **License:** MIT
34
+
35
+ ## Features
36
+
37
+ - **Async by design** -- built on `httpx` + `asyncio` with proper connection pooling
38
+ - **Typed throughout** -- Pydantic v2 models for all entities, PEP 561 `py.typed` marker
39
+ - **Cursor pagination** -- efficient iteration over large result sets via cursor-based auto-pagination
40
+ - **Filter serialization** -- automatic conversion to OpenAlex `filter=key:value` syntax with Pydantic filter models
41
+ - **Safe types** -- `SafeList` and `SafeStr` for None-safe traversal of API responses
42
+ - **Convenience queries** -- high-level functions for common workflows (`works_by_author`, `citing_works`, etc.)
43
+
44
+ ## Installation
45
+
46
+ ```bash
47
+ uv add aletheca
48
+ ```
49
+
50
+ Or with pip: `pip install aletheca`. Requires Python >=3.12.
51
+
52
+ ## Quick Start
53
+
54
+ ```python
55
+ import asyncio
56
+ from aletheca import AlethecaSession
57
+
58
+ async def main():
59
+ async with AlethecaSession() as session:
60
+ # Get a work by OpenAlex ID
61
+ work = await session.works.get("W1234567890")
62
+ print(work.title)
63
+
64
+ # Search works
65
+ results = await session.works.search(search="machine learning", page_size=10)
66
+ for work in results.results:
67
+ print(f"{work.title} ({work.publication_year})")
68
+
69
+ # Iterate all works by an author (cursor-based auto-pagination)
70
+ async for work in session.works.iterate(
71
+ filters={"authorships.author.id": "A1234567890"},
72
+ page_size=200,
73
+ ):
74
+ print(work.title)
75
+
76
+ asyncio.run(main())
77
+ ```
78
+
79
+ No authentication required -- the OpenAlex API works without it. For higher rate limits, see [Authentication](#authentication).
80
+
81
+ ## Examples
82
+
83
+ All examples in [`examples/`](examples/) are dual-purpose -- run as scripts or as interactive [marimo](https://marimo.io) notebooks:
84
+
85
+ ```bash
86
+ # As a script
87
+ uv run examples/simple_example.py
88
+
89
+ # As an interactive notebook
90
+ uv run marimo edit examples/simple_example.py
91
+ ```
92
+
93
+ | Script | Description |
94
+ |--------|-------------|
95
+ | `simple_example.py` | Search, iterate, get works |
96
+ | `02_filtering_and_search.py` | WorksFilters, AuthorsFilters, and other filter models |
97
+ | `03_institution_research.py` | Works by institution, topic analysis |
98
+ | `04_author_discovery.py` | Find authors, retrieve their works |
99
+ | `05_advanced_queries.py` | Cursor pagination, select fields, sort |
100
+ | `06_convenience_queries.py` | `session.queries.*` convenience functions |
101
+ | `07_iterator_helpers.py` | `collect()`, `count()`, `first()` from bibliofabric mixins |
102
+ | `08_safe_types_and_helpers.py` | SafeList, SafeStr, DOI normalization, abstract reconstruction |
103
+
104
+ ## Authentication
105
+
106
+ Aletheca auto-detects the OpenAlex API key from environment variables or `.env` files (prefixed with `ALETHECA_`). No auth is the default if nothing is configured.
107
+
108
+ ```dotenv
109
+ ALETHECA_OPENALEX_API_KEY=your_api_key
110
+ ```
111
+
112
+ Or pass explicitly:
113
+
114
+ ```python
115
+ async with AlethecaSession(api_key="your_api_key") as session:
116
+ ...
117
+ ```
118
+
119
+ With an API key you get faster responses (dedicated pool). Without one, you use the polite pool (slower).
120
+
121
+ ## Basic Usage
122
+
123
+ ### Get a single entity
124
+
125
+ ```python
126
+ work = await session.works.get("W2741809801")
127
+ print(work.title, work.doi, work.publication_year)
128
+ ```
129
+
130
+ ### Search
131
+
132
+ ```python
133
+ results = await session.works.search(search="machine learning", page_size=5)
134
+ for work in results.results:
135
+ print(work.title)
136
+ ```
137
+
138
+ ### Iterate all results
139
+
140
+ ```python
141
+ async for work in session.works.iterate(
142
+ filters={"publication_year": 2024, "is_oa": True},
143
+ page_size=200,
144
+ ):
145
+ print(work.title)
146
+ break # stop when you want
147
+ ```
148
+
149
+ ### Convenience queries
150
+
151
+ ```python
152
+ citations = await session.queries.citing_works("W2741809801")
153
+ print(f"{len(citations)} citations")
154
+ ```
155
+
156
+ ## Known OpenAlex API Issues
157
+
158
+ Full bug report with reproduction steps: [`OPENALEX_BUG_REPORT.md`](OPENALEX_BUG_REPORT.md).
159
+
160
+ - **OpenAPI spec is substantially incomplete** -- 50+ fields returned by the live API are missing from the spec schemas across all entity types. Several spec fields don't exist in the live API.
161
+ - **Wrong field names in spec** -- `content_url` (spec) vs `content_urls` (live), `grants_count` (spec) vs `awards_count` (live)
162
+ - **Undocumented fields** -- `institution_awarded` on Awards is not documented anywhere; 15+ nested Award filters are missing from the docs filter table
163
+ - **Awards endpoint missing from `llms.txt`** -- the awards endpoint is not listed in the API quick reference
164
+ - **`per_page` max is 200, not 100** -- documented as 100 but the API accepts 200
165
+
166
+ ## Development
167
+
168
+ ```bash
169
+ uv sync --all-groups --all-extras # install everything
170
+ uv run ruff check src/ --fix # lint
171
+ uv run ruff format src/ # format
172
+ uvx ty check src/ # type check
173
+ uv run pytest tests/ # run tests
174
+ uv run pytest --cov=aletheca tests/ # coverage (CI threshold: 95%)
175
+ uv build # build package
176
+ uv run mkdocs serve # local docs
177
+ ```
178
+
179
+ Contributions welcome -- see [Contributing](https://utsmok.github.io/aletheca/contributing/).
180
+
181
+ ## License
182
+
183
+ MIT
@@ -0,0 +1,157 @@
1
+ # Aletheca: Asynchronous Python client for the OpenAlex API
2
+
3
+ Samuel Mok -- s.mok@utwente.nl -- 2025-2026
4
+
5
+ Aletheca is an async Python client for the [OpenAlex API](https://docs.openalex.org/), built on [bibliofabric](https://github.com/utsmok/bibliofabric).
6
+
7
+ **Docs:** [utsmok.github.io/aletheca](https://utsmok.github.io/aletheca/) -- **PyPI:** [aletheca](https://pypi.org/project/aletheca/) -- **License:** MIT
8
+
9
+ ## Features
10
+
11
+ - **Async by design** -- built on `httpx` + `asyncio` with proper connection pooling
12
+ - **Typed throughout** -- Pydantic v2 models for all entities, PEP 561 `py.typed` marker
13
+ - **Cursor pagination** -- efficient iteration over large result sets via cursor-based auto-pagination
14
+ - **Filter serialization** -- automatic conversion to OpenAlex `filter=key:value` syntax with Pydantic filter models
15
+ - **Safe types** -- `SafeList` and `SafeStr` for None-safe traversal of API responses
16
+ - **Convenience queries** -- high-level functions for common workflows (`works_by_author`, `citing_works`, etc.)
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ uv add aletheca
22
+ ```
23
+
24
+ Or with pip: `pip install aletheca`. Requires Python >=3.12.
25
+
26
+ ## Quick Start
27
+
28
+ ```python
29
+ import asyncio
30
+ from aletheca import AlethecaSession
31
+
32
+ async def main():
33
+ async with AlethecaSession() as session:
34
+ # Get a work by OpenAlex ID
35
+ work = await session.works.get("W1234567890")
36
+ print(work.title)
37
+
38
+ # Search works
39
+ results = await session.works.search(search="machine learning", page_size=10)
40
+ for work in results.results:
41
+ print(f"{work.title} ({work.publication_year})")
42
+
43
+ # Iterate all works by an author (cursor-based auto-pagination)
44
+ async for work in session.works.iterate(
45
+ filters={"authorships.author.id": "A1234567890"},
46
+ page_size=200,
47
+ ):
48
+ print(work.title)
49
+
50
+ asyncio.run(main())
51
+ ```
52
+
53
+ No authentication required -- the OpenAlex API works without it. For higher rate limits, see [Authentication](#authentication).
54
+
55
+ ## Examples
56
+
57
+ All examples in [`examples/`](examples/) are dual-purpose -- run as scripts or as interactive [marimo](https://marimo.io) notebooks:
58
+
59
+ ```bash
60
+ # As a script
61
+ uv run examples/simple_example.py
62
+
63
+ # As an interactive notebook
64
+ uv run marimo edit examples/simple_example.py
65
+ ```
66
+
67
+ | Script | Description |
68
+ |--------|-------------|
69
+ | `simple_example.py` | Search, iterate, get works |
70
+ | `02_filtering_and_search.py` | WorksFilters, AuthorsFilters, and other filter models |
71
+ | `03_institution_research.py` | Works by institution, topic analysis |
72
+ | `04_author_discovery.py` | Find authors, retrieve their works |
73
+ | `05_advanced_queries.py` | Cursor pagination, select fields, sort |
74
+ | `06_convenience_queries.py` | `session.queries.*` convenience functions |
75
+ | `07_iterator_helpers.py` | `collect()`, `count()`, `first()` from bibliofabric mixins |
76
+ | `08_safe_types_and_helpers.py` | SafeList, SafeStr, DOI normalization, abstract reconstruction |
77
+
78
+ ## Authentication
79
+
80
+ Aletheca auto-detects the OpenAlex API key from environment variables or `.env` files (prefixed with `ALETHECA_`). No auth is the default if nothing is configured.
81
+
82
+ ```dotenv
83
+ ALETHECA_OPENALEX_API_KEY=your_api_key
84
+ ```
85
+
86
+ Or pass explicitly:
87
+
88
+ ```python
89
+ async with AlethecaSession(api_key="your_api_key") as session:
90
+ ...
91
+ ```
92
+
93
+ With an API key you get faster responses (dedicated pool). Without one, you use the polite pool (slower).
94
+
95
+ ## Basic Usage
96
+
97
+ ### Get a single entity
98
+
99
+ ```python
100
+ work = await session.works.get("W2741809801")
101
+ print(work.title, work.doi, work.publication_year)
102
+ ```
103
+
104
+ ### Search
105
+
106
+ ```python
107
+ results = await session.works.search(search="machine learning", page_size=5)
108
+ for work in results.results:
109
+ print(work.title)
110
+ ```
111
+
112
+ ### Iterate all results
113
+
114
+ ```python
115
+ async for work in session.works.iterate(
116
+ filters={"publication_year": 2024, "is_oa": True},
117
+ page_size=200,
118
+ ):
119
+ print(work.title)
120
+ break # stop when you want
121
+ ```
122
+
123
+ ### Convenience queries
124
+
125
+ ```python
126
+ citations = await session.queries.citing_works("W2741809801")
127
+ print(f"{len(citations)} citations")
128
+ ```
129
+
130
+ ## Known OpenAlex API Issues
131
+
132
+ Full bug report with reproduction steps: [`OPENALEX_BUG_REPORT.md`](OPENALEX_BUG_REPORT.md).
133
+
134
+ - **OpenAPI spec is substantially incomplete** -- 50+ fields returned by the live API are missing from the spec schemas across all entity types. Several spec fields don't exist in the live API.
135
+ - **Wrong field names in spec** -- `content_url` (spec) vs `content_urls` (live), `grants_count` (spec) vs `awards_count` (live)
136
+ - **Undocumented fields** -- `institution_awarded` on Awards is not documented anywhere; 15+ nested Award filters are missing from the docs filter table
137
+ - **Awards endpoint missing from `llms.txt`** -- the awards endpoint is not listed in the API quick reference
138
+ - **`per_page` max is 200, not 100** -- documented as 100 but the API accepts 200
139
+
140
+ ## Development
141
+
142
+ ```bash
143
+ uv sync --all-groups --all-extras # install everything
144
+ uv run ruff check src/ --fix # lint
145
+ uv run ruff format src/ # format
146
+ uvx ty check src/ # type check
147
+ uv run pytest tests/ # run tests
148
+ uv run pytest --cov=aletheca tests/ # coverage (CI threshold: 95%)
149
+ uv build # build package
150
+ uv run mkdocs serve # local docs
151
+ ```
152
+
153
+ Contributions welcome -- see [Contributing](https://utsmok.github.io/aletheca/contributing/).
154
+
155
+ ## License
156
+
157
+ MIT
@@ -0,0 +1,119 @@
1
+ [project]
2
+ name = "aletheca"
3
+ version = "0.1.0"
4
+ description = "Python interface for the OpenAlex API, built on top of the bibliofabric framework."
5
+ authors = [
6
+ {name = "Samuel Mok", email = "s.mok@utwente.nl"}
7
+ ]
8
+ readme = "README.md"
9
+ license = {text = "MIT"}
10
+ requires-python = ">=3.12"
11
+ dependencies = [
12
+ "bibliofabric>=0.4.1,<0.5.0",
13
+ ]
14
+
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Framework :: AsyncIO",
21
+ "Typing :: Typed",
22
+ ]
23
+
24
+ [dependency-groups]
25
+ dev = [
26
+ "pytest>=8.3.4",
27
+ "pytest-asyncio>=0.26.0",
28
+ "pytest-cov>=6.1.1",
29
+ "pytest-httpx>=0.35.0",
30
+ "python-dotenv>=1.0.0",
31
+ ]
32
+ docs = [
33
+ "mkdocs~=1.6.0",
34
+ "mkdocs-material~=9.5.0",
35
+ "mkdocstrings[python]",
36
+ ]
37
+ lint = ["ruff>=0.8.0"]
38
+ test = ["pytest", "pytest-randomly"]
39
+
40
+ [project.optional-dependencies]
41
+ analysis = [
42
+ "polars",
43
+ "duckdb>=1.3.0",
44
+ "matplotlib>=3.8.0",
45
+ "rich>=13.0.0",
46
+ "pandas>=2.1.0",
47
+ "numpy>=1.26.0",
48
+ "pyarrow>=14.0.0",
49
+ ]
50
+
51
+ [project.urls]
52
+ "Homepage" = "https://github.com/utsmok/aletheca"
53
+
54
+ [build-system]
55
+ requires = ["uv_build>=0.11.19,<0.12"]
56
+ build-backend = "uv_build"
57
+
58
+ [tool.uv.pip]
59
+ generate-hashes = true
60
+
61
+ [tool.ruff]
62
+ line-length = 88
63
+
64
+ [tool.ruff.format]
65
+ docstring-code-format = true
66
+ docstring-code-line-length = 60
67
+
68
+ [tool.ruff.lint]
69
+ select = [
70
+ "E4",
71
+ "E7",
72
+ "E9",
73
+ "F",
74
+ "I", # isort
75
+ "B", # bugbear -- flake8 bugfinder
76
+ "Q", # correct quotes usage
77
+ "PTH", # Replace os functions with pathlib functions
78
+ "SIM", # Simplify statements
79
+ "RET", # Return value related rules
80
+ "PIE", # misc flake8 rules
81
+ "FBT", # boolean traps
82
+ "PERF", # performance optimization
83
+ "PL", # pylint
84
+ "UP", # check for deprecated ways of coding
85
+ "FURB",
86
+ ]
87
+ ignore = ["PLR2004"]
88
+
89
+ [tool.ruff.lint.isort]
90
+ combine-as-imports = true
91
+
92
+ [tool.ruff.lint.per-file-ignores]
93
+ "__init__.py" = ["F401"]
94
+ "**/client.py" = ["PLR0913", "PLR0912", "PLR0915", "PLC0415"]
95
+ "**/session.py" = ["PLC0415"]
96
+ "**/queries.py" = ["PLC0415"]
97
+ "**/endpoints.py" = ["PLR0913"]
98
+ "examples/**/*.py" = ["PLC0415", "F821", "PERF401", "B007", "F841"]
99
+ "marimo_checks/**/*.py" = ["PLC0415", "F821", "F841", "B905", "PLW2901", "I001", "PLR1711", "F404", "UP037"]
100
+
101
+ [tool.ruff.lint.pylint]
102
+ max-args = 10
103
+ max-branches = 25
104
+ max-statements = 75
105
+ max-returns = 10
106
+
107
+ [tool.pytest.ini_options]
108
+ pythonpath = [
109
+ "src"
110
+ ]
111
+ testpaths = ["tests"]
112
+ python_files = "test_*.py"
113
+ python_functions = "test_*"
114
+ asyncio_mode = "auto"
115
+ asyncio_default_fixture_loop_scope = "function"
116
+ markers = [
117
+ "live_api: marks tests that hit the live OpenAlex API (requires API key, skipped in CI)",
118
+ ]
119
+ addopts = "-m 'not live_api'"
@@ -0,0 +1,64 @@
1
+ """Aletheca: Python interface for the OpenAlex API."""
2
+
3
+ try:
4
+ from importlib.metadata import PackageNotFoundError, version as _get_version
5
+
6
+ __version__ = _get_version("aletheca")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0"
9
+
10
+ from bibliofabric.exceptions import (
11
+ APIError,
12
+ AuthError,
13
+ BibliofabricError,
14
+ ConfigurationError,
15
+ NetworkError,
16
+ NotFoundError,
17
+ RateLimitError,
18
+ TimeoutError,
19
+ ValidationError,
20
+ )
21
+
22
+ from .client import AlethecaClient
23
+ from .models import (
24
+ ApiResponse,
25
+ Author,
26
+ Award,
27
+ BaseEntity,
28
+ Funder,
29
+ Institution,
30
+ Keyword,
31
+ Meta,
32
+ Publisher,
33
+ Source,
34
+ Topic,
35
+ Work,
36
+ )
37
+ from .session import AlethecaSession
38
+
39
+ __all__ = [
40
+ "__version__",
41
+ "APIError",
42
+ "ApiResponse",
43
+ "AuthError",
44
+ "Award",
45
+ "Author",
46
+ "BaseEntity",
47
+ "BibliofabricError",
48
+ "ConfigurationError",
49
+ "Funder",
50
+ "Institution",
51
+ "Keyword",
52
+ "Meta",
53
+ "NetworkError",
54
+ "NotFoundError",
55
+ "Publisher",
56
+ "RateLimitError",
57
+ "Source",
58
+ "AlethecaClient",
59
+ "AlethecaSession",
60
+ "TimeoutError",
61
+ "Topic",
62
+ "ValidationError",
63
+ "Work",
64
+ ]
@@ -0,0 +1,105 @@
1
+ """Utility helpers for working with OpenAlex identifiers and data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+
8
+ def normalize_doi(doi: str) -> str:
9
+ """Normalize a DOI to its bare form (no URL prefix).
10
+
11
+ Args:
12
+ doi: A DOI string, possibly with ``https://doi.org/`` prefix.
13
+
14
+ Returns:
15
+ The bare DOI string.
16
+
17
+ Examples:
18
+ >>> normalize_doi("https://doi.org/10.1234/x")
19
+ "10.1234/x"
20
+ >>> normalize_doi("10.1234/x")
21
+ "10.1234/x"
22
+ """
23
+ doi = doi.strip()
24
+ for prefix in ("https://doi.org/", "http://doi.org/", "doi.org/"):
25
+ if doi.startswith(prefix):
26
+ return doi[len(prefix) :]
27
+ return doi
28
+
29
+
30
+ def parse_openalex_id(url_or_id: str) -> str:
31
+ """Extract the short OpenAlex ID from a full URL or bare ID.
32
+
33
+ Args:
34
+ url_or_id: An OpenAlex ID or URL (e.g., ``https://openalex.org/W123``).
35
+
36
+ Returns:
37
+ The short ID (e.g., ``W123``).
38
+
39
+ Examples:
40
+ >>> parse_openalex_id("https://openalex.org/W1234567890")
41
+ "W1234567890"
42
+ >>> parse_openalex_id("W1234567890")
43
+ "W1234567890"
44
+ """
45
+ url_or_id = url_or_id.strip()
46
+ match = re.search(r"([WAITSFPDC]\d+)", url_or_id)
47
+ if match:
48
+ return match.group(1)
49
+ return url_or_id
50
+
51
+
52
+ def detect_id_type(identifier: str) -> str | None:
53
+ """Detect the type of a scholarly identifier.
54
+
55
+ Args:
56
+ identifier: A string identifier.
57
+
58
+ Returns:
59
+ One of ``"openalex"``, ``"doi"``, ``"pmid"``, ``"orcid"``,
60
+ ``"issn"``, ``"ror"``, or ``None``.
61
+ """
62
+ identifier = identifier.strip()
63
+ if re.match(r"^[WAITSFPDC]\d+$", identifier, re.IGNORECASE):
64
+ return "openalex"
65
+ identifier_lower = identifier.lower()
66
+ if identifier_lower.startswith("10.") or "doi.org/" in identifier_lower:
67
+ return "doi"
68
+ if re.match(r"^\d{4}-\d{3,4}$", identifier_lower):
69
+ return "issn"
70
+ if re.match(r"^\d{7,8}$", identifier_lower):
71
+ return "pmid"
72
+ if identifier_lower.startswith("https://orcid.org/") or re.match(
73
+ r"\d{4}-\d{4}-\d{4}-\d{4}", identifier_lower
74
+ ):
75
+ return "orcid"
76
+ if identifier_lower.startswith("https://ror.org/") or re.match(
77
+ r"^0[a-hj-km-np-tv-z]{2,3}\w{3,14}$", identifier_lower
78
+ ):
79
+ return "ror"
80
+ return None
81
+
82
+
83
+ def reconstruct_abstract(
84
+ inverted_index: dict[str, list[int]] | None,
85
+ ) -> str | None:
86
+ """Reconstruct an abstract from OpenAlex's inverted index format.
87
+
88
+ Args:
89
+ inverted_index: Mapping of word → list of positions.
90
+
91
+ Returns:
92
+ The reconstructed abstract string, or None if input is None/empty.
93
+ """
94
+ if not inverted_index:
95
+ return None
96
+
97
+ words: dict[int, str] = {}
98
+ for word, positions in inverted_index.items():
99
+ for pos in positions:
100
+ words[pos] = word
101
+
102
+ if not words:
103
+ return None
104
+
105
+ return " ".join(words[i] for i in sorted(words.keys()))