aletheca 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aletheca-0.1.0/PKG-INFO +183 -0
- aletheca-0.1.0/README.md +157 -0
- aletheca-0.1.0/pyproject.toml +119 -0
- aletheca-0.1.0/src/aletheca/__init__.py +64 -0
- aletheca-0.1.0/src/aletheca/_helpers.py +105 -0
- aletheca-0.1.0/src/aletheca/client.py +162 -0
- aletheca-0.1.0/src/aletheca/config.py +45 -0
- aletheca-0.1.0/src/aletheca/constants.py +21 -0
- aletheca-0.1.0/src/aletheca/endpoints.py +338 -0
- aletheca-0.1.0/src/aletheca/models/__init__.py +127 -0
- aletheca-0.1.0/src/aletheca/models/author.py +43 -0
- aletheca-0.1.0/src/aletheca/models/award.py +54 -0
- aletheca-0.1.0/src/aletheca/models/base.py +45 -0
- aletheca-0.1.0/src/aletheca/models/common.py +361 -0
- aletheca-0.1.0/src/aletheca/models/dehydrated.py +60 -0
- aletheca-0.1.0/src/aletheca/models/funder.py +33 -0
- aletheca-0.1.0/src/aletheca/models/ids.py +99 -0
- aletheca-0.1.0/src/aletheca/models/institution.py +71 -0
- aletheca-0.1.0/src/aletheca/models/keyword.py +18 -0
- aletheca-0.1.0/src/aletheca/models/publisher.py +46 -0
- aletheca-0.1.0/src/aletheca/models/safe_types.py +5 -0
- aletheca-0.1.0/src/aletheca/models/source.py +64 -0
- aletheca-0.1.0/src/aletheca/models/topic.py +29 -0
- aletheca-0.1.0/src/aletheca/models/work.py +129 -0
- aletheca-0.1.0/src/aletheca/py.typed +0 -0
- aletheca-0.1.0/src/aletheca/queries.py +165 -0
- aletheca-0.1.0/src/aletheca/resources/__init__.py +23 -0
- aletheca-0.1.0/src/aletheca/resources/_standard.py +84 -0
- aletheca-0.1.0/src/aletheca/resources/authors_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/awards_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/funders_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/institutions_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/keywords_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/publishers_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/sources_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/topics_client.py +21 -0
- aletheca-0.1.0/src/aletheca/resources/works_client.py +35 -0
- aletheca-0.1.0/src/aletheca/session.py +102 -0
- aletheca-0.1.0/src/aletheca/unwrapper.py +40 -0
aletheca-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: aletheca
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python interface for the OpenAlex API, built on top of the bibliofabric framework.
|
|
5
|
+
Author: Samuel Mok
|
|
6
|
+
Author-email: Samuel Mok <s.mok@utwente.nl>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Framework :: AsyncIO
|
|
13
|
+
Classifier: Typing :: Typed
|
|
14
|
+
Requires-Dist: bibliofabric>=0.4.1,<0.5.0
|
|
15
|
+
Requires-Dist: polars ; extra == 'analysis'
|
|
16
|
+
Requires-Dist: duckdb>=1.3.0 ; extra == 'analysis'
|
|
17
|
+
Requires-Dist: matplotlib>=3.8.0 ; extra == 'analysis'
|
|
18
|
+
Requires-Dist: rich>=13.0.0 ; extra == 'analysis'
|
|
19
|
+
Requires-Dist: pandas>=2.1.0 ; extra == 'analysis'
|
|
20
|
+
Requires-Dist: numpy>=1.26.0 ; extra == 'analysis'
|
|
21
|
+
Requires-Dist: pyarrow>=14.0.0 ; extra == 'analysis'
|
|
22
|
+
Requires-Python: >=3.12
|
|
23
|
+
Project-URL: Homepage, https://github.com/utsmok/aletheca
|
|
24
|
+
Provides-Extra: analysis
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Aletheca: Asynchronous Python client for the OpenAlex API
|
|
28
|
+
|
|
29
|
+
Samuel Mok -- s.mok@utwente.nl -- 2025-2026
|
|
30
|
+
|
|
31
|
+
Aletheca is an async Python client for the [OpenAlex API](https://docs.openalex.org/), built on [bibliofabric](https://github.com/utsmok/bibliofabric).
|
|
32
|
+
|
|
33
|
+
**Docs:** [utsmok.github.io/aletheca](https://utsmok.github.io/aletheca/) -- **PyPI:** [aletheca](https://pypi.org/project/aletheca/) -- **License:** MIT
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Async by design** -- built on `httpx` + `asyncio` with proper connection pooling
|
|
38
|
+
- **Typed throughout** -- Pydantic v2 models for all entities, PEP 561 `py.typed` marker
|
|
39
|
+
- **Cursor pagination** -- efficient iteration over large result sets via cursor-based auto-pagination
|
|
40
|
+
- **Filter serialization** -- automatic conversion to OpenAlex `filter=key:value` syntax with Pydantic filter models
|
|
41
|
+
- **Safe types** -- `SafeList` and `SafeStr` for None-safe traversal of API responses
|
|
42
|
+
- **Convenience queries** -- high-level functions for common workflows (`works_by_author`, `citing_works`, etc.)
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uv add aletheca
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or with pip: `pip install aletheca`. Requires Python >=3.12.
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import asyncio
|
|
56
|
+
from aletheca import AlethecaSession
|
|
57
|
+
|
|
58
|
+
async def main():
|
|
59
|
+
async with AlethecaSession() as session:
|
|
60
|
+
# Get a work by OpenAlex ID
|
|
61
|
+
work = await session.works.get("W1234567890")
|
|
62
|
+
print(work.title)
|
|
63
|
+
|
|
64
|
+
# Search works
|
|
65
|
+
results = await session.works.search(search="machine learning", page_size=10)
|
|
66
|
+
for work in results.results:
|
|
67
|
+
print(f"{work.title} ({work.publication_year})")
|
|
68
|
+
|
|
69
|
+
# Iterate all works by an author (cursor-based auto-pagination)
|
|
70
|
+
async for work in session.works.iterate(
|
|
71
|
+
filters={"authorships.author.id": "A1234567890"},
|
|
72
|
+
page_size=200,
|
|
73
|
+
):
|
|
74
|
+
print(work.title)
|
|
75
|
+
|
|
76
|
+
asyncio.run(main())
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
No authentication required -- the OpenAlex API works without it. For higher rate limits, see [Authentication](#authentication).
|
|
80
|
+
|
|
81
|
+
## Examples
|
|
82
|
+
|
|
83
|
+
All examples in [`examples/`](examples/) are dual-purpose -- run as scripts or as interactive [marimo](https://marimo.io) notebooks:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# As a script
|
|
87
|
+
uv run examples/simple_example.py
|
|
88
|
+
|
|
89
|
+
# As an interactive notebook
|
|
90
|
+
uv run marimo edit examples/simple_example.py
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
| Script | Description |
|
|
94
|
+
|--------|-------------|
|
|
95
|
+
| `simple_example.py` | Search, iterate, get works |
|
|
96
|
+
| `02_filtering_and_search.py` | WorksFilters, AuthorsFilters, and other filter models |
|
|
97
|
+
| `03_institution_research.py` | Works by institution, topic analysis |
|
|
98
|
+
| `04_author_discovery.py` | Find authors, retrieve their works |
|
|
99
|
+
| `05_advanced_queries.py` | Cursor pagination, select fields, sort |
|
|
100
|
+
| `06_convenience_queries.py` | `session.queries.*` convenience functions |
|
|
101
|
+
| `07_iterator_helpers.py` | `collect()`, `count()`, `first()` from bibliofabric mixins |
|
|
102
|
+
| `08_safe_types_and_helpers.py` | SafeList, SafeStr, DOI normalization, abstract reconstruction |
|
|
103
|
+
|
|
104
|
+
## Authentication
|
|
105
|
+
|
|
106
|
+
Aletheca auto-detects the OpenAlex API key from environment variables or `.env` files (prefixed with `ALETHECA_`). No auth is the default if nothing is configured.
|
|
107
|
+
|
|
108
|
+
```dotenv
|
|
109
|
+
ALETHECA_OPENALEX_API_KEY=your_api_key
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Or pass explicitly:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
async with AlethecaSession(api_key="your_api_key") as session:
|
|
116
|
+
...
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
With an API key you get faster responses (dedicated pool). Without one, you use the polite pool (slower).
|
|
120
|
+
|
|
121
|
+
## Basic Usage
|
|
122
|
+
|
|
123
|
+
### Get a single entity
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
work = await session.works.get("W2741809801")
|
|
127
|
+
print(work.title, work.doi, work.publication_year)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Search
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
results = await session.works.search(search="machine learning", page_size=5)
|
|
134
|
+
for work in results.results:
|
|
135
|
+
print(work.title)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Iterate all results
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
async for work in session.works.iterate(
|
|
142
|
+
filters={"publication_year": 2024, "is_oa": True},
|
|
143
|
+
page_size=200,
|
|
144
|
+
):
|
|
145
|
+
print(work.title)
|
|
146
|
+
break # stop when you want
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Convenience queries
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
citations = await session.queries.citing_works("W2741809801")
|
|
153
|
+
print(f"{len(citations)} citations")
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Known OpenAlex API Issues
|
|
157
|
+
|
|
158
|
+
Full bug report with reproduction steps: [`OPENALEX_BUG_REPORT.md`](OPENALEX_BUG_REPORT.md).
|
|
159
|
+
|
|
160
|
+
- **OpenAPI spec is substantially incomplete** -- 50+ fields returned by the live API are missing from the spec schemas across all entity types. Several spec fields don't exist in the live API.
|
|
161
|
+
- **Wrong field names in spec** -- `content_url` (spec) vs `content_urls` (live), `grants_count` (spec) vs `awards_count` (live)
|
|
162
|
+
- **Undocumented fields** -- `institution_awarded` on Awards is not documented anywhere; 15+ nested Award filters are missing from the docs filter table
|
|
163
|
+
- **Awards endpoint missing from `llms.txt`** -- the awards endpoint is not listed in the API quick reference
|
|
164
|
+
- **`per_page` max is 200, not 100** -- documented as 100 but the API accepts 200
|
|
165
|
+
|
|
166
|
+
## Development
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
uv sync --all-groups --all-extras # install everything
|
|
170
|
+
uv run ruff check src/ --fix # lint
|
|
171
|
+
uv run ruff format src/ # format
|
|
172
|
+
uvx ty check src/ # type check
|
|
173
|
+
uv run pytest tests/ # run tests
|
|
174
|
+
uv run pytest --cov=aletheca tests/ # coverage (CI threshold: 95%)
|
|
175
|
+
uv build # build package
|
|
176
|
+
uv run mkdocs serve # local docs
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Contributions welcome -- see [Contributing](https://utsmok.github.io/aletheca/contributing/).
|
|
180
|
+
|
|
181
|
+
## License
|
|
182
|
+
|
|
183
|
+
MIT
|
aletheca-0.1.0/README.md
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Aletheca: Asynchronous Python client for the OpenAlex API
|
|
2
|
+
|
|
3
|
+
Samuel Mok -- s.mok@utwente.nl -- 2025-2026
|
|
4
|
+
|
|
5
|
+
Aletheca is an async Python client for the [OpenAlex API](https://docs.openalex.org/), built on [bibliofabric](https://github.com/utsmok/bibliofabric).
|
|
6
|
+
|
|
7
|
+
**Docs:** [utsmok.github.io/aletheca](https://utsmok.github.io/aletheca/) -- **PyPI:** [aletheca](https://pypi.org/project/aletheca/) -- **License:** MIT
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Async by design** -- built on `httpx` + `asyncio` with proper connection pooling
|
|
12
|
+
- **Typed throughout** -- Pydantic v2 models for all entities, PEP 561 `py.typed` marker
|
|
13
|
+
- **Cursor pagination** -- efficient iteration over large result sets via cursor-based auto-pagination
|
|
14
|
+
- **Filter serialization** -- automatic conversion to OpenAlex `filter=key:value` syntax with Pydantic filter models
|
|
15
|
+
- **Safe types** -- `SafeList` and `SafeStr` for None-safe traversal of API responses
|
|
16
|
+
- **Convenience queries** -- high-level functions for common workflows (`works_by_author`, `citing_works`, etc.)
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add aletheca
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or with pip: `pip install aletheca`. Requires Python >=3.12.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import asyncio
|
|
30
|
+
from aletheca import AlethecaSession
|
|
31
|
+
|
|
32
|
+
async def main():
|
|
33
|
+
async with AlethecaSession() as session:
|
|
34
|
+
# Get a work by OpenAlex ID
|
|
35
|
+
work = await session.works.get("W1234567890")
|
|
36
|
+
print(work.title)
|
|
37
|
+
|
|
38
|
+
# Search works
|
|
39
|
+
results = await session.works.search(search="machine learning", page_size=10)
|
|
40
|
+
for work in results.results:
|
|
41
|
+
print(f"{work.title} ({work.publication_year})")
|
|
42
|
+
|
|
43
|
+
# Iterate all works by an author (cursor-based auto-pagination)
|
|
44
|
+
async for work in session.works.iterate(
|
|
45
|
+
filters={"authorships.author.id": "A1234567890"},
|
|
46
|
+
page_size=200,
|
|
47
|
+
):
|
|
48
|
+
print(work.title)
|
|
49
|
+
|
|
50
|
+
asyncio.run(main())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
No authentication required -- the OpenAlex API works without it. For higher rate limits, see [Authentication](#authentication).
|
|
54
|
+
|
|
55
|
+
## Examples
|
|
56
|
+
|
|
57
|
+
All examples in [`examples/`](examples/) are dual-purpose -- run as scripts or as interactive [marimo](https://marimo.io) notebooks:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# As a script
|
|
61
|
+
uv run examples/simple_example.py
|
|
62
|
+
|
|
63
|
+
# As an interactive notebook
|
|
64
|
+
uv run marimo edit examples/simple_example.py
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
| Script | Description |
|
|
68
|
+
|--------|-------------|
|
|
69
|
+
| `simple_example.py` | Search, iterate, get works |
|
|
70
|
+
| `02_filtering_and_search.py` | WorksFilters, AuthorsFilters, and other filter models |
|
|
71
|
+
| `03_institution_research.py` | Works by institution, topic analysis |
|
|
72
|
+
| `04_author_discovery.py` | Find authors, retrieve their works |
|
|
73
|
+
| `05_advanced_queries.py` | Cursor pagination, select fields, sort |
|
|
74
|
+
| `06_convenience_queries.py` | `session.queries.*` convenience functions |
|
|
75
|
+
| `07_iterator_helpers.py` | `collect()`, `count()`, `first()` from bibliofabric mixins |
|
|
76
|
+
| `08_safe_types_and_helpers.py` | SafeList, SafeStr, DOI normalization, abstract reconstruction |
|
|
77
|
+
|
|
78
|
+
## Authentication
|
|
79
|
+
|
|
80
|
+
Aletheca auto-detects the OpenAlex API key from environment variables or `.env` files (prefixed with `ALETHECA_`). No auth is the default if nothing is configured.
|
|
81
|
+
|
|
82
|
+
```dotenv
|
|
83
|
+
ALETHECA_OPENALEX_API_KEY=your_api_key
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Or pass explicitly:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
async with AlethecaSession(api_key="your_api_key") as session:
|
|
90
|
+
...
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
With an API key you get faster responses (dedicated pool). Without one, you use the polite pool (slower).
|
|
94
|
+
|
|
95
|
+
## Basic Usage
|
|
96
|
+
|
|
97
|
+
### Get a single entity
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
work = await session.works.get("W2741809801")
|
|
101
|
+
print(work.title, work.doi, work.publication_year)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Search
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
results = await session.works.search(search="machine learning", page_size=5)
|
|
108
|
+
for work in results.results:
|
|
109
|
+
print(work.title)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Iterate all results
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
async for work in session.works.iterate(
|
|
116
|
+
filters={"publication_year": 2024, "is_oa": True},
|
|
117
|
+
page_size=200,
|
|
118
|
+
):
|
|
119
|
+
print(work.title)
|
|
120
|
+
break # stop when you want
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Convenience queries
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
citations = await session.queries.citing_works("W2741809801")
|
|
127
|
+
print(f"{len(citations)} citations")
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Known OpenAlex API Issues
|
|
131
|
+
|
|
132
|
+
Full bug report with reproduction steps: [`OPENALEX_BUG_REPORT.md`](OPENALEX_BUG_REPORT.md).
|
|
133
|
+
|
|
134
|
+
- **OpenAPI spec is substantially incomplete** -- 50+ fields returned by the live API are missing from the spec schemas across all entity types. Several spec fields don't exist in the live API.
|
|
135
|
+
- **Wrong field names in spec** -- `content_url` (spec) vs `content_urls` (live), `grants_count` (spec) vs `awards_count` (live)
|
|
136
|
+
- **Undocumented fields** -- `institution_awarded` on Awards is not documented anywhere; 15+ nested Award filters are missing from the docs filter table
|
|
137
|
+
- **Awards endpoint missing from `llms.txt`** -- the awards endpoint is not listed in the API quick reference
|
|
138
|
+
- **`per_page` max is 200, not 100** -- documented as 100 but the API accepts 200
|
|
139
|
+
|
|
140
|
+
## Development
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
uv sync --all-groups --all-extras # install everything
|
|
144
|
+
uv run ruff check src/ --fix # lint
|
|
145
|
+
uv run ruff format src/ # format
|
|
146
|
+
uvx ty check src/ # type check
|
|
147
|
+
uv run pytest tests/ # run tests
|
|
148
|
+
uv run pytest --cov=aletheca tests/ # coverage (CI threshold: 95%)
|
|
149
|
+
uv build # build package
|
|
150
|
+
uv run mkdocs serve # local docs
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Contributions welcome -- see [Contributing](https://utsmok.github.io/aletheca/contributing/).
|
|
154
|
+
|
|
155
|
+
## License
|
|
156
|
+
|
|
157
|
+
MIT
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "aletheca"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python interface for the OpenAlex API, built on top of the bibliofabric framework."
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Samuel Mok", email = "s.mok@utwente.nl"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = {text = "MIT"}
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"bibliofabric>=0.4.1,<0.5.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Framework :: AsyncIO",
|
|
21
|
+
"Typing :: Typed",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[dependency-groups]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=8.3.4",
|
|
27
|
+
"pytest-asyncio>=0.26.0",
|
|
28
|
+
"pytest-cov>=6.1.1",
|
|
29
|
+
"pytest-httpx>=0.35.0",
|
|
30
|
+
"python-dotenv>=1.0.0",
|
|
31
|
+
]
|
|
32
|
+
docs = [
|
|
33
|
+
"mkdocs~=1.6.0",
|
|
34
|
+
"mkdocs-material~=9.5.0",
|
|
35
|
+
"mkdocstrings[python]",
|
|
36
|
+
]
|
|
37
|
+
lint = ["ruff>=0.8.0"]
|
|
38
|
+
test = ["pytest", "pytest-randomly"]
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
analysis = [
|
|
42
|
+
"polars",
|
|
43
|
+
"duckdb>=1.3.0",
|
|
44
|
+
"matplotlib>=3.8.0",
|
|
45
|
+
"rich>=13.0.0",
|
|
46
|
+
"pandas>=2.1.0",
|
|
47
|
+
"numpy>=1.26.0",
|
|
48
|
+
"pyarrow>=14.0.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
"Homepage" = "https://github.com/utsmok/aletheca"
|
|
53
|
+
|
|
54
|
+
[build-system]
|
|
55
|
+
requires = ["uv_build>=0.11.19,<0.12"]
|
|
56
|
+
build-backend = "uv_build"
|
|
57
|
+
|
|
58
|
+
[tool.uv.pip]
|
|
59
|
+
generate-hashes = true
|
|
60
|
+
|
|
61
|
+
[tool.ruff]
|
|
62
|
+
line-length = 88
|
|
63
|
+
|
|
64
|
+
[tool.ruff.format]
|
|
65
|
+
docstring-code-format = true
|
|
66
|
+
docstring-code-line-length = 60
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint]
|
|
69
|
+
select = [
|
|
70
|
+
"E4",
|
|
71
|
+
"E7",
|
|
72
|
+
"E9",
|
|
73
|
+
"F",
|
|
74
|
+
"I", # isort
|
|
75
|
+
"B", # bugbear -- flake8 bugfinder
|
|
76
|
+
"Q", # correct quotes usage
|
|
77
|
+
"PTH", # Replace os functions with pathlib functions
|
|
78
|
+
"SIM", # Simplify statements
|
|
79
|
+
"RET", # Return value related rules
|
|
80
|
+
"PIE", # misc flake8 rules
|
|
81
|
+
"FBT", # boolean traps
|
|
82
|
+
"PERF", # performance optimization
|
|
83
|
+
"PL", # pylint
|
|
84
|
+
"UP", # check for deprecated ways of coding
|
|
85
|
+
"FURB",
|
|
86
|
+
]
|
|
87
|
+
ignore = ["PLR2004"]
|
|
88
|
+
|
|
89
|
+
[tool.ruff.lint.isort]
|
|
90
|
+
combine-as-imports = true
|
|
91
|
+
|
|
92
|
+
[tool.ruff.lint.per-file-ignores]
|
|
93
|
+
"__init__.py" = ["F401"]
|
|
94
|
+
"**/client.py" = ["PLR0913", "PLR0912", "PLR0915", "PLC0415"]
|
|
95
|
+
"**/session.py" = ["PLC0415"]
|
|
96
|
+
"**/queries.py" = ["PLC0415"]
|
|
97
|
+
"**/endpoints.py" = ["PLR0913"]
|
|
98
|
+
"examples/**/*.py" = ["PLC0415", "F821", "PERF401", "B007", "F841"]
|
|
99
|
+
"marimo_checks/**/*.py" = ["PLC0415", "F821", "F841", "B905", "PLW2901", "I001", "PLR1711", "F404", "UP037"]
|
|
100
|
+
|
|
101
|
+
[tool.ruff.lint.pylint]
|
|
102
|
+
max-args = 10
|
|
103
|
+
max-branches = 25
|
|
104
|
+
max-statements = 75
|
|
105
|
+
max-returns = 10
|
|
106
|
+
|
|
107
|
+
[tool.pytest.ini_options]
|
|
108
|
+
pythonpath = [
|
|
109
|
+
"src"
|
|
110
|
+
]
|
|
111
|
+
testpaths = ["tests"]
|
|
112
|
+
python_files = "test_*.py"
|
|
113
|
+
python_functions = "test_*"
|
|
114
|
+
asyncio_mode = "auto"
|
|
115
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
116
|
+
markers = [
|
|
117
|
+
"live_api: marks tests that hit the live OpenAlex API (requires API key, skipped in CI)",
|
|
118
|
+
]
|
|
119
|
+
addopts = "-m 'not live_api'"
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Aletheca: Python interface for the OpenAlex API."""
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from importlib.metadata import PackageNotFoundError, version as _get_version
|
|
5
|
+
|
|
6
|
+
__version__ = _get_version("aletheca")
|
|
7
|
+
except PackageNotFoundError:
|
|
8
|
+
__version__ = "0.0.0"
|
|
9
|
+
|
|
10
|
+
from bibliofabric.exceptions import (
|
|
11
|
+
APIError,
|
|
12
|
+
AuthError,
|
|
13
|
+
BibliofabricError,
|
|
14
|
+
ConfigurationError,
|
|
15
|
+
NetworkError,
|
|
16
|
+
NotFoundError,
|
|
17
|
+
RateLimitError,
|
|
18
|
+
TimeoutError,
|
|
19
|
+
ValidationError,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from .client import AlethecaClient
|
|
23
|
+
from .models import (
|
|
24
|
+
ApiResponse,
|
|
25
|
+
Author,
|
|
26
|
+
Award,
|
|
27
|
+
BaseEntity,
|
|
28
|
+
Funder,
|
|
29
|
+
Institution,
|
|
30
|
+
Keyword,
|
|
31
|
+
Meta,
|
|
32
|
+
Publisher,
|
|
33
|
+
Source,
|
|
34
|
+
Topic,
|
|
35
|
+
Work,
|
|
36
|
+
)
|
|
37
|
+
from .session import AlethecaSession
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"__version__",
|
|
41
|
+
"APIError",
|
|
42
|
+
"ApiResponse",
|
|
43
|
+
"AuthError",
|
|
44
|
+
"Award",
|
|
45
|
+
"Author",
|
|
46
|
+
"BaseEntity",
|
|
47
|
+
"BibliofabricError",
|
|
48
|
+
"ConfigurationError",
|
|
49
|
+
"Funder",
|
|
50
|
+
"Institution",
|
|
51
|
+
"Keyword",
|
|
52
|
+
"Meta",
|
|
53
|
+
"NetworkError",
|
|
54
|
+
"NotFoundError",
|
|
55
|
+
"Publisher",
|
|
56
|
+
"RateLimitError",
|
|
57
|
+
"Source",
|
|
58
|
+
"AlethecaClient",
|
|
59
|
+
"AlethecaSession",
|
|
60
|
+
"TimeoutError",
|
|
61
|
+
"Topic",
|
|
62
|
+
"ValidationError",
|
|
63
|
+
"Work",
|
|
64
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Utility helpers for working with OpenAlex identifiers and data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def normalize_doi(doi: str) -> str:
|
|
9
|
+
"""Normalize a DOI to its bare form (no URL prefix).
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
doi: A DOI string, possibly with ``https://doi.org/`` prefix.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
The bare DOI string.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
>>> normalize_doi("https://doi.org/10.1234/x")
|
|
19
|
+
"10.1234/x"
|
|
20
|
+
>>> normalize_doi("10.1234/x")
|
|
21
|
+
"10.1234/x"
|
|
22
|
+
"""
|
|
23
|
+
doi = doi.strip()
|
|
24
|
+
for prefix in ("https://doi.org/", "http://doi.org/", "doi.org/"):
|
|
25
|
+
if doi.startswith(prefix):
|
|
26
|
+
return doi[len(prefix) :]
|
|
27
|
+
return doi
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_openalex_id(url_or_id: str) -> str:
|
|
31
|
+
"""Extract the short OpenAlex ID from a full URL or bare ID.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
url_or_id: An OpenAlex ID or URL (e.g., ``https://openalex.org/W123``).
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The short ID (e.g., ``W123``).
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> parse_openalex_id("https://openalex.org/W1234567890")
|
|
41
|
+
"W1234567890"
|
|
42
|
+
>>> parse_openalex_id("W1234567890")
|
|
43
|
+
"W1234567890"
|
|
44
|
+
"""
|
|
45
|
+
url_or_id = url_or_id.strip()
|
|
46
|
+
match = re.search(r"([WAITSFPDC]\d+)", url_or_id)
|
|
47
|
+
if match:
|
|
48
|
+
return match.group(1)
|
|
49
|
+
return url_or_id
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def detect_id_type(identifier: str) -> str | None:
|
|
53
|
+
"""Detect the type of a scholarly identifier.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
identifier: A string identifier.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
One of ``"openalex"``, ``"doi"``, ``"pmid"``, ``"orcid"``,
|
|
60
|
+
``"issn"``, ``"ror"``, or ``None``.
|
|
61
|
+
"""
|
|
62
|
+
identifier = identifier.strip()
|
|
63
|
+
if re.match(r"^[WAITSFPDC]\d+$", identifier, re.IGNORECASE):
|
|
64
|
+
return "openalex"
|
|
65
|
+
identifier_lower = identifier.lower()
|
|
66
|
+
if identifier_lower.startswith("10.") or "doi.org/" in identifier_lower:
|
|
67
|
+
return "doi"
|
|
68
|
+
if re.match(r"^\d{4}-\d{3,4}$", identifier_lower):
|
|
69
|
+
return "issn"
|
|
70
|
+
if re.match(r"^\d{7,8}$", identifier_lower):
|
|
71
|
+
return "pmid"
|
|
72
|
+
if identifier_lower.startswith("https://orcid.org/") or re.match(
|
|
73
|
+
r"\d{4}-\d{4}-\d{4}-\d{4}", identifier_lower
|
|
74
|
+
):
|
|
75
|
+
return "orcid"
|
|
76
|
+
if identifier_lower.startswith("https://ror.org/") or re.match(
|
|
77
|
+
r"^0[a-hj-km-np-tv-z]{2,3}\w{3,14}$", identifier_lower
|
|
78
|
+
):
|
|
79
|
+
return "ror"
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def reconstruct_abstract(
|
|
84
|
+
inverted_index: dict[str, list[int]] | None,
|
|
85
|
+
) -> str | None:
|
|
86
|
+
"""Reconstruct an abstract from OpenAlex's inverted index format.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
inverted_index: Mapping of word → list of positions.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
The reconstructed abstract string, or None if input is None/empty.
|
|
93
|
+
"""
|
|
94
|
+
if not inverted_index:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
words: dict[int, str] = {}
|
|
98
|
+
for word, positions in inverted_index.items():
|
|
99
|
+
for pos in positions:
|
|
100
|
+
words[pos] = word
|
|
101
|
+
|
|
102
|
+
if not words:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
return " ".join(words[i] for i in sorted(words.keys()))
|