litfetch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litfetch-0.1.0/LICENSE +21 -0
- litfetch-0.1.0/PKG-INFO +230 -0
- litfetch-0.1.0/README.md +202 -0
- litfetch-0.1.0/litfetch/__init__.py +92 -0
- litfetch-0.1.0/litfetch/_doi.py +78 -0
- litfetch-0.1.0/litfetch/_http.py +194 -0
- litfetch-0.1.0/litfetch/artifacts.py +91 -0
- litfetch-0.1.0/litfetch/crossref.py +51 -0
- litfetch-0.1.0/litfetch/fetchers.py +888 -0
- litfetch-0.1.0/litfetch/ids.py +41 -0
- litfetch-0.1.0/litfetch/py.typed +0 -0
- litfetch-0.1.0/litfetch/relations.py +114 -0
- litfetch-0.1.0/litfetch/resolvers.py +202 -0
- litfetch-0.1.0/litfetch/semantic_scholar.py +71 -0
- litfetch-0.1.0/litfetch/serde.py +67 -0
- litfetch-0.1.0/litfetch/sessions.py +344 -0
- litfetch-0.1.0/litfetch/source_metadata.py +118 -0
- litfetch-0.1.0/litfetch/unpaywall.py +58 -0
- litfetch-0.1.0/litfetch.egg-info/PKG-INFO +230 -0
- litfetch-0.1.0/litfetch.egg-info/SOURCES.txt +31 -0
- litfetch-0.1.0/litfetch.egg-info/dependency_links.txt +1 -0
- litfetch-0.1.0/litfetch.egg-info/requires.txt +5 -0
- litfetch-0.1.0/litfetch.egg-info/top_level.txt +1 -0
- litfetch-0.1.0/pyproject.toml +129 -0
- litfetch-0.1.0/setup.cfg +4 -0
- litfetch-0.1.0/tests/test_doi.py +80 -0
- litfetch-0.1.0/tests/test_fetchers.py +746 -0
- litfetch-0.1.0/tests/test_http.py +103 -0
- litfetch-0.1.0/tests/test_relations.py +79 -0
- litfetch-0.1.0/tests/test_resolvers.py +147 -0
- litfetch-0.1.0/tests/test_serde.py +28 -0
- litfetch-0.1.0/tests/test_sessions.py +184 -0
- litfetch-0.1.0/tests/test_source_metadata.py +130 -0
litfetch-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Centre for Population Genomics
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
litfetch-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: litfetch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Identifier -> the retrievable artifacts of a scholarly article: a pluggable source ladder + identifier resolvers.
|
|
5
|
+
Author: Toby Sargeant
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/populationgenomics/litfetch
|
|
8
|
+
Project-URL: Issues, https://github.com/populationgenomics/litfetch/issues
|
|
9
|
+
Keywords: pubmed,pmc,full-text,jats,elsevier,supplementary
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: defusedxml>=0.7
|
|
24
|
+
Requires-Dist: httpx>=0.28
|
|
25
|
+
Provides-Extra: biorxiv
|
|
26
|
+
Requires-Dist: curl_cffi>=0.7; extra == "biorxiv"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# litfetch
|
|
30
|
+
|
|
31
|
+
Resolve a scholarly article identifier to its **retrievable artifacts** — the
|
|
32
|
+
full-text body and any supplementary material — and fetch their bytes.
|
|
33
|
+
|
|
34
|
+
litfetch is two cooperating seams:
|
|
35
|
+
|
|
36
|
+
- a **fetch ladder** — pluggable `Fetcher` backends (PMC Open Access S3, Europe
|
|
37
|
+
PMC, Elsevier OA) tried in priority order; the first to serve the body wins,
|
|
38
|
+
returning a `Blob` (a `File` plus its bytes);
|
|
39
|
+
- an optional **resolver layer** — pluggable `Resolver`s that enrich what you
|
|
40
|
+
know about a paper (`pmid` → `pmcid`/`doi`, etc.) so the ladder can act.
|
|
41
|
+
|
|
42
|
+
You hand it an `ArticleIds` bundle (any of `pmid` / `pmcid` / `doi`). Resolution
|
|
43
|
+
is **demand-driven**: a resolver only runs when the next fetcher needs an
|
|
44
|
+
identifier you don't yet have, and runs at most once.
|
|
45
|
+
|
|
46
|
+
An article is modelled as a **file-set**: a collection of `File` references (the
|
|
47
|
+
body in its various media types, plus supplementary material, distinguished by
|
|
48
|
+
`FileKind`), each hosted upstream. litfetch fetches the raw artifacts and reports
|
|
49
|
+
their access terms; it does **not** render them. To turn a fetched JATS/Elsevier
|
|
50
|
+
body into markdown, run [litdown](https://github.com/populationgenomics/litdown)
|
|
51
|
+
on the bytes yourself (see [Render to markdown](#render-to-markdown)).
|
|
52
|
+
|
|
53
|
+
The examples below are a tour; [`docs/api.md`](docs/api.md) is the full
|
|
54
|
+
reference for the public surface.
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```sh
|
|
59
|
+
pip install litfetch
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
bioRxiv / medRxiv preprint full text needs a browser-fingerprint HTTP client,
|
|
63
|
+
enabled by the `biorxiv` extra:
|
|
64
|
+
|
|
65
|
+
```sh
|
|
66
|
+
pip install 'litfetch[biorxiv]'
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
### Fetch the body
|
|
72
|
+
|
|
73
|
+
Hand `fetch_body` an `ArticleIds`; the default ladder serves the first available
|
|
74
|
+
body as a `Blob`:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from litfetch import ArticleIds, fetch_body
|
|
78
|
+
|
|
79
|
+
blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
|
|
80
|
+
if blob:
|
|
81
|
+
print(blob.file.source, blob.file.media_type, len(blob.content))
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Render to markdown
|
|
85
|
+
|
|
86
|
+
litfetch returns raw bytes, not markdown. Convert a JATS/Elsevier body with
|
|
87
|
+
[litdown](https://github.com/populationgenomics/litdown) — you pick and pin the
|
|
88
|
+
converter:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import io
|
|
92
|
+
import litdown
|
|
93
|
+
from litfetch import ArticleIds, fetch_body
|
|
94
|
+
|
|
95
|
+
blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
|
|
96
|
+
if blob:
|
|
97
|
+
markdown = litdown.convert(io.BytesIO(blob.content))
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Inject your own resolver
|
|
101
|
+
|
|
102
|
+
A resolver is an async `(ArticleIds, Http) -> ArticleIds` — the session running
|
|
103
|
+
it supplies the `Http`. Enrich from whatever you have — a corpus client, a local
|
|
104
|
+
cache, an API — and `merge` it in (this one ignores `Http`, hence `_http`):
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from litfetch import ArticleIds, Http, fetch_body
|
|
108
|
+
|
|
109
|
+
async def my_resolver(ids: ArticleIds, _http: Http) -> ArticleIds:
|
|
110
|
+
if not ids.pmid:
|
|
111
|
+
return ids
|
|
112
|
+
pmcid, doi = await my_corpus.lookup(ids.pmid)
|
|
113
|
+
return ids.merge(ArticleIds(pmcid=pmcid, doi=doi))
|
|
114
|
+
|
|
115
|
+
blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=my_resolver)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Use a bundled resolver
|
|
119
|
+
|
|
120
|
+
Bundled resolvers are constructed with their config, then passed in the same
|
|
121
|
+
slot. `chain(...)` composes several (yours first, fallbacks after); it stops
|
|
122
|
+
once every identifier is known:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from litfetch import ArticleIds, fetch_body
|
|
126
|
+
from litfetch.resolvers import SemanticScholarResolver, NcbiIdConverterResolver, chain
|
|
127
|
+
|
|
128
|
+
resolver = chain(
|
|
129
|
+
my_resolver, # your own
|
|
130
|
+
SemanticScholarResolver(api_key=S2_KEY), # bundled
|
|
131
|
+
NcbiIdConverterResolver(tool='myapp'), # bundled
|
|
132
|
+
)
|
|
133
|
+
blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=resolver)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Polite-pool identification (NCBI/Crossref `email`, Unpaywall's required `email`)
|
|
137
|
+
comes from a session `contact`, not a hardcoded default — set it on the session:
|
|
138
|
+
`async with litfetch.Session(contact='you@example.org') as s: await s.fetch_body(...)`.
|
|
139
|
+
|
|
140
|
+
`default_resolver()` is a batteries-included, keyless chain
|
|
141
|
+
(Europe PMC search + NCBI ID Converter).
|
|
142
|
+
|
|
143
|
+
### No resolver — you already hold the IDs
|
|
144
|
+
|
|
145
|
+
A non-PubMed paper you only have a DOI for, plus your own Elsevier key:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
blob = await fetch_body(
|
|
149
|
+
ArticleIds(doi='10.1016/j.cell.2020.01.001'),
|
|
150
|
+
credentials={'elsevier_api_key': key},
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Supplementary material
|
|
155
|
+
|
|
156
|
+
`list_files` enumerates the file-set (references, no bytes); `fetch_file`
|
|
157
|
+
materialises one:
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from litfetch import ArticleIds, FileKind, list_files, fetch_file
|
|
161
|
+
|
|
162
|
+
files = await list_files(ArticleIds(pmcid='PMC5334499'), kind=FileKind.SUPPLEMENTARY)
|
|
163
|
+
for file in files:
|
|
164
|
+
blob = await fetch_file(file)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Access terms
|
|
168
|
+
|
|
169
|
+
Read the licence from the fetched bytes, falling back to an access authority
|
|
170
|
+
(Unpaywall) when the bytes carry none:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from litfetch import extract_source_metadata, resolve_access
|
|
174
|
+
|
|
175
|
+
meta = extract_source_metadata(blob) # from the JATS/Elsevier bytes
|
|
176
|
+
if meta.licence is None:
|
|
177
|
+
meta = await resolve_access(ArticleIds(doi='10.1016/j.cell.2020.01.001'))
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Resolvers stand alone
|
|
181
|
+
|
|
182
|
+
Each resolver is usable on its own as a cross-reference tool, independent of
|
|
183
|
+
fetching. A resolver is given the `Http` to use, so run it inside a session:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from litfetch import ArticleIds, Session
|
|
187
|
+
from litfetch.resolvers import SemanticScholarResolver
|
|
188
|
+
|
|
189
|
+
async with Session() as s:
|
|
190
|
+
ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/j.cell.2020.01.001'), s)
|
|
191
|
+
print(ids.pmid, ids.pmcid)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Batch: one session, a scope per paper
|
|
195
|
+
|
|
196
|
+
The one-shot functions above each open a throwaway session. For many papers,
|
|
197
|
+
hold one `Session` (pooled connection, shared pacing) and open a `scope` per
|
|
198
|
+
paper — the scope caches within itself, so a duplicate upstream call (e.g.
|
|
199
|
+
Unpaywall for both licence and PDF) is fetched once:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from litfetch import ArticleIds, Session
|
|
203
|
+
|
|
204
|
+
async with Session() as session:
|
|
205
|
+
for pmid in pmids:
|
|
206
|
+
async with session.scope() as s:
|
|
207
|
+
blob = await s.fetch_body(ArticleIds(pmid=pmid))
|
|
208
|
+
access = await s.resolve_access(ArticleIds(pmid=pmid))
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## Extending
|
|
212
|
+
|
|
213
|
+
- **A new body fetcher:** implement the `Fetcher` protocol — a `name`, a
|
|
214
|
+
`requires: frozenset[str]` of the `ArticleIds` fields it needs, and an async
|
|
215
|
+
`fetch(ids, *, credentials, http)` returning a body `Blob` or `None`.
|
|
216
|
+
Add it to a `fetchers=` list (or your own `default_fetchers`).
|
|
217
|
+
- **A new file source:** implement the `FileSource` protocol — a `name`, and
|
|
218
|
+
async `list_files(ids, ...)` / `fetch_file(file, ...)` — to enumerate and
|
|
219
|
+
materialise an article's file-set (body renditions and supplementary alike).
|
|
220
|
+
- **A new resolver:** write an async `ArticleIds -> ArticleIds` that fills gaps
|
|
221
|
+
via `ArticleIds.merge` and never overwrites a known id.
|
|
222
|
+
|
|
223
|
+
## Development
|
|
224
|
+
|
|
225
|
+
```sh
|
|
226
|
+
uv sync
|
|
227
|
+
uv run ruff check . && uv run ruff format --check .
|
|
228
|
+
uv run pyright
|
|
229
|
+
uv run pytest
|
|
230
|
+
```
|
litfetch-0.1.0/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# litfetch
|
|
2
|
+
|
|
3
|
+
Resolve a scholarly article identifier to its **retrievable artifacts** — the
|
|
4
|
+
full-text body and any supplementary material — and fetch their bytes.
|
|
5
|
+
|
|
6
|
+
litfetch is two cooperating seams:
|
|
7
|
+
|
|
8
|
+
- a **fetch ladder** — pluggable `Fetcher` backends (PMC Open Access S3, Europe
|
|
9
|
+
PMC, Elsevier OA) tried in priority order; the first to serve the body wins,
|
|
10
|
+
returning a `Blob` (a `File` plus its bytes);
|
|
11
|
+
- an optional **resolver layer** — pluggable `Resolver`s that enrich what you
|
|
12
|
+
know about a paper (`pmid` → `pmcid`/`doi`, etc.) so the ladder can act.
|
|
13
|
+
|
|
14
|
+
You hand it an `ArticleIds` bundle (any of `pmid` / `pmcid` / `doi`). Resolution
|
|
15
|
+
is **demand-driven**: a resolver only runs when the next fetcher needs an
|
|
16
|
+
identifier you don't yet have, and runs at most once.
|
|
17
|
+
|
|
18
|
+
An article is modelled as a **file-set**: a collection of `File` references (the
|
|
19
|
+
body in its various media types, plus supplementary material, distinguished by
|
|
20
|
+
`FileKind`), each hosted upstream. litfetch fetches the raw artifacts and reports
|
|
21
|
+
their access terms; it does **not** render them. To turn a fetched JATS/Elsevier
|
|
22
|
+
body into markdown, run [litdown](https://github.com/populationgenomics/litdown)
|
|
23
|
+
on the bytes yourself (see [Render to markdown](#render-to-markdown)).
|
|
24
|
+
|
|
25
|
+
The examples below are a tour; [`docs/api.md`](docs/api.md) is the full
|
|
26
|
+
reference for the public surface.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```sh
|
|
31
|
+
pip install litfetch
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
bioRxiv / medRxiv preprint full text needs a browser-fingerprint HTTP client,
|
|
35
|
+
enabled by the `biorxiv` extra:
|
|
36
|
+
|
|
37
|
+
```sh
|
|
38
|
+
pip install 'litfetch[biorxiv]'
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
### Fetch the body
|
|
44
|
+
|
|
45
|
+
Hand `fetch_body` an `ArticleIds`; the default ladder serves the first available
|
|
46
|
+
body as a `Blob`:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from litfetch import ArticleIds, fetch_body
|
|
50
|
+
|
|
51
|
+
blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
|
|
52
|
+
if blob:
|
|
53
|
+
print(blob.file.source, blob.file.media_type, len(blob.content))
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Render to markdown
|
|
57
|
+
|
|
58
|
+
litfetch returns raw bytes, not markdown. Convert a JATS/Elsevier body with
|
|
59
|
+
[litdown](https://github.com/populationgenomics/litdown) — you pick and pin the
|
|
60
|
+
converter:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import io
|
|
64
|
+
import litdown
|
|
65
|
+
from litfetch import ArticleIds, fetch_body
|
|
66
|
+
|
|
67
|
+
blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
|
|
68
|
+
if blob:
|
|
69
|
+
markdown = litdown.convert(io.BytesIO(blob.content))
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Inject your own resolver
|
|
73
|
+
|
|
74
|
+
A resolver is an async `(ArticleIds, Http) -> ArticleIds` — the session running
|
|
75
|
+
it supplies the `Http`. Enrich from whatever you have — a corpus client, a local
|
|
76
|
+
cache, an API — and `merge` it in (this one ignores `Http`, hence `_http`):
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from litfetch import ArticleIds, Http, fetch_body
|
|
80
|
+
|
|
81
|
+
async def my_resolver(ids: ArticleIds, _http: Http) -> ArticleIds:
|
|
82
|
+
if not ids.pmid:
|
|
83
|
+
return ids
|
|
84
|
+
pmcid, doi = await my_corpus.lookup(ids.pmid)
|
|
85
|
+
return ids.merge(ArticleIds(pmcid=pmcid, doi=doi))
|
|
86
|
+
|
|
87
|
+
blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=my_resolver)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Use a bundled resolver
|
|
91
|
+
|
|
92
|
+
Bundled resolvers are constructed with their config, then passed in the same
|
|
93
|
+
slot. `chain(...)` composes several (yours first, fallbacks after); it stops
|
|
94
|
+
once every identifier is known:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from litfetch import ArticleIds, fetch_body
|
|
98
|
+
from litfetch.resolvers import SemanticScholarResolver, NcbiIdConverterResolver, chain
|
|
99
|
+
|
|
100
|
+
resolver = chain(
|
|
101
|
+
my_resolver, # your own
|
|
102
|
+
SemanticScholarResolver(api_key=S2_KEY), # bundled
|
|
103
|
+
NcbiIdConverterResolver(tool='myapp'), # bundled
|
|
104
|
+
)
|
|
105
|
+
blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=resolver)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Polite-pool identification (NCBI/Crossref `email`, Unpaywall's required `email`)
|
|
109
|
+
comes from a session `contact`, not a hardcoded default — set it on the session:
|
|
110
|
+
`async with litfetch.Session(contact='you@example.org') as s: await s.fetch_body(...)`.
|
|
111
|
+
|
|
112
|
+
`default_resolver()` is a batteries-included, keyless chain
|
|
113
|
+
(Europe PMC search + NCBI ID Converter).
|
|
114
|
+
|
|
115
|
+
### No resolver — you already hold the IDs
|
|
116
|
+
|
|
117
|
+
A non-PubMed paper you only have a DOI for, plus your own Elsevier key:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
blob = await fetch_body(
|
|
121
|
+
ArticleIds(doi='10.1016/j.cell.2020.01.001'),
|
|
122
|
+
credentials={'elsevier_api_key': key},
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Supplementary material
|
|
127
|
+
|
|
128
|
+
`list_files` enumerates the file-set (references, no bytes); `fetch_file`
|
|
129
|
+
materialises one:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from litfetch import ArticleIds, FileKind, list_files, fetch_file
|
|
133
|
+
|
|
134
|
+
files = await list_files(ArticleIds(pmcid='PMC5334499'), kind=FileKind.SUPPLEMENTARY)
|
|
135
|
+
for file in files:
|
|
136
|
+
blob = await fetch_file(file)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Access terms
|
|
140
|
+
|
|
141
|
+
Read the licence from the fetched bytes, falling back to an access authority
|
|
142
|
+
(Unpaywall) when the bytes carry none:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from litfetch import extract_source_metadata, resolve_access
|
|
146
|
+
|
|
147
|
+
meta = extract_source_metadata(blob) # from the JATS/Elsevier bytes
|
|
148
|
+
if meta.licence is None:
|
|
149
|
+
meta = await resolve_access(ArticleIds(doi='10.1016/j.cell.2020.01.001'))
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Resolvers stand alone
|
|
153
|
+
|
|
154
|
+
Each resolver is usable on its own as a cross-reference tool, independent of
|
|
155
|
+
fetching. A resolver is given the `Http` to use, so run it inside a session:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from litfetch import ArticleIds, Session
|
|
159
|
+
from litfetch.resolvers import SemanticScholarResolver
|
|
160
|
+
|
|
161
|
+
async with Session() as s:
|
|
162
|
+
ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/j.cell.2020.01.001'), s)
|
|
163
|
+
print(ids.pmid, ids.pmcid)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Batch: one session, a scope per paper
|
|
167
|
+
|
|
168
|
+
The one-shot functions above each open a throwaway session. For many papers,
|
|
169
|
+
hold one `Session` (pooled connection, shared pacing) and open a `scope` per
|
|
170
|
+
paper — the scope caches within itself, so a duplicate upstream call (e.g.
|
|
171
|
+
Unpaywall for both licence and PDF) is fetched once:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from litfetch import ArticleIds, Session
|
|
175
|
+
|
|
176
|
+
async with Session() as session:
|
|
177
|
+
for pmid in pmids:
|
|
178
|
+
async with session.scope() as s:
|
|
179
|
+
blob = await s.fetch_body(ArticleIds(pmid=pmid))
|
|
180
|
+
access = await s.resolve_access(ArticleIds(pmid=pmid))
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Extending
|
|
184
|
+
|
|
185
|
+
- **A new body fetcher:** implement the `Fetcher` protocol — a `name`, a
|
|
186
|
+
`requires: frozenset[str]` of the `ArticleIds` fields it needs, and an async
|
|
187
|
+
`fetch(ids, *, credentials, http)` returning a body `Blob` or `None`.
|
|
188
|
+
Add it to a `fetchers=` list (or your own `default_fetchers`).
|
|
189
|
+
- **A new file source:** implement the `FileSource` protocol — a `name`, and
|
|
190
|
+
async `list_files(ids, ...)` / `fetch_file(file, ...)` — to enumerate and
|
|
191
|
+
materialise an article's file-set (body renditions and supplementary alike).
|
|
192
|
+
- **A new resolver:** write an async `ArticleIds -> ArticleIds` that fills gaps
|
|
193
|
+
via `ArticleIds.merge` and never overwrites a known id.
|
|
194
|
+
|
|
195
|
+
## Development
|
|
196
|
+
|
|
197
|
+
```sh
|
|
198
|
+
uv sync
|
|
199
|
+
uv run ruff check . && uv run ruff format --check .
|
|
200
|
+
uv run pyright
|
|
201
|
+
uv run pytest
|
|
202
|
+
```
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""litfetch: identifier -> the retrievable artifacts of a scholarly article.
|
|
2
|
+
|
|
3
|
+
Hand :func:`fetch_body` an :class:`ArticleIds` bundle (any of pmid / pmcid / doi)
|
|
4
|
+
and, optionally, a :data:`~litfetch.resolvers.Resolver` to fill in missing
|
|
5
|
+
identifiers on demand. A :class:`~litfetch.fetchers.Fetcher` ladder is tried in
|
|
6
|
+
priority order; the first to serve the body yields a :class:`Blob` (a
|
|
7
|
+
:class:`File` plus its bytes). Supplementary material is discovered with
|
|
8
|
+
:func:`list_files` and fetched with :func:`fetch_file`.
|
|
9
|
+
|
|
10
|
+
An article is modelled as a *file-set*: a collection of :class:`File` references
|
|
11
|
+
(body renditions and supplementary material, by :class:`FileKind`) sharing one
|
|
12
|
+
identity, each hosted upstream. litfetch fetches the raw artifacts and reports
|
|
13
|
+
their access terms (:class:`SourceMetadata`); rendering them (e.g. XML ->
|
|
14
|
+
markdown via litdown) and storing them are the consumer's concern. The bundled
|
|
15
|
+
identifier resolvers (Europe PMC, NCBI ID Converter, Semantic Scholar) live in
|
|
16
|
+
:mod:`litfetch.resolvers`; file-set listing and fetching live in
|
|
17
|
+
:mod:`litfetch.fetchers`.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from litfetch._http import Http, Rate, RetryPolicy
|
|
23
|
+
from litfetch.artifacts import (
|
|
24
|
+
INSTITUTIONAL,
|
|
25
|
+
Blob,
|
|
26
|
+
File,
|
|
27
|
+
FileKind,
|
|
28
|
+
SourceMetadata,
|
|
29
|
+
)
|
|
30
|
+
from litfetch.fetchers import (
|
|
31
|
+
BiorxivFetcher,
|
|
32
|
+
CrossrefFileSource,
|
|
33
|
+
ElsevierFetcher,
|
|
34
|
+
EuropePmcFetcher,
|
|
35
|
+
Fetcher,
|
|
36
|
+
FileSource,
|
|
37
|
+
PmcOaFetcher,
|
|
38
|
+
SemanticScholarFileSource,
|
|
39
|
+
SpringerFetcher,
|
|
40
|
+
SpringerFileSource,
|
|
41
|
+
UnpaywallFileSource,
|
|
42
|
+
default_fetchers,
|
|
43
|
+
default_file_sources,
|
|
44
|
+
)
|
|
45
|
+
from litfetch.ids import ArticleIds
|
|
46
|
+
from litfetch.relations import Related, RelationType
|
|
47
|
+
from litfetch.sessions import (
|
|
48
|
+
Session,
|
|
49
|
+
fetch_body,
|
|
50
|
+
fetch_file,
|
|
51
|
+
list_files,
|
|
52
|
+
related_ids,
|
|
53
|
+
resolve_access,
|
|
54
|
+
)
|
|
55
|
+
from litfetch.source_metadata import extract_source_metadata
|
|
56
|
+
|
|
57
|
+
__version__ = '0.1.0'
|
|
58
|
+
|
|
59
|
+
__all__ = [
|
|
60
|
+
'INSTITUTIONAL',
|
|
61
|
+
'ArticleIds',
|
|
62
|
+
'BiorxivFetcher',
|
|
63
|
+
'Blob',
|
|
64
|
+
'CrossrefFileSource',
|
|
65
|
+
'ElsevierFetcher',
|
|
66
|
+
'EuropePmcFetcher',
|
|
67
|
+
'Fetcher',
|
|
68
|
+
'File',
|
|
69
|
+
'FileKind',
|
|
70
|
+
'FileSource',
|
|
71
|
+
'Http',
|
|
72
|
+
'PmcOaFetcher',
|
|
73
|
+
'Rate',
|
|
74
|
+
'Related',
|
|
75
|
+
'RelationType',
|
|
76
|
+
'RetryPolicy',
|
|
77
|
+
'SemanticScholarFileSource',
|
|
78
|
+
'Session',
|
|
79
|
+
'SourceMetadata',
|
|
80
|
+
'SpringerFetcher',
|
|
81
|
+
'SpringerFileSource',
|
|
82
|
+
'UnpaywallFileSource',
|
|
83
|
+
'__version__',
|
|
84
|
+
'default_fetchers',
|
|
85
|
+
'default_file_sources',
|
|
86
|
+
'extract_source_metadata',
|
|
87
|
+
'fetch_body',
|
|
88
|
+
'fetch_file',
|
|
89
|
+
'list_files',
|
|
90
|
+
'related_ids',
|
|
91
|
+
'resolve_access',
|
|
92
|
+
]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""DOI validation and URL-safe path encoding.
|
|
2
|
+
|
|
3
|
+
Several sources interpolate a DOI into an upstream URL path (Unpaywall,
|
|
4
|
+
Crossref, Semantic Scholar, bioRxiv, the future doi.org resolve). Doing so
|
|
5
|
+
raw is wrong twice over: a DOI suffix may contain ``?``, ``#``, spaces, or
|
|
6
|
+
``/`` -- which truncate or reshape the URL -- and a crafted ``.``/``..``
|
|
7
|
+
segment is a path-traversal vector. :func:`encode_doi_path` is the one safe
|
|
8
|
+
way to place a DOI in a URL path; :func:`normalize_and_validate_doi` is the
|
|
9
|
+
shape gate it builds on.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import urllib.parse
|
|
16
|
+
|
|
17
|
+
# A DOI is ``10.<registrant>/<suffix>``: the registrant is one or more digits
|
|
18
|
+
# with optional dot-separated sub-elements (e.g. ``10.1000.10``); the suffix is
|
|
19
|
+
# any non-empty string. The digit count is left open -- the common 4-9 range is
|
|
20
|
+
# Crossref's observed corpus, not a spec rule -- so an unusual registrant is not
|
|
21
|
+
# rejected. DOIs are case-insensitive, so the prefix match is too; the value is
|
|
22
|
+
# returned unchanged (suffixes are case-sensitive for many registrants).
|
|
23
|
+
_DOI_RE = re.compile(r'^10\.\d+(?:\.\d+)*/.+$', re.IGNORECASE)
|
|
24
|
+
|
|
25
|
+
# Decorations a caller-supplied DOI may arrive with; stripped before validation.
|
|
26
|
+
_RESOLVER_PREFIXES = ('https://doi.org/', 'http://doi.org/', 'https://dx.doi.org/', 'http://dx.doi.org/')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_and_validate_doi(doi: str) -> str:
|
|
30
|
+
"""Return the bare, validated DOI, stripping common decorations.
|
|
31
|
+
|
|
32
|
+
Accepts a DOI carrying surrounding whitespace, a ``doi:`` scheme, or a
|
|
33
|
+
resolver URL prefix (``https://doi.org/``, ``http://dx.doi.org/``) and
|
|
34
|
+
returns the bare ``10.xxxx/suffix`` form.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
doi: The DOI to normalise, possibly decorated.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The bare DOI.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If the result is not a syntactically valid DOI.
|
|
44
|
+
"""
|
|
45
|
+
candidate = doi.strip()
|
|
46
|
+
lowered = candidate.lower()
|
|
47
|
+
for prefix in _RESOLVER_PREFIXES:
|
|
48
|
+
if lowered.startswith(prefix):
|
|
49
|
+
candidate = candidate[len(prefix) :]
|
|
50
|
+
break
|
|
51
|
+
if candidate.lower().startswith('doi:'):
|
|
52
|
+
candidate = candidate[len('doi:') :].strip()
|
|
53
|
+
if not _DOI_RE.match(candidate):
|
|
54
|
+
raise ValueError(f'not a valid DOI: {doi!r}')
|
|
55
|
+
return candidate
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def encode_doi_path(doi: str) -> str:
|
|
59
|
+
"""Percent-encode a validated DOI for safe interpolation into a URL path.
|
|
60
|
+
|
|
61
|
+
Validates via :func:`normalize_and_validate_doi`, then percent-encodes each
|
|
62
|
+
``/``-separated segment -- so a suffix ``/``, ``?``, ``#``, or space cannot
|
|
63
|
+
reshape the URL -- and rejects a ``.`` or ``..`` segment (path traversal).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
doi: The DOI to encode, possibly decorated.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The encoded DOI, ready to interpolate after a URL's path separator.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If the DOI is invalid or contains a dot-segment.
|
|
73
|
+
"""
|
|
74
|
+
normalized = normalize_and_validate_doi(doi)
|
|
75
|
+
segments = normalized.split('/')
|
|
76
|
+
if any(segment in ('.', '..') for segment in segments):
|
|
77
|
+
raise ValueError(f'DOI contains a path-traversal segment: {doi!r}')
|
|
78
|
+
return '/'.join(urllib.parse.quote(segment, safe='') for segment in segments)
|