litfetch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litfetch-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Centre for Population Genomics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: litfetch
3
+ Version: 0.1.0
4
+ Summary: Identifier -> the retrievable artifacts of a scholarly article: a pluggable source ladder + identifier resolvers.
5
+ Author: Toby Sargeant
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/populationgenomics/litfetch
8
+ Project-URL: Issues, https://github.com/populationgenomics/litfetch/issues
9
+ Keywords: pubmed,pmc,full-text,jats,elsevier,supplementary
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: defusedxml>=0.7
24
+ Requires-Dist: httpx>=0.28
25
+ Provides-Extra: biorxiv
26
+ Requires-Dist: curl_cffi>=0.7; extra == "biorxiv"
27
+ Dynamic: license-file
28
+
29
+ # litfetch
30
+
31
+ Resolve a scholarly article identifier to its **retrievable artifacts** — the
32
+ full-text body and any supplementary material — and fetch their bytes.
33
+
34
+ litfetch is two cooperating seams:
35
+
36
+ - a **fetch ladder** — pluggable `Fetcher` backends (PMC Open Access S3, Europe
37
+ PMC, Elsevier OA) tried in priority order; the first to serve the body wins,
38
+ returning a `Blob` (a `File` plus its bytes);
39
+ - an optional **resolver layer** — pluggable `Resolver`s that enrich what you
40
+ know about a paper (`pmid` → `pmcid`/`doi`, etc.) so the ladder can act.
41
+
42
+ You hand it an `ArticleIds` bundle (any of `pmid` / `pmcid` / `doi`). Resolution
43
+ is **demand-driven**: a resolver only runs when the next fetcher needs an
44
+ identifier you don't yet have, and runs at most once.
45
+
46
+ An article is modelled as a **file-set**: a collection of `File` references (the
47
+ body in its various media types, plus supplementary material, distinguished by
48
+ `FileKind`), each hosted upstream. litfetch fetches the raw artifacts and reports
49
+ their access terms; it does **not** render them. To turn a fetched JATS/Elsevier
50
+ body into markdown, run [litdown](https://github.com/populationgenomics/litdown)
51
+ on the bytes yourself (see [Render to markdown](#render-to-markdown)).
52
+
53
+ The examples below are a tour; [`docs/api.md`](docs/api.md) is the full
54
+ reference for the public surface.
55
+
56
+ ## Install
57
+
58
+ ```sh
59
+ pip install litfetch
60
+ ```
61
+
62
+ bioRxiv / medRxiv preprint full text needs a browser-fingerprint HTTP client,
63
+ enabled by the `biorxiv` extra:
64
+
65
+ ```sh
66
+ pip install 'litfetch[biorxiv]'
67
+ ```
68
+
69
+ ## Usage
70
+
71
+ ### Fetch the body
72
+
73
+ Hand `fetch_body` an `ArticleIds`; the default ladder serves the first available
74
+ body as a `Blob`:
75
+
76
+ ```python
77
+ from litfetch import ArticleIds, fetch_body
78
+
79
+ blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
80
+ if blob:
81
+ print(blob.file.source, blob.file.media_type, len(blob.content))
82
+ ```
83
+
84
+ ### Render to markdown
85
+
86
+ litfetch returns raw bytes, not markdown. Convert a JATS/Elsevier body with
87
+ [litdown](https://github.com/populationgenomics/litdown) — you pick and pin the
88
+ converter:
89
+
90
+ ```python
91
+ import io
92
+ import litdown
93
+ from litfetch import ArticleIds, fetch_body
94
+
95
+ blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
96
+ if blob:
97
+ markdown = litdown.convert(io.BytesIO(blob.content))
98
+ ```
99
+
100
+ ### Inject your own resolver
101
+
102
+ A resolver is an async `(ArticleIds, Http) -> ArticleIds` — the session running
103
+ it supplies the `Http`. Enrich from whatever you have — a corpus client, a local
104
+ cache, an API — and `merge` it in (this one ignores `Http`, hence `_http`):
105
+
106
+ ```python
107
+ from litfetch import ArticleIds, Http, fetch_body
108
+
109
+ async def my_resolver(ids: ArticleIds, _http: Http) -> ArticleIds:
110
+ if not ids.pmid:
111
+ return ids
112
+ pmcid, doi = await my_corpus.lookup(ids.pmid)
113
+ return ids.merge(ArticleIds(pmcid=pmcid, doi=doi))
114
+
115
+ blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=my_resolver)
116
+ ```
117
+
118
+ ### Use a bundled resolver
119
+
120
+ Bundled resolvers are constructed with their config, then passed in the same
121
+ slot. `chain(...)` composes several (yours first, fallbacks after); it stops
122
+ once every identifier is known:
123
+
124
+ ```python
125
+ from litfetch import ArticleIds, fetch_body
126
+ from litfetch.resolvers import SemanticScholarResolver, NcbiIdConverterResolver, chain
127
+
128
+ resolver = chain(
129
+ my_resolver, # your own
130
+ SemanticScholarResolver(api_key=S2_KEY), # bundled
131
+ NcbiIdConverterResolver(tool='myapp'), # bundled
132
+ )
133
+ blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=resolver)
134
+ ```
135
+
136
+ Polite-pool identification (NCBI/Crossref `email`, Unpaywall's required `email`)
137
+ comes from a session `contact`, not a hardcoded default — set it on the session:
138
+ `async with litfetch.Session(contact='you@example.org') as s: await s.fetch_body(...)`.
139
+
140
+ `default_resolver()` is a batteries-included, keyless chain
141
+ (Europe PMC search + NCBI ID Converter).
142
+
143
+ ### No resolver — you already hold the IDs
144
+
145
+ A non-PubMed paper you only have a DOI for, plus your own Elsevier key:
146
+
147
+ ```python
148
+ blob = await fetch_body(
149
+ ArticleIds(doi='10.1016/j.cell.2020.01.001'),
150
+ credentials={'elsevier_api_key': key},
151
+ )
152
+ ```
153
+
154
+ ### Supplementary material
155
+
156
+ `list_files` enumerates the file-set (references, no bytes); `fetch_file`
157
+ materialises one:
158
+
159
+ ```python
160
+ from litfetch import ArticleIds, FileKind, list_files, fetch_file
161
+
162
+ files = await list_files(ArticleIds(pmcid='PMC5334499'), kind=FileKind.SUPPLEMENTARY)
163
+ for file in files:
164
+ blob = await fetch_file(file)
165
+ ```
166
+
167
+ ### Access terms
168
+
169
+ Read the licence from the fetched bytes, falling back to an access authority
170
+ (Unpaywall) when the bytes carry none:
171
+
172
+ ```python
173
+ from litfetch import extract_source_metadata, resolve_access
174
+
175
+ meta = extract_source_metadata(blob) # from the JATS/Elsevier bytes
176
+ if meta.licence is None:
177
+ meta = await resolve_access(ArticleIds(doi='10.1016/j.cell.2020.01.001'))
178
+ ```
179
+
180
+ ### Resolvers stand alone
181
+
182
+ Each resolver is usable on its own as a cross-reference tool, independent of
183
+ fetching. A resolver is given the `Http` to use, so run it inside a session:
184
+
185
+ ```python
186
+ from litfetch import ArticleIds, Session
187
+ from litfetch.resolvers import SemanticScholarResolver
188
+
189
+ async with Session() as s:
190
+ ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/j.cell.2020.01.001'), s)
191
+ print(ids.pmid, ids.pmcid)
192
+ ```
193
+
194
+ ### Batch: one session, a scope per paper
195
+
196
+ The one-shot functions above each open a throwaway session. For many papers,
197
+ hold one `Session` (pooled connection, shared pacing) and open a `scope` per
198
+ paper — the scope caches within itself, so a duplicate upstream call (e.g.
199
+ Unpaywall for both licence and PDF) is fetched once:
200
+
201
+ ```python
202
+ from litfetch import ArticleIds, Session
203
+
204
+ async with Session() as session:
205
+ for pmid in pmids:
206
+ async with session.scope() as s:
207
+ blob = await s.fetch_body(ArticleIds(pmid=pmid))
208
+ access = await s.resolve_access(ArticleIds(pmid=pmid))
209
+ ```
210
+
211
+ ## Extending
212
+
213
+ - **A new body fetcher:** implement the `Fetcher` protocol — a `name`, a
214
+ `requires: frozenset[str]` of the `ArticleIds` fields it needs, and an async
215
+ `fetch(ids, *, credentials, http)` returning a body `Blob` or `None`.
216
+ Add it to a `fetchers=` list (or your own `default_fetchers`).
217
+ - **A new file source:** implement the `FileSource` protocol — a `name`, and
218
+ async `list_files(ids, ...)` / `fetch_file(file, ...)` — to enumerate and
219
+ materialise an article's file-set (body renditions and supplementary alike).
220
+ - **A new resolver:** write an async `ArticleIds -> ArticleIds` that fills gaps
221
+ via `ArticleIds.merge` and never overwrites a known id.
222
+
223
+ ## Development
224
+
225
+ ```sh
226
+ uv sync
227
+ uv run ruff check . && uv run ruff format --check .
228
+ uv run pyright
229
+ uv run pytest
230
+ ```
@@ -0,0 +1,202 @@
1
+ # litfetch
2
+
3
+ Resolve a scholarly article identifier to its **retrievable artifacts** — the
4
+ full-text body and any supplementary material — and fetch their bytes.
5
+
6
+ litfetch is two cooperating seams:
7
+
8
+ - a **fetch ladder** — pluggable `Fetcher` backends (PMC Open Access S3, Europe
9
+ PMC, Elsevier OA) tried in priority order; the first to serve the body wins,
10
+ returning a `Blob` (a `File` plus its bytes);
11
+ - an optional **resolver layer** — pluggable `Resolver`s that enrich what you
12
+ know about a paper (`pmid` → `pmcid`/`doi`, etc.) so the ladder can act.
13
+
14
+ You hand it an `ArticleIds` bundle (any of `pmid` / `pmcid` / `doi`). Resolution
15
+ is **demand-driven**: a resolver only runs when the next fetcher needs an
16
+ identifier you don't yet have, and runs at most once.
17
+
18
+ An article is modelled as a **file-set**: a collection of `File` references (the
19
+ body in its various media types, plus supplementary material, distinguished by
20
+ `FileKind`), each hosted upstream. litfetch fetches the raw artifacts and reports
21
+ their access terms; it does **not** render them. To turn a fetched JATS/Elsevier
22
+ body into markdown, run [litdown](https://github.com/populationgenomics/litdown)
23
+ on the bytes yourself (see [Render to markdown](#render-to-markdown)).
24
+
25
+ The examples below are a tour; [`docs/api.md`](docs/api.md) is the full
26
+ reference for the public surface.
27
+
28
+ ## Install
29
+
30
+ ```sh
31
+ pip install litfetch
32
+ ```
33
+
34
+ bioRxiv / medRxiv preprint full text needs a browser-fingerprint HTTP client,
35
+ enabled by the `biorxiv` extra:
36
+
37
+ ```sh
38
+ pip install 'litfetch[biorxiv]'
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ ### Fetch the body
44
+
45
+ Hand `fetch_body` an `ArticleIds`; the default ladder serves the first available
46
+ body as a `Blob`:
47
+
48
+ ```python
49
+ from litfetch import ArticleIds, fetch_body
50
+
51
+ blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
52
+ if blob:
53
+ print(blob.file.source, blob.file.media_type, len(blob.content))
54
+ ```
55
+
56
+ ### Render to markdown
57
+
58
+ litfetch returns raw bytes, not markdown. Convert a JATS/Elsevier body with
59
+ [litdown](https://github.com/populationgenomics/litdown) — you pick and pin the
60
+ converter:
61
+
62
+ ```python
63
+ import io
64
+ import litdown
65
+ from litfetch import ArticleIds, fetch_body
66
+
67
+ blob = await fetch_body(ArticleIds(pmcid='PMC5334499'))
68
+ if blob:
69
+ markdown = litdown.convert(io.BytesIO(blob.content))
70
+ ```
71
+
72
+ ### Inject your own resolver
73
+
74
+ A resolver is an async `(ArticleIds, Http) -> ArticleIds` — the session running
75
+ it supplies the `Http`. Enrich from whatever you have — a corpus client, a local
76
+ cache, an API — and `merge` it in (this one ignores `Http`, hence `_http`):
77
+
78
+ ```python
79
+ from litfetch import ArticleIds, Http, fetch_body
80
+
81
+ async def my_resolver(ids: ArticleIds, _http: Http) -> ArticleIds:
82
+ if not ids.pmid:
83
+ return ids
84
+ pmcid, doi = await my_corpus.lookup(ids.pmid)
85
+ return ids.merge(ArticleIds(pmcid=pmcid, doi=doi))
86
+
87
+ blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=my_resolver)
88
+ ```
89
+
90
+ ### Use a bundled resolver
91
+
92
+ Bundled resolvers are constructed with their config, then passed in the same
93
+ slot. `chain(...)` composes several (yours first, fallbacks after); it stops
94
+ once every identifier is known:
95
+
96
+ ```python
97
+ from litfetch import ArticleIds, fetch_body
98
+ from litfetch.resolvers import SemanticScholarResolver, NcbiIdConverterResolver, chain
99
+
100
+ resolver = chain(
101
+ my_resolver, # your own
102
+ SemanticScholarResolver(api_key=S2_KEY), # bundled
103
+ NcbiIdConverterResolver(tool='myapp'), # bundled
104
+ )
105
+ blob = await fetch_body(ArticleIds(pmid='29622564'), resolver=resolver)
106
+ ```
107
+
108
+ Polite-pool identification (NCBI/Crossref `email`, Unpaywall's required `email`)
109
+ comes from a session `contact`, not a hardcoded default — set it on the session:
110
+ `async with litfetch.Session(contact='you@example.org') as s: await s.fetch_body(...)`.
111
+
112
+ `default_resolver()` is a batteries-included, keyless chain
113
+ (Europe PMC search + NCBI ID Converter).
114
+
115
+ ### No resolver — you already hold the IDs
116
+
117
+ A non-PubMed paper you only have a DOI for, plus your own Elsevier key:
118
+
119
+ ```python
120
+ blob = await fetch_body(
121
+ ArticleIds(doi='10.1016/j.cell.2020.01.001'),
122
+ credentials={'elsevier_api_key': key},
123
+ )
124
+ ```
125
+
126
+ ### Supplementary material
127
+
128
+ `list_files` enumerates the file-set (references, no bytes); `fetch_file`
129
+ materialises one:
130
+
131
+ ```python
132
+ from litfetch import ArticleIds, FileKind, list_files, fetch_file
133
+
134
+ files = await list_files(ArticleIds(pmcid='PMC5334499'), kind=FileKind.SUPPLEMENTARY)
135
+ for file in files:
136
+ blob = await fetch_file(file)
137
+ ```
138
+
139
+ ### Access terms
140
+
141
+ Read the licence from the fetched bytes, falling back to an access authority
142
+ (Unpaywall) when the bytes carry none:
143
+
144
+ ```python
145
+ from litfetch import extract_source_metadata, resolve_access
146
+
147
+ meta = extract_source_metadata(blob) # from the JATS/Elsevier bytes
148
+ if meta.licence is None:
149
+ meta = await resolve_access(ArticleIds(doi='10.1016/j.cell.2020.01.001'))
150
+ ```
151
+
152
+ ### Resolvers stand alone
153
+
154
+ Each resolver is usable on its own as a cross-reference tool, independent of
155
+ fetching. A resolver is given the `Http` to use, so run it inside a session:
156
+
157
+ ```python
158
+ from litfetch import ArticleIds, Session
159
+ from litfetch.resolvers import SemanticScholarResolver
160
+
161
+ async with Session() as s:
162
+ ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/j.cell.2020.01.001'), s)
163
+ print(ids.pmid, ids.pmcid)
164
+ ```
165
+
166
+ ### Batch: one session, a scope per paper
167
+
168
+ The one-shot functions above each open a throwaway session. For many papers,
169
+ hold one `Session` (pooled connection, shared pacing) and open a `scope` per
170
+ paper — the scope caches within itself, so a duplicate upstream call (e.g.
171
+ Unpaywall for both licence and PDF) is fetched once:
172
+
173
+ ```python
174
+ from litfetch import ArticleIds, Session
175
+
176
+ async with Session() as session:
177
+ for pmid in pmids:
178
+ async with session.scope() as s:
179
+ blob = await s.fetch_body(ArticleIds(pmid=pmid))
180
+ access = await s.resolve_access(ArticleIds(pmid=pmid))
181
+ ```
182
+
183
+ ## Extending
184
+
185
+ - **A new body fetcher:** implement the `Fetcher` protocol — a `name`, a
186
+ `requires: frozenset[str]` of the `ArticleIds` fields it needs, and an async
187
+ `fetch(ids, *, credentials, http)` returning a body `Blob` or `None`.
188
+ Add it to a `fetchers=` list (or your own `default_fetchers`).
189
+ - **A new file source:** implement the `FileSource` protocol — a `name`, and
190
+ async `list_files(ids, ...)` / `fetch_file(file, ...)` — to enumerate and
191
+ materialise an article's file-set (body renditions and supplementary alike).
192
+ - **A new resolver:** write an async `ArticleIds -> ArticleIds` that fills gaps
193
+ via `ArticleIds.merge` and never overwrites a known id.
194
+
195
+ ## Development
196
+
197
+ ```sh
198
+ uv sync
199
+ uv run ruff check . && uv run ruff format --check .
200
+ uv run pyright
201
+ uv run pytest
202
+ ```
@@ -0,0 +1,92 @@
1
+ """litfetch: identifier -> the retrievable artifacts of a scholarly article.
2
+
3
+ Hand :func:`fetch_body` an :class:`ArticleIds` bundle (any of pmid / pmcid / doi)
4
+ and, optionally, a :data:`~litfetch.resolvers.Resolver` to fill in missing
5
+ identifiers on demand. A :class:`~litfetch.fetchers.Fetcher` ladder is tried in
6
+ priority order; the first to serve the body yields a :class:`Blob` (a
7
+ :class:`File` plus its bytes). Supplementary material is discovered with
8
+ :func:`list_files` and fetched with :func:`fetch_file`.
9
+
10
+ An article is modelled as a *file-set*: a collection of :class:`File` references
11
+ (body renditions and supplementary material, by :class:`FileKind`) sharing one
12
+ identity, each hosted upstream. litfetch fetches the raw artifacts and reports
13
+ their access terms (:class:`SourceMetadata`); rendering them (e.g. XML ->
14
+ markdown via litdown) and storing them are the consumer's concern. The bundled
15
+ identifier resolvers (Europe PMC, NCBI ID Converter, Semantic Scholar) live in
16
+ :mod:`litfetch.resolvers`; file-set listing and fetching live in
17
+ :mod:`litfetch.fetchers`.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from litfetch._http import Http, Rate, RetryPolicy
23
+ from litfetch.artifacts import (
24
+ INSTITUTIONAL,
25
+ Blob,
26
+ File,
27
+ FileKind,
28
+ SourceMetadata,
29
+ )
30
+ from litfetch.fetchers import (
31
+ BiorxivFetcher,
32
+ CrossrefFileSource,
33
+ ElsevierFetcher,
34
+ EuropePmcFetcher,
35
+ Fetcher,
36
+ FileSource,
37
+ PmcOaFetcher,
38
+ SemanticScholarFileSource,
39
+ SpringerFetcher,
40
+ SpringerFileSource,
41
+ UnpaywallFileSource,
42
+ default_fetchers,
43
+ default_file_sources,
44
+ )
45
+ from litfetch.ids import ArticleIds
46
+ from litfetch.relations import Related, RelationType
47
+ from litfetch.sessions import (
48
+ Session,
49
+ fetch_body,
50
+ fetch_file,
51
+ list_files,
52
+ related_ids,
53
+ resolve_access,
54
+ )
55
+ from litfetch.source_metadata import extract_source_metadata
56
+
57
+ __version__ = '0.1.0'
58
+
59
+ __all__ = [
60
+ 'INSTITUTIONAL',
61
+ 'ArticleIds',
62
+ 'BiorxivFetcher',
63
+ 'Blob',
64
+ 'CrossrefFileSource',
65
+ 'ElsevierFetcher',
66
+ 'EuropePmcFetcher',
67
+ 'Fetcher',
68
+ 'File',
69
+ 'FileKind',
70
+ 'FileSource',
71
+ 'Http',
72
+ 'PmcOaFetcher',
73
+ 'Rate',
74
+ 'Related',
75
+ 'RelationType',
76
+ 'RetryPolicy',
77
+ 'SemanticScholarFileSource',
78
+ 'Session',
79
+ 'SourceMetadata',
80
+ 'SpringerFetcher',
81
+ 'SpringerFileSource',
82
+ 'UnpaywallFileSource',
83
+ '__version__',
84
+ 'default_fetchers',
85
+ 'default_file_sources',
86
+ 'extract_source_metadata',
87
+ 'fetch_body',
88
+ 'fetch_file',
89
+ 'list_files',
90
+ 'related_ids',
91
+ 'resolve_access',
92
+ ]
@@ -0,0 +1,78 @@
1
+ """DOI validation and URL-safe path encoding.
2
+
3
+ Several sources interpolate a DOI into an upstream URL path (Unpaywall,
4
+ Crossref, Semantic Scholar, bioRxiv, the future doi.org resolve). Doing so
5
+ raw is wrong twice over: a DOI suffix may contain ``?``, ``#``, spaces, or
6
+ ``/`` -- which truncate or reshape the URL -- and a crafted ``.``/``..``
7
+ segment is a path-traversal vector. :func:`encode_doi_path` is the one safe
8
+ way to place a DOI in a URL path; :func:`normalize_and_validate_doi` is the
9
+ shape gate it builds on.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import urllib.parse
16
+
17
+ # A DOI is ``10.<registrant>/<suffix>``: the registrant is one or more digits
18
+ # with optional dot-separated sub-elements (e.g. ``10.1000.10``); the suffix is
19
+ # any non-empty string. The digit count is left open -- the common 4-9 range is
20
+ # Crossref's observed corpus, not a spec rule -- so an unusual registrant is not
21
+ # rejected. DOIs are case-insensitive, so the prefix match is too; the value is
22
+ # returned unchanged (suffixes are case-sensitive for many registrants).
23
+ _DOI_RE = re.compile(r'^10\.\d+(?:\.\d+)*/.+$', re.IGNORECASE)
24
+
25
+ # Decorations a caller-supplied DOI may arrive with; stripped before validation.
26
+ _RESOLVER_PREFIXES = ('https://doi.org/', 'http://doi.org/', 'https://dx.doi.org/', 'http://dx.doi.org/')
27
+
28
+
29
+ def normalize_and_validate_doi(doi: str) -> str:
30
+ """Return the bare, validated DOI, stripping common decorations.
31
+
32
+ Accepts a DOI carrying surrounding whitespace, a ``doi:`` scheme, or a
33
+ resolver URL prefix (``https://doi.org/``, ``http://dx.doi.org/``) and
34
+ returns the bare ``10.xxxx/suffix`` form.
35
+
36
+ Args:
37
+ doi: The DOI to normalise, possibly decorated.
38
+
39
+ Returns:
40
+ The bare DOI.
41
+
42
+ Raises:
43
+ ValueError: If the result is not a syntactically valid DOI.
44
+ """
45
+ candidate = doi.strip()
46
+ lowered = candidate.lower()
47
+ for prefix in _RESOLVER_PREFIXES:
48
+ if lowered.startswith(prefix):
49
+ candidate = candidate[len(prefix) :]
50
+ break
51
+ if candidate.lower().startswith('doi:'):
52
+ candidate = candidate[len('doi:') :].strip()
53
+ if not _DOI_RE.match(candidate):
54
+ raise ValueError(f'not a valid DOI: {doi!r}')
55
+ return candidate
56
+
57
+
58
+ def encode_doi_path(doi: str) -> str:
59
+ """Percent-encode a validated DOI for safe interpolation into a URL path.
60
+
61
+ Validates via :func:`normalize_and_validate_doi`, then percent-encodes each
62
+ ``/``-separated segment -- so a suffix ``/``, ``?``, ``#``, or space cannot
63
+ reshape the URL -- and rejects a ``.`` or ``..`` segment (path traversal).
64
+
65
+ Args:
66
+ doi: The DOI to encode, possibly decorated.
67
+
68
+ Returns:
69
+ The encoded DOI, ready to interpolate after a URL's path separator.
70
+
71
+ Raises:
72
+ ValueError: If the DOI is invalid or contains a dot-segment.
73
+ """
74
+ normalized = normalize_and_validate_doi(doi)
75
+ segments = normalized.split('/')
76
+ if any(segment in ('.', '..') for segment in segments):
77
+ raise ValueError(f'DOI contains a path-traversal segment: {doi!r}')
78
+ return '/'.join(urllib.parse.quote(segment, safe='') for segment in segments)