litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litfetch/__init__.py ADDED
@@ -0,0 +1,92 @@
1
+ """litfetch: identifier -> the retrievable artifacts of a scholarly article.
2
+
3
+ Hand :func:`fetch_body` an :class:`ArticleIds` bundle (any of pmid / pmcid / doi)
4
+ and, optionally, a :data:`~litfetch.resolvers.Resolver` to fill in missing
5
+ identifiers on demand. A :class:`~litfetch.fetchers.Fetcher` ladder is tried in
6
+ priority order; the first to serve the body yields a :class:`Blob` (a
7
+ :class:`File` plus its bytes). Supplementary material is discovered with
8
+ :func:`list_files` and fetched with :func:`fetch_file`.
9
+
10
+ An article is modelled as a *file-set*: a collection of :class:`File` references
11
+ (body renditions and supplementary material, by :class:`FileKind`) sharing one
12
+ identity, each hosted upstream. litfetch fetches the raw artifacts and reports
13
+ their access terms (:class:`SourceMetadata`); rendering them (e.g. XML ->
14
+ markdown via litdown) and storing them are the consumer's concern. The bundled
15
+ identifier resolvers (Europe PMC, NCBI ID Converter, Semantic Scholar) live in
16
+ :mod:`litfetch.resolvers`; file-set listing and fetching live in
17
+ :mod:`litfetch.fetchers`.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from litfetch._http import Http, Rate, RetryPolicy
23
+ from litfetch.artifacts import (
24
+ INSTITUTIONAL,
25
+ Blob,
26
+ File,
27
+ FileKind,
28
+ SourceMetadata,
29
+ )
30
+ from litfetch.fetchers import (
31
+ BiorxivFetcher,
32
+ CrossrefFileSource,
33
+ ElsevierFetcher,
34
+ EuropePmcFetcher,
35
+ Fetcher,
36
+ FileSource,
37
+ PmcOaFetcher,
38
+ SemanticScholarFileSource,
39
+ SpringerFetcher,
40
+ SpringerFileSource,
41
+ UnpaywallFileSource,
42
+ default_fetchers,
43
+ default_file_sources,
44
+ )
45
+ from litfetch.ids import ArticleIds
46
+ from litfetch.relations import Related, RelationType
47
+ from litfetch.sessions import (
48
+ Session,
49
+ fetch_body,
50
+ fetch_file,
51
+ list_files,
52
+ related_ids,
53
+ resolve_access,
54
+ )
55
+ from litfetch.source_metadata import extract_source_metadata
56
+
57
+ __version__ = '0.1.0'
58
+
59
+ __all__ = [
60
+ 'INSTITUTIONAL',
61
+ 'ArticleIds',
62
+ 'BiorxivFetcher',
63
+ 'Blob',
64
+ 'CrossrefFileSource',
65
+ 'ElsevierFetcher',
66
+ 'EuropePmcFetcher',
67
+ 'Fetcher',
68
+ 'File',
69
+ 'FileKind',
70
+ 'FileSource',
71
+ 'Http',
72
+ 'PmcOaFetcher',
73
+ 'Rate',
74
+ 'Related',
75
+ 'RelationType',
76
+ 'RetryPolicy',
77
+ 'SemanticScholarFileSource',
78
+ 'Session',
79
+ 'SourceMetadata',
80
+ 'SpringerFetcher',
81
+ 'SpringerFileSource',
82
+ 'UnpaywallFileSource',
83
+ '__version__',
84
+ 'default_fetchers',
85
+ 'default_file_sources',
86
+ 'extract_source_metadata',
87
+ 'fetch_body',
88
+ 'fetch_file',
89
+ 'list_files',
90
+ 'related_ids',
91
+ 'resolve_access',
92
+ ]
litfetch/_doi.py ADDED
@@ -0,0 +1,78 @@
1
+ """DOI validation and URL-safe path encoding.
2
+
3
+ Several sources interpolate a DOI into an upstream URL path (Unpaywall,
4
+ Crossref, Semantic Scholar, bioRxiv, the future doi.org resolve). Doing so
5
+ raw is wrong twice over: a DOI suffix may contain ``?``, ``#``, spaces, or
6
+ ``/`` -- which truncate or reshape the URL -- and a crafted ``.``/``..``
7
+ segment is a path-traversal vector. :func:`encode_doi_path` is the one safe
8
+ way to place a DOI in a URL path; :func:`normalize_and_validate_doi` is the
9
+ shape gate it builds on.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import urllib.parse
16
+
17
+ # A DOI is ``10.<registrant>/<suffix>``: the registrant is one or more digits
18
+ # with optional dot-separated sub-elements (e.g. ``10.1000.10``); the suffix is
19
+ # any non-empty string. The digit count is left open -- the common 4-9 range is
20
+ # Crossref's observed corpus, not a spec rule -- so an unusual registrant is not
21
+ # rejected. DOIs are case-insensitive, so the prefix match is too; the value is
22
+ # returned unchanged (suffixes are case-sensitive for many registrants).
23
+ _DOI_RE = re.compile(r'^10\.\d+(?:\.\d+)*/.+$', re.IGNORECASE)
24
+
25
+ # Decorations a caller-supplied DOI may arrive with; stripped before validation.
26
+ _RESOLVER_PREFIXES = ('https://doi.org/', 'http://doi.org/', 'https://dx.doi.org/', 'http://dx.doi.org/')
27
+
28
+
29
+ def normalize_and_validate_doi(doi: str) -> str:
30
+ """Return the bare, validated DOI, stripping common decorations.
31
+
32
+ Accepts a DOI carrying surrounding whitespace, a ``doi:`` scheme, or a
33
+ resolver URL prefix (``https://doi.org/``, ``http://dx.doi.org/``) and
34
+ returns the bare ``10.xxxx/suffix`` form.
35
+
36
+ Args:
37
+ doi: The DOI to normalise, possibly decorated.
38
+
39
+ Returns:
40
+ The bare DOI.
41
+
42
+ Raises:
43
+ ValueError: If the result is not a syntactically valid DOI.
44
+ """
45
+ candidate = doi.strip()
46
+ lowered = candidate.lower()
47
+ for prefix in _RESOLVER_PREFIXES:
48
+ if lowered.startswith(prefix):
49
+ candidate = candidate[len(prefix) :]
50
+ break
51
+ if candidate.lower().startswith('doi:'):
52
+ candidate = candidate[len('doi:') :].strip()
53
+ if not _DOI_RE.match(candidate):
54
+ raise ValueError(f'not a valid DOI: {doi!r}')
55
+ return candidate
56
+
57
+
58
+ def encode_doi_path(doi: str) -> str:
59
+ """Percent-encode a validated DOI for safe interpolation into a URL path.
60
+
61
+ Validates via :func:`normalize_and_validate_doi`, then percent-encodes each
62
+ ``/``-separated segment -- so a suffix ``/``, ``?``, ``#``, or space cannot
63
+ reshape the URL -- and rejects a ``.`` or ``..`` segment (path traversal).
64
+
65
+ Args:
66
+ doi: The DOI to encode, possibly decorated.
67
+
68
+ Returns:
69
+ The encoded DOI, ready to interpolate after a URL's path separator.
70
+
71
+ Raises:
72
+ ValueError: If the DOI is invalid or contains a dot-segment.
73
+ """
74
+ normalized = normalize_and_validate_doi(doi)
75
+ segments = normalized.split('/')
76
+ if any(segment in ('.', '..') for segment in segments):
77
+ raise ValueError(f'DOI contains a path-traversal segment: {doi!r}')
78
+ return '/'.join(urllib.parse.quote(segment, safe='') for segment in segments)
litfetch/_http.py ADDED
@@ -0,0 +1,194 @@
1
+ """Low-level HTTP primitives shared across litfetch.
2
+
3
+ The public vocabulary the source and resolver layers depend on -- the
4
+ :class:`Http` request protocol and the :class:`Rate` politeness levels -- lives
5
+ here, together with the retrying GET primitive (:func:`get`) they are built on.
6
+ Keeping these here (and not in :mod:`litfetch.sessions`) lets ``fetchers`` and
7
+ ``resolvers`` depend on the protocol without importing the concrete
8
+ :class:`~litfetch.sessions.Session`, which in turn imports them.
9
+
10
+ :func:`get` is the single choke point for outbound GETs: it adds retry with
11
+ exponential backoff and honours a 429/503 ``Retry-After``. It operates over a
12
+ raw client and knows nothing of pacing or caching; :class:`Session` layers those
13
+ on top and is the concrete :class:`Http` fetchers and resolvers actually call.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import dataclasses
20
+ import enum
21
+ import random
22
+ from collections.abc import Mapping
23
+ from typing import Protocol
24
+
25
+ import httpx
26
+
27
+ DEFAULT_TIMEOUT = 30.0
28
+ # Base User-Agent, no contact. A caller who sets Session(contact=...) gets a
29
+ # `(mailto:...)` appended and that address fed to the polite-pool params; litfetch
30
+ # ships no default contact of its own.
31
+ USER_AGENT = 'litfetch/0.1'
32
+
33
+ # Status codes worth retrying: 429 (rate limited) and the transient 5xx family.
34
+ # A 4xx other than 429 is the caller's fault and will not fix itself on retry.
35
+ _RETRYABLE_STATUS = frozenset({429, 500, 502, 503, 504})
36
+
37
+
38
+ class Rate(enum.Enum):
39
+ """A named politeness rate, chosen at the call site.
40
+
41
+ The ``KEYED`` variants apply when the caller holds an API key for that host
42
+ (a higher allowance), the ``UNKEYED`` variants are the polite public rate.
43
+ ``DEFAULT`` imposes no throttle -- for hosts (S3, publisher CDNs) with no
44
+ tight per-client limit. :attr:`min_interval` is the resulting minimum
45
+ inter-request interval in seconds.
46
+ """
47
+
48
+ DEFAULT = 'default'
49
+ NCBI_UNKEYED = 'ncbi_unkeyed'
50
+ NCBI_KEYED = 'ncbi_keyed'
51
+ S2_UNKEYED = 's2_unkeyed'
52
+ S2_KEYED = 's2_keyed'
53
+
54
+ @property
55
+ def min_interval(self) -> float:
56
+ """Minimum seconds between requests to one host at this rate."""
57
+ return _MIN_INTERVALS[self]
58
+
59
+
60
+ # Seconds between requests per rate. Distinct members may share an interval
61
+ # (NCBI and S2 keyed allowances both land near 10 req/s), so the interval is a
62
+ # mapping, not the enum value -- equal values would alias the members.
63
+ _MIN_INTERVALS = {
64
+ Rate.DEFAULT: 0.0,
65
+ Rate.NCBI_UNKEYED: 0.34, # ~3 req/s, NCBI's keyless allowance
66
+ Rate.NCBI_KEYED: 0.1, # ~10 req/s with an NCBI API key
67
+ Rate.S2_UNKEYED: 1.0, # Semantic Scholar's shared public pool: stay conservative
68
+ Rate.S2_KEYED: 0.1, # with a Semantic Scholar API key
69
+ }
70
+
71
+
72
+ class Http(Protocol):
73
+ """The request surface a source or resolver needs: a paced, retrying GET.
74
+
75
+ :class:`~litfetch.sessions.Session` satisfies this. A source is handed an
76
+ ``Http``, never the Session's lifecycle, so it depends on one method (and one
77
+ attribute) and is trivially faked in a test.
78
+
79
+ ``contact`` is the caller-configured identity (an email) for polite-pool
80
+ parameters -- Unpaywall's required ``email``, Crossref's ``mailto``, NCBI's
81
+ ``email`` -- and ``None`` when the caller set none; a source reads it rather
82
+ than carrying a hardcoded address.
83
+ """
84
+
85
+ contact: str | None
86
+
87
+ async def get(
88
+ self,
89
+ url: str,
90
+ *,
91
+ params: Mapping[str, str | int] | None = None,
92
+ headers: Mapping[str, str] | None = None,
93
+ rate: Rate = Rate.DEFAULT,
94
+ follow_redirects: bool = False,
95
+ ) -> httpx.Response:
96
+ """GET ``url``, paced per ``rate`` and retried per the session policy."""
97
+ ...
98
+
99
+
100
+ @dataclasses.dataclass(frozen=True)
101
+ class RetryPolicy:
102
+ """How :func:`get` retries a transient failure.
103
+
104
+ A transient failure is an ``httpx.TransportError`` (timeout, connection
105
+ reset) or a retryable status (429, 500, 502, 503, 504). Backoff is
106
+ exponential with full jitter -- ``uniform(0, base_delay * 2**attempt)`` --
107
+ capped at ``max_delay``; a 429/503 ``Retry-After`` in integer seconds
108
+ overrides the jitter, also capped. ``max_attempts`` counts total tries, so
109
+ ``max_attempts=1`` disables retrying.
110
+ """
111
+
112
+ max_attempts: int = 3
113
+ base_delay: float = 0.5
114
+ max_delay: float = 8.0
115
+
116
+ def __post_init__(self) -> None:
117
+ # >= 1 so the get() loop always runs at least once (0 would fall through
118
+ # to its `unreachable` guard).
119
+ if self.max_attempts < 1:
120
+ raise ValueError(f'max_attempts must be >= 1, got {self.max_attempts}')
121
+
122
+
123
+ DEFAULT_RETRY = RetryPolicy()
124
+
125
+
126
+ async def get(
127
+ client: httpx.AsyncClient,
128
+ url: str,
129
+ *,
130
+ params: Mapping[str, str | int] | None = None,
131
+ headers: Mapping[str, str] | None = None,
132
+ retry: RetryPolicy = DEFAULT_RETRY,
133
+ follow_redirects: bool = False,
134
+ ) -> httpx.Response:
135
+ """GET ``url``, retrying a transient failure per ``retry``.
136
+
137
+ Retries an ``httpx.TransportError`` or a retryable status (see
138
+ :class:`RetryPolicy`) with backoff, then returns the final response --
139
+ including a still-failing status, so the caller keeps its own status
140
+ handling. Re-raises the last transport error when every attempt fails.
141
+
142
+ Args:
143
+ client: The httpx client to issue the request on.
144
+ url: The absolute URL to GET.
145
+ params: Query parameters, if any.
146
+ headers: Request headers, if any.
147
+ retry: The backoff/attempt policy.
148
+ follow_redirects: Follow 3xx redirects (off by default; file downloads
149
+ enable it to follow publisher PDF redirects).
150
+
151
+ Returns:
152
+ The final :class:`httpx.Response` (a non-retryable status, or the last
153
+ response after exhausting retries).
154
+
155
+ Raises:
156
+ httpx.TransportError: If every attempt fails at the transport layer.
157
+ """
158
+ for attempt in range(retry.max_attempts):
159
+ last_attempt = attempt == retry.max_attempts - 1
160
+ retry_after: float | None = None
161
+ try:
162
+ response = await client.get(url, params=params, headers=headers, follow_redirects=follow_redirects)
163
+ except httpx.TransportError:
164
+ if last_attempt:
165
+ raise
166
+ else:
167
+ if response.status_code not in _RETRYABLE_STATUS or last_attempt:
168
+ return response
169
+ retry_after = _retry_after_seconds(response)
170
+ await asyncio.sleep(_backoff(attempt, retry_after, retry))
171
+ raise AssertionError('unreachable: the loop returns or raises on the last attempt')
172
+
173
+
174
+ def _retry_after_seconds(response: httpx.Response) -> float | None:
175
+ """Parse a ``Retry-After`` header as integer seconds; ``None`` otherwise.
176
+
177
+ The HTTP-date form is accepted by the spec but not used by the APIs
178
+ litfetch talks to; it falls through to ``None`` (jittered backoff).
179
+ """
180
+ value = response.headers.get('Retry-After')
181
+ if value is None:
182
+ return None
183
+ try:
184
+ return float(value)
185
+ except ValueError:
186
+ return None
187
+
188
+
189
+ def _backoff(attempt: int, retry_after: float | None, policy: RetryPolicy) -> float:
190
+ """Seconds to wait before the next attempt: ``Retry-After`` or jittered backoff."""
191
+ if retry_after is not None:
192
+ return min(retry_after, policy.max_delay)
193
+ jittered = random.uniform(0, policy.base_delay * 2**attempt) # noqa: S311 -- jitter, not crypto
194
+ return min(jittered, policy.max_delay)
litfetch/artifacts.py ADDED
@@ -0,0 +1,91 @@
1
+ """The data types that flow through the fetch seam.
2
+
3
+ An article is modelled as a *file-set*: a collection of :class:`File` references
4
+ sharing one identity. A :class:`File` is either the article **body** (in one of
5
+ its media types) or a piece of **supplementary** material, hosted upstream with a
6
+ ``uri`` and the ``credential_key`` a fetch needs. A :class:`Blob` is a File once
7
+ its bytes are in hand.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import dataclasses
13
+ import enum
14
+ from typing import Final
15
+
16
+ # Well-known media types an artifact can carry: the JATS or Elsevier body
17
+ # dialects, or a PDF rendition. Left as open ``str`` (not an enum): a File's
18
+ # media_type is an open domain -- Crossref/publisher links carry arbitrary
19
+ # content-types -- so a closed enum would be wrong. The closed sets in this
20
+ # package (FileKind, Rate, RelationType) are enums; these are not.
21
+ JATS_XML: Final[str] = 'application/jats+xml'
22
+ ELSEVIER_XML: Final[str] = 'application/vnd.elsevier+xml'
23
+ PDF: Final[str] = 'application/pdf'
24
+
25
+ # A ``File.credential_key`` value meaning the fetch needs *institutional
26
+ # entitlement* (a subscription reached via an EZproxy-style client), rather than
27
+ # a key in the ``credentials`` map. The ``litfetch:`` prefix makes it
28
+ # un-collidable with a user-supplied credentials key (which could otherwise be
29
+ # literally ``institutional``). The consumer routes such a file through its
30
+ # entitled client; an openly-fetchable file leaves ``credential_key`` ``None``.
31
+ INSTITUTIONAL: Final[str] = 'litfetch:institutional'
32
+
33
+
34
+ class FileKind(enum.Enum):
35
+ """What a :class:`File` is within the article's file-set.
36
+
37
+ ``BODY`` -- the article full text itself, in one of its media types.
38
+ ``SUPPLEMENTARY`` -- additional material (figures, datasets, tables), not
39
+ the body.
40
+ """
41
+
42
+ BODY = 'body'
43
+ SUPPLEMENTARY = 'supplementary'
44
+
45
+
46
+ @dataclasses.dataclass(frozen=True)
47
+ class File:
48
+ """A reference to one file in an article's file-set -- not its bytes.
49
+
50
+ ``source`` names the source that can retrieve it (routes
51
+ :func:`~litfetch.fetchers.fetch_file`). A File is hosted upstream: it carries
52
+ a ``uri`` and the ``credential_key`` a fetch needs (``None`` when openly
53
+ accessible). ``credential_key`` is either a key in the caller's
54
+ ``credentials`` map (e.g. a publisher API key) or :data:`INSTITUTIONAL`, which
55
+ marks a fetch that needs institutional entitlement (an EZproxy-style client)
56
+ rather than a map key. ``uri`` is fetched on demand, never eagerly.
57
+ """
58
+
59
+ kind: FileKind
60
+ source: str
61
+ media_type: str | None = None
62
+ uri: str | None = None
63
+ filename: str | None = None
64
+ credential_key: str | None = None
65
+ size_bytes: int | None = None
66
+ description: str | None = None
67
+
68
+
69
+ @dataclasses.dataclass(frozen=True)
70
+ class SourceMetadata:
71
+ """Access terms for a fetched artifact: its licence and how that was known.
72
+
73
+ litfetch returns the licence *raw* (the CC URL, JATS ``license-type``, or
74
+ licence text as found upstream); mapping to an SPDX id is the consumer's --
75
+ describe, don't own. ``basis`` records provenance: ``'artifact'`` when read
76
+ from the fetched bytes (authoritative for exactly those bytes), or an
77
+ authority name (e.g. ``'unpaywall'``) when asserted for a paper whose bytes
78
+ ship no licence (a PDF). A ``None`` field means unknown.
79
+ """
80
+
81
+ licence: str | None = None
82
+ access: str | None = None
83
+ basis: str | None = None
84
+
85
+
86
+ @dataclasses.dataclass(frozen=True)
87
+ class Blob:
88
+ """A materialised :class:`File`: its reference plus the fetched bytes."""
89
+
90
+ file: File
91
+ content: bytes
litfetch/crossref.py ADDED
@@ -0,0 +1,51 @@
1
+ """Crossref works fetch, shared by the Elsevier link locator, relations, and TDM.
2
+
3
+ One DOI-keyed GET against the Crossref works API returns a ``message`` that
4
+ several callers read differently: the Elsevier full-text fetcher for its
5
+ ``text/xml`` text-mining link, :mod:`litfetch.relations` for ``relation``
6
+ entries, and :class:`~litfetch.fetchers.CrossrefFileSource` for the
7
+ text-mining ``link[]`` renditions. All go through :func:`fetch_work`; inside a
8
+ :meth:`~litfetch.sessions.Session.scope` the duplicate GET is served from cache.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+
15
+ import httpx
16
+
17
+ from litfetch import _doi, _http
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _CROSSREF_BASE = 'https://api.crossref.org/works'
22
+
23
+
24
+ async def fetch_work(doi: str, *, http: _http.Http, mailto: str | None = None) -> dict | None:
25
+ """Return the Crossref ``message`` object for ``doi``, or ``None``.
26
+
27
+ Args:
28
+ doi: The DOI to look up.
29
+ http: The :class:`~litfetch._http.Http` to issue the request on.
30
+ mailto: Identifies the caller for Crossref's polite pool; defaults to
31
+ ``http.contact``. Omitted (Crossref still answers, just not in the
32
+ polite pool) when neither is set.
33
+
34
+ Returns:
35
+ The parsed ``message`` object, or ``None`` when the lookup fails or the
36
+ response is not JSON.
37
+ """
38
+ mailto = mailto or http.contact
39
+ params = {'mailto': mailto} if mailto else {}
40
+ try:
41
+ resp = await http.get(f'{_CROSSREF_BASE}/{_doi.encode_doi_path(doi)}', params=params)
42
+ except httpx.HTTPError:
43
+ logger.exception('Crossref lookup failed for %s', doi)
44
+ return None
45
+ if resp.status_code != 200:
46
+ return None
47
+ try:
48
+ return resp.json().get('message')
49
+ except ValueError:
50
+ logger.warning('Crossref returned a non-JSON response for %s', doi)
51
+ return None