litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litfetch/sessions.py ADDED
@@ -0,0 +1,344 @@
1
+ """The Session facade: the object callers hold to run litfetch.
2
+
3
+ See [ADR 0001](../docs/adr/0001-http-session-seam.md). A :class:`Session` owns
4
+ one ``httpx.AsyncClient`` (built by an injectable ``client_factory``) and the
5
+ per-host pacing state, and it is the concrete :class:`~litfetch._http.Http` the
6
+ source and resolver layers issue requests on. The library's operations --
7
+ :meth:`~Session.fetch_body`, :meth:`~Session.list_files`,
8
+ :meth:`~Session.fetch_file`, :meth:`~Session.resolve_access`,
9
+ :meth:`~Session.related_ids` -- are methods on it, so a caller threads no HTTP
10
+ argument: the object it holds *is* the context.
11
+
12
+ :meth:`Session.scope` returns a child sharing the parent's client and pacing but
13
+ with its own short-lived response cache -- open one per logical unit of work (a
14
+ paper) so a duplicate GET within that unit is served once and the cache cannot
15
+ grow across the run::
16
+
17
+ async with litfetch.Session() as session: # long-lived: pool + pacing
18
+ for pid in paper_ids:
19
+ async with session.scope() as s: # short-lived: cache
20
+ blob = await s.fetch_body(ArticleIds(pmid=pid), resolver=resolver)
21
+
22
+ Module-level functions of the same names are one-shot conveniences: each opens
23
+ an ephemeral session for a single call.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import asyncio
29
+ import dataclasses
30
+ import urllib.parse
31
+ from collections.abc import Callable, Mapping, Sequence
32
+
33
+ import httpx
34
+
35
+ from litfetch import _http, artifacts, ids, relations, resolvers, source_metadata
36
+ from litfetch import fetchers as fetchers_
37
+
38
+
39
+ def _default_client_factory(timeout: float, contact: str | None) -> Callable[[], httpx.AsyncClient]:
40
+ """Build the default client factory: a litfetch User-Agent and ``timeout``.
41
+
42
+ A ``contact`` (an email) appends ``(mailto:<contact>)`` to the User-Agent for
43
+ polite-pool identification; ``None`` leaves the bare ``litfetch/<version>``.
44
+ """
45
+ user_agent = f'{_http.USER_AGENT} (mailto:{contact})' if contact else _http.USER_AGENT
46
+
47
+ def factory() -> httpx.AsyncClient:
48
+ return httpx.AsyncClient(timeout=timeout, headers={'User-Agent': user_agent})
49
+
50
+ return factory
51
+
52
+
53
+ @dataclasses.dataclass
54
+ class _HostPacer:
55
+ """Per-host pacing state: a lock plus the earliest monotonic time to send next."""
56
+
57
+ lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock)
58
+ next_allowed: float = 0.0
59
+
60
+
61
+ def _is_cacheable(response: httpx.Response) -> bool:
62
+ """Report whether a response is a deterministic outcome worth caching.
63
+
64
+ 2xx and 4xx-except-429 are stable answers (including a 404 "no record"). A
65
+ 5xx or 429 is transient -- the retry layer owns it -- and must never be
66
+ cached, or the whole scope would be poisoned by one blip.
67
+ """
68
+ return response.status_code < 500 and response.status_code != 429
69
+
70
+
71
+ def _cache_key(
72
+ url: str,
73
+ params: Mapping[str, str | int] | None,
74
+ headers: Mapping[str, str] | None,
75
+ follow_redirects: bool,
76
+ ) -> tuple[str, tuple[tuple[str, str | int], ...], tuple[tuple[str, str], ...], bool]:
77
+ """A hashable key identifying one GET; headers/redirect-mode included so they vary it."""
78
+ return (
79
+ url,
80
+ tuple(sorted(params.items())) if params else (),
81
+ tuple(sorted(headers.items())) if headers else (),
82
+ follow_redirects,
83
+ )
84
+
85
+
86
+ class Session:
87
+ """Owns the HTTP client and per-host pacing, and exposes litfetch's operations.
88
+
89
+ Use as an async context manager: it builds the client on entry (via
90
+ ``client_factory``) and closes it on exit. ``client_factory`` is the
91
+ injection point for a proxy, an institutional EZproxy, or CA-cert
92
+ configuration; the default builds a litfetch-configured client. :attr:`client`
93
+ exposes the raw client for needs :meth:`get` does not cover (POST, streaming).
94
+
95
+ A bare session does not cache; :meth:`scope` returns a child that does.
96
+ """
97
+
98
+ def __init__(
99
+ self,
100
+ *,
101
+ client_factory: Callable[[], httpx.AsyncClient] | None = None,
102
+ retry: _http.RetryPolicy = _http.DEFAULT_RETRY,
103
+ timeout: float = _http.DEFAULT_TIMEOUT,
104
+ contact: str | None = None,
105
+ ) -> None:
106
+ self.contact = contact
107
+ self._factory = client_factory or _default_client_factory(timeout, contact)
108
+ self._retry = retry
109
+ self._client: httpx.AsyncClient | None = None
110
+ self._pacers: dict[str, _HostPacer] = {}
111
+ self._cache: dict[object, httpx.Response] | None = None
112
+ self._parent: Session | None = None
113
+
114
+ def scope(self) -> Session:
115
+ """Return a child session with its own response cache, entered per unit of work.
116
+
117
+ The child shares this session's client factory, pacing state, and retry
118
+ policy; on exit it drops its cache and leaves the client open. Caching
119
+ applies only inside a scope.
120
+ """
121
+ child = Session(client_factory=self._factory, retry=self._retry)
122
+ child._adopt(self)
123
+ return child
124
+
125
+ def _adopt(self, parent: Session) -> None:
126
+ """Bind this scope to ``parent``: share its client, pacing, and contact; keep own cache."""
127
+ self._parent = parent
128
+ self._pacers = parent._pacers # share the pacing dict by reference, so pacing spans the run
129
+ self.contact = parent.contact
130
+ self._cache = {}
131
+
132
+ async def __aenter__(self) -> Session: # noqa: PYI034 -- Self needs py311; project targets py310
133
+ if self._parent is None:
134
+ self._client = self._factory()
135
+ return self
136
+ # A scope shares the parent's client; the parent must already be entered.
137
+ # Its `client` property raises when it is not -- re-raise with the message
138
+ # that names the actual mistake.
139
+ try:
140
+ self._client = self._parent.client
141
+ except RuntimeError as e:
142
+ raise RuntimeError('enter the parent session before entering its scope()') from e
143
+ return self
144
+
145
+ async def __aexit__(self, *exc: object) -> None:
146
+ if self._parent is None and self._client is not None:
147
+ await self._client.aclose()
148
+ self._client = None
149
+ self._cache = None
150
+
151
+ @property
152
+ def client(self) -> httpx.AsyncClient:
153
+ """The underlying httpx client (escape hatch); valid only inside the context."""
154
+ if self._client is None:
155
+ raise RuntimeError('Session.client is only available inside the context manager')
156
+ return self._client
157
+
158
+ async def get(
159
+ self,
160
+ url: str,
161
+ *,
162
+ params: Mapping[str, str | int] | None = None,
163
+ headers: Mapping[str, str] | None = None,
164
+ rate: _http.Rate = _http.Rate.DEFAULT,
165
+ follow_redirects: bool = False,
166
+ ) -> httpx.Response:
167
+ """GET ``url``, paced per ``rate`` then retried per the session policy.
168
+
169
+ ``follow_redirects`` is off by default (an API move should surface, not be
170
+ chased silently); file downloads pass it through to follow publisher
171
+ redirects. In a :meth:`scope`, a deterministic response (see
172
+ :func:`_is_cacheable`) is cached by ``url`` + params + headers +
173
+ redirect-mode for the scope's life and a repeat is served without a
174
+ round-trip.
175
+ """
176
+ key = _cache_key(url, params, headers, follow_redirects)
177
+ if self._cache is not None and key in self._cache:
178
+ return self._cache[key]
179
+ await self._pace(url, rate)
180
+ response = await _http.get(
181
+ self.client, url, params=params, headers=headers, retry=self._retry, follow_redirects=follow_redirects
182
+ )
183
+ if self._cache is not None and _is_cacheable(response):
184
+ self._cache[key] = response
185
+ return response
186
+
187
+ async def _pace(self, url: str, rate: _http.Rate) -> None:
188
+ """Wait until the per-host minimum interval since the last send has elapsed.
189
+
190
+ The lock is held across the wait, so concurrent requests to one host
191
+ queue and space out; different hosts pace independently.
192
+ """
193
+ interval = rate.min_interval
194
+ if interval <= 0:
195
+ return
196
+ host = urllib.parse.urlsplit(url).netloc
197
+ pacer = self._pacers.setdefault(host, _HostPacer())
198
+ async with pacer.lock:
199
+ loop = asyncio.get_running_loop()
200
+ wait = pacer.next_allowed - loop.time()
201
+ if wait > 0:
202
+ await asyncio.sleep(wait)
203
+ pacer.next_allowed = loop.time() + interval
204
+
205
+ async def fetch_body(
206
+ self,
207
+ article_ids: ids.ArticleIds,
208
+ *,
209
+ resolver: resolvers.Resolver | None = None,
210
+ fetchers: Sequence[fetchers_.Fetcher] | None = None,
211
+ credentials: Mapping[str, object] | None = None,
212
+ ) -> artifacts.Blob | None:
213
+ """Walk the fetcher ladder, resolving identifiers on demand, return the first hit.
214
+
215
+ When the next fetcher needs an identifier ``article_ids`` lacks, invokes
216
+ ``resolver`` once (memoised) to enrich the bundle, then continues. Returns
217
+ the first non-``None`` body :class:`~litfetch.artifacts.Blob`, or ``None``
218
+ when nothing serves it. The blob carries raw bytes; rendering it (e.g.
219
+ XML -> markdown) is the caller's concern.
220
+ """
221
+ chosen = tuple(fetchers) if fetchers is not None else fetchers_.default_fetchers()
222
+ resolved = False
223
+ for fetcher in chosen:
224
+ if not article_ids.has(fetcher.requires):
225
+ if resolver is not None and not resolved:
226
+ article_ids = article_ids.merge(await resolver(article_ids, self))
227
+ resolved = True
228
+ if not article_ids.has(fetcher.requires):
229
+ continue
230
+ blob = await fetcher.fetch(article_ids, credentials=credentials, http=self)
231
+ if blob is not None:
232
+ return blob
233
+ return None
234
+
235
+ async def list_files(
236
+ self,
237
+ article_ids: ids.ArticleIds,
238
+ *,
239
+ sources: Sequence[fetchers_.FileSource] | None = None,
240
+ kind: artifacts.FileKind | None = None,
241
+ credentials: Mapping[str, object] | None = None,
242
+ ) -> tuple[artifacts.File, ...]:
243
+ """Enumerate an article's file-set across every source (a union, not first-wins).
244
+
245
+ Pass ``kind`` to keep only body renditions or only supplementary material.
246
+ ``sources`` defaults to :func:`~litfetch.fetchers.default_file_sources`.
247
+ """
248
+ chosen = tuple(sources) if sources is not None else fetchers_.default_file_sources()
249
+ found: list[artifacts.File] = []
250
+ for source in chosen:
251
+ found.extend(await source.list_files(article_ids, credentials=credentials, http=self))
252
+ if kind is not None:
253
+ found = [file for file in found if file.kind is kind]
254
+ return tuple(found)
255
+
256
+ async def fetch_file(
257
+ self,
258
+ file: artifacts.File,
259
+ *,
260
+ sources: Sequence[fetchers_.FileSource] | None = None,
261
+ credentials: Mapping[str, object] | None = None,
262
+ ) -> artifacts.Blob | None:
263
+ """Download one file's bytes, routing to the source whose ``name`` owns it.
264
+
265
+ Returns ``None`` when no registered source claims the file.
266
+ """
267
+ chosen = tuple(sources) if sources is not None else fetchers_.default_file_sources()
268
+ for source in chosen:
269
+ if source.name == file.source:
270
+ return await source.fetch_file(file, credentials=credentials, http=self)
271
+ return None
272
+
273
+ async def resolve_access(
274
+ self,
275
+ article_ids: ids.ArticleIds,
276
+ *,
277
+ email: str | None = None,
278
+ ) -> artifacts.SourceMetadata:
279
+ """Resolve licence / OA status from Unpaywall (see :func:`~litfetch.source_metadata.resolve_access`).
280
+
281
+ Unpaywall requires an email; it defaults to the session ``contact`` and
282
+ can be overridden here. Without either, the lookup is skipped.
283
+ """
284
+ return await source_metadata.resolve_access(article_ids, http=self, email=email)
285
+
286
+ async def related_ids(self, article_ids: ids.ArticleIds) -> tuple[relations.Related, ...]:
287
+ """Find preprint / published counterparts (see :func:`~litfetch.relations.related_ids`)."""
288
+ return await relations.related_ids(article_ids, http=self)
289
+
290
+
291
+ async def fetch_body(
292
+ article_ids: ids.ArticleIds,
293
+ *,
294
+ resolver: resolvers.Resolver | None = None,
295
+ fetchers: Sequence[fetchers_.Fetcher] | None = None,
296
+ credentials: Mapping[str, object] | None = None,
297
+ ) -> artifacts.Blob | None:
298
+ """One-shot :meth:`Session.fetch_body`: opens an ephemeral session for this call."""
299
+ async with Session() as session:
300
+ return await session.fetch_body(article_ids, resolver=resolver, fetchers=fetchers, credentials=credentials)
301
+
302
+
303
+ async def list_files(
304
+ article_ids: ids.ArticleIds,
305
+ *,
306
+ sources: Sequence[fetchers_.FileSource] | None = None,
307
+ kind: artifacts.FileKind | None = None,
308
+ credentials: Mapping[str, object] | None = None,
309
+ ) -> tuple[artifacts.File, ...]:
310
+ """One-shot :meth:`Session.list_files`: opens an ephemeral session for this call."""
311
+ async with Session() as session:
312
+ return await session.list_files(article_ids, sources=sources, kind=kind, credentials=credentials)
313
+
314
+
315
+ async def fetch_file(
316
+ file: artifacts.File,
317
+ *,
318
+ sources: Sequence[fetchers_.FileSource] | None = None,
319
+ credentials: Mapping[str, object] | None = None,
320
+ ) -> artifacts.Blob | None:
321
+ """One-shot :meth:`Session.fetch_file`: opens an ephemeral session for this call."""
322
+ async with Session() as session:
323
+ return await session.fetch_file(file, sources=sources, credentials=credentials)
324
+
325
+
326
+ async def resolve_access(
327
+ article_ids: ids.ArticleIds,
328
+ *,
329
+ email: str | None = None,
330
+ ) -> artifacts.SourceMetadata:
331
+ """One-shot :meth:`Session.resolve_access`: opens an ephemeral session for this call.
332
+
333
+ ``email`` is the Unpaywall identity (Unpaywall requires it; without one the
334
+ lookup is skipped). It is passed only to Unpaywall, not promoted to the
335
+ session ``contact``/User-Agent -- hold a :class:`Session` for that.
336
+ """
337
+ async with Session() as session:
338
+ return await session.resolve_access(article_ids, email=email)
339
+
340
+
341
+ async def related_ids(article_ids: ids.ArticleIds) -> tuple[relations.Related, ...]:
342
+ """One-shot :meth:`Session.related_ids`: opens an ephemeral session for this call."""
343
+ async with Session() as session:
344
+ return await session.related_ids(article_ids)
@@ -0,0 +1,118 @@
1
+ """Access terms (licence / OA status) for a fetched artifact.
2
+
3
+ litfetch owns access terms (see ``CONTEXT.md``): the licence under which the
4
+ bytes it fetched may be used. Two paths, distinguished by
5
+ :attr:`~litfetch.artifacts.SourceMetadata.basis`:
6
+
7
+ * :func:`extract_source_metadata` reads the licence *from the artifact itself* --
8
+ the JATS ``<permissions>/<license>`` or the Elsevier ``<openaccessUserLicense>``
9
+ -- with ``basis='artifact'`` (authoritative for exactly those bytes).
10
+ * :func:`resolve_access` asserts the licence / OA status from **Unpaywall**,
11
+ keyed on the DOI, with ``basis='unpaywall'`` -- for a paper whose bytes carry
12
+ none (a PDF).
13
+
14
+ Both return the licence *raw*; mapping to an SPDX id is the consumer's.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+
21
+ import defusedxml.ElementTree
22
+
23
+ from litfetch import _http, artifacts, ids, unpaywall
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def _localname(tag: str) -> str:
29
+ """Return an XML tag/attribute name without its namespace."""
30
+ return tag.rsplit('}', 1)[-1]
31
+
32
+
33
+ def extract_source_metadata(blob: artifacts.Blob) -> artifacts.SourceMetadata:
34
+ """Read access terms from a body ``blob``, dispatched on its media type.
35
+
36
+ Returns a :class:`~litfetch.artifacts.SourceMetadata` with ``basis='artifact'``
37
+ when a licence is present in the bytes, or an empty one (all ``None``) for a
38
+ media type that carries none (e.g. PDF) or when nothing is found.
39
+ """
40
+ media_type = blob.file.media_type
41
+ if media_type == artifacts.JATS_XML:
42
+ return _from_jats(blob.content)
43
+ if media_type == artifacts.ELSEVIER_XML:
44
+ return _from_elsevier(blob.content)
45
+ return artifacts.SourceMetadata()
46
+
47
+
48
+ def _from_jats(content: bytes) -> artifacts.SourceMetadata:
49
+ """Extract the licence from a JATS ``<permissions>/<license>``.
50
+
51
+ Prefers the ``xlink:href`` (the canonical CC URL), then ``license-type``,
52
+ then the licence paragraph text. ``access`` is flagged open only when the
53
+ ``license-type`` itself says so -- OA status proper comes from an authority.
54
+ """
55
+ try:
56
+ root = defusedxml.ElementTree.fromstring(content)
57
+ except Exception:
58
+ logger.exception('JATS source-metadata parse failed')
59
+ return artifacts.SourceMetadata()
60
+ for el in root.iter():
61
+ if _localname(el.tag) != 'license':
62
+ continue
63
+ href = next((v for k, v in el.attrib.items() if _localname(k) == 'href'), None)
64
+ license_type = el.attrib.get('license-type')
65
+ text = ' '.join(el.itertext()).strip() or None
66
+ licence = href or license_type or text
67
+ access = 'open-access' if license_type and 'open' in license_type.lower() else None
68
+ if licence or access:
69
+ return artifacts.SourceMetadata(licence=licence, access=access, basis='artifact')
70
+ return artifacts.SourceMetadata()
71
+
72
+
73
+ def _from_elsevier(content: bytes) -> artifacts.SourceMetadata:
74
+ """Extract the licence from an Elsevier ``<openaccessUserLicense>`` + ``<openaccess>``."""
75
+ try:
76
+ root = defusedxml.ElementTree.fromstring(content)
77
+ except Exception:
78
+ logger.exception('Elsevier source-metadata parse failed')
79
+ return artifacts.SourceMetadata()
80
+ licence: str | None = None
81
+ access: str | None = None
82
+ for el in root.iter():
83
+ name = _localname(el.tag)
84
+ if name == 'openaccessUserLicense' and el.text and el.text.strip():
85
+ licence = el.text.strip()
86
+ elif name == 'openaccess' and el.text and el.text.strip() in ('1', 'true'):
87
+ access = 'open-access'
88
+ if licence or access:
89
+ return artifacts.SourceMetadata(licence=licence, access=access, basis='artifact')
90
+ return artifacts.SourceMetadata()
91
+
92
+
93
+ async def resolve_access(
94
+ article_ids: ids.ArticleIds,
95
+ *,
96
+ http: _http.Http,
97
+ email: str | None = None,
98
+ ) -> artifacts.SourceMetadata:
99
+ """Resolve a paper's licence / OA status from Unpaywall, keyed on its DOI.
100
+
101
+ For papers whose bytes carry no licence (a PDF), this asserts one from an
102
+ external authority. Returns :class:`~litfetch.artifacts.SourceMetadata` with
103
+ ``basis='unpaywall'`` -- ``licence`` from the best OA location's raw
104
+ ``license``, ``access`` from the raw ``oa_status`` -- or an empty one when
105
+ there is no DOI, the lookup fails, or Unpaywall has no record. ``http`` is
106
+ the :class:`~litfetch._http.Http` to issue the request on; ``email``
107
+ identifies the caller per Unpaywall's policy and defaults to the session
108
+ ``contact`` -- without either, the lookup is skipped (Unpaywall requires it).
109
+ """
110
+ data = await unpaywall.fetch_record(article_ids, http=http, email=email)
111
+ if data is None:
112
+ return artifacts.SourceMetadata()
113
+ best = data.get('best_oa_location') or {}
114
+ licence = best.get('license') or None
115
+ access = data.get('oa_status') or None
116
+ if licence is None and access is None:
117
+ return artifacts.SourceMetadata()
118
+ return artifacts.SourceMetadata(licence=licence, access=access, basis='unpaywall')
litfetch/unpaywall.py ADDED
@@ -0,0 +1,58 @@
1
+ """Unpaywall record fetch, shared by access resolution and the file-set.
2
+
3
+ One DOI-keyed GET returns a record that yields two things litfetch wants: the
4
+ licence / OA status (:mod:`litfetch.source_metadata`) and the best OA location's
5
+ PDF URL (a ``BODY`` file-set rendition, via
6
+ :class:`~litfetch.fetchers.UnpaywallFileSource`). Both callers go through
7
+ :func:`fetch_record`; inside a :meth:`~litfetch.sessions.Session.scope` the
8
+ duplicate GET is served from cache rather than hitting Unpaywall twice.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+
15
+ import httpx
16
+
17
+ from litfetch import _doi, _http, ids
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _UNPAYWALL_BASE = 'https://api.unpaywall.org/v2'
22
+
23
+
24
+ async def fetch_record(
25
+ article_ids: ids.ArticleIds,
26
+ *,
27
+ http: _http.Http,
28
+ email: str | None = None,
29
+ ) -> dict | None:
30
+ """Return the parsed Unpaywall record for ``article_ids.doi``.
31
+
32
+ Args:
33
+ article_ids: The identifiers; only the DOI is used.
34
+ http: The :class:`~litfetch._http.Http` to issue the request on.
35
+ email: Identifies the caller per Unpaywall's policy; defaults to
36
+ ``http.contact``. Unpaywall requires it, so the request is skipped
37
+ when neither is set.
38
+
39
+ Returns:
40
+ The parsed JSON record, or ``None`` when there is no DOI, no email, the
41
+ lookup fails, or Unpaywall has no record.
42
+ """
43
+ email = email or http.contact
44
+ if not article_ids.doi or not email:
45
+ return None
46
+ url = f'{_UNPAYWALL_BASE}/{_doi.encode_doi_path(article_ids.doi)}'
47
+ try:
48
+ resp = await http.get(url, params={'email': email})
49
+ except httpx.HTTPError:
50
+ logger.exception('Unpaywall request failed for %s', article_ids.doi)
51
+ return None
52
+ if resp.status_code != 200:
53
+ return None
54
+ try:
55
+ return resp.json()
56
+ except ValueError:
57
+ logger.warning('Unpaywall returned a non-JSON response for %s', article_ids.doi)
58
+ return None