litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litfetch/fetchers.py ADDED
@@ -0,0 +1,888 @@
1
+ """The fetch seam: the source backends behind the file-set.
2
+
3
+ A :class:`Fetcher` declares the identifiers it needs (``requires``) and returns
4
+ the article **body** as a :class:`~litfetch.artifacts.Blob` -- a
5
+ :class:`~litfetch.artifacts.File` plus its bytes -- without converting to
6
+ markdown. A :class:`FileSource` lists the article's files (body renditions
7
+ *and* supplementary material, distinguished by
8
+ :class:`~litfetch.artifacts.FileKind`) and fetches any one of them, with
9
+ per-source authentication. PMC's Open Access bucket is openly listable and
10
+ fetchable; publisher assets reuse the same ``credentials`` as their full text.
11
+
12
+ Both take the :class:`~litfetch._http.Http` to issue requests on; the ladder
13
+ walk and file-set union that drive them live as methods on
14
+ :class:`~litfetch.sessions.Session` (which supplies ``http=self``).
15
+
16
+ Registered fetchers, in priority order (:func:`default_fetchers`):
17
+
18
+ * :class:`PmcOaFetcher` -- the PMC Open Access S3 bucket (JATS body and open
19
+ file-set). Needs a ``pmcid``.
20
+ * :class:`EuropePmcFetcher` -- Europe PMC's REST endpoint. Needs a ``pmcid``.
21
+ * :class:`ElsevierFetcher` -- Elsevier's article API, keyed on the caller's own
22
+ ``credentials['elsevier_api_key']``. Needs a ``doi``.
23
+ * :class:`SpringerFetcher` -- Springer Nature's OpenAccess JATS API, keyed on
24
+ ``credentials['springer_api_key']``. Needs a ``doi``.
25
+
26
+ :class:`BiorxivFetcher` (bioRxiv/medRxiv preprints; needs ``litfetch[biorxiv]``)
27
+ is *not* registered in :func:`default_fetchers` -- it uses browser-fingerprint
28
+ impersonation and an extra dependency, so a caller adds it explicitly.
29
+
30
+ PMC's S3 layout is article-versioned: each article lives at
31
+ ``s3://pmc-oa-opendata/PMC{id}.{version}/``. We probe ``PMC{id}.1.xml`` first
32
+ (the vast majority have a single version) and fall through to ``.2`` / ``.3``
33
+ for the rare correction case.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import logging
39
+ import mimetypes
40
+ import re
41
+ import urllib.parse
42
+ from collections.abc import Mapping
43
+ from typing import Protocol
44
+
45
+ import defusedxml.ElementTree
46
+ import httpx
47
+
48
+ from litfetch import _doi, _http, artifacts, crossref, ids, semantic_scholar, unpaywall
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+ _PMC_S3_BASE = 'https://pmc-oa-opendata.s3.amazonaws.com'
53
+ _EUROPE_PMC_BASE = 'https://www.ebi.ac.uk/europepmc/webservices/rest'
54
+ _ELSEVIER_HOST = 'api.elsevier.com'
55
+ _ELSEVIER_CREDENTIAL_KEY = 'elsevier_api_key'
56
+ _S2_CREDENTIAL_KEY = 'semantic_scholar_api_key'
57
+ _SPRINGER_BASE = 'https://api.springernature.com/openaccess/jats'
58
+ _SPRINGER_CREDENTIAL_KEY = 'springer_api_key'
59
+ _SPRINGER_META_BASE = 'https://api.springernature.com/meta/v2/json'
60
+ _SPRINGER_META_CREDENTIAL_KEY = 'springer_meta_api_key'
61
+ _BIORXIV_DETAILS_BASE = 'https://api.biorxiv.org/details'
62
+ _BIORXIV_SERVERS = ('biorxiv', 'medrxiv')
63
+ # Cold Spring Harbor preprint DOI prefixes (bioRxiv/medRxiv): older 10.1101, newer 10.64898.
64
+ _BIORXIV_DOI_PREFIXES = ('10.1101/', '10.64898/')
65
+ _BIORXIV_IMPERSONATE = 'chrome'
66
+
67
+ # The default XML namespace on an S3 ListObjectsV2 response.
68
+ _S3_NS = '{http://s3.amazonaws.com/doc/2006-03-01/}'
69
+
70
+ # Versions to probe under the article-versioned layout. PMC documents that
71
+ # "the majority of articles in PMC have a single version and it is version 1";
72
+ # the cap is a generous bound on the rare correction case.
73
+ _PMC_OA_MAX_VERSION = 3
74
+
75
+
76
+ class Fetcher(Protocol):
77
+ """A pluggable body-retrieval backend.
78
+
79
+ ``requires`` names the :class:`~litfetch.ids.ArticleIds` fields the fetcher
80
+ needs to even attempt a fetch; the ladder skips it when they are absent.
81
+ ``credentials`` carries the caller's per-user publisher keys. ``http`` is
82
+ the :class:`~litfetch._http.Http` to issue requests on -- the session running
83
+ the ladder supplies it.
84
+ """
85
+
86
+ name: str
87
+ requires: frozenset[str]
88
+
89
+ async def fetch(
90
+ self,
91
+ article_ids: ids.ArticleIds,
92
+ *,
93
+ credentials: Mapping[str, object] | None = None,
94
+ http: _http.Http,
95
+ ) -> artifacts.Blob | None:
96
+ """Fetch the body Blob; return ``None`` to defer to the next fetcher."""
97
+ ...
98
+
99
+
100
+ class FileSource(Protocol):
101
+ """Enumerates and materialises the files in an article's file-set.
102
+
103
+ ``list_files`` returns both body renditions and supplementary material as
104
+ :class:`~litfetch.artifacts.File` references, each tagged with its
105
+ :class:`~litfetch.artifacts.FileKind`; ``fetch_file`` downloads one of them.
106
+ Authentication is per-source and reuses the same ``credentials`` map as body
107
+ fetching; :func:`fetch_file` routes a file back to its source by
108
+ :attr:`~litfetch.artifacts.File.source`.
109
+ """
110
+
111
+ name: str
112
+
113
+ async def list_files(
114
+ self,
115
+ article_ids: ids.ArticleIds,
116
+ *,
117
+ credentials: Mapping[str, object] | None = None,
118
+ http: _http.Http,
119
+ ) -> tuple[artifacts.File, ...]:
120
+ """Enumerate the article's file references (no bytes)."""
121
+ ...
122
+
123
+ async def fetch_file(
124
+ self,
125
+ file: artifacts.File,
126
+ *,
127
+ credentials: Mapping[str, object] | None = None,
128
+ http: _http.Http,
129
+ ) -> artifacts.Blob | None:
130
+ """Download one file's bytes as a Blob; ``None`` when unavailable."""
131
+ ...
132
+
133
+
134
+ def _pmc_numeric(pmc_id: str) -> str:
135
+ """Return the PMC ID with any leading ``PMC`` stripped."""
136
+ s = pmc_id.strip()
137
+ if s.upper().startswith('PMC'):
138
+ return s[3:]
139
+ return s
140
+
141
+
142
+ def _pmc_versioned_xml_url(numeric: str, version: int) -> str:
143
+ """Construct the JATS XML URL for ``PMC{numeric}.{version}``."""
144
+ stem = f'PMC{numeric}.{version}'
145
+ return f'{_PMC_S3_BASE}/{stem}/{stem}.xml'
146
+
147
+
148
+ def _is_article_rendition(key: str) -> bool:
149
+ """Report whether an S3 key is an alternate rendition of the article body.
150
+
151
+ PMC stores the body under ``PMC{id}.{v}/`` as several stem-named renditions
152
+ -- ``PMC{id}.{v}.xml`` (JATS), ``.pdf``, ``.txt``, ``.json`` -- each sharing
153
+ the folder's stem. Those are body Files; anything else under the prefix
154
+ (figures, datasets, media) is supplementary.
155
+ """
156
+ parts = key.split('/')
157
+ if len(parts) != 2:
158
+ return False
159
+ folder, filename = parts
160
+ return filename.rsplit('.', 1)[0] == folder
161
+
162
+
163
+ async def fetch_jats_xml(
164
+ pmc_id: str,
165
+ *,
166
+ http: _http.Http,
167
+ ) -> tuple[bytes, str] | None:
168
+ """Fetch the JATS XML for ``pmc_id`` from PMC's public S3 bucket.
169
+
170
+ Probes the article-versioned layout starting at ``.1`` and falling through
171
+ to ``.2`` / ``.3`` on 404. Returns ``(xml_bytes, source_url)`` on the first
172
+ 200, or ``None`` when no version is present in the bucket.
173
+ """
174
+ numeric = _pmc_numeric(pmc_id)
175
+ for version in range(1, _PMC_OA_MAX_VERSION + 1):
176
+ url = _pmc_versioned_xml_url(numeric, version)
177
+ try:
178
+ resp = await http.get(url)
179
+ except httpx.HTTPError:
180
+ logger.exception('PMC OA fetch failed for %s', url)
181
+ continue
182
+ if resp.status_code == 200:
183
+ return resp.content, url
184
+ if resp.status_code != 404:
185
+ logger.warning('Unexpected status %d from %s', resp.status_code, url)
186
+ return None
187
+
188
+
189
+ async def crossref_elsevier_xml_link(http: _http.Http, doi: str) -> str | None:
190
+ """Return the Elsevier text/xml TDM link for ``doi`` via Crossref.
191
+
192
+ Crossref records publisher text-mining links in ``message.link[]``;
193
+ Elsevier-hosted articles carry a ``text/xml`` entry pointing at
194
+ ``api.elsevier.com/content/article/PII:...``. This both identifies the
195
+ article as Elsevier-hosted and hands us the exact fetch URL. Returns
196
+ ``None`` for non-Elsevier DOIs.
197
+ """
198
+ message = await crossref.fetch_work(doi, http=http)
199
+ if message is None:
200
+ return None
201
+ for link in message.get('link', []) or []:
202
+ url = link.get('URL', '')
203
+ if link.get('content-type') == 'text/xml' and urllib.parse.urlparse(url).netloc.endswith(_ELSEVIER_HOST):
204
+ return url
205
+ return None
206
+
207
+
208
+ def _elsevier_has_body(xml_bytes: bytes) -> bool:
209
+ """Report whether an Elsevier article XML response carries full text.
210
+
211
+ Full text is wrapped in ``<ce:sections>`` containing ``<ce:para>``
212
+ elements; an unentitled response (e.g. fetched from a non-institutional IP)
213
+ is coredata + a ``<dc:description>`` abstract only. Body presence -- not
214
+ the ``openaccess`` flag -- is the gate: the OA-only guarantee is enforced
215
+ at the deploy layer (the caller's egress IP).
216
+ """
217
+ return b'<ce:sections' in xml_bytes or xml_bytes.count(b'<ce:para') >= 3
218
+
219
+
220
+ def _extract_jats_article(content: bytes) -> bytes | None:
221
+ """Slice the JATS ``<article>`` out of a Springer OpenAccess response.
222
+
223
+ The OpenAccess ``/jats`` payload wraps the article in a ``<response>``
224
+ envelope behind a DOCTYPE that declares parameter entities -- which
225
+ ``defusedxml`` (and any hardened downstream parser) refuses. Slicing the
226
+ ``<article>`` element out by bytes yields self-contained JATS (its
227
+ namespaces live on the element) and drops both the envelope and the
228
+ entity-declaring DOCTYPE. Body presence is the gate, not an OA flag,
229
+ matching the Elsevier path; returns ``None`` for a non-OA/absent article.
230
+ """
231
+ # Anchor on `<article` followed by whitespace or `>` so a wrapper element
232
+ # like `<article-set>` (or `<article-meta>`) can't be mistaken for the root.
233
+ opening = re.search(rb'<article[\s>]', content)
234
+ end = content.rfind(b'</article>')
235
+ if opening is None or end < opening.start():
236
+ return None
237
+ article = content[opening.start() : end + len(b'</article>')]
238
+ if b'<body' not in article:
239
+ return None
240
+ return b"<?xml version='1.0' encoding='UTF-8'?>\n" + article
241
+
242
+
243
+ async def _download(http: _http.Http, file: artifacts.File, *, what: str) -> artifacts.Blob | None:
244
+ """GET ``file.uri`` and wrap the bytes as a Blob; ``None`` on error or non-200.
245
+
246
+ Shared by the openly-hosted file sources (PMC renditions, OA PDFs). ``what``
247
+ labels the log line on a transport error.
248
+ """
249
+ if file.uri is None:
250
+ return None
251
+ try:
252
+ # Publisher PDF links commonly redirect (openURL -> content/pdf, ...); follow them.
253
+ resp = await http.get(file.uri, follow_redirects=True)
254
+ except httpx.HTTPError:
255
+ logger.exception('%s fetch failed for %s', what, file.uri)
256
+ return None
257
+ if resp.status_code != 200:
258
+ return None
259
+ return artifacts.Blob(file=file, content=resp.content)
260
+
261
+
262
+ class PmcOaFetcher:
263
+ """The PMC Open Access S3 bucket: JATS body and open file-set.
264
+
265
+ Implements both :class:`Fetcher` and :class:`FileSource`. The bucket is
266
+ openly accessible, so neither path consults ``credentials``.
267
+ """
268
+
269
+ name = 'pmc_oa_s3'
270
+ requires = frozenset({'pmcid'})
271
+
272
+ async def fetch(
273
+ self,
274
+ article_ids: ids.ArticleIds,
275
+ *,
276
+ credentials: Mapping[str, object] | None = None,
277
+ http: _http.Http,
278
+ ) -> artifacts.Blob | None:
279
+ """Fetch the article-versioned JATS body for ``article_ids.pmcid``."""
280
+ del credentials # open bucket, no key
281
+ if article_ids.pmcid is None:
282
+ return None
283
+ fetched = await fetch_jats_xml(article_ids.pmcid, http=http)
284
+ if fetched is None:
285
+ return None
286
+ xml_bytes, source_url = fetched
287
+ return artifacts.Blob(
288
+ file=artifacts.File(
289
+ kind=artifacts.FileKind.BODY,
290
+ source=self.name,
291
+ media_type=artifacts.JATS_XML,
292
+ uri=source_url,
293
+ ),
294
+ content=xml_bytes,
295
+ )
296
+
297
+ async def list_files(
298
+ self,
299
+ article_ids: ids.ArticleIds,
300
+ *,
301
+ credentials: Mapping[str, object] | None = None,
302
+ http: _http.Http,
303
+ ) -> tuple[artifacts.File, ...]:
304
+ """List every file under the article's S3 prefix, tagged by kind.
305
+
306
+ Stem renditions (``PMC{id}.{v}.xml`` / ``.pdf`` / ...) are ``BODY``;
307
+ everything else under the prefix is ``SUPPLEMENTARY``. One listing
308
+ serves both axes.
309
+ """
310
+ del credentials # open bucket
311
+ if article_ids.pmcid is None:
312
+ return ()
313
+ numeric = _pmc_numeric(article_ids.pmcid)
314
+ keys = await self._list_keys(http, f'PMC{numeric}.')
315
+ files = []
316
+ for key, size in keys:
317
+ rendition = _is_article_rendition(key)
318
+ media_type = mimetypes.guess_type(key)[0]
319
+ files.append(
320
+ artifacts.File(
321
+ kind=artifacts.FileKind.BODY if rendition else artifacts.FileKind.SUPPLEMENTARY,
322
+ source=self.name,
323
+ media_type=media_type or ('application/octet-stream' if rendition else None),
324
+ uri=f'{_PMC_S3_BASE}/{urllib.parse.quote(key)}',
325
+ filename=key.rsplit('/', 1)[-1],
326
+ credential_key=None,
327
+ size_bytes=size,
328
+ )
329
+ )
330
+ return tuple(files)
331
+
332
+ async def fetch_file(
333
+ self,
334
+ file: artifacts.File,
335
+ *,
336
+ credentials: Mapping[str, object] | None = None,
337
+ http: _http.Http,
338
+ ) -> artifacts.Blob | None:
339
+ """Download an open PMC file (body rendition or supplementary) by ``uri``."""
340
+ del credentials # open bucket
341
+ return await _download(http, file, what='PMC file')
342
+
343
+ async def _list_keys(self, http: _http.Http, prefix: str) -> list[tuple[str, int | None]]:
344
+ """List ``(key, size)`` under ``prefix`` via S3 ListObjectsV2, paging fully."""
345
+ keys: list[tuple[str, int | None]] = []
346
+ token: str | None = None
347
+ while True:
348
+ params = {'list-type': '2', 'prefix': prefix}
349
+ if token:
350
+ params['continuation-token'] = token
351
+ try:
352
+ resp = await http.get(f'{_PMC_S3_BASE}/', params=params)
353
+ except httpx.HTTPError:
354
+ logger.exception('PMC OA list failed for prefix %s', prefix)
355
+ return keys
356
+ if resp.status_code != 200:
357
+ logger.warning('Unexpected status %d listing PMC OA prefix %s', resp.status_code, prefix)
358
+ return keys
359
+ root = defusedxml.ElementTree.fromstring(resp.content)
360
+ for contents in root.findall(f'{_S3_NS}Contents'):
361
+ key_el = contents.find(f'{_S3_NS}Key')
362
+ if key_el is None or not key_el.text:
363
+ continue
364
+ size_el = contents.find(f'{_S3_NS}Size')
365
+ size = int(size_el.text) if size_el is not None and size_el.text else None
366
+ keys.append((key_el.text, size))
367
+ token = root.findtext(f'{_S3_NS}NextContinuationToken')
368
+ if root.findtext(f'{_S3_NS}IsTruncated') != 'true' or not token:
369
+ return keys
370
+
371
+
372
+ class EuropePmcFetcher:
373
+ """The Europe PMC REST source.
374
+
375
+ A single GET against ``/{pmc_id}/fullTextXML``. Europe PMC mirrors PMC and
376
+ additionally serves UK funder-deposited Author Manuscripts plus articles
377
+ with direct EBI publisher arrangements. pmid -> pmcid resolution lives in
378
+ :class:`~litfetch.resolvers.EuropePmcResolver`, not here.
379
+ """
380
+
381
+ name = 'europe_pmc'
382
+ requires = frozenset({'pmcid'})
383
+
384
+ async def fetch(
385
+ self,
386
+ article_ids: ids.ArticleIds,
387
+ *,
388
+ credentials: Mapping[str, object] | None = None,
389
+ http: _http.Http,
390
+ ) -> artifacts.Blob | None:
391
+ """Fetch the Europe PMC full-text body for ``article_ids.pmcid``."""
392
+ del credentials # unused by this source
393
+ if article_ids.pmcid is None:
394
+ return None
395
+ numeric = _pmc_numeric(article_ids.pmcid)
396
+ url = f'{_EUROPE_PMC_BASE}/PMC{numeric}/fullTextXML'
397
+ try:
398
+ resp = await http.get(url)
399
+ except httpx.HTTPError:
400
+ logger.exception('Europe PMC fetch failed for %s', url)
401
+ return None
402
+ if resp.status_code != 200 or not resp.content:
403
+ if resp.status_code not in (200, 404):
404
+ logger.warning('Unexpected status %d from Europe PMC for %s', resp.status_code, url)
405
+ return None
406
+ return artifacts.Blob(
407
+ file=artifacts.File(
408
+ kind=artifacts.FileKind.BODY,
409
+ source=self.name,
410
+ media_type=artifacts.JATS_XML,
411
+ uri=url,
412
+ ),
413
+ content=resp.content,
414
+ )
415
+
416
+
417
+ class ElsevierFetcher:
418
+ """Elsevier full-text source via the article TDM API.
419
+
420
+ Resolves the Elsevier ``text/xml`` link through Crossref (which also
421
+ confirms the article is Elsevier-hosted), fetches it with the caller's own
422
+ API key (``credentials['elsevier_api_key']`` -- a self-serve
423
+ dev.elsevier.com key, per-user, no service-level shared key), and returns
424
+ the ce:/ja: XML body for later conversion. Returns ``None`` for non-Elsevier
425
+ DOIs, when the caller supplied no Elsevier key, or when the response carries
426
+ no body.
427
+ """
428
+
429
+ name = 'elsevier_oa'
430
+ requires = frozenset({'doi'})
431
+
432
+ async def fetch(
433
+ self,
434
+ article_ids: ids.ArticleIds,
435
+ *,
436
+ credentials: Mapping[str, object] | None = None,
437
+ http: _http.Http,
438
+ ) -> artifacts.Blob | None:
439
+ """Fetch the Elsevier article body for ``article_ids.doi``."""
440
+ raw_key = (credentials or {}).get(_ELSEVIER_CREDENTIAL_KEY)
441
+ api_key = raw_key if isinstance(raw_key, str) and raw_key else None
442
+ if api_key is None or article_ids.doi is None:
443
+ return None
444
+ link = await crossref_elsevier_xml_link(http, article_ids.doi)
445
+ if link is None:
446
+ return None
447
+ try:
448
+ resp = await http.get(link, headers={'X-ELS-APIKey': api_key, 'Accept': 'text/xml'})
449
+ except httpx.HTTPError:
450
+ logger.exception('Elsevier fetch failed for %s', link)
451
+ return None
452
+ if resp.status_code != 200 or not resp.content or not _elsevier_has_body(resp.content):
453
+ return None
454
+ return artifacts.Blob(
455
+ file=artifacts.File(
456
+ kind=artifacts.FileKind.BODY,
457
+ source=self.name,
458
+ media_type=artifacts.ELSEVIER_XML,
459
+ uri=link,
460
+ ),
461
+ content=resp.content,
462
+ )
463
+
464
+
465
+ class SpringerFetcher:
466
+ """Springer Nature Open Access full text (JATS) via the OpenAccess API.
467
+
468
+ Keyed on the caller's own dev.springernature.com key
469
+ (``credentials['springer_api_key']`` -- per-user, no shared service key).
470
+ Queries the OpenAccess JATS endpoint by DOI and returns the JATS response
471
+ bytes when they carry an ``<article>`` with a ``<body>``; ``None`` for a
472
+ non-OA/absent article, no key, or no DOI. Body presence is the gate, not an
473
+ OA flag, matching the Elsevier path.
474
+ """
475
+
476
+ name = 'springer_oa'
477
+ requires = frozenset({'doi'})
478
+
479
+ async def fetch(
480
+ self,
481
+ article_ids: ids.ArticleIds,
482
+ *,
483
+ credentials: Mapping[str, object] | None = None,
484
+ http: _http.Http,
485
+ ) -> artifacts.Blob | None:
486
+ """Fetch the Springer OA article body for ``article_ids.doi``."""
487
+ raw_key = (credentials or {}).get(_SPRINGER_CREDENTIAL_KEY)
488
+ api_key = raw_key if isinstance(raw_key, str) and raw_key else None
489
+ if api_key is None or article_ids.doi is None:
490
+ return None
491
+ query = f'doi:{article_ids.doi}'
492
+ try:
493
+ resp = await http.get(_SPRINGER_BASE, params={'q': query, 'api_key': api_key})
494
+ except httpx.HTTPError:
495
+ logger.exception('Springer fetch failed for %s', article_ids.doi)
496
+ return None
497
+ if resp.status_code != 200 or not resp.content:
498
+ return None
499
+ article = _extract_jats_article(resp.content)
500
+ if article is None:
501
+ return None
502
+ return artifacts.Blob(
503
+ file=artifacts.File(
504
+ kind=artifacts.FileKind.BODY,
505
+ source=self.name,
506
+ media_type=artifacts.JATS_XML,
507
+ # The request URL carries the secret api_key; record the key-free query instead.
508
+ uri=f'{_SPRINGER_BASE}?q={query}',
509
+ ),
510
+ content=article,
511
+ )
512
+
513
+
514
+ async def _fetch_impersonated(url: str, *, impersonate: str) -> bytes | None:
515
+ """GET ``url`` with a browser TLS fingerprint via curl_cffi.
516
+
517
+ bioRxiv's JATS host sits behind Cloudflare's fingerprint gate, which a plain
518
+ httpx client trips; curl_cffi impersonates a real browser's TLS/HTTP-2
519
+ fingerprint to pass it. Raises a clear error when the optional extra is
520
+ absent; returns ``None`` on a transport error or non-200.
521
+ """
522
+ try:
523
+ from curl_cffi import requests # noqa: PLC0415 -- optional dep, imported lazily
524
+ except ImportError as e:
525
+ raise RuntimeError('curl_cffi is not installed; install litfetch[biorxiv]') from e
526
+ try:
527
+ async with requests.AsyncSession() as session:
528
+ resp = await session.get(url, impersonate=impersonate, timeout=_http.DEFAULT_TIMEOUT) # type: ignore[arg-type]
529
+ except Exception:
530
+ logger.exception('bioRxiv impersonated fetch failed for %s', url)
531
+ return None
532
+ if resp.status_code != 200 or not resp.content:
533
+ return None
534
+ return resp.content
535
+
536
+
537
+ class BiorxivFetcher:
538
+ """bioRxiv / medRxiv preprint full text (opt-in; needs ``litfetch[biorxiv]``).
539
+
540
+ Preprints carry a Cold Spring Harbor DOI (``10.1101/`` or ``10.64898/``). The
541
+ details API yields the latest version's ``jatsxml`` link; that JATS host is
542
+ Cloudflare-gated, so the body is fetched with a browser TLS fingerprint
543
+ (curl_cffi). The XML is HighWire-produced structured JATS, so a litdown
544
+ rendering is ``xml-faithful`` -- though the conversion is bioRxiv's
545
+ best-effort, a provenance the ``biorxiv`` source records. Kept *off*
546
+ :func:`default_fetchers` (impersonation + an extra dependency): add it
547
+ explicitly. Returns ``None`` for a non-preprint DOI or when no JATS is on
548
+ offer; raises if curl_cffi is absent when a fetch is actually attempted.
549
+ """
550
+
551
+ name = 'biorxiv'
552
+ requires = frozenset({'doi'})
553
+
554
+ def __init__(self, *, impersonate: str = _BIORXIV_IMPERSONATE) -> None:
555
+ self._impersonate = impersonate
556
+
557
+ async def fetch(
558
+ self,
559
+ article_ids: ids.ArticleIds,
560
+ *,
561
+ credentials: Mapping[str, object] | None = None,
562
+ http: _http.Http,
563
+ ) -> artifacts.Blob | None:
564
+ """Fetch the latest-version JATS for a CSH-prefix ``article_ids.doi``."""
565
+ del credentials # open preprint server, no key
566
+ doi = article_ids.doi
567
+ if doi is None or not doi.startswith(_BIORXIV_DOI_PREFIXES):
568
+ return None
569
+ jats_url = await self._jats_url(http, doi)
570
+ if jats_url is None:
571
+ return None
572
+ content = await _fetch_impersonated(jats_url, impersonate=self._impersonate)
573
+ if content is None:
574
+ return None
575
+ return artifacts.Blob(
576
+ file=artifacts.File(
577
+ kind=artifacts.FileKind.BODY,
578
+ source=self.name,
579
+ media_type=artifacts.JATS_XML,
580
+ uri=jats_url,
581
+ ),
582
+ content=content,
583
+ )
584
+
585
+ async def _jats_url(self, http: _http.Http, doi: str) -> str | None:
586
+ """Return the latest version's ``jatsxml`` link, trying biorxiv then medrxiv.
587
+
588
+ The details API is not Cloudflare-gated, so this uses the shared session;
589
+ only the JATS body itself needs the impersonated fetch.
590
+ """
591
+ for server in _BIORXIV_SERVERS:
592
+ url = f'{_BIORXIV_DETAILS_BASE}/{server}/{_doi.encode_doi_path(doi)}'
593
+ try:
594
+ resp = await http.get(url)
595
+ except httpx.HTTPError:
596
+ logger.exception('bioRxiv details lookup failed for %s', url)
597
+ continue
598
+ if resp.status_code != 200:
599
+ continue
600
+ try:
601
+ collection = resp.json().get('collection') or []
602
+ except ValueError:
603
+ logger.warning('bioRxiv details returned a non-JSON response for %s', url)
604
+ continue
605
+ if collection and collection[-1].get('jatsxml'):
606
+ return collection[-1]['jatsxml']
607
+ return None
608
+
609
+
610
+ class UnpaywallFileSource:
611
+ """Unpaywall's best-OA-location PDF as a ``BODY`` file-set rendition.
612
+
613
+ Reuses the DOI-keyed Unpaywall record (shared with
614
+ :func:`~litfetch.source_metadata.resolve_access` -- inside a session scope
615
+ the record is fetched once). Lists a single ``application/pdf`` ``BODY``
616
+ :class:`~litfetch.artifacts.File` when the record has a
617
+ ``best_oa_location.url_for_pdf``, empty otherwise. Needs no credential;
618
+ ``email`` identifies the caller per Unpaywall's policy and defaults to the
619
+ session ``contact`` (Unpaywall requires it -- no email, no listing).
620
+
621
+ A discovered PDF is an *additional* file-set member, never the goal of the
622
+ XML body ladder (see ``docs/source-expansion-plan.md``).
623
+ """
624
+
625
+ name = 'unpaywall'
626
+
627
+ def __init__(self, *, email: str | None = None) -> None:
628
+ self._email = email
629
+
630
+ async def list_files(
631
+ self,
632
+ article_ids: ids.ArticleIds,
633
+ *,
634
+ credentials: Mapping[str, object] | None = None,
635
+ http: _http.Http,
636
+ ) -> tuple[artifacts.File, ...]:
637
+ """List the best-OA PDF as a BODY File, or nothing when there is none."""
638
+ del credentials # no key; email identifies the caller
639
+ record = await unpaywall.fetch_record(article_ids, http=http, email=self._email)
640
+ if record is None:
641
+ return ()
642
+ best = record.get('best_oa_location') or {}
643
+ pdf_url = best.get('url_for_pdf')
644
+ if not pdf_url:
645
+ return ()
646
+ return (
647
+ artifacts.File(
648
+ kind=artifacts.FileKind.BODY,
649
+ source=self.name,
650
+ media_type=artifacts.PDF,
651
+ uri=pdf_url,
652
+ ),
653
+ )
654
+
655
+ async def fetch_file(
656
+ self,
657
+ file: artifacts.File,
658
+ *,
659
+ credentials: Mapping[str, object] | None = None,
660
+ http: _http.Http,
661
+ ) -> artifacts.Blob | None:
662
+ """Download the OA PDF by ``uri``."""
663
+ del credentials # openly hosted OA PDF
664
+ return await _download(http, file, what='Unpaywall PDF')
665
+
666
+
667
+ class SemanticScholarFileSource:
668
+ """Semantic Scholar's open-access PDF as a ``BODY`` file-set rendition.
669
+
670
+ A paper lookup for the ``openAccessPdf`` field yields a PDF URL when S2 knows
671
+ an OA copy. Lists a single ``application/pdf`` ``BODY``
672
+ :class:`~litfetch.artifacts.File`, empty otherwise. An optional S2 API key
673
+ in ``credentials['semantic_scholar_api_key']`` raises the request pace; the
674
+ keyless endpoint works without it.
675
+
676
+ A discovered PDF is an *additional* file-set member, never the goal of the
677
+ XML body ladder (see ``docs/source-expansion-plan.md``).
678
+ """
679
+
680
+ name = 'semantic_scholar'
681
+
682
+ async def list_files(
683
+ self,
684
+ article_ids: ids.ArticleIds,
685
+ *,
686
+ credentials: Mapping[str, object] | None = None,
687
+ http: _http.Http,
688
+ ) -> tuple[artifacts.File, ...]:
689
+ """List S2's open-access PDF as a BODY File, or nothing when there is none."""
690
+ raw_key = (credentials or {}).get(_S2_CREDENTIAL_KEY)
691
+ api_key = raw_key if isinstance(raw_key, str) and raw_key else None
692
+ record = await semantic_scholar.fetch_paper(article_ids, http=http, fields='openAccessPdf', api_key=api_key)
693
+ if record is None:
694
+ return ()
695
+ pdf_url = (record.get('openAccessPdf') or {}).get('url')
696
+ if not pdf_url:
697
+ return ()
698
+ return (
699
+ artifacts.File(
700
+ kind=artifacts.FileKind.BODY,
701
+ source=self.name,
702
+ media_type=artifacts.PDF,
703
+ uri=pdf_url,
704
+ ),
705
+ )
706
+
707
+ async def fetch_file(
708
+ self,
709
+ file: artifacts.File,
710
+ *,
711
+ credentials: Mapping[str, object] | None = None,
712
+ http: _http.Http,
713
+ ) -> artifacts.Blob | None:
714
+ """Download the OA PDF by ``uri``."""
715
+ del credentials # openly hosted OA PDF
716
+ return await _download(http, file, what='Semantic Scholar PDF')
717
+
718
+
719
+ class CrossrefFileSource:
720
+ """Crossref text-mining links as ``BODY`` file-set renditions.
721
+
722
+ Crossref records publisher text-mining links in ``message.link[]`` flagged
723
+ ``intended-application: text-mining`` -- a full-text PDF and/or XML URL.
724
+ Lists each as a ``BODY`` :class:`~litfetch.artifacts.File` with
725
+ ``media_type`` from its ``content-type``; fetching one may need the
726
+ publisher entitlement (egress IP or a TDM token), enforced upstream, so the
727
+ ref is surfaced regardless. Needs a ``doi``.
728
+ """
729
+
730
+ name = 'crossref_tdm'
731
+
732
+ async def list_files(
733
+ self,
734
+ article_ids: ids.ArticleIds,
735
+ *,
736
+ credentials: Mapping[str, object] | None = None,
737
+ http: _http.Http,
738
+ ) -> tuple[artifacts.File, ...]:
739
+ """List the text-mining links Crossref advertises, or nothing."""
740
+ del credentials # the Crossref lookup itself is open
741
+ if not article_ids.doi:
742
+ return ()
743
+ message = await crossref.fetch_work(article_ids.doi, http=http)
744
+ if message is None:
745
+ return ()
746
+ files = []
747
+ for link in message.get('link', []) or []:
748
+ url = link.get('URL')
749
+ if link.get('intended-application') != 'text-mining' or not url:
750
+ continue
751
+ content_type = link.get('content-type')
752
+ files.append(
753
+ artifacts.File(
754
+ kind=artifacts.FileKind.BODY,
755
+ source=self.name,
756
+ media_type=content_type if content_type and content_type != 'unspecified' else None,
757
+ uri=url,
758
+ )
759
+ )
760
+ return tuple(files)
761
+
762
+ async def fetch_file(
763
+ self,
764
+ file: artifacts.File,
765
+ *,
766
+ credentials: Mapping[str, object] | None = None,
767
+ http: _http.Http,
768
+ ) -> artifacts.Blob | None:
769
+ """Download the TDM link by ``uri`` (may 403 without entitlement)."""
770
+ del credentials # entitlement is by egress IP / upstream token, not a litfetch key
771
+ return await _download(http, file, what='Crossref TDM file')
772
+
773
+
774
+ async def _springer_meta_pdf(http: _http.Http, doi: str, api_key: str) -> tuple[str, bool] | None:
775
+ """Return ``(pdf_url, is_open_access)`` from the Springer Meta record, or ``None``.
776
+
777
+ The Meta record's ``url`` list carries an ``openURL`` PDF entry
778
+ (``link.springer.com/openurl/pdf?id=doi:<doi>``); ``openaccess`` is the OA
779
+ flag. Returns ``None`` when the lookup fails or no PDF url is present.
780
+ """
781
+ try:
782
+ resp = await http.get(_SPRINGER_META_BASE, params={'q': f'doi:{doi}', 'api_key': api_key})
783
+ except httpx.HTTPError:
784
+ logger.exception('Springer Meta request failed for %s', doi)
785
+ return None
786
+ if resp.status_code != 200:
787
+ return None
788
+ try:
789
+ records = resp.json().get('records') or []
790
+ except ValueError:
791
+ logger.warning('Springer Meta returned a non-JSON response for %s', doi)
792
+ return None
793
+ if not records:
794
+ return None
795
+ record = records[0]
796
+ pdf_url = next(
797
+ (u.get('value') for u in (record.get('url') or []) if u.get('format') == 'pdf' and u.get('value')), None
798
+ )
799
+ if not pdf_url:
800
+ return None
801
+ return pdf_url, record.get('openaccess') == 'true'
802
+
803
+
804
+ class SpringerFileSource:
805
+ """Springer's article PDF as a ``BODY`` file-set rendition, via the Meta API.
806
+
807
+ The Meta API (``credentials['springer_meta_api_key']`` -- distinct from the
808
+ OpenAccess key) yields a stable openURL PDF link plus the ``openaccess`` flag
809
+ for any Springer DOI, OA or subscription. Lists an ``application/pdf``
810
+ ``BODY`` :class:`~litfetch.artifacts.File`; an OA article is openly fetchable
811
+ (``credential_key=None``), a subscription one is marked
812
+ :data:`~litfetch.artifacts.INSTITUTIONAL` so the consumer routes the fetch
813
+ through its entitled (EZproxy-style) client. The openURL redirects to the
814
+ PDF, so :func:`_download` follows redirects.
815
+ """
816
+
817
+ name = 'springer'
818
+
819
+ async def list_files(
820
+ self,
821
+ article_ids: ids.ArticleIds,
822
+ *,
823
+ credentials: Mapping[str, object] | None = None,
824
+ http: _http.Http,
825
+ ) -> tuple[artifacts.File, ...]:
826
+ """List the Springer PDF, marking it entitled when the article is not OA."""
827
+ raw_key = (credentials or {}).get(_SPRINGER_META_CREDENTIAL_KEY)
828
+ api_key = raw_key if isinstance(raw_key, str) and raw_key else None
829
+ if api_key is None or article_ids.doi is None:
830
+ return ()
831
+ found = await _springer_meta_pdf(http, article_ids.doi, api_key)
832
+ if found is None:
833
+ return ()
834
+ pdf_url, is_open_access = found
835
+ return (
836
+ artifacts.File(
837
+ kind=artifacts.FileKind.BODY,
838
+ source=self.name,
839
+ media_type=artifacts.PDF,
840
+ uri=pdf_url,
841
+ credential_key=None if is_open_access else artifacts.INSTITUTIONAL,
842
+ ),
843
+ )
844
+
845
+ async def fetch_file(
846
+ self,
847
+ file: artifacts.File,
848
+ *,
849
+ credentials: Mapping[str, object] | None = None,
850
+ http: _http.Http,
851
+ ) -> artifacts.Blob | None:
852
+ """Download the PDF by ``uri`` (following the openURL redirect).
853
+
854
+ An OA article fetches directly; a subscription one succeeds only when
855
+ ``http`` is an entitled client (the ``INSTITUTIONAL`` marker is the
856
+ consumer's cue to route it so).
857
+ """
858
+ del credentials # entitlement is by the routed client, not a litfetch key
859
+ return await _download(http, file, what='Springer PDF')
860
+
861
+
862
+ def default_fetchers() -> tuple[Fetcher, ...]:
863
+ """Return the production fetcher list, in priority order.
864
+
865
+ Kept as a function so callers can prepend their own fetcher (e.g. a
866
+ read-only cache) without import-time side effects. The Elsevier fetcher
867
+ sits last and reads its key from ``credentials``; a caller with no Elsevier
868
+ key makes it a no-op.
869
+ """
870
+ return (PmcOaFetcher(), EuropePmcFetcher(), ElsevierFetcher(), SpringerFetcher())
871
+
872
+
873
+ def default_file_sources() -> tuple[FileSource, ...]:
874
+ """Return the file sources a session queries by default.
875
+
876
+ PMC's Open Access bucket (JATS body, PDF rendition, and supplementary
877
+ material); Unpaywall and Semantic Scholar (a best-OA PDF for the non-PMC long
878
+ tail); Crossref TDM links; and Springer (an openURL PDF via the Meta API,
879
+ marked :data:`~litfetch.artifacts.INSTITUTIONAL` when the article is not OA).
880
+ A source with no usable identifier or credential is simply a no-op.
881
+ """
882
+ return (
883
+ PmcOaFetcher(),
884
+ UnpaywallFileSource(),
885
+ SemanticScholarFileSource(),
886
+ CrossrefFileSource(),
887
+ SpringerFileSource(),
888
+ )