litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litfetch/ids.py ADDED
@@ -0,0 +1,41 @@
1
+ """The identifier bundle shared across resolvers and full-text sources."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import dataclasses
6
+ from collections.abc import Iterable
7
+
8
+
9
+ @dataclasses.dataclass(frozen=True)
10
+ class ArticleIds:
11
+ """An immutable bundle of the identifiers litfetch can act on.
12
+
13
+ Every field is optional: a caller may enter with only a PMID, only a DOI (a
14
+ non-PubMed paper), or a fully-populated bundle. Resolvers enrich a bundle;
15
+ sources consume whichever identifier they declare in ``requires``.
16
+ """
17
+
18
+ # Deliberately a thin record. The priority orders callers apply over these
19
+ # fields (canonical_key prefers doi; NCBI idconv prefers pmid; S2 prefers
20
+ # doi) are independent caller policy, not a domain ordering -- there is no
21
+ # single intrinsic specificity ranking -- so they stay at the call sites
22
+ # rather than being centralised here behind a generic picker.
23
+ pmid: str | None = None
24
+ pmcid: str | None = None
25
+ doi: str | None = None
26
+
27
+ def merge(self, other: ArticleIds) -> ArticleIds:
28
+ """Return a bundle that fills this one's gaps from ``other``.
29
+
30
+ Known identifiers are never overwritten: a resolver can add a DOI but
31
+ cannot correct a PMCID the caller supplied.
32
+ """
33
+ return ArticleIds(
34
+ pmid=self.pmid or other.pmid,
35
+ pmcid=self.pmcid or other.pmcid,
36
+ doi=self.doi or other.doi,
37
+ )
38
+
39
+ def has(self, fields: Iterable[str]) -> bool:
40
+ """Return whether every identifier named in ``fields`` is present."""
41
+ return all(getattr(self, field) for field in fields)
litfetch/py.typed ADDED
File without changes
litfetch/relations.py ADDED
@@ -0,0 +1,114 @@
1
+ """Cross-version relations between identifiers: preprint <-> published.
2
+
3
+ A work can exist as a preprint (bioRxiv / medRxiv, ...) and later as a published
4
+ version of record -- two distinct DOIs for one paper. :func:`related_ids` takes
5
+ whatever :class:`~litfetch.ids.ArticleIds` you hold and returns the related works
6
+ it can find, each as its own :class:`~litfetch.ids.ArticleIds` tagged with how it
7
+ relates -- so the caller need not know whether what it holds is a preprint or a
8
+ published DOI. The equivalence decision ("same paper") is the consumer's;
9
+ litfetch only surfaces the links. The returned bundles are single-DOI and can be
10
+ fed straight back through a :data:`~litfetch.resolvers.Resolver` to fill the rest.
11
+
12
+ Sources: the bioRxiv / medRxiv details API (reliable preprint -> published) and
13
+ Crossref relations (``has-preprint`` / ``is-preprint-of``, both directions,
14
+ best-effort on publisher metadata).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import enum
20
+ import logging
21
+ from typing import NamedTuple
22
+
23
+ import httpx
24
+
25
+ from litfetch import _doi, _http, crossref, ids
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ _BIORXIV_DETAILS_BASE = 'https://api.biorxiv.org/details'
30
+ _BIORXIV_SERVERS = ('biorxiv', 'medrxiv')
31
+ # Cold Spring Harbor preprint DOI prefixes (bioRxiv/medRxiv): the older 10.1101
32
+ # and the newer 10.64898.
33
+ _PREPRINT_DOI_PREFIXES = ('10.1101/', '10.64898/')
34
+
35
+
36
+ class RelationType(enum.Enum):
37
+ """How a related work relates to the one you asked about.
38
+
39
+ ``PREPRINT`` -- the related bundle is a preprint of the input; ``PUBLISHED``
40
+ -- the related bundle is the published version of record of the input.
41
+ """
42
+
43
+ PREPRINT = 'preprint'
44
+ PUBLISHED = 'published'
45
+
46
+
47
+ class Related(NamedTuple):
48
+ """A related work: its relationship to the input, and its identifiers."""
49
+
50
+ relation: RelationType
51
+ ids: ids.ArticleIds
52
+
53
+
54
+ async def related_ids(article_ids: ids.ArticleIds, *, http: _http.Http) -> tuple[Related, ...]:
55
+ """Find the preprint / published counterparts of ``article_ids`` by DOI.
56
+
57
+ Returns each related work as a single-DOI :class:`~litfetch.ids.ArticleIds`
58
+ tagged with its :class:`RelationType`; empty when there is no DOI or nothing
59
+ links. A preprint DOI is followed to its published version via the bioRxiv
60
+ details API; Crossref relations are consulted in either direction.
61
+ """
62
+ doi = article_ids.doi
63
+ if not doi:
64
+ return ()
65
+ # DOIs are case-insensitive, so dedupe on a case-folded key (keeping the
66
+ # first-seen original casing) -- else bioRxiv and Crossref reporting the same
67
+ # DOI in different case would both survive.
68
+ found: dict[tuple[RelationType, str], Related] = {}
69
+ if doi.startswith(_PREPRINT_DOI_PREFIXES):
70
+ published = await _biorxiv_published(http, doi)
71
+ if published:
72
+ found[(RelationType.PUBLISHED, published.casefold())] = Related(
73
+ RelationType.PUBLISHED, ids.ArticleIds(doi=published)
74
+ )
75
+ for relation, linked in await _crossref_relations(http, doi):
76
+ found.setdefault((relation, linked.casefold()), Related(relation, ids.ArticleIds(doi=linked)))
77
+ return tuple(found.values())
78
+
79
+
80
+ async def _biorxiv_published(http: _http.Http, doi: str) -> str | None:
81
+ """Return the published DOI bioRxiv/medRxiv records for preprint ``doi``."""
82
+ for server in _BIORXIV_SERVERS:
83
+ url = f'{_BIORXIV_DETAILS_BASE}/{server}/{_doi.encode_doi_path(doi)}'
84
+ try:
85
+ resp = await http.get(url)
86
+ except httpx.HTTPError:
87
+ logger.exception('bioRxiv details lookup failed for %s', url)
88
+ continue
89
+ if resp.status_code != 200:
90
+ continue
91
+ try:
92
+ collection = resp.json().get('collection') or []
93
+ except ValueError:
94
+ logger.warning('bioRxiv details returned a non-JSON response for %s', url)
95
+ continue
96
+ if collection:
97
+ published = collection[-1].get('published')
98
+ if published and published != 'NA':
99
+ return published
100
+ return None
101
+
102
+
103
+ async def _crossref_relations(http: _http.Http, doi: str) -> list[tuple[RelationType, str]]:
104
+ """Return ``(RelationType, doi)`` for Crossref ``has-preprint`` / ``is-preprint-of``."""
105
+ message = await crossref.fetch_work(doi, http=http)
106
+ if message is None:
107
+ return []
108
+ relation = message.get('relation', {})
109
+ out: list[tuple[RelationType, str]] = []
110
+ for key, kind in (('has-preprint', RelationType.PREPRINT), ('is-preprint-of', RelationType.PUBLISHED)):
111
+ for entry in relation.get(key, []) or []:
112
+ if entry.get('id') and entry.get('id-type') == 'doi':
113
+ out.append((kind, entry['id']))
114
+ return out
litfetch/resolvers.py ADDED
@@ -0,0 +1,202 @@
1
+ """Identifier resolvers: enrich an :class:`~litfetch.ids.ArticleIds` bundle.
2
+
3
+ A resolver is any async callable ``(ArticleIds, Http) -> ArticleIds`` (the
4
+ :data:`Resolver` alias). It takes what is known and the
5
+ :class:`~litfetch._http.Http` to issue requests on, and returns a bundle filled
6
+ with whatever more it could find; it must never overwrite a known identifier
7
+ (use :meth:`~litfetch.ids.ArticleIds.merge`). Resolvers are usable on their
8
+ own as a cross-reference toolkit, independent of the fetch ladder::
9
+
10
+ async with litfetch.Session() as s:
11
+ ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/...'), s)
12
+ print(ids.pmcid)
13
+
14
+ The bundled resolvers are general (no pubmedifier coupling): Europe PMC search,
15
+ NCBI's ID Converter, and Semantic Scholar. Consumer-specific resolvers (a
16
+ local cache, a corpus client) belong in the consumer and slot into the same
17
+ :func:`chain`.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ from collections.abc import Awaitable, Callable, Mapping
24
+
25
+ import httpx
26
+
27
+ from litfetch import _http, ids, semantic_scholar
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _EUROPE_PMC_BASE = 'https://www.ebi.ac.uk/europepmc/webservices/rest'
32
+ _NCBI_IDCONV_BASE = 'https://pmc.ncbi.nlm.nih.gov/tools/idconv/api/v1/articles/'
33
+
34
+ # A resolver enriches a bundle: given the known ids and an Http, it returns a
35
+ # (possibly) fuller ArticleIds and preserves every identifier it was given.
36
+ Resolver = Callable[[ids.ArticleIds, _http.Http], Awaitable[ids.ArticleIds]]
37
+
38
+
39
+ def _pmcid_with_prefix(value: str | None) -> str | None:
40
+ """Normalise a bare or prefixed PMC id to the ``PMC...`` form."""
41
+ if not value:
42
+ return None
43
+ value = value.strip()
44
+ if value.upper().startswith('PMC'):
45
+ return value
46
+ return f'PMC{value}'
47
+
48
+
49
+ async def _get_json(
50
+ http: _http.Http,
51
+ url: str,
52
+ *,
53
+ params: Mapping[str, str | int],
54
+ context: str,
55
+ rate: _http.Rate = _http.Rate.DEFAULT,
56
+ ) -> dict | None:
57
+ """GET ``url`` and parse JSON, logging and swallowing transport errors."""
58
+ try:
59
+ resp = await http.get(url, params=params, rate=rate)
60
+ except httpx.HTTPError:
61
+ logger.exception('%s request failed', context)
62
+ return None
63
+ if resp.status_code != 200:
64
+ return None
65
+ try:
66
+ return resp.json()
67
+ except ValueError:
68
+ logger.warning('%s returned a non-JSON response', context)
69
+ return None
70
+
71
+
72
+ class EuropePmcResolver:
73
+ """Resolve ``pmid -> pmcid`` via Europe PMC's search API.
74
+
75
+ Europe PMC occasionally records a PMC id for an article a PubMed-XML-sourced
76
+ corpus does not -- typically UKPMC-only author-manuscript deposits. A no-op
77
+ when the bundle already has a ``pmcid`` or has no ``pmid``.
78
+ """
79
+
80
+ async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
81
+ """Return ``article_ids`` enriched with a ``pmcid`` where Europe PMC has one."""
82
+ if article_ids.pmcid or not article_ids.pmid:
83
+ return article_ids
84
+ params = {
85
+ 'query': f'EXT_ID:{article_ids.pmid} AND SRC:MED',
86
+ 'format': 'json',
87
+ 'pageSize': 1,
88
+ 'resultType': 'lite',
89
+ }
90
+ data = await _get_json(http, f'{_EUROPE_PMC_BASE}/search', params=params, context='Europe PMC search')
91
+ if data is None:
92
+ return article_ids
93
+ records = data.get('resultList', {}).get('result', [])
94
+ if not records:
95
+ return article_ids
96
+ return article_ids.merge(ids.ArticleIds(pmcid=_pmcid_with_prefix(records[0].get('pmcid'))))
97
+
98
+
99
+ class NcbiIdConverterResolver:
100
+ """Cross-reference ``pmid``/``pmcid``/``doi`` via NCBI's ID Converter.
101
+
102
+ A single keyless request maps any one of the three identifiers to the
103
+ others. ``tool`` identifies the caller to NCBI; the ``email`` sent with it
104
+ defaults to the session ``contact`` (``http.contact``) and is omitted when
105
+ unset. A no-op when the bundle carries none of the three.
106
+ """
107
+
108
+ def __init__(self, *, tool: str = 'litfetch') -> None:
109
+ self._tool = tool
110
+
111
+ async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
112
+ """Return ``article_ids`` enriched with whatever the ID Converter maps."""
113
+ query = _idconv_query(article_ids)
114
+ if query is None:
115
+ return article_ids
116
+ identifier, idtype = query
117
+ params = {'ids': identifier, 'idtype': idtype, 'format': 'json', 'tool': self._tool}
118
+ if http.contact:
119
+ params['email'] = http.contact
120
+ data = await _get_json(
121
+ http, _NCBI_IDCONV_BASE, params=params, context='NCBI ID Converter', rate=_http.Rate.NCBI_UNKEYED
122
+ )
123
+ if data is None:
124
+ return article_ids
125
+ records = data.get('records', [])
126
+ if not records or records[0].get('status') == 'error':
127
+ return article_ids
128
+ rec = records[0]
129
+ return article_ids.merge(
130
+ ids.ArticleIds(
131
+ # The migrated endpoint returns pmid as an int; ArticleIds holds strings.
132
+ pmid=(str(rec['pmid']) if rec.get('pmid') else None),
133
+ pmcid=_pmcid_with_prefix(rec.get('pmcid')),
134
+ doi=rec.get('doi') or None,
135
+ )
136
+ )
137
+
138
+
139
+ class SemanticScholarResolver:
140
+ """Cross-reference identifiers via Semantic Scholar's ``externalIds``.
141
+
142
+ One lookup returns DOI / PubMed / PubMedCentral / arXiv ids for the paper.
143
+ ``api_key`` is optional (the public endpoint is rate-limited but keyless).
144
+ A no-op when the bundle carries no identifier S2 can key on.
145
+ """
146
+
147
+ def __init__(self, *, api_key: str | None = None) -> None:
148
+ self._api_key = api_key
149
+
150
+ async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
151
+ """Return ``article_ids`` enriched from Semantic Scholar's external ids."""
152
+ data = await semantic_scholar.fetch_paper(article_ids, http=http, fields='externalIds', api_key=self._api_key)
153
+ if data is None:
154
+ return article_ids
155
+ external = data.get('externalIds') or {}
156
+ return article_ids.merge(
157
+ ids.ArticleIds(
158
+ pmid=(str(external['PubMed']) if external.get('PubMed') else None),
159
+ pmcid=_pmcid_with_prefix(external.get('PubMedCentral')),
160
+ doi=external.get('DOI') or None,
161
+ )
162
+ )
163
+
164
+
165
+ def chain(*resolvers: Resolver) -> Resolver:
166
+ """Compose resolvers into one, run in order until the bundle is complete.
167
+
168
+ Each resolver enriches the bundle in turn; the chain stops early once every
169
+ identifier (``pmid``, ``pmcid``, ``doi``) is known, so later resolvers run
170
+ only while there is still something to find.
171
+ """
172
+
173
+ async def _run(article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
174
+ for resolver in resolvers:
175
+ if article_ids.pmid and article_ids.pmcid and article_ids.doi:
176
+ break
177
+ article_ids = article_ids.merge(await resolver(article_ids, http))
178
+ return article_ids
179
+
180
+ return _run
181
+
182
+
183
+ def default_resolver() -> Resolver:
184
+ """Build a batteries-included, keyless resolver chain.
185
+
186
+ Europe PMC search then NCBI's ID Converter -- both auth-free -- which covers
187
+ the common ``pmid -> pmcid``/``doi`` paths. Add
188
+ :class:`SemanticScholarResolver` (or a consumer's own resolver) to the
189
+ :func:`chain` for broader coverage.
190
+ """
191
+ return chain(EuropePmcResolver(), NcbiIdConverterResolver())
192
+
193
+
194
+ def _idconv_query(article_ids: ids.ArticleIds) -> tuple[str, str] | None:
195
+ """Pick the identifier and ``idtype`` to send to NCBI's ID Converter."""
196
+ if article_ids.pmid:
197
+ return article_ids.pmid, 'pmid'
198
+ if article_ids.pmcid:
199
+ return article_ids.pmcid, 'pmcid'
200
+ if article_ids.doi:
201
+ return article_ids.doi, 'doi'
202
+ return None
@@ -0,0 +1,71 @@
1
+ """Semantic Scholar paper fetch, shared by identifier resolution and the file-set.
2
+
3
+ One paper lookup returns whichever fields are asked for: ``externalIds`` (for
4
+ :class:`~litfetch.resolvers.SemanticScholarResolver`'s cross-referencing) or
5
+ ``openAccessPdf`` (for a ``BODY`` PDF rendition, via
6
+ :class:`~litfetch.fetchers.SemanticScholarFileSource`). Both go through
7
+ :func:`fetch_paper`; the paper id is built from the most specific identifier the
8
+ bundle carries. ``api_key`` is optional -- the public endpoint is keyless but
9
+ rate-limited -- and selects the keyed vs unkeyed pace.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+
16
+ import httpx
17
+
18
+ from litfetch import _doi, _http, ids
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _PAPER_BASE = 'https://api.semanticscholar.org/graph/v1/paper'
23
+
24
+
25
+ def paper_id(article_ids: ids.ArticleIds) -> str | None:
26
+ """Build an S2 paper id from the most specific identifier available."""
27
+ if article_ids.doi:
28
+ return f'DOI:{_doi.encode_doi_path(article_ids.doi)}'
29
+ if article_ids.pmid:
30
+ return f'PMID:{article_ids.pmid}'
31
+ if article_ids.pmcid:
32
+ return f'PMCID:{article_ids.pmcid}'
33
+ return None
34
+
35
+
36
+ async def fetch_paper(
37
+ article_ids: ids.ArticleIds,
38
+ *,
39
+ http: _http.Http,
40
+ fields: str,
41
+ api_key: str | None = None,
42
+ ) -> dict | None:
43
+ """Return the parsed S2 record for ``fields``, or ``None``.
44
+
45
+ Args:
46
+ article_ids: The identifiers; the most specific keys the request.
47
+ http: The :class:`~litfetch._http.Http` to issue the request on.
48
+ fields: The S2 ``fields`` selector (e.g. ``'externalIds'``).
49
+ api_key: An optional S2 API key; its presence selects the keyed pace.
50
+
51
+ Returns:
52
+ The parsed JSON record, or ``None`` when the bundle carries no id S2 can
53
+ key on, the lookup fails, or the response is not JSON.
54
+ """
55
+ pid = paper_id(article_ids)
56
+ if pid is None:
57
+ return None
58
+ headers = {'x-api-key': api_key} if api_key else None
59
+ rate = _http.Rate.S2_KEYED if api_key else _http.Rate.S2_UNKEYED
60
+ try:
61
+ resp = await http.get(f'{_PAPER_BASE}/{pid}', params={'fields': fields}, headers=headers, rate=rate)
62
+ except httpx.HTTPError:
63
+ logger.exception('Semantic Scholar request failed')
64
+ return None
65
+ if resp.status_code != 200:
66
+ return None
67
+ try:
68
+ return resp.json()
69
+ except ValueError:
70
+ logger.warning('Semantic Scholar returned a non-JSON response for %s', pid)
71
+ return None
litfetch/serde.py ADDED
@@ -0,0 +1,67 @@
1
+ """Canonical, backend-agnostic (de)serialisation of the file-set model.
2
+
3
+ litfetch owns the *structure* of an article's identity and files -- their fields
4
+ and how each round-trips through a plain JSON-able ``dict`` -- but not the wire
5
+ format nor where they are stored. litfetch ships no cache backend; a consumer
6
+ composes these mappings into its own record envelope (status, leases, placement)
7
+ and never re-lists a dataclass's fields itself.
8
+
9
+ Every ``*_to_dict`` returns a dict of JSON primitives; every ``*_from_dict``
10
+ reconstructs the dataclass from one. ``from_dict`` inputs are typed ``Any`` --
11
+ they sit at the untyped parse boundary (``json.loads`` and friends).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import dataclasses
17
+ from typing import Any
18
+
19
+ from litfetch import artifacts, ids
20
+
21
+
22
+ def article_ids_to_dict(value: ids.ArticleIds) -> dict[str, Any]:
23
+ """Map an :class:`~litfetch.ids.ArticleIds` to a dict."""
24
+ return dataclasses.asdict(value)
25
+
26
+
27
+ def article_ids_from_dict(data: dict[str, Any]) -> ids.ArticleIds:
28
+ """Reconstruct an :class:`~litfetch.ids.ArticleIds` from a dict."""
29
+ return ids.ArticleIds(**data)
30
+
31
+
32
+ def file_to_dict(file: artifacts.File) -> dict[str, Any]:
33
+ """Map a :class:`~litfetch.artifacts.File` to a dict (``kind`` as its value)."""
34
+ return {
35
+ 'kind': file.kind.value,
36
+ 'source': file.source,
37
+ 'media_type': file.media_type,
38
+ 'uri': file.uri,
39
+ 'filename': file.filename,
40
+ 'credential_key': file.credential_key,
41
+ 'size_bytes': file.size_bytes,
42
+ 'description': file.description,
43
+ }
44
+
45
+
46
+ def file_from_dict(data: dict[str, Any]) -> artifacts.File:
47
+ """Reconstruct a :class:`~litfetch.artifacts.File` from a dict."""
48
+ return artifacts.File(
49
+ kind=artifacts.FileKind(data['kind']),
50
+ source=data['source'],
51
+ media_type=data['media_type'],
52
+ uri=data['uri'],
53
+ filename=data['filename'],
54
+ credential_key=data['credential_key'],
55
+ size_bytes=data['size_bytes'],
56
+ description=data['description'],
57
+ )
58
+
59
+
60
+ def source_metadata_to_dict(meta: artifacts.SourceMetadata) -> dict[str, Any]:
61
+ """Map a :class:`~litfetch.artifacts.SourceMetadata` to a dict."""
62
+ return {'licence': meta.licence, 'access': meta.access, 'basis': meta.basis}
63
+
64
+
65
+ def source_metadata_from_dict(data: dict[str, Any]) -> artifacts.SourceMetadata:
66
+ """Reconstruct a :class:`~litfetch.artifacts.SourceMetadata` from a dict."""
67
+ return artifacts.SourceMetadata(licence=data['licence'], access=data['access'], basis=data['basis'])