PyPI - litfetch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

litfetch/__init__.py +92 -0
litfetch/_doi.py +78 -0
litfetch/_http.py +194 -0
litfetch/artifacts.py +91 -0
litfetch/crossref.py +51 -0
litfetch/fetchers.py +888 -0
litfetch/ids.py +41 -0
litfetch/py.typed +0 -0
litfetch/relations.py +114 -0
litfetch/resolvers.py +202 -0
litfetch/semantic_scholar.py +71 -0
litfetch/serde.py +67 -0
litfetch/sessions.py +344 -0
litfetch/source_metadata.py +118 -0
litfetch/unpaywall.py +58 -0
litfetch-0.1.0.dist-info/METADATA +230 -0
litfetch-0.1.0.dist-info/RECORD +20 -0
litfetch-0.1.0.dist-info/WHEEL +5 -0
litfetch-0.1.0.dist-info/licenses/LICENSE +21 -0
litfetch-0.1.0.dist-info/top_level.txt +1 -0

litfetch/ids.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""The identifier bundle shared across resolvers and full-text sources."""
+from __future__ import annotations
+import dataclasses
+from collections.abc import Iterable
+@dataclasses.dataclass(frozen=True)
+class ArticleIds:
+    """An immutable bundle of the identifiers litfetch can act on.
+    Every field is optional: a caller may enter with only a PMID, only a DOI (a
+    non-PubMed paper), or a fully-populated bundle.  Resolvers enrich a bundle;
+    sources consume whichever identifier they declare in ``requires``.
+    """
+    # Deliberately a thin record.  The priority orders callers apply over these
+    # fields (canonical_key prefers doi; NCBI idconv prefers pmid; S2 prefers
+    # doi) are independent caller policy, not a domain ordering -- there is no
+    # single intrinsic specificity ranking -- so they stay at the call sites
+    # rather than being centralised here behind a generic picker.
+    pmid: str | None = None
+    pmcid: str | None = None
+    doi: str | None = None
+    def merge(self, other: ArticleIds) -> ArticleIds:
+        """Return a bundle that fills this one's gaps from ``other``.
+        Known identifiers are never overwritten: a resolver can add a DOI but
+        cannot correct a PMCID the caller supplied.
+        """
+        return ArticleIds(
+            pmid=self.pmid or other.pmid,
+            pmcid=self.pmcid or other.pmcid,
+            doi=self.doi or other.doi,
+        )
+    def has(self, fields: Iterable[str]) -> bool:
+        """Return whether every identifier named in ``fields`` is present."""
+        return all(getattr(self, field) for field in fields)

litfetch/py.typed ADDED Viewed

File without changes

litfetch/relations.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Cross-version relations between identifiers: preprint <-> published.
+A work can exist as a preprint (bioRxiv / medRxiv, ...) and later as a published
+version of record -- two distinct DOIs for one paper.  :func:`related_ids` takes
+whatever :class:`~litfetch.ids.ArticleIds` you hold and returns the related works
+it can find, each as its own :class:`~litfetch.ids.ArticleIds` tagged with how it
+relates -- so the caller need not know whether what it holds is a preprint or a
+published DOI.  The equivalence decision ("same paper") is the consumer's;
+litfetch only surfaces the links.  The returned bundles are single-DOI and can be
+fed straight back through a :data:`~litfetch.resolvers.Resolver` to fill the rest.
+Sources: the bioRxiv / medRxiv details API (reliable preprint -> published) and
+Crossref relations (``has-preprint`` / ``is-preprint-of``, both directions,
+best-effort on publisher metadata).
+"""
+from __future__ import annotations
+import enum
+import logging
+from typing import NamedTuple
+import httpx
+from litfetch import _doi, _http, crossref, ids
+logger = logging.getLogger(__name__)
+_BIORXIV_DETAILS_BASE = 'https://api.biorxiv.org/details'
+_BIORXIV_SERVERS = ('biorxiv', 'medrxiv')
+# Cold Spring Harbor preprint DOI prefixes (bioRxiv/medRxiv): the older 10.1101
+# and the newer 10.64898.
+_PREPRINT_DOI_PREFIXES = ('10.1101/', '10.64898/')
+class RelationType(enum.Enum):
+    """How a related work relates to the one you asked about.
+    ``PREPRINT`` -- the related bundle is a preprint of the input; ``PUBLISHED``
+    -- the related bundle is the published version of record of the input.
+    """
+    PREPRINT = 'preprint'
+    PUBLISHED = 'published'
+class Related(NamedTuple):
+    """A related work: its relationship to the input, and its identifiers."""
+    relation: RelationType
+    ids: ids.ArticleIds
+async def related_ids(article_ids: ids.ArticleIds, *, http: _http.Http) -> tuple[Related, ...]:
+    """Find the preprint / published counterparts of ``article_ids`` by DOI.
+    Returns each related work as a single-DOI :class:`~litfetch.ids.ArticleIds`
+    tagged with its :class:`RelationType`; empty when there is no DOI or nothing
+    links.  A preprint DOI is followed to its published version via the bioRxiv
+    details API; Crossref relations are consulted in either direction.
+    """
+    doi = article_ids.doi
+    if not doi:
+        return ()
+    # DOIs are case-insensitive, so dedupe on a case-folded key (keeping the
+    # first-seen original casing) -- else bioRxiv and Crossref reporting the same
+    # DOI in different case would both survive.
+    found: dict[tuple[RelationType, str], Related] = {}
+    if doi.startswith(_PREPRINT_DOI_PREFIXES):
+        published = await _biorxiv_published(http, doi)
+        if published:
+            found[(RelationType.PUBLISHED, published.casefold())] = Related(
+                RelationType.PUBLISHED, ids.ArticleIds(doi=published)
+            )
+    for relation, linked in await _crossref_relations(http, doi):
+        found.setdefault((relation, linked.casefold()), Related(relation, ids.ArticleIds(doi=linked)))
+    return tuple(found.values())
+async def _biorxiv_published(http: _http.Http, doi: str) -> str | None:
+    """Return the published DOI bioRxiv/medRxiv records for preprint ``doi``."""
+    for server in _BIORXIV_SERVERS:
+        url = f'{_BIORXIV_DETAILS_BASE}/{server}/{_doi.encode_doi_path(doi)}'
+        try:
+            resp = await http.get(url)
+        except httpx.HTTPError:
+            logger.exception('bioRxiv details lookup failed for %s', url)
+            continue
+        if resp.status_code != 200:
+            continue
+        try:
+            collection = resp.json().get('collection') or []
+        except ValueError:
+            logger.warning('bioRxiv details returned a non-JSON response for %s', url)
+            continue
+        if collection:
+            published = collection[-1].get('published')
+            if published and published != 'NA':
+                return published
+    return None
+async def _crossref_relations(http: _http.Http, doi: str) -> list[tuple[RelationType, str]]:
+    """Return ``(RelationType, doi)`` for Crossref ``has-preprint`` / ``is-preprint-of``."""
+    message = await crossref.fetch_work(doi, http=http)
+    if message is None:
+        return []
+    relation = message.get('relation', {})
+    out: list[tuple[RelationType, str]] = []
+    for key, kind in (('has-preprint', RelationType.PREPRINT), ('is-preprint-of', RelationType.PUBLISHED)):
+        for entry in relation.get(key, []) or []:
+            if entry.get('id') and entry.get('id-type') == 'doi':
+                out.append((kind, entry['id']))
+    return out

litfetch/resolvers.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Identifier resolvers: enrich an :class:`~litfetch.ids.ArticleIds` bundle.
+A resolver is any async callable ``(ArticleIds, Http) -> ArticleIds`` (the
+:data:`Resolver` alias).  It takes what is known and the
+:class:`~litfetch._http.Http` to issue requests on, and returns a bundle filled
+with whatever more it could find; it must never overwrite a known identifier
+(use :meth:`~litfetch.ids.ArticleIds.merge`).  Resolvers are usable on their
+own as a cross-reference toolkit, independent of the fetch ladder::
+    async with litfetch.Session() as s:
+        ids = await SemanticScholarResolver()(ArticleIds(doi='10.1016/...'), s)
+    print(ids.pmcid)
+The bundled resolvers are general (no pubmedifier coupling): Europe PMC search,
+NCBI's ID Converter, and Semantic Scholar.  Consumer-specific resolvers (a
+local cache, a corpus client) belong in the consumer and slot into the same
+:func:`chain`.
+"""
+from __future__ import annotations
+import logging
+from collections.abc import Awaitable, Callable, Mapping
+import httpx
+from litfetch import _http, ids, semantic_scholar
+logger = logging.getLogger(__name__)
+_EUROPE_PMC_BASE = 'https://www.ebi.ac.uk/europepmc/webservices/rest'
+_NCBI_IDCONV_BASE = 'https://pmc.ncbi.nlm.nih.gov/tools/idconv/api/v1/articles/'
+# A resolver enriches a bundle: given the known ids and an Http, it returns a
+# (possibly) fuller ArticleIds and preserves every identifier it was given.
+Resolver = Callable[[ids.ArticleIds, _http.Http], Awaitable[ids.ArticleIds]]
+def _pmcid_with_prefix(value: str | None) -> str | None:
+    """Normalise a bare or prefixed PMC id to the ``PMC...`` form."""
+    if not value:
+        return None
+    value = value.strip()
+    if value.upper().startswith('PMC'):
+        return value
+    return f'PMC{value}'
+async def _get_json(
+    http: _http.Http,
+    url: str,
+    *,
+    params: Mapping[str, str | int],
+    context: str,
+    rate: _http.Rate = _http.Rate.DEFAULT,
+) -> dict | None:
+    """GET ``url`` and parse JSON, logging and swallowing transport errors."""
+    try:
+        resp = await http.get(url, params=params, rate=rate)
+    except httpx.HTTPError:
+        logger.exception('%s request failed', context)
+        return None
+    if resp.status_code != 200:
+        return None
+    try:
+        return resp.json()
+    except ValueError:
+        logger.warning('%s returned a non-JSON response', context)
+        return None
+class EuropePmcResolver:
+    """Resolve ``pmid -> pmcid`` via Europe PMC's search API.
+    Europe PMC occasionally records a PMC id for an article a PubMed-XML-sourced
+    corpus does not -- typically UKPMC-only author-manuscript deposits.  A no-op
+    when the bundle already has a ``pmcid`` or has no ``pmid``.
+    """
+    async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
+        """Return ``article_ids`` enriched with a ``pmcid`` where Europe PMC has one."""
+        if article_ids.pmcid or not article_ids.pmid:
+            return article_ids
+        params = {
+            'query': f'EXT_ID:{article_ids.pmid} AND SRC:MED',
+            'format': 'json',
+            'pageSize': 1,
+            'resultType': 'lite',
+        }
+        data = await _get_json(http, f'{_EUROPE_PMC_BASE}/search', params=params, context='Europe PMC search')
+        if data is None:
+            return article_ids
+        records = data.get('resultList', {}).get('result', [])
+        if not records:
+            return article_ids
+        return article_ids.merge(ids.ArticleIds(pmcid=_pmcid_with_prefix(records[0].get('pmcid'))))
+class NcbiIdConverterResolver:
+    """Cross-reference ``pmid``/``pmcid``/``doi`` via NCBI's ID Converter.
+    A single keyless request maps any one of the three identifiers to the
+    others.  ``tool`` identifies the caller to NCBI; the ``email`` sent with it
+    defaults to the session ``contact`` (``http.contact``) and is omitted when
+    unset.  A no-op when the bundle carries none of the three.
+    """
+    def __init__(self, *, tool: str = 'litfetch') -> None:
+        self._tool = tool
+    async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
+        """Return ``article_ids`` enriched with whatever the ID Converter maps."""
+        query = _idconv_query(article_ids)
+        if query is None:
+            return article_ids
+        identifier, idtype = query
+        params = {'ids': identifier, 'idtype': idtype, 'format': 'json', 'tool': self._tool}
+        if http.contact:
+            params['email'] = http.contact
+        data = await _get_json(
+            http, _NCBI_IDCONV_BASE, params=params, context='NCBI ID Converter', rate=_http.Rate.NCBI_UNKEYED
+        )
+        if data is None:
+            return article_ids
+        records = data.get('records', [])
+        if not records or records[0].get('status') == 'error':
+            return article_ids
+        rec = records[0]
+        return article_ids.merge(
+            ids.ArticleIds(
+                # The migrated endpoint returns pmid as an int; ArticleIds holds strings.
+                pmid=(str(rec['pmid']) if rec.get('pmid') else None),
+                pmcid=_pmcid_with_prefix(rec.get('pmcid')),
+                doi=rec.get('doi') or None,
+            )
+        )
+class SemanticScholarResolver:
+    """Cross-reference identifiers via Semantic Scholar's ``externalIds``.
+    One lookup returns DOI / PubMed / PubMedCentral / arXiv ids for the paper.
+    ``api_key`` is optional (the public endpoint is rate-limited but keyless).
+    A no-op when the bundle carries no identifier S2 can key on.
+    """
+    def __init__(self, *, api_key: str | None = None) -> None:
+        self._api_key = api_key
+    async def __call__(self, article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
+        """Return ``article_ids`` enriched from Semantic Scholar's external ids."""
+        data = await semantic_scholar.fetch_paper(article_ids, http=http, fields='externalIds', api_key=self._api_key)
+        if data is None:
+            return article_ids
+        external = data.get('externalIds') or {}
+        return article_ids.merge(
+            ids.ArticleIds(
+                pmid=(str(external['PubMed']) if external.get('PubMed') else None),
+                pmcid=_pmcid_with_prefix(external.get('PubMedCentral')),
+                doi=external.get('DOI') or None,
+            )
+        )
+def chain(*resolvers: Resolver) -> Resolver:
+    """Compose resolvers into one, run in order until the bundle is complete.
+    Each resolver enriches the bundle in turn; the chain stops early once every
+    identifier (``pmid``, ``pmcid``, ``doi``) is known, so later resolvers run
+    only while there is still something to find.
+    """
+    async def _run(article_ids: ids.ArticleIds, http: _http.Http) -> ids.ArticleIds:
+        for resolver in resolvers:
+            if article_ids.pmid and article_ids.pmcid and article_ids.doi:
+                break
+            article_ids = article_ids.merge(await resolver(article_ids, http))
+        return article_ids
+    return _run
+def default_resolver() -> Resolver:
+    """Build a batteries-included, keyless resolver chain.
+    Europe PMC search then NCBI's ID Converter -- both auth-free -- which covers
+    the common ``pmid -> pmcid``/``doi`` paths.  Add
+    :class:`SemanticScholarResolver` (or a consumer's own resolver) to the
+    :func:`chain` for broader coverage.
+    """
+    return chain(EuropePmcResolver(), NcbiIdConverterResolver())
+def _idconv_query(article_ids: ids.ArticleIds) -> tuple[str, str] | None:
+    """Pick the identifier and ``idtype`` to send to NCBI's ID Converter."""
+    if article_ids.pmid:
+        return article_ids.pmid, 'pmid'
+    if article_ids.pmcid:
+        return article_ids.pmcid, 'pmcid'
+    if article_ids.doi:
+        return article_ids.doi, 'doi'
+    return None

litfetch/semantic_scholar.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Semantic Scholar paper fetch, shared by identifier resolution and the file-set.
+One paper lookup returns whichever fields are asked for: ``externalIds`` (for
+:class:`~litfetch.resolvers.SemanticScholarResolver`'s cross-referencing) or
+``openAccessPdf`` (for a ``BODY`` PDF rendition, via
+:class:`~litfetch.fetchers.SemanticScholarFileSource`).  Both go through
+:func:`fetch_paper`; the paper id is built from the most specific identifier the
+bundle carries.  ``api_key`` is optional -- the public endpoint is keyless but
+rate-limited -- and selects the keyed vs unkeyed pace.
+"""
+from __future__ import annotations
+import logging
+import httpx
+from litfetch import _doi, _http, ids
+logger = logging.getLogger(__name__)
+_PAPER_BASE = 'https://api.semanticscholar.org/graph/v1/paper'
+def paper_id(article_ids: ids.ArticleIds) -> str | None:
+    """Build an S2 paper id from the most specific identifier available."""
+    if article_ids.doi:
+        return f'DOI:{_doi.encode_doi_path(article_ids.doi)}'
+    if article_ids.pmid:
+        return f'PMID:{article_ids.pmid}'
+    if article_ids.pmcid:
+        return f'PMCID:{article_ids.pmcid}'
+    return None
+async def fetch_paper(
+    article_ids: ids.ArticleIds,
+    *,
+    http: _http.Http,
+    fields: str,
+    api_key: str | None = None,
+) -> dict | None:
+    """Return the parsed S2 record for ``fields``, or ``None``.
+    Args:
+        article_ids: The identifiers; the most specific keys the request.
+        http: The :class:`~litfetch._http.Http` to issue the request on.
+        fields: The S2 ``fields`` selector (e.g. ``'externalIds'``).
+        api_key: An optional S2 API key; its presence selects the keyed pace.
+    Returns:
+        The parsed JSON record, or ``None`` when the bundle carries no id S2 can
+        key on, the lookup fails, or the response is not JSON.
+    """
+    pid = paper_id(article_ids)
+    if pid is None:
+        return None
+    headers = {'x-api-key': api_key} if api_key else None
+    rate = _http.Rate.S2_KEYED if api_key else _http.Rate.S2_UNKEYED
+    try:
+        resp = await http.get(f'{_PAPER_BASE}/{pid}', params={'fields': fields}, headers=headers, rate=rate)
+    except httpx.HTTPError:
+        logger.exception('Semantic Scholar request failed')
+        return None
+    if resp.status_code != 200:
+        return None
+    try:
+        return resp.json()
+    except ValueError:
+        logger.warning('Semantic Scholar returned a non-JSON response for %s', pid)
+        return None

litfetch/serde.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Canonical, backend-agnostic (de)serialisation of the file-set model.
+litfetch owns the *structure* of an article's identity and files -- their fields
+and how each round-trips through a plain JSON-able ``dict`` -- but not the wire
+format nor where they are stored.  litfetch ships no cache backend; a consumer
+composes these mappings into its own record envelope (status, leases, placement)
+and never re-lists a dataclass's fields itself.
+Every ``*_to_dict`` returns a dict of JSON primitives; every ``*_from_dict``
+reconstructs the dataclass from one.  ``from_dict`` inputs are typed ``Any`` --
+they sit at the untyped parse boundary (``json.loads`` and friends).
+"""
+from __future__ import annotations
+import dataclasses
+from typing import Any
+from litfetch import artifacts, ids
+def article_ids_to_dict(value: ids.ArticleIds) -> dict[str, Any]:
+    """Map an :class:`~litfetch.ids.ArticleIds` to a dict."""
+    return dataclasses.asdict(value)
+def article_ids_from_dict(data: dict[str, Any]) -> ids.ArticleIds:
+    """Reconstruct an :class:`~litfetch.ids.ArticleIds` from a dict."""
+    return ids.ArticleIds(**data)
+def file_to_dict(file: artifacts.File) -> dict[str, Any]:
+    """Map a :class:`~litfetch.artifacts.File` to a dict (``kind`` as its value)."""
+    return {
+        'kind': file.kind.value,
+        'source': file.source,
+        'media_type': file.media_type,
+        'uri': file.uri,
+        'filename': file.filename,
+        'credential_key': file.credential_key,
+        'size_bytes': file.size_bytes,
+        'description': file.description,
+    }
+def file_from_dict(data: dict[str, Any]) -> artifacts.File:
+    """Reconstruct a :class:`~litfetch.artifacts.File` from a dict."""
+    return artifacts.File(
+        kind=artifacts.FileKind(data['kind']),
+        source=data['source'],
+        media_type=data['media_type'],
+        uri=data['uri'],
+        filename=data['filename'],
+        credential_key=data['credential_key'],
+        size_bytes=data['size_bytes'],
+        description=data['description'],
+    )
+def source_metadata_to_dict(meta: artifacts.SourceMetadata) -> dict[str, Any]:
+    """Map a :class:`~litfetch.artifacts.SourceMetadata` to a dict."""
+    return {'licence': meta.licence, 'access': meta.access, 'basis': meta.basis}
+def source_metadata_from_dict(data: dict[str, Any]) -> artifacts.SourceMetadata:
+    """Reconstruct a :class:`~litfetch.artifacts.SourceMetadata` from a dict."""
+    return artifacts.SourceMetadata(licence=data['licence'], access=data['access'], basis=data['basis'])