PyPI - litfetch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

litfetch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

litfetch/__init__.py +92 -0
litfetch/_doi.py +78 -0
litfetch/_http.py +194 -0
litfetch/artifacts.py +91 -0
litfetch/crossref.py +51 -0
litfetch/fetchers.py +888 -0
litfetch/ids.py +41 -0
litfetch/py.typed +0 -0
litfetch/relations.py +114 -0
litfetch/resolvers.py +202 -0
litfetch/semantic_scholar.py +71 -0
litfetch/serde.py +67 -0
litfetch/sessions.py +344 -0
litfetch/source_metadata.py +118 -0
litfetch/unpaywall.py +58 -0
litfetch-0.1.0.dist-info/METADATA +230 -0
litfetch-0.1.0.dist-info/RECORD +20 -0
litfetch-0.1.0.dist-info/WHEEL +5 -0
litfetch-0.1.0.dist-info/licenses/LICENSE +21 -0
litfetch-0.1.0.dist-info/top_level.txt +1 -0

litfetch/__init__.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""litfetch: identifier -> the retrievable artifacts of a scholarly article.
+Hand :func:`fetch_body` an :class:`ArticleIds` bundle (any of pmid / pmcid / doi)
+and, optionally, a :data:`~litfetch.resolvers.Resolver` to fill in missing
+identifiers on demand.  A :class:`~litfetch.fetchers.Fetcher` ladder is tried in
+priority order; the first to serve the body yields a :class:`Blob` (a
+:class:`File` plus its bytes).  Supplementary material is discovered with
+:func:`list_files` and fetched with :func:`fetch_file`.
+An article is modelled as a *file-set*: a collection of :class:`File` references
+(body renditions and supplementary material, by :class:`FileKind`) sharing one
+identity, each hosted upstream.  litfetch fetches the raw artifacts and reports
+their access terms (:class:`SourceMetadata`); rendering them (e.g. XML ->
+markdown via litdown) and storing them are the consumer's concern.  The bundled
+identifier resolvers (Europe PMC, NCBI ID Converter, Semantic Scholar) live in
+:mod:`litfetch.resolvers`; file-set listing and fetching live in
+:mod:`litfetch.fetchers`.
+"""
+from __future__ import annotations
+from litfetch._http import Http, Rate, RetryPolicy
+from litfetch.artifacts import (
+    INSTITUTIONAL,
+    Blob,
+    File,
+    FileKind,
+    SourceMetadata,
+)
+from litfetch.fetchers import (
+    BiorxivFetcher,
+    CrossrefFileSource,
+    ElsevierFetcher,
+    EuropePmcFetcher,
+    Fetcher,
+    FileSource,
+    PmcOaFetcher,
+    SemanticScholarFileSource,
+    SpringerFetcher,
+    SpringerFileSource,
+    UnpaywallFileSource,
+    default_fetchers,
+    default_file_sources,
+)
+from litfetch.ids import ArticleIds
+from litfetch.relations import Related, RelationType
+from litfetch.sessions import (
+    Session,
+    fetch_body,
+    fetch_file,
+    list_files,
+    related_ids,
+    resolve_access,
+)
+from litfetch.source_metadata import extract_source_metadata
+__version__ = '0.1.0'
+__all__ = [
+    'INSTITUTIONAL',
+    'ArticleIds',
+    'BiorxivFetcher',
+    'Blob',
+    'CrossrefFileSource',
+    'ElsevierFetcher',
+    'EuropePmcFetcher',
+    'Fetcher',
+    'File',
+    'FileKind',
+    'FileSource',
+    'Http',
+    'PmcOaFetcher',
+    'Rate',
+    'Related',
+    'RelationType',
+    'RetryPolicy',
+    'SemanticScholarFileSource',
+    'Session',
+    'SourceMetadata',
+    'SpringerFetcher',
+    'SpringerFileSource',
+    'UnpaywallFileSource',
+    '__version__',
+    'default_fetchers',
+    'default_file_sources',
+    'extract_source_metadata',
+    'fetch_body',
+    'fetch_file',
+    'list_files',
+    'related_ids',
+    'resolve_access',
+]

litfetch/_doi.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""DOI validation and URL-safe path encoding.
+Several sources interpolate a DOI into an upstream URL path (Unpaywall,
+Crossref, Semantic Scholar, bioRxiv, the future doi.org resolve).  Doing so
+raw is wrong twice over: a DOI suffix may contain ``?``, ``#``, spaces, or
+``/`` -- which truncate or reshape the URL -- and a crafted ``.``/``..``
+segment is a path-traversal vector.  :func:`encode_doi_path` is the one safe
+way to place a DOI in a URL path; :func:`normalize_and_validate_doi` is the
+shape gate it builds on.
+"""
+from __future__ import annotations
+import re
+import urllib.parse
+# A DOI is ``10.<registrant>/<suffix>``: the registrant is one or more digits
+# with optional dot-separated sub-elements (e.g. ``10.1000.10``); the suffix is
+# any non-empty string.  The digit count is left open -- the common 4-9 range is
+# Crossref's observed corpus, not a spec rule -- so an unusual registrant is not
+# rejected.  DOIs are case-insensitive, so the prefix match is too; the value is
+# returned unchanged (suffixes are case-sensitive for many registrants).
+_DOI_RE = re.compile(r'^10\.\d+(?:\.\d+)*/.+$', re.IGNORECASE)
+# Decorations a caller-supplied DOI may arrive with; stripped before validation.
+_RESOLVER_PREFIXES = ('https://doi.org/', 'http://doi.org/', 'https://dx.doi.org/', 'http://dx.doi.org/')
+def normalize_and_validate_doi(doi: str) -> str:
+    """Return the bare, validated DOI, stripping common decorations.
+    Accepts a DOI carrying surrounding whitespace, a ``doi:`` scheme, or a
+    resolver URL prefix (``https://doi.org/``, ``http://dx.doi.org/``) and
+    returns the bare ``10.xxxx/suffix`` form.
+    Args:
+        doi: The DOI to normalise, possibly decorated.
+    Returns:
+        The bare DOI.
+    Raises:
+        ValueError: If the result is not a syntactically valid DOI.
+    """
+    candidate = doi.strip()
+    lowered = candidate.lower()
+    for prefix in _RESOLVER_PREFIXES:
+        if lowered.startswith(prefix):
+            candidate = candidate[len(prefix) :]
+            break
+    if candidate.lower().startswith('doi:'):
+        candidate = candidate[len('doi:') :].strip()
+    if not _DOI_RE.match(candidate):
+        raise ValueError(f'not a valid DOI: {doi!r}')
+    return candidate
+def encode_doi_path(doi: str) -> str:
+    """Percent-encode a validated DOI for safe interpolation into a URL path.
+    Validates via :func:`normalize_and_validate_doi`, then percent-encodes each
+    ``/``-separated segment -- so a suffix ``/``, ``?``, ``#``, or space cannot
+    reshape the URL -- and rejects a ``.`` or ``..`` segment (path traversal).
+    Args:
+        doi: The DOI to encode, possibly decorated.
+    Returns:
+        The encoded DOI, ready to interpolate after a URL's path separator.
+    Raises:
+        ValueError: If the DOI is invalid or contains a dot-segment.
+    """
+    normalized = normalize_and_validate_doi(doi)
+    segments = normalized.split('/')
+    if any(segment in ('.', '..') for segment in segments):
+        raise ValueError(f'DOI contains a path-traversal segment: {doi!r}')
+    return '/'.join(urllib.parse.quote(segment, safe='') for segment in segments)

litfetch/_http.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Low-level HTTP primitives shared across litfetch.
+The public vocabulary the source and resolver layers depend on -- the
+:class:`Http` request protocol and the :class:`Rate` politeness levels -- lives
+here, together with the retrying GET primitive (:func:`get`) they are built on.
+Keeping these here (and not in :mod:`litfetch.sessions`) lets ``fetchers`` and
+``resolvers`` depend on the protocol without importing the concrete
+:class:`~litfetch.sessions.Session`, which in turn imports them.
+:func:`get` is the single choke point for outbound GETs: it adds retry with
+exponential backoff and honours a 429/503 ``Retry-After``.  It operates over a
+raw client and knows nothing of pacing or caching; :class:`Session` layers those
+on top and is the concrete :class:`Http` fetchers and resolvers actually call.
+"""
+from __future__ import annotations
+import asyncio
+import dataclasses
+import enum
+import random
+from collections.abc import Mapping
+from typing import Protocol
+import httpx
+DEFAULT_TIMEOUT = 30.0
+# Base User-Agent, no contact. A caller who sets Session(contact=...) gets a
+# `(mailto:...)` appended and that address fed to the polite-pool params; litfetch
+# ships no default contact of its own.
+USER_AGENT = 'litfetch/0.1'
+# Status codes worth retrying: 429 (rate limited) and the transient 5xx family.
+# A 4xx other than 429 is the caller's fault and will not fix itself on retry.
+_RETRYABLE_STATUS = frozenset({429, 500, 502, 503, 504})
+class Rate(enum.Enum):
+    """A named politeness rate, chosen at the call site.
+    The ``KEYED`` variants apply when the caller holds an API key for that host
+    (a higher allowance), the ``UNKEYED`` variants are the polite public rate.
+    ``DEFAULT`` imposes no throttle -- for hosts (S3, publisher CDNs) with no
+    tight per-client limit.  :attr:`min_interval` is the resulting minimum
+    inter-request interval in seconds.
+    """
+    DEFAULT = 'default'
+    NCBI_UNKEYED = 'ncbi_unkeyed'
+    NCBI_KEYED = 'ncbi_keyed'
+    S2_UNKEYED = 's2_unkeyed'
+    S2_KEYED = 's2_keyed'
+    @property
+    def min_interval(self) -> float:
+        """Minimum seconds between requests to one host at this rate."""
+        return _MIN_INTERVALS[self]
+# Seconds between requests per rate.  Distinct members may share an interval
+# (NCBI and S2 keyed allowances both land near 10 req/s), so the interval is a
+# mapping, not the enum value -- equal values would alias the members.
+_MIN_INTERVALS = {
+    Rate.DEFAULT: 0.0,
+    Rate.NCBI_UNKEYED: 0.34,  # ~3 req/s, NCBI's keyless allowance
+    Rate.NCBI_KEYED: 0.1,  # ~10 req/s with an NCBI API key
+    Rate.S2_UNKEYED: 1.0,  # Semantic Scholar's shared public pool: stay conservative
+    Rate.S2_KEYED: 0.1,  # with a Semantic Scholar API key
+}
+class Http(Protocol):
+    """The request surface a source or resolver needs: a paced, retrying GET.
+    :class:`~litfetch.sessions.Session` satisfies this.  A source is handed an
+    ``Http``, never the Session's lifecycle, so it depends on one method (and one
+    attribute) and is trivially faked in a test.
+    ``contact`` is the caller-configured identity (an email) for polite-pool
+    parameters -- Unpaywall's required ``email``, Crossref's ``mailto``, NCBI's
+    ``email`` -- and ``None`` when the caller set none; a source reads it rather
+    than carrying a hardcoded address.
+    """
+    contact: str | None
+    async def get(
+        self,
+        url: str,
+        *,
+        params: Mapping[str, str | int] | None = None,
+        headers: Mapping[str, str] | None = None,
+        rate: Rate = Rate.DEFAULT,
+        follow_redirects: bool = False,
+    ) -> httpx.Response:
+        """GET ``url``, paced per ``rate`` and retried per the session policy."""
+        ...
+@dataclasses.dataclass(frozen=True)
+class RetryPolicy:
+    """How :func:`get` retries a transient failure.
+    A transient failure is an ``httpx.TransportError`` (timeout, connection
+    reset) or a retryable status (429, 500, 502, 503, 504).  Backoff is
+    exponential with full jitter -- ``uniform(0, base_delay * 2**attempt)`` --
+    capped at ``max_delay``; a 429/503 ``Retry-After`` in integer seconds
+    overrides the jitter, also capped.  ``max_attempts`` counts total tries, so
+    ``max_attempts=1`` disables retrying.
+    """
+    max_attempts: int = 3
+    base_delay: float = 0.5
+    max_delay: float = 8.0
+    def __post_init__(self) -> None:
+        # >= 1 so the get() loop always runs at least once (0 would fall through
+        # to its `unreachable` guard).
+        if self.max_attempts < 1:
+            raise ValueError(f'max_attempts must be >= 1, got {self.max_attempts}')
+DEFAULT_RETRY = RetryPolicy()
+async def get(
+    client: httpx.AsyncClient,
+    url: str,
+    *,
+    params: Mapping[str, str | int] | None = None,
+    headers: Mapping[str, str] | None = None,
+    retry: RetryPolicy = DEFAULT_RETRY,
+    follow_redirects: bool = False,
+) -> httpx.Response:
+    """GET ``url``, retrying a transient failure per ``retry``.
+    Retries an ``httpx.TransportError`` or a retryable status (see
+    :class:`RetryPolicy`) with backoff, then returns the final response --
+    including a still-failing status, so the caller keeps its own status
+    handling.  Re-raises the last transport error when every attempt fails.
+    Args:
+        client: The httpx client to issue the request on.
+        url: The absolute URL to GET.
+        params: Query parameters, if any.
+        headers: Request headers, if any.
+        retry: The backoff/attempt policy.
+        follow_redirects: Follow 3xx redirects (off by default; file downloads
+            enable it to follow publisher PDF redirects).
+    Returns:
+        The final :class:`httpx.Response` (a non-retryable status, or the last
+        response after exhausting retries).
+    Raises:
+        httpx.TransportError: If every attempt fails at the transport layer.
+    """
+    for attempt in range(retry.max_attempts):
+        last_attempt = attempt == retry.max_attempts - 1
+        retry_after: float | None = None
+        try:
+            response = await client.get(url, params=params, headers=headers, follow_redirects=follow_redirects)
+        except httpx.TransportError:
+            if last_attempt:
+                raise
+        else:
+            if response.status_code not in _RETRYABLE_STATUS or last_attempt:
+                return response
+            retry_after = _retry_after_seconds(response)
+        await asyncio.sleep(_backoff(attempt, retry_after, retry))
+    raise AssertionError('unreachable: the loop returns or raises on the last attempt')
+def _retry_after_seconds(response: httpx.Response) -> float | None:
+    """Parse a ``Retry-After`` header as integer seconds; ``None`` otherwise.
+    The HTTP-date form is accepted by the spec but not used by the APIs
+    litfetch talks to; it falls through to ``None`` (jittered backoff).
+    """
+    value = response.headers.get('Retry-After')
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None
+def _backoff(attempt: int, retry_after: float | None, policy: RetryPolicy) -> float:
+    """Seconds to wait before the next attempt: ``Retry-After`` or jittered backoff."""
+    if retry_after is not None:
+        return min(retry_after, policy.max_delay)
+    jittered = random.uniform(0, policy.base_delay * 2**attempt)  # noqa: S311 -- jitter, not crypto
+    return min(jittered, policy.max_delay)

litfetch/artifacts.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""The data types that flow through the fetch seam.
+An article is modelled as a *file-set*: a collection of :class:`File` references
+sharing one identity.  A :class:`File` is either the article **body** (in one of
+its media types) or a piece of **supplementary** material, hosted upstream with a
+``uri`` and the ``credential_key`` a fetch needs.  A :class:`Blob` is a File once
+its bytes are in hand.
+"""
+from __future__ import annotations
+import dataclasses
+import enum
+from typing import Final
+# Well-known media types an artifact can carry: the JATS or Elsevier body
+# dialects, or a PDF rendition.  Left as open ``str`` (not an enum): a File's
+# media_type is an open domain -- Crossref/publisher links carry arbitrary
+# content-types -- so a closed enum would be wrong.  The closed sets in this
+# package (FileKind, Rate, RelationType) are enums; these are not.
+JATS_XML: Final[str] = 'application/jats+xml'
+ELSEVIER_XML: Final[str] = 'application/vnd.elsevier+xml'
+PDF: Final[str] = 'application/pdf'
+# A ``File.credential_key`` value meaning the fetch needs *institutional
+# entitlement* (a subscription reached via an EZproxy-style client), rather than
+# a key in the ``credentials`` map.  The ``litfetch:`` prefix makes it
+# un-collidable with a user-supplied credentials key (which could otherwise be
+# literally ``institutional``).  The consumer routes such a file through its
+# entitled client; an openly-fetchable file leaves ``credential_key`` ``None``.
+INSTITUTIONAL: Final[str] = 'litfetch:institutional'
+class FileKind(enum.Enum):
+    """What a :class:`File` is within the article's file-set.
+    ``BODY`` -- the article full text itself, in one of its media types.
+    ``SUPPLEMENTARY`` -- additional material (figures, datasets, tables), not
+    the body.
+    """
+    BODY = 'body'
+    SUPPLEMENTARY = 'supplementary'
+@dataclasses.dataclass(frozen=True)
+class File:
+    """A reference to one file in an article's file-set -- not its bytes.
+    ``source`` names the source that can retrieve it (routes
+    :func:`~litfetch.fetchers.fetch_file`).  A File is hosted upstream: it carries
+    a ``uri`` and the ``credential_key`` a fetch needs (``None`` when openly
+    accessible).  ``credential_key`` is either a key in the caller's
+    ``credentials`` map (e.g. a publisher API key) or :data:`INSTITUTIONAL`, which
+    marks a fetch that needs institutional entitlement (an EZproxy-style client)
+    rather than a map key.  ``uri`` is fetched on demand, never eagerly.
+    """
+    kind: FileKind
+    source: str
+    media_type: str | None = None
+    uri: str | None = None
+    filename: str | None = None
+    credential_key: str | None = None
+    size_bytes: int | None = None
+    description: str | None = None
+@dataclasses.dataclass(frozen=True)
+class SourceMetadata:
+    """Access terms for a fetched artifact: its licence and how that was known.
+    litfetch returns the licence *raw* (the CC URL, JATS ``license-type``, or
+    licence text as found upstream); mapping to an SPDX id is the consumer's --
+    describe, don't own.  ``basis`` records provenance: ``'artifact'`` when read
+    from the fetched bytes (authoritative for exactly those bytes), or an
+    authority name (e.g. ``'unpaywall'``) when asserted for a paper whose bytes
+    ship no licence (a PDF).  A ``None`` field means unknown.
+    """
+    licence: str | None = None
+    access: str | None = None
+    basis: str | None = None
+@dataclasses.dataclass(frozen=True)
+class Blob:
+    """A materialised :class:`File`: its reference plus the fetched bytes."""
+    file: File
+    content: bytes

litfetch/crossref.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Crossref works fetch, shared by the Elsevier link locator, relations, and TDM.
+One DOI-keyed GET against the Crossref works API returns a ``message`` that
+several callers read differently: the Elsevier full-text fetcher for its
+``text/xml`` text-mining link, :mod:`litfetch.relations` for ``relation``
+entries, and :class:`~litfetch.fetchers.CrossrefFileSource` for the
+text-mining ``link[]`` renditions.  All go through :func:`fetch_work`; inside a
+:meth:`~litfetch.sessions.Session.scope` the duplicate GET is served from cache.
+"""
+from __future__ import annotations
+import logging
+import httpx
+from litfetch import _doi, _http
+logger = logging.getLogger(__name__)
+_CROSSREF_BASE = 'https://api.crossref.org/works'
+async def fetch_work(doi: str, *, http: _http.Http, mailto: str | None = None) -> dict | None:
+    """Return the Crossref ``message`` object for ``doi``, or ``None``.
+    Args:
+        doi: The DOI to look up.
+        http: The :class:`~litfetch._http.Http` to issue the request on.
+        mailto: Identifies the caller for Crossref's polite pool; defaults to
+            ``http.contact``. Omitted (Crossref still answers, just not in the
+            polite pool) when neither is set.
+    Returns:
+        The parsed ``message`` object, or ``None`` when the lookup fails or the
+        response is not JSON.
+    """
+    mailto = mailto or http.contact
+    params = {'mailto': mailto} if mailto else {}
+    try:
+        resp = await http.get(f'{_CROSSREF_BASE}/{_doi.encode_doi_path(doi)}', params=params)
+    except httpx.HTTPError:
+        logger.exception('Crossref lookup failed for %s', doi)
+        return None
+    if resp.status_code != 200:
+        return None
+    try:
+        return resp.json().get('message')
+    except ValueError:
+        logger.warning('Crossref returned a non-JSON response for %s', doi)
+        return None