PyPI - afdb-query - Versions diffs - 0.1.0__py3-none-any.whl - Mend

afdb-query 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

afdb_query/__init__.py +16 -0
afdb_query/batch.py +135 -0
afdb_query/client.py +107 -0
afdb_query/errors.py +19 -0
afdb_query/models.py +102 -0
afdb_query/py.typed +0 -0
afdb_query/sequences.py +20 -0
afdb_query-0.1.0.dist-info/METADATA +80 -0
afdb_query-0.1.0.dist-info/RECORD +10 -0
afdb_query-0.1.0.dist-info/WHEEL +4 -0

afdb_query/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""afdb-query: sequence-based programmatic access to the AlphaFold DB."""
+from .client import AlphaFold
+from .errors import AFDBError, InvalidSequenceError
+from .models import Plddt, Structure, confidence_url
+from .sequences import filter_reason
+__all__ = [
+    "AlphaFold",
+    "Structure",
+    "Plddt",
+    "filter_reason",
+    "confidence_url",
+    "AFDBError",
+    "InvalidSequenceError",
+]

afdb_query/batch.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Concurrent, resumable batch lookups over many sequences."""
+from __future__ import annotations
+import json
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import NamedTuple
+import httpx
+from .sequences import filter_reason
+def _normalize_inputs(inputs) -> list[tuple[str, str]]:
+    pairs: list[tuple[str, str]] = []
+    for item in inputs:
+        if isinstance(item, dict):
+            pairs.append((item["id"], item["sequence"]))
+        else:
+            id_, seq = item
+            pairs.append((id_, seq))
+    return pairs
+def _chunked(items: list, size: int):
+    for i in range(0, len(items), size):
+        yield items[i : i + size]
+class _Result(NamedTuple):
+    summary_path: Path
+    plddt_path: Path | None
+    outcome: str  # "found" | "notfound" | "error"
+    summary_data: dict | None
+    plddt_values: list | None
+def search_many(
+    client,
+    inputs,
+    out_dir,
+    *,
+    concurrency: int = 6,
+    rows: int = 10,
+    plddt_first_n: int | None = None,
+) -> dict:
+    """Query each queryable input's sequence concurrently, caching to disk.
+    ``inputs`` is a list of ``(id, sequence)`` tuples or ``{"id":..., "sequence":...}``
+    dicts. Results are cached under ``out_dir``:
+    * ``out_dir/summaries/{id}.json`` — a hit stores the AFDB summary document; a
+      404 miss stores ``{"structures": []}`` so re-runs skip it. An existing file
+      is left untouched (resumability).
+    * ``out_dir/plddt/{id}.json`` (only when ``plddt_first_n`` is set) — the raw
+      first-n per-residue pLDDT array for the first/best structure (<= n values).
+    A real per-query HTTP error is counted and NOT saved, so it retries next run.
+    Returns a counts report (dict).
+    Note: resumability keys on the summary file. If a record's summary file already
+    exists, it is skipped entirely and ``plddt`` is not back-filled for it.
+    """
+    out_dir = Path(out_dir)
+    summaries_dir = out_dir / "summaries"
+    plddt_dir = out_dir / "plddt"
+    pairs = _normalize_inputs(inputs)
+    counts = {
+        "internal_stop": 0,
+        "too_short": 0,
+        "nonstandard_aa": 0,
+        "skipped": 0,
+        "hits": 0,
+        "misses": 0,
+        "errors": 0,
+    }
+    pending: list[tuple[Path, Path | None, str]] = []
+    for id_, seq in pairs:
+        reason = filter_reason(seq)
+        if reason is not None:
+            counts[reason] += 1
+            continue
+        summary_path = summaries_dir / f"{id_}.json"
+        if summary_path.exists():
+            counts["skipped"] += 1
+            continue
+        plddt_path = (plddt_dir / f"{id_}.json") if plddt_first_n is not None else None
+        pending.append((summary_path, plddt_path, seq))
+    def _query(item: tuple[Path, Path | None, str]) -> _Result:
+        summary_path, plddt_path, seq = item
+        try:
+            data = client._fetch_summary(seq, rows)
+        except httpx.HTTPError:
+            return _Result(summary_path, plddt_path, "error", None, None)
+        if data is None:
+            return _Result(summary_path, plddt_path, "notfound", None, None)
+        plddt_values = None
+        if plddt_first_n is not None:
+            structures = data.get("structures") or []
+            if structures:
+                try:
+                    conf = client._fetch_confidence(structures[0]["summary"]["model_url"])
+                except httpx.HTTPError:
+                    return _Result(summary_path, plddt_path, "error", None, None)
+                plddt_values = conf.get("confidenceScore", [])[:plddt_first_n]
+        return _Result(summary_path, plddt_path, "found", data, plddt_values)
+    if pending:
+        summaries_dir.mkdir(parents=True, exist_ok=True)
+        if plddt_first_n is not None:
+            plddt_dir.mkdir(parents=True, exist_ok=True)
+        with ThreadPoolExecutor(max_workers=concurrency) as pool:
+            for chunk in _chunked(pending, concurrency * 50):
+                for res in pool.map(_query, chunk):
+                    if res.outcome == "error":
+                        counts["errors"] += 1
+                    elif res.outcome == "notfound":
+                        res.summary_path.write_text(json.dumps({"structures": []}))
+                        counts["misses"] += 1
+                    else:
+                        res.summary_path.write_text(json.dumps(res.summary_data))
+                        counts["hits"] += 1
+                        if res.plddt_values is not None and res.plddt_path is not None:
+                            res.plddt_path.write_text(json.dumps(res.plddt_values))
+    return {
+        "total": len(pairs),
+        "filtered": counts["internal_stop"] + counts["too_short"] + counts["nonstandard_aa"],
+        **counts,
+        "queried": counts["hits"] + counts["misses"] + counts["errors"],
+    }

afdb_query/client.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""The AlphaFold client: HTTP session and AFDB endpoint access."""
+from __future__ import annotations
+import httpx
+from .batch import search_many as _search_many
+from .errors import InvalidSequenceError
+from .models import Structure, confidence_url
+from .sequences import filter_reason
+DEFAULT_BASE_URL = "https://alphafold.ebi.ac.uk"
+SUMMARY_PATH = "/api/sequence/summary"
+class AlphaFold:
+    """Client for sequence-based access to the AlphaFold Protein Structure Database.
+    Wraps a shared, thread-safe ``httpx.Client``. Use as a context manager, or
+    call :meth:`close` when done.
+    """
+    def __init__(
+        self,
+        *,
+        timeout: float = 30.0,
+        base_url: str = DEFAULT_BASE_URL,
+        max_retries: int = 2,
+    ) -> None:
+        transport = httpx.HTTPTransport(retries=max_retries)
+        self._client = httpx.Client(base_url=base_url, timeout=timeout, transport=transport)
+    # -- lifecycle ---------------------------------------------------------
+    def close(self) -> None:
+        self._client.close()
+    def __enter__(self) -> "AlphaFold":
+        return self
+    def __exit__(self, *exc: object) -> None:
+        self.close()
+    # -- low-level fetch ---------------------------------------------------
+    def _get(self, url: str, params: dict | None = None) -> httpx.Response:
+        return self._client.get(url, params=params, headers={"Accept": "application/json"})
+    def _fetch_summary(self, sequence: str, rows: int = 10) -> dict | None:
+        """Tier 1: query the sequence-summary endpoint.
+        Returns the parsed ``{"entry": ..., "structures": [...]}`` document, or
+        ``None`` when AFDB has no entry (HTTP 404 — a clean "not found"). Raises
+        on any other HTTP error.
+        """
+        if rows < 2:
+            raise ValueError("rows must be > 1 (AFDB rejects rows <= 1)")
+        resp = self._get(
+            SUMMARY_PATH, params={"id": sequence, "type": "sequence", "rows": rows}
+        )
+        if resp.status_code == 404:
+            return None
+        resp.raise_for_status()
+        return resp.json()
+    def _fetch_confidence(self, model_url: str) -> dict:
+        """Tier 2: fetch the per-residue confidence JSON for a model URL."""
+        resp = self._get(confidence_url(model_url))
+        resp.raise_for_status()
+        return resp.json()
+    # -- public API --------------------------------------------------------
+    def search(self, sequence: str, rows: int = 10) -> list[Structure]:
+        """Tier 1: find AFDB structures matching ``sequence``, in AFDB's returned order.
+        Results are ranked by sequence identity, but ``hits[0]`` is not guaranteed to
+        be the canonical ``AF-<accession>-F1`` model — for some sequences a multi-chain
+        or AB-INITIO model ranks first. Select by ``model_identifier`` if you need a
+        specific entry.
+        Raises :class:`InvalidSequenceError` if the sequence is not queryable.
+        Returns ``[]`` when AFDB has no entry for it.
+        """
+        reason = filter_reason(sequence)
+        if reason is not None:
+            raise InvalidSequenceError(reason)
+        data = self._fetch_summary(sequence, rows)
+        if data is None:
+            return []
+        return [Structure(item["summary"], self) for item in data.get("structures", [])]
+    def search_many(
+        self,
+        inputs,
+        out_dir,
+        *,
+        concurrency: int = 6,
+        rows: int = 10,
+        plddt_first_n: int | None = None,
+    ) -> dict:
+        """Concurrent, resumable batch lookup. See ``afdb_query.batch.search_many``."""
+        return _search_many(
+            self,
+            inputs,
+            out_dir,
+            concurrency=concurrency,
+            rows=rows,
+            plddt_first_n=plddt_first_n,
+        )

afdb_query/errors.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Exception types for afdb-query."""
+from __future__ import annotations
+class AFDBError(Exception):
+    """Base class for all afdb-query errors."""
+class InvalidSequenceError(AFDBError):
+    """Raised when a sequence cannot be queried against AFDB.
+    ``reason`` is one of ``"internal_stop"``, ``"too_short"``,
+    ``"nonstandard_aa"`` (see ``afdb_query.sequences.filter_reason``).
+    """
+    def __init__(self, reason: str) -> None:
+        self.reason = reason
+        super().__init__(f"sequence not queryable: {reason}")

afdb_query/models.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Result objects and helpers for afdb-query."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+def confidence_url(model_url: str) -> str:
+    """Derive the per-residue confidence-JSON URL from a model (CIF) URL.
+    AFDB names files ``...-model_vN.cif`` and ``...-confidence_vN.json`` in the
+    same directory, so the per-residue pLDDT URL is a pure string transform of
+    the model URL (verified against the live API).
+    """
+    url = model_url.replace("-model_", "-confidence_")
+    if url.endswith(".bcif"):
+        return url[: -len(".bcif")] + ".json"
+    if url.endswith(".cif"):
+        return url[: -len(".cif")] + ".json"
+    return url
+@dataclass(frozen=True)
+class Plddt:
+    """Per-residue pLDDT for one structure.
+    ``scores`` and ``residue_numbers`` are parallel lists. ``raw`` is the full
+    confidence-JSON document (escape hatch).
+    """
+    scores: list[float]
+    residue_numbers: list[int]
+    raw: dict = field(repr=False)
+    @classmethod
+    def from_dict(cls, data: dict) -> "Plddt":
+        return cls(
+            scores=data["confidenceScore"],
+            residue_numbers=data["residueNumber"],
+            raw=data,
+        )
+    def first(self, n: int) -> list[float]:
+        """First ``n`` per-residue pLDDT values, or all of them if fewer than ``n``.
+        Never pads and never raises on short structures: returns ``scores[:n]``.
+        """
+        return self.scores[:n]
+@dataclass(frozen=True)
+class Structure:
+    """One AFDB structure match for a queried sequence.
+    Thin typed wrapper over the endpoint's ``summary`` dict. ``raw`` is the full
+    summary (escape hatch). ``plddt()`` lazily fetches per-residue pLDDT.
+    """
+    raw: dict
+    _client: "AlphaFold" = field(repr=False, compare=False)  # noqa: F821
+    _cache: dict = field(default_factory=dict, repr=False, compare=False)
+    @property
+    def model_identifier(self) -> str | None:
+        return self.raw.get("model_identifier")
+    @property
+    def model_url(self) -> str | None:
+        return self.raw.get("model_url")
+    @property
+    def global_plddt(self) -> float | None:
+        return self.raw.get("confidence_avg_local_score")
+    @property
+    def sequence_identity(self) -> float | None:
+        return self.raw.get("sequence_identity")
+    @property
+    def coverage(self) -> float | None:
+        return self.raw.get("coverage")
+    @property
+    def uniprot_accession(self) -> str | None:
+        for entity in self.raw.get("entities") or []:
+            if entity.get("identifier_category") == "UNIPROT":
+                return entity.get("identifier")
+        return None
+    @property
+    def description(self) -> str | None:
+        for entity in self.raw.get("entities") or []:
+            if entity.get("description"):
+                return entity["description"]
+        return None
+    def plddt(self) -> Plddt:
+        """Tier 2: per-residue pLDDT for this structure (fetched once, then cached)."""
+        if "plddt" not in self._cache:
+            data = self._client._fetch_confidence(self.model_url)
+            self._cache["plddt"] = Plddt.from_dict(data)
+        return self._cache["plddt"]

afdb_query/py.typed ADDED Viewed

File without changes

afdb_query/sequences.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Sequence validation for AFDB queries (ported from the original pipeline)."""
+from __future__ import annotations
+STANDARD_AA = frozenset("ACDEFGHIKLMNPQRSTVWY")
+MIN_LENGTH = 20
+def filter_reason(seq: str) -> str | None:
+    """Why a sequence cannot be queried against AFDB, or None if it is queryable.
+    Checked in priority order: internal stop, length, non-standard residues.
+    """
+    if "*" in seq:
+        return "internal_stop"
+    if len(seq) < MIN_LENGTH:
+        return "too_short"
+    if not set(seq) <= STANDARD_AA:
+        return "nonstandard_aa"
+    return None

afdb_query-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,80 @@
+Metadata-Version: 2.4
+Name: afdb-query
+Version: 0.1.0
+Summary: Sequence-based programmatic access to the AlphaFold Protein Structure Database
+License: MIT
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.27
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == 'dev'
+Requires-Dist: respx>=0.21; extra == 'dev'
+Description-Content-Type: text/markdown
+# afdb-query
+Sequence-based programmatic access to the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/) (AFDB). Query a protein by its amino-acid sequence, then pull per-residue pLDDT — including "the first n values" — without hand-rolling URL derivation and JSON fetching.
+## Install
+```bash
+pip install afdb-query
+```
+## Quickstart
+```python
+from afdb_query import AlphaFold
+with AlphaFold() as af:
+    hits = af.search(sequence)        # Tier 1: list[Structure], in AFDB's returned order
+    s = hits[0]
+    s.global_plddt        # mean pLDDT for the model (cheap, from the summary)
+    s.sequence_identity   # 1.0 == exact match, < 1.0 == near hit
+    s.uniprot_accession   # e.g. "P12345", or None
+    p = s.plddt()         # Tier 2: per-residue pLDDT (fetched once, then cached)
+    p.scores              # full per-residue list[float]
+    p.first(50)           # first 50 values — or all of them if the model is shorter
+```
+`search` raises `InvalidSequenceError` for sequences that cannot be queried
+(internal stop `*`, shorter than 20 residues, or non-standard amino acids), and
+returns `[]` when AFDB has no entry for a valid sequence.
+Results come back in AFDB's returned order (ranked by sequence identity). Note that
+`hits[0]` is **not** guaranteed to be the canonical `AF-<accession>-F1` model — for
+some sequences a multi-chain or AB-INITIO model ranks first — so pick the hit whose
+`model_identifier` you want if you need a specific entry.
+## Batch lookups
+`search_many` runs many sequences concurrently with resumable on-disk caching:
+```python
+report = af.search_many(
+    [{"id": "rec1", "sequence": seq1}, {"id": "rec2", "sequence": seq2}],
+    out_dir="afdb_cache",
+    concurrency=6,
+    plddt_first_n=50,   # optional: also save the first 50 per-residue pLDDT per hit
+)
+# report -> {"total":..., "hits":..., "misses":..., "errors":..., "skipped":..., ...}
+```
+- You supply a generic `id` per sequence; it keys the cache file and maps back to
+  your own records.
+- `out_dir/summaries/{id}.json` stores each hit (a 404 miss stores
+  `{"structures": []}`); existing files are left untouched, so re-runs resume.
+- With `plddt_first_n` set, `out_dir/plddt/{id}.json` stores the raw first-n
+  per-residue pLDDT array for the best structure.
+- Real HTTP errors are counted but not saved, so they retry on the next run.
+  Note: resumability keys on the summary file. If you run once without
+  `plddt_first_n` and again with it, already-cached records are skipped and their
+  pLDDT is not back-filled.
+## Not (yet) supported
+- UniProt-accession lookup (sequence-only for now)
+- PAE (Predicted Aligned Error)
+- No statistics helpers — the package returns raw values; downstream math is yours.

afdb_query-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+afdb_query/__init__.py,sha256=RupH4A7p_l7dUJOTru97O7kl0YLQtEgAiyNwrdSicn0,397
+afdb_query/batch.py,sha256=aMJ89EUwE9Ups8a_TXSdy_vcKSyA5v4eETIBZe-qFvk,4835
+afdb_query/client.py,sha256=YVfDNq-cLE0E-vbXPdSkU123Sj3Atzlo34ZgRIzSezg,3808
+afdb_query/errors.py,sha256=3DRg8y1pxCf7Zv08J1RNgq48vuDEmmGI8e9iENu6vEs,535
+afdb_query/models.py,sha256=NN1_MdlbNhJ_T06Tc47Sjp_Wr6evfhJ9nX4XQtaIH2k,3194
+afdb_query/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+afdb_query/sequences.py,sha256=r097SESzEg6NUXzBjcVoO54D5BQ_KSDfqqamTvHpBjs,585
+afdb_query-0.1.0.dist-info/METADATA,sha256=a0eYp1948KZi_ppK9zAaZwS7CbwG4NoXUicggLOVuOA,3105
+afdb_query-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+afdb_query-0.1.0.dist-info/RECORD,,

afdb_query-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any