afdb-query 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
afdb_query/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ """afdb-query: sequence-based programmatic access to the AlphaFold DB."""
2
+
3
+ from .client import AlphaFold
4
+ from .errors import AFDBError, InvalidSequenceError
5
+ from .models import Plddt, Structure, confidence_url
6
+ from .sequences import filter_reason
7
+
8
+ __all__ = [
9
+ "AlphaFold",
10
+ "Structure",
11
+ "Plddt",
12
+ "filter_reason",
13
+ "confidence_url",
14
+ "AFDBError",
15
+ "InvalidSequenceError",
16
+ ]
afdb_query/batch.py ADDED
@@ -0,0 +1,135 @@
1
+ """Concurrent, resumable batch lookups over many sequences."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from pathlib import Path
8
+ from typing import NamedTuple
9
+
10
+ import httpx
11
+
12
+ from .sequences import filter_reason
13
+
14
+
15
+ def _normalize_inputs(inputs) -> list[tuple[str, str]]:
16
+ pairs: list[tuple[str, str]] = []
17
+ for item in inputs:
18
+ if isinstance(item, dict):
19
+ pairs.append((item["id"], item["sequence"]))
20
+ else:
21
+ id_, seq = item
22
+ pairs.append((id_, seq))
23
+ return pairs
24
+
25
+
26
+ def _chunked(items: list, size: int):
27
+ for i in range(0, len(items), size):
28
+ yield items[i : i + size]
29
+
30
+
31
+ class _Result(NamedTuple):
32
+ summary_path: Path
33
+ plddt_path: Path | None
34
+ outcome: str # "found" | "notfound" | "error"
35
+ summary_data: dict | None
36
+ plddt_values: list | None
37
+
38
+
39
+ def search_many(
40
+ client,
41
+ inputs,
42
+ out_dir,
43
+ *,
44
+ concurrency: int = 6,
45
+ rows: int = 10,
46
+ plddt_first_n: int | None = None,
47
+ ) -> dict:
48
+ """Query each queryable input's sequence concurrently, caching to disk.
49
+
50
+ ``inputs`` is a list of ``(id, sequence)`` tuples or ``{"id":..., "sequence":...}``
51
+ dicts. Results are cached under ``out_dir``:
52
+
53
+ * ``out_dir/summaries/{id}.json`` — a hit stores the AFDB summary document; a
54
+ 404 miss stores ``{"structures": []}`` so re-runs skip it. An existing file
55
+ is left untouched (resumability).
56
+ * ``out_dir/plddt/{id}.json`` (only when ``plddt_first_n`` is set) — the raw
57
+ first-n per-residue pLDDT array for the first/best structure (<= n values).
58
+
59
+ A real per-query HTTP error is counted and NOT saved, so it retries next run.
60
+ Returns a counts report (dict).
61
+
62
+ Note: resumability keys on the summary file. If a record's summary file already
63
+ exists, it is skipped entirely and ``plddt`` is not back-filled for it.
64
+ """
65
+ out_dir = Path(out_dir)
66
+ summaries_dir = out_dir / "summaries"
67
+ plddt_dir = out_dir / "plddt"
68
+
69
+ pairs = _normalize_inputs(inputs)
70
+ counts = {
71
+ "internal_stop": 0,
72
+ "too_short": 0,
73
+ "nonstandard_aa": 0,
74
+ "skipped": 0,
75
+ "hits": 0,
76
+ "misses": 0,
77
+ "errors": 0,
78
+ }
79
+
80
+ pending: list[tuple[Path, Path | None, str]] = []
81
+ for id_, seq in pairs:
82
+ reason = filter_reason(seq)
83
+ if reason is not None:
84
+ counts[reason] += 1
85
+ continue
86
+ summary_path = summaries_dir / f"{id_}.json"
87
+ if summary_path.exists():
88
+ counts["skipped"] += 1
89
+ continue
90
+ plddt_path = (plddt_dir / f"{id_}.json") if plddt_first_n is not None else None
91
+ pending.append((summary_path, plddt_path, seq))
92
+
93
+ def _query(item: tuple[Path, Path | None, str]) -> _Result:
94
+ summary_path, plddt_path, seq = item
95
+ try:
96
+ data = client._fetch_summary(seq, rows)
97
+ except httpx.HTTPError:
98
+ return _Result(summary_path, plddt_path, "error", None, None)
99
+ if data is None:
100
+ return _Result(summary_path, plddt_path, "notfound", None, None)
101
+ plddt_values = None
102
+ if plddt_first_n is not None:
103
+ structures = data.get("structures") or []
104
+ if structures:
105
+ try:
106
+ conf = client._fetch_confidence(structures[0]["summary"]["model_url"])
107
+ except httpx.HTTPError:
108
+ return _Result(summary_path, plddt_path, "error", None, None)
109
+ plddt_values = conf.get("confidenceScore", [])[:plddt_first_n]
110
+ return _Result(summary_path, plddt_path, "found", data, plddt_values)
111
+
112
+ if pending:
113
+ summaries_dir.mkdir(parents=True, exist_ok=True)
114
+ if plddt_first_n is not None:
115
+ plddt_dir.mkdir(parents=True, exist_ok=True)
116
+ with ThreadPoolExecutor(max_workers=concurrency) as pool:
117
+ for chunk in _chunked(pending, concurrency * 50):
118
+ for res in pool.map(_query, chunk):
119
+ if res.outcome == "error":
120
+ counts["errors"] += 1
121
+ elif res.outcome == "notfound":
122
+ res.summary_path.write_text(json.dumps({"structures": []}))
123
+ counts["misses"] += 1
124
+ else:
125
+ res.summary_path.write_text(json.dumps(res.summary_data))
126
+ counts["hits"] += 1
127
+ if res.plddt_values is not None and res.plddt_path is not None:
128
+ res.plddt_path.write_text(json.dumps(res.plddt_values))
129
+
130
+ return {
131
+ "total": len(pairs),
132
+ "filtered": counts["internal_stop"] + counts["too_short"] + counts["nonstandard_aa"],
133
+ **counts,
134
+ "queried": counts["hits"] + counts["misses"] + counts["errors"],
135
+ }
afdb_query/client.py ADDED
@@ -0,0 +1,107 @@
1
+ """The AlphaFold client: HTTP session and AFDB endpoint access."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import httpx
6
+
7
+ from .batch import search_many as _search_many
8
+ from .errors import InvalidSequenceError
9
+ from .models import Structure, confidence_url
10
+ from .sequences import filter_reason
11
+
12
+ DEFAULT_BASE_URL = "https://alphafold.ebi.ac.uk"
13
+ SUMMARY_PATH = "/api/sequence/summary"
14
+
15
+
16
+ class AlphaFold:
17
+ """Client for sequence-based access to the AlphaFold Protein Structure Database.
18
+
19
+ Wraps a shared, thread-safe ``httpx.Client``. Use as a context manager, or
20
+ call :meth:`close` when done.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ *,
26
+ timeout: float = 30.0,
27
+ base_url: str = DEFAULT_BASE_URL,
28
+ max_retries: int = 2,
29
+ ) -> None:
30
+ transport = httpx.HTTPTransport(retries=max_retries)
31
+ self._client = httpx.Client(base_url=base_url, timeout=timeout, transport=transport)
32
+
33
+ # -- lifecycle ---------------------------------------------------------
34
+ def close(self) -> None:
35
+ self._client.close()
36
+
37
+ def __enter__(self) -> "AlphaFold":
38
+ return self
39
+
40
+ def __exit__(self, *exc: object) -> None:
41
+ self.close()
42
+
43
+ # -- low-level fetch ---------------------------------------------------
44
+ def _get(self, url: str, params: dict | None = None) -> httpx.Response:
45
+ return self._client.get(url, params=params, headers={"Accept": "application/json"})
46
+
47
+ def _fetch_summary(self, sequence: str, rows: int = 10) -> dict | None:
48
+ """Tier 1: query the sequence-summary endpoint.
49
+
50
+ Returns the parsed ``{"entry": ..., "structures": [...]}`` document, or
51
+ ``None`` when AFDB has no entry (HTTP 404 — a clean "not found"). Raises
52
+ on any other HTTP error.
53
+ """
54
+ if rows < 2:
55
+ raise ValueError("rows must be > 1 (AFDB rejects rows <= 1)")
56
+ resp = self._get(
57
+ SUMMARY_PATH, params={"id": sequence, "type": "sequence", "rows": rows}
58
+ )
59
+ if resp.status_code == 404:
60
+ return None
61
+ resp.raise_for_status()
62
+ return resp.json()
63
+
64
+ def _fetch_confidence(self, model_url: str) -> dict:
65
+ """Tier 2: fetch the per-residue confidence JSON for a model URL."""
66
+ resp = self._get(confidence_url(model_url))
67
+ resp.raise_for_status()
68
+ return resp.json()
69
+
70
+ # -- public API --------------------------------------------------------
71
+ def search(self, sequence: str, rows: int = 10) -> list[Structure]:
72
+ """Tier 1: find AFDB structures matching ``sequence``, in AFDB's returned order.
73
+
74
+ Results are ranked by sequence identity, but ``hits[0]`` is not guaranteed to
75
+ be the canonical ``AF-<accession>-F1`` model — for some sequences a multi-chain
76
+ or AB-INITIO model ranks first. Select by ``model_identifier`` if you need a
77
+ specific entry.
78
+
79
+ Raises :class:`InvalidSequenceError` if the sequence is not queryable.
80
+ Returns ``[]`` when AFDB has no entry for it.
81
+ """
82
+ reason = filter_reason(sequence)
83
+ if reason is not None:
84
+ raise InvalidSequenceError(reason)
85
+ data = self._fetch_summary(sequence, rows)
86
+ if data is None:
87
+ return []
88
+ return [Structure(item["summary"], self) for item in data.get("structures", [])]
89
+
90
+ def search_many(
91
+ self,
92
+ inputs,
93
+ out_dir,
94
+ *,
95
+ concurrency: int = 6,
96
+ rows: int = 10,
97
+ plddt_first_n: int | None = None,
98
+ ) -> dict:
99
+ """Concurrent, resumable batch lookup. See ``afdb_query.batch.search_many``."""
100
+ return _search_many(
101
+ self,
102
+ inputs,
103
+ out_dir,
104
+ concurrency=concurrency,
105
+ rows=rows,
106
+ plddt_first_n=plddt_first_n,
107
+ )
afdb_query/errors.py ADDED
@@ -0,0 +1,19 @@
1
+ """Exception types for afdb-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class AFDBError(Exception):
7
+ """Base class for all afdb-query errors."""
8
+
9
+
10
+ class InvalidSequenceError(AFDBError):
11
+ """Raised when a sequence cannot be queried against AFDB.
12
+
13
+ ``reason`` is one of ``"internal_stop"``, ``"too_short"``,
14
+ ``"nonstandard_aa"`` (see ``afdb_query.sequences.filter_reason``).
15
+ """
16
+
17
+ def __init__(self, reason: str) -> None:
18
+ self.reason = reason
19
+ super().__init__(f"sequence not queryable: {reason}")
afdb_query/models.py ADDED
@@ -0,0 +1,102 @@
1
+ """Result objects and helpers for afdb-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ def confidence_url(model_url: str) -> str:
9
+ """Derive the per-residue confidence-JSON URL from a model (CIF) URL.
10
+
11
+ AFDB names files ``...-model_vN.cif`` and ``...-confidence_vN.json`` in the
12
+ same directory, so the per-residue pLDDT URL is a pure string transform of
13
+ the model URL (verified against the live API).
14
+ """
15
+ url = model_url.replace("-model_", "-confidence_")
16
+ if url.endswith(".bcif"):
17
+ return url[: -len(".bcif")] + ".json"
18
+ if url.endswith(".cif"):
19
+ return url[: -len(".cif")] + ".json"
20
+ return url
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class Plddt:
25
+ """Per-residue pLDDT for one structure.
26
+
27
+ ``scores`` and ``residue_numbers`` are parallel lists. ``raw`` is the full
28
+ confidence-JSON document (escape hatch).
29
+ """
30
+
31
+ scores: list[float]
32
+ residue_numbers: list[int]
33
+ raw: dict = field(repr=False)
34
+
35
+ @classmethod
36
+ def from_dict(cls, data: dict) -> "Plddt":
37
+ return cls(
38
+ scores=data["confidenceScore"],
39
+ residue_numbers=data["residueNumber"],
40
+ raw=data,
41
+ )
42
+
43
+ def first(self, n: int) -> list[float]:
44
+ """First ``n`` per-residue pLDDT values, or all of them if fewer than ``n``.
45
+
46
+ Never pads and never raises on short structures: returns ``scores[:n]``.
47
+ """
48
+ return self.scores[:n]
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class Structure:
53
+ """One AFDB structure match for a queried sequence.
54
+
55
+ Thin typed wrapper over the endpoint's ``summary`` dict. ``raw`` is the full
56
+ summary (escape hatch). ``plddt()`` lazily fetches per-residue pLDDT.
57
+ """
58
+
59
+ raw: dict
60
+ _client: "AlphaFold" = field(repr=False, compare=False) # noqa: F821
61
+ _cache: dict = field(default_factory=dict, repr=False, compare=False)
62
+
63
+ @property
64
+ def model_identifier(self) -> str | None:
65
+ return self.raw.get("model_identifier")
66
+
67
+ @property
68
+ def model_url(self) -> str | None:
69
+ return self.raw.get("model_url")
70
+
71
+ @property
72
+ def global_plddt(self) -> float | None:
73
+ return self.raw.get("confidence_avg_local_score")
74
+
75
+ @property
76
+ def sequence_identity(self) -> float | None:
77
+ return self.raw.get("sequence_identity")
78
+
79
+ @property
80
+ def coverage(self) -> float | None:
81
+ return self.raw.get("coverage")
82
+
83
+ @property
84
+ def uniprot_accession(self) -> str | None:
85
+ for entity in self.raw.get("entities") or []:
86
+ if entity.get("identifier_category") == "UNIPROT":
87
+ return entity.get("identifier")
88
+ return None
89
+
90
+ @property
91
+ def description(self) -> str | None:
92
+ for entity in self.raw.get("entities") or []:
93
+ if entity.get("description"):
94
+ return entity["description"]
95
+ return None
96
+
97
+ def plddt(self) -> Plddt:
98
+ """Tier 2: per-residue pLDDT for this structure (fetched once, then cached)."""
99
+ if "plddt" not in self._cache:
100
+ data = self._client._fetch_confidence(self.model_url)
101
+ self._cache["plddt"] = Plddt.from_dict(data)
102
+ return self._cache["plddt"]
afdb_query/py.typed ADDED
File without changes
@@ -0,0 +1,20 @@
1
+ """Sequence validation for AFDB queries (ported from the original pipeline)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ STANDARD_AA = frozenset("ACDEFGHIKLMNPQRSTVWY")
6
+ MIN_LENGTH = 20
7
+
8
+
9
+ def filter_reason(seq: str) -> str | None:
10
+ """Why a sequence cannot be queried against AFDB, or None if it is queryable.
11
+
12
+ Checked in priority order: internal stop, length, non-standard residues.
13
+ """
14
+ if "*" in seq:
15
+ return "internal_stop"
16
+ if len(seq) < MIN_LENGTH:
17
+ return "too_short"
18
+ if not set(seq) <= STANDARD_AA:
19
+ return "nonstandard_aa"
20
+ return None
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: afdb-query
3
+ Version: 0.1.0
4
+ Summary: Sequence-based programmatic access to the AlphaFold Protein Structure Database
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: httpx>=0.27
8
+ Provides-Extra: dev
9
+ Requires-Dist: pytest>=8; extra == 'dev'
10
+ Requires-Dist: respx>=0.21; extra == 'dev'
11
+ Description-Content-Type: text/markdown
12
+
13
+ # afdb-query
14
+
15
+ Sequence-based programmatic access to the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/) (AFDB). Query a protein by its amino-acid sequence, then pull per-residue pLDDT — including "the first n values" — without hand-rolling URL derivation and JSON fetching.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install afdb-query
21
+ ```
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from afdb_query import AlphaFold
27
+
28
+ with AlphaFold() as af:
29
+ hits = af.search(sequence) # Tier 1: list[Structure], in AFDB's returned order
30
+ s = hits[0]
31
+
32
+ s.global_plddt # mean pLDDT for the model (cheap, from the summary)
33
+ s.sequence_identity # 1.0 == exact match, < 1.0 == near hit
34
+ s.uniprot_accession # e.g. "P12345", or None
35
+
36
+ p = s.plddt() # Tier 2: per-residue pLDDT (fetched once, then cached)
37
+ p.scores # full per-residue list[float]
38
+ p.first(50) # first 50 values — or all of them if the model is shorter
39
+ ```
40
+
41
+ `search` raises `InvalidSequenceError` for sequences that cannot be queried
42
+ (internal stop `*`, shorter than 20 residues, or non-standard amino acids), and
43
+ returns `[]` when AFDB has no entry for a valid sequence.
44
+
45
+ Results come back in AFDB's returned order (ranked by sequence identity). Note that
46
+ `hits[0]` is **not** guaranteed to be the canonical `AF-<accession>-F1` model — for
47
+ some sequences a multi-chain or AB-INITIO model ranks first — so pick the hit whose
48
+ `model_identifier` you want if you need a specific entry.
49
+
50
+ ## Batch lookups
51
+
52
+ `search_many` runs many sequences concurrently with resumable on-disk caching:
53
+
54
+ ```python
55
+ report = af.search_many(
56
+ [{"id": "rec1", "sequence": seq1}, {"id": "rec2", "sequence": seq2}],
57
+ out_dir="afdb_cache",
58
+ concurrency=6,
59
+ plddt_first_n=50, # optional: also save the first 50 per-residue pLDDT per hit
60
+ )
61
+ # report -> {"total":..., "hits":..., "misses":..., "errors":..., "skipped":..., ...}
62
+ ```
63
+
64
+ - You supply a generic `id` per sequence; it keys the cache file and maps back to
65
+ your own records.
66
+ - `out_dir/summaries/{id}.json` stores each hit (a 404 miss stores
67
+ `{"structures": []}`); existing files are left untouched, so re-runs resume.
68
+ - With `plddt_first_n` set, `out_dir/plddt/{id}.json` stores the raw first-n
69
+ per-residue pLDDT array for the best structure.
70
+ - Real HTTP errors are counted but not saved, so they retry on the next run.
71
+
72
+ Note: resumability keys on the summary file. If you run once without
73
+ `plddt_first_n` and again with it, already-cached records are skipped and their
74
+ pLDDT is not back-filled.
75
+
76
+ ## Not (yet) supported
77
+
78
+ - UniProt-accession lookup (sequence-only for now)
79
+ - PAE (Predicted Aligned Error)
80
+ - No statistics helpers — the package returns raw values; downstream math is yours.
@@ -0,0 +1,10 @@
1
+ afdb_query/__init__.py,sha256=RupH4A7p_l7dUJOTru97O7kl0YLQtEgAiyNwrdSicn0,397
2
+ afdb_query/batch.py,sha256=aMJ89EUwE9Ups8a_TXSdy_vcKSyA5v4eETIBZe-qFvk,4835
3
+ afdb_query/client.py,sha256=YVfDNq-cLE0E-vbXPdSkU123Sj3Atzlo34ZgRIzSezg,3808
4
+ afdb_query/errors.py,sha256=3DRg8y1pxCf7Zv08J1RNgq48vuDEmmGI8e9iENu6vEs,535
5
+ afdb_query/models.py,sha256=NN1_MdlbNhJ_T06Tc47Sjp_Wr6evfhJ9nX4XQtaIH2k,3194
6
+ afdb_query/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ afdb_query/sequences.py,sha256=r097SESzEg6NUXzBjcVoO54D5BQ_KSDfqqamTvHpBjs,585
8
+ afdb_query-0.1.0.dist-info/METADATA,sha256=a0eYp1948KZi_ppK9zAaZwS7CbwG4NoXUicggLOVuOA,3105
9
+ afdb_query-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
10
+ afdb_query-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any