afdb-query 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- afdb_query-0.1.0/.gitattributes +2 -0
- afdb_query-0.1.0/.gitignore +6 -0
- afdb_query-0.1.0/PKG-INFO +80 -0
- afdb_query-0.1.0/README.md +68 -0
- afdb_query-0.1.0/docs/superpowers/plans/2026-06-02-afdb-query.md +1424 -0
- afdb_query-0.1.0/docs/superpowers/specs/2026-06-02-afdb-query-package-design.md +233 -0
- afdb_query-0.1.0/pyproject.toml +25 -0
- afdb_query-0.1.0/src/afdb_query/__init__.py +16 -0
- afdb_query-0.1.0/src/afdb_query/batch.py +135 -0
- afdb_query-0.1.0/src/afdb_query/client.py +107 -0
- afdb_query-0.1.0/src/afdb_query/errors.py +19 -0
- afdb_query-0.1.0/src/afdb_query/models.py +102 -0
- afdb_query-0.1.0/src/afdb_query/py.typed +0 -0
- afdb_query-0.1.0/src/afdb_query/sequences.py +20 -0
- afdb_query-0.1.0/tests/test_batch.py +94 -0
- afdb_query-0.1.0/tests/test_client.py +91 -0
- afdb_query-0.1.0/tests/test_integration.py +39 -0
- afdb_query-0.1.0/tests/test_models.py +117 -0
- afdb_query-0.1.0/tests/test_sequences.py +30 -0
- afdb_query-0.1.0/tests/test_smoke.py +13 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: afdb-query
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sequence-based programmatic access to the AlphaFold Protein Structure Database
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: httpx>=0.27
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
10
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# afdb-query
|
|
14
|
+
|
|
15
|
+
Sequence-based programmatic access to the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/) (AFDB). Query a protein by its amino-acid sequence, then pull per-residue pLDDT — including "the first n values" — without hand-rolling URL derivation and JSON fetching.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install afdb-query
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from afdb_query import AlphaFold
|
|
27
|
+
|
|
28
|
+
with AlphaFold() as af:
|
|
29
|
+
hits = af.search(sequence) # Tier 1: list[Structure], in AFDB's returned order
|
|
30
|
+
s = hits[0]
|
|
31
|
+
|
|
32
|
+
s.global_plddt # mean pLDDT for the model (cheap, from the summary)
|
|
33
|
+
s.sequence_identity # 1.0 == exact match, < 1.0 == near hit
|
|
34
|
+
s.uniprot_accession # e.g. "P12345", or None
|
|
35
|
+
|
|
36
|
+
p = s.plddt() # Tier 2: per-residue pLDDT (fetched once, then cached)
|
|
37
|
+
p.scores # full per-residue list[float]
|
|
38
|
+
p.first(50) # first 50 values — or all of them if the model is shorter
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`search` raises `InvalidSequenceError` for sequences that cannot be queried
|
|
42
|
+
(internal stop `*`, shorter than 20 residues, or non-standard amino acids), and
|
|
43
|
+
returns `[]` when AFDB has no entry for a valid sequence.
|
|
44
|
+
|
|
45
|
+
Results come back in AFDB's returned order (ranked by sequence identity). Note that
|
|
46
|
+
`hits[0]` is **not** guaranteed to be the canonical `AF-<accession>-F1` model — for
|
|
47
|
+
some sequences a multi-chain or AB-INITIO model ranks first — so pick the hit whose
|
|
48
|
+
`model_identifier` you want if you need a specific entry.
|
|
49
|
+
|
|
50
|
+
## Batch lookups
|
|
51
|
+
|
|
52
|
+
`search_many` runs many sequences concurrently with resumable on-disk caching:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
report = af.search_many(
|
|
56
|
+
[{"id": "rec1", "sequence": seq1}, {"id": "rec2", "sequence": seq2}],
|
|
57
|
+
out_dir="afdb_cache",
|
|
58
|
+
concurrency=6,
|
|
59
|
+
plddt_first_n=50, # optional: also save the first 50 per-residue pLDDT per hit
|
|
60
|
+
)
|
|
61
|
+
# report -> {"total":..., "hits":..., "misses":..., "errors":..., "skipped":..., ...}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
- You supply a generic `id` per sequence; it keys the cache file and maps back to
|
|
65
|
+
your own records.
|
|
66
|
+
- `out_dir/summaries/{id}.json` stores each hit (a 404 miss stores
|
|
67
|
+
`{"structures": []}`); existing files are left untouched, so re-runs resume.
|
|
68
|
+
- With `plddt_first_n` set, `out_dir/plddt/{id}.json` stores the raw first-n
|
|
69
|
+
per-residue pLDDT array for the best structure.
|
|
70
|
+
- Real HTTP errors are counted but not saved, so they retry on the next run.
|
|
71
|
+
|
|
72
|
+
Note: resumability keys on the summary file. If you run once without
|
|
73
|
+
`plddt_first_n` and again with it, already-cached records are skipped and their
|
|
74
|
+
pLDDT is not back-filled.
|
|
75
|
+
|
|
76
|
+
## Not (yet) supported
|
|
77
|
+
|
|
78
|
+
- UniProt-accession lookup (sequence-only for now)
|
|
79
|
+
- PAE (Predicted Aligned Error)
|
|
80
|
+
- No statistics helpers — the package returns raw values; downstream math is yours.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# afdb-query
|
|
2
|
+
|
|
3
|
+
Sequence-based programmatic access to the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/) (AFDB). Query a protein by its amino-acid sequence, then pull per-residue pLDDT — including "the first n values" — without hand-rolling URL derivation and JSON fetching.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install afdb-query
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quickstart
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from afdb_query import AlphaFold
|
|
15
|
+
|
|
16
|
+
with AlphaFold() as af:
|
|
17
|
+
hits = af.search(sequence) # Tier 1: list[Structure], in AFDB's returned order
|
|
18
|
+
s = hits[0]
|
|
19
|
+
|
|
20
|
+
s.global_plddt # mean pLDDT for the model (cheap, from the summary)
|
|
21
|
+
s.sequence_identity # 1.0 == exact match, < 1.0 == near hit
|
|
22
|
+
s.uniprot_accession # e.g. "P12345", or None
|
|
23
|
+
|
|
24
|
+
p = s.plddt() # Tier 2: per-residue pLDDT (fetched once, then cached)
|
|
25
|
+
p.scores # full per-residue list[float]
|
|
26
|
+
p.first(50) # first 50 values — or all of them if the model is shorter
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`search` raises `InvalidSequenceError` for sequences that cannot be queried
|
|
30
|
+
(internal stop `*`, shorter than 20 residues, or non-standard amino acids), and
|
|
31
|
+
returns `[]` when AFDB has no entry for a valid sequence.
|
|
32
|
+
|
|
33
|
+
Results come back in AFDB's returned order (ranked by sequence identity). Note that
|
|
34
|
+
`hits[0]` is **not** guaranteed to be the canonical `AF-<accession>-F1` model — for
|
|
35
|
+
some sequences a multi-chain or AB-INITIO model ranks first — so pick the hit whose
|
|
36
|
+
`model_identifier` you want if you need a specific entry.
|
|
37
|
+
|
|
38
|
+
## Batch lookups
|
|
39
|
+
|
|
40
|
+
`search_many` runs many sequences concurrently with resumable on-disk caching:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
report = af.search_many(
|
|
44
|
+
[{"id": "rec1", "sequence": seq1}, {"id": "rec2", "sequence": seq2}],
|
|
45
|
+
out_dir="afdb_cache",
|
|
46
|
+
concurrency=6,
|
|
47
|
+
plddt_first_n=50, # optional: also save the first 50 per-residue pLDDT per hit
|
|
48
|
+
)
|
|
49
|
+
# report -> {"total":..., "hits":..., "misses":..., "errors":..., "skipped":..., ...}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- You supply a generic `id` per sequence; it keys the cache file and maps back to
|
|
53
|
+
your own records.
|
|
54
|
+
- `out_dir/summaries/{id}.json` stores each hit (a 404 miss stores
|
|
55
|
+
`{"structures": []}`); existing files are left untouched, so re-runs resume.
|
|
56
|
+
- With `plddt_first_n` set, `out_dir/plddt/{id}.json` stores the raw first-n
|
|
57
|
+
per-residue pLDDT array for the best structure.
|
|
58
|
+
- Real HTTP errors are counted but not saved, so they retry on the next run.
|
|
59
|
+
|
|
60
|
+
Note: resumability keys on the summary file. If you run once without
|
|
61
|
+
`plddt_first_n` and again with it, already-cached records are skipped and their
|
|
62
|
+
pLDDT is not back-filled.
|
|
63
|
+
|
|
64
|
+
## Not (yet) supported
|
|
65
|
+
|
|
66
|
+
- UniProt-accession lookup (sequence-only for now)
|
|
67
|
+
- PAE (Predicted Aligned Error)
|
|
68
|
+
- No statistics helpers — the package returns raw values; downstream math is yours.
|