epiquire 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epiquire-0.1.0/PKG-INFO +200 -0
- epiquire-0.1.0/README.md +169 -0
- epiquire-0.1.0/pyproject.toml +49 -0
- epiquire-0.1.0/setup.cfg +4 -0
- epiquire-0.1.0/src/epiquire/__init__.py +18 -0
- epiquire-0.1.0/src/epiquire/acquisition.py +135 -0
- epiquire-0.1.0/src/epiquire/advisor.py +158 -0
- epiquire-0.1.0/src/epiquire/bench/__init__.py +17 -0
- epiquire-0.1.0/src/epiquire/bench/falsification.py +95 -0
- epiquire-0.1.0/src/epiquire/bench/harness.py +134 -0
- epiquire-0.1.0/src/epiquire/bench/stats.py +46 -0
- epiquire-0.1.0/src/epiquire/campaign.py +257 -0
- epiquire-0.1.0/src/epiquire/candidates.py +341 -0
- epiquire-0.1.0/src/epiquire/cli.py +1017 -0
- epiquire-0.1.0/src/epiquire/conformal.py +160 -0
- epiquire-0.1.0/src/epiquire/design_space.py +141 -0
- epiquire-0.1.0/src/epiquire/embeddings/__init__.py +6 -0
- epiquire-0.1.0/src/epiquire/embeddings/esm2.py +76 -0
- epiquire-0.1.0/src/epiquire/embeddings/esmc.py +94 -0
- epiquire-0.1.0/src/epiquire/embeddings/provider.py +105 -0
- epiquire-0.1.0/src/epiquire/features/__init__.py +22 -0
- epiquire-0.1.0/src/epiquire/features/base.py +46 -0
- epiquire-0.1.0/src/epiquire/features/defaults.py +11 -0
- epiquire-0.1.0/src/epiquire/features/embedding.py +80 -0
- epiquire-0.1.0/src/epiquire/features/inverse_folding.py +62 -0
- epiquire-0.1.0/src/epiquire/features/matrix.py +50 -0
- epiquire-0.1.0/src/epiquire/features/naturalness.py +65 -0
- epiquire-0.1.0/src/epiquire/features/onehot.py +55 -0
- epiquire-0.1.0/src/epiquire/features/pairwise.py +64 -0
- epiquire-0.1.0/src/epiquire/funclib.py +331 -0
- epiquire-0.1.0/src/epiquire/gating.py +181 -0
- epiquire-0.1.0/src/epiquire/loop.py +136 -0
- epiquire-0.1.0/src/epiquire/msa.py +106 -0
- epiquire-0.1.0/src/epiquire/naturalness.py +142 -0
- epiquire-0.1.0/src/epiquire/oracle.py +94 -0
- epiquire-0.1.0/src/epiquire/plm.py +145 -0
- epiquire-0.1.0/src/epiquire/prereg.py +41 -0
- epiquire-0.1.0/src/epiquire/protocols.py +87 -0
- epiquire-0.1.0/src/epiquire/rank.py +101 -0
- epiquire-0.1.0/src/epiquire/report.py +132 -0
- epiquire-0.1.0/src/epiquire/selector.py +63 -0
- epiquire-0.1.0/src/epiquire/sitefinder.py +424 -0
- epiquire-0.1.0/src/epiquire/structure.py +353 -0
- epiquire-0.1.0/src/epiquire/surrogate.py +300 -0
- epiquire-0.1.0/src/epiquire/types.py +66 -0
- epiquire-0.1.0/src/epiquire/zero_shot.py +160 -0
- epiquire-0.1.0/src/epiquire.egg-info/PKG-INFO +200 -0
- epiquire-0.1.0/src/epiquire.egg-info/SOURCES.txt +78 -0
- epiquire-0.1.0/src/epiquire.egg-info/dependency_links.txt +1 -0
- epiquire-0.1.0/src/epiquire.egg-info/entry_points.txt +2 -0
- epiquire-0.1.0/src/epiquire.egg-info/requires.txt +17 -0
- epiquire-0.1.0/src/epiquire.egg-info/top_level.txt +1 -0
- epiquire-0.1.0/tests/test_acquisition.py +77 -0
- epiquire-0.1.0/tests/test_advisor.py +184 -0
- epiquire-0.1.0/tests/test_bench.py +51 -0
- epiquire-0.1.0/tests/test_campaign.py +215 -0
- epiquire-0.1.0/tests/test_candidates.py +83 -0
- epiquire-0.1.0/tests/test_cli.py +132 -0
- epiquire-0.1.0/tests/test_conformal.py +199 -0
- epiquire-0.1.0/tests/test_defaults.py +21 -0
- epiquire-0.1.0/tests/test_design_space.py +56 -0
- epiquire-0.1.0/tests/test_embeddings.py +149 -0
- epiquire-0.1.0/tests/test_encode_fast.py +82 -0
- epiquire-0.1.0/tests/test_falsification.py +58 -0
- epiquire-0.1.0/tests/test_features.py +54 -0
- epiquire-0.1.0/tests/test_funclib.py +285 -0
- epiquire-0.1.0/tests/test_gating.py +161 -0
- epiquire-0.1.0/tests/test_heldout.py +37 -0
- epiquire-0.1.0/tests/test_msa.py +54 -0
- epiquire-0.1.0/tests/test_naturalness.py +139 -0
- epiquire-0.1.0/tests/test_oracle_loop.py +57 -0
- epiquire-0.1.0/tests/test_pairwise.py +50 -0
- epiquire-0.1.0/tests/test_plm.py +139 -0
- epiquire-0.1.0/tests/test_prereg.py +20 -0
- epiquire-0.1.0/tests/test_rank_report.py +120 -0
- epiquire-0.1.0/tests/test_selector.py +43 -0
- epiquire-0.1.0/tests/test_sitefinder.py +122 -0
- epiquire-0.1.0/tests/test_structure.py +245 -0
- epiquire-0.1.0/tests/test_surrogate.py +158 -0
- epiquire-0.1.0/tests/test_zero_shot.py +100 -0
epiquire-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: epiquire
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Low-N, acquisition-driven, epistasis-aware protein mutation recommendation engine
|
|
5
|
+
Author: gyuminlee-repo
|
|
6
|
+
Keywords: directed-evolution,protein-engineering,active-learning,bayesian-optimization,epistasis,mutation-recommendation
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: scipy
|
|
19
|
+
Requires-Dist: scikit-learn
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Provides-Extra: plm
|
|
22
|
+
Requires-Dist: torch; extra == "plm"
|
|
23
|
+
Requires-Dist: esm; extra == "plm"
|
|
24
|
+
Provides-Extra: plm2
|
|
25
|
+
Requires-Dist: fair-esm; extra == "plm2"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
|
28
|
+
Requires-Dist: pyright; extra == "dev"
|
|
29
|
+
Requires-Dist: build; extra == "dev"
|
|
30
|
+
Requires-Dist: twine; extra == "dev"
|
|
31
|
+
|
|
32
|
+
# epiquire
|
|
33
|
+
|
|
34
|
+
Low-N, acquisition-driven protein mutation recommendation. The spine is one measurement campaign:
|
|
35
|
+
|
|
36
|
+
```text
|
|
37
|
+
R0 seed to measure -> fit surrogate on all measured data -> acquire next plate -> measure -> repeat
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`epiquire` is not a magic zero-shot "better protein" predictor. Zero-shot, MSA, inverse-folding,
|
|
41
|
+
structure, and ddG signals are used as priors, constraints, or features. The function-aligned signal
|
|
42
|
+
is still measurement. The tool's job is to spend a small measurement budget better.
|
|
43
|
+
|
|
44
|
+
## Easiest entry point: `epiquire round`
|
|
45
|
+
|
|
46
|
+
One command per round. It auto-resolves the active-site design region (the `resolve-site` evidence
|
|
47
|
+
ladder: UniProt -> linked PDB cocrystal -> AlphaFold) and then runs the campaign round, so there is
|
|
48
|
+
no position bookkeeping by hand.
|
|
49
|
+
|
|
50
|
+
**Run it in your browser, no install:** open [`notebooks/epiquire_round_colab.ipynb`](https://colab.research.google.com/github/gyuminlee-repo/epiquire/blob/master/notebooks/epiquire_round_colab.ipynb)
|
|
51
|
+
in Google Colab -- a form-driven front end over `epiquire round` (paste the WT sequence + an accession,
|
|
52
|
+
optionally upload measured results, get the plate CSV). The core loop is numpy/scikit-learn (seconds);
|
|
53
|
+
only the optional PLM path is heavy, and Colab's free GPU covers that.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# R0: no --measured -> the funclib seed plate to MEASURE (needs a signal: --msa / --ddg / --if-logprobs)
|
|
57
|
+
epiquire round WT.fasta --uniprot Q50L36 --msa alignment.a3m --plate 95 --outdir outputs/round
|
|
58
|
+
|
|
59
|
+
# Rk: add each round's measured results -> the next AL plate over the full saturation universe
|
|
60
|
+
epiquire round WT.fasta --uniprot Q50L36 --measured round0_measured.csv --plate 95 --outdir outputs/round
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Define the design region any one way: `--uniprot`/`--query`/`--pdb` (auto ladder), a local `--holo`
|
|
64
|
+
structure, or explicit `--positions`. If the ladder cannot map evidence onto your WT it abstains
|
|
65
|
+
(supply `--positions` or `--holo`) rather than guessing. `round` is a thin wrapper: `resolve-site`
|
|
66
|
+
and `campaign` below remain the explicit, fully-configurable commands it calls.
|
|
67
|
+
|
|
68
|
+
## The campaign engine (what `round` calls)
|
|
69
|
+
|
|
70
|
+
Use `campaign` when starting an active-site campaign. It combines the two validated pieces:
|
|
71
|
+
|
|
72
|
+
1. **R0 cold-start seed**: FuncLib-style active-site library construction. This proposes variants to
|
|
73
|
+
measure; it does not predict winners.
|
|
74
|
+
2. **R1+ active learning**: after measurements arrive, the AL loop searches the full saturation space
|
|
75
|
+
of the design positions, not just the R0 seed.
|
|
76
|
+
|
|
77
|
+
### R0: generate the seed plate
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
epiquire campaign WT.fasta \
|
|
81
|
+
--holo holo_structure.pdb --ligand-resnames LIG \
|
|
82
|
+
--msa alignment.a3m \
|
|
83
|
+
--plate 95 \
|
|
84
|
+
--outdir outputs/campaign
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
This writes `outputs/campaign/round0_plate.csv` with `variant,mutations,n_mut,seed_score`. Measure
|
|
88
|
+
those variants and save a `variant,fitness` CSV.
|
|
89
|
+
|
|
90
|
+
You can use explicit design positions instead of a holo structure:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
epiquire campaign WT.fasta \
|
|
94
|
+
--positions 183,184,227,228 \
|
|
95
|
+
--msa alignment.a3m \
|
|
96
|
+
--plate 95 \
|
|
97
|
+
--outdir outputs/campaign
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### R1+: propose the next plate from measured data
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
epiquire campaign WT.fasta \
|
|
104
|
+
--positions 183,184,227,228 \
|
|
105
|
+
--measured round0_measured.csv \
|
|
106
|
+
--plate 95 \
|
|
107
|
+
--outdir outputs/campaign
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
With multiple rounds, pass all accumulated CSVs:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
epiquire campaign WT.fasta \
|
|
114
|
+
--positions 183,184,227,228 \
|
|
115
|
+
--measured round0_measured.csv round1_measured.csv \
|
|
116
|
+
--plate 95 \
|
|
117
|
+
--outdir outputs/campaign
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Simulate against a complete landscape
|
|
121
|
+
|
|
122
|
+
For validation or retrospective benchmarks:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
epiquire campaign WT.fasta \
|
|
126
|
+
--simulate data/trpb.csv \
|
|
127
|
+
--positions 183,184,227,228 \
|
|
128
|
+
--msa isps_run/msa_trpb.a3m \
|
|
129
|
+
--strategies funclib singles random \
|
|
130
|
+
--plate 95 --rounds 3 --seeds 20 --jobs 4 \
|
|
131
|
+
--model-class ridge \
|
|
132
|
+
--outdir artifacts/campaign_trpb
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
`--jobs` parallelizes independent seeds with spawn processes; serial output and parallel output are
|
|
136
|
+
byte-identical by test.
|
|
137
|
+
|
|
138
|
+
## Other commands
|
|
139
|
+
|
|
140
|
+
- `epiquire recommend measured.csv`: rank next candidates from accumulated measurements. This is the
|
|
141
|
+
AL round engine exposed directly. If you pass `--if-logprobs`, epiquire CV-gates that signal by
|
|
142
|
+
default and uses it only if it improves held-out performance on this protein. Use
|
|
143
|
+
`--no-auto-signals` only as an expert override. `--recombine` also adds recombinations of your
|
|
144
|
+
MEASURED beneficials (variants beating WT) to the candidate pool -- the strategy the real-data
|
|
145
|
+
evidence supports (measure broadly, then recombine confirmed wins).
|
|
146
|
+
- `epiquire resolve-site WT.fasta --uniprot ACC` (or `--query`): resolve active-site/binding positions
|
|
147
|
+
from UniProt experimental curation, sequence-aligned onto YOUR WT numbering (handles transit-peptide
|
|
148
|
+
/ species offsets), with provenance + confidence per position. If UniProt features do not map it
|
|
149
|
+
falls back to a UniProt-linked PDB cocrystal (active-site shells); `--pdb ID` forces the structure
|
|
150
|
+
path. If neither maps, it fetches the AlphaFold model (predicted, apo). Optional `--apo-pocket`
|
|
151
|
+
(fpocket) guesses pocket positions but is LOW-confidence -- it missed the catalytic site on ispS,
|
|
152
|
+
so it is warned, not default. Abstains on positions rather than guessing -- then provide
|
|
153
|
+
`--holo`/`--positions`. Feed resolved positions to `campaign`.
|
|
154
|
+
- `epiquire funclib WT.fasta ...`: build only the active-site R0 measurement library. Useful when you
|
|
155
|
+
want the seed plate without the campaign wrapper.
|
|
156
|
+
- `epiquire advise measured.csv`: readiness check on measured data.
|
|
157
|
+
- `epiquire bench`: synthetic alpha-spectrum benchmark.
|
|
158
|
+
- `epiquire report`: summarize a JSON report.
|
|
159
|
+
|
|
160
|
+
## What the opt-ins mean
|
|
161
|
+
|
|
162
|
+
Most optional flags are not feature clutter. They are either:
|
|
163
|
+
|
|
164
|
+
- **resources you must provide**: MSA, PDB, inverse-folding table, ddG table, PLM cache; or
|
|
165
|
+
- **expert overrides** for known regimes.
|
|
166
|
+
|
|
167
|
+
The default stays conservative because low-N model selection overfits. When epiquire can decide from
|
|
168
|
+
this protein's measured data, it should decide itself: optional IF signals are now CV-gated by default;
|
|
169
|
+
pairwise epistasis has its own data gate (`--auto-gate-pairwise`). See `docs/OPTIN_AUDIT.md`.
|
|
170
|
+
|
|
171
|
+
## Honest scope
|
|
172
|
+
|
|
173
|
+
Evidence so far supports these claims:
|
|
174
|
+
|
|
175
|
+
- active-site FuncLib-style seeds enrich measurable libraries and improve low-budget discovery;
|
|
176
|
+
- the global activity winner can be outside the seed, so AL must search beyond the seed;
|
|
177
|
+
- stability/ddG signals are best used as constraints, while activity improvement is measurement-bound;
|
|
178
|
+
- high-order activity epistasis makes zero-shot winner prediction unreliable.
|
|
179
|
+
|
|
180
|
+
Evidence does **not** support: "epiquire predicts the best protein from sequence/structure alone." It
|
|
181
|
+
increases the probability and efficiency of finding a better variant per measured plate, conditional on
|
|
182
|
+
the design space and assay.
|
|
183
|
+
|
|
184
|
+
## When NOT to use epiquire
|
|
185
|
+
|
|
186
|
+
The focused active-site campaign helps only when the wins plausibly live in a region you can name
|
|
187
|
+
and measure combinatorially. Do NOT reach for it when:
|
|
188
|
+
|
|
189
|
+
- you do not know where beneficial mutations are (no mechanistic/structural prior on the target site);
|
|
190
|
+
- the wins are likely distal/distributed across the protein (use a broad whole-protein single-mutant
|
|
191
|
+
scan instead -- that is a different tool class; epiquire's additive default cannot rank
|
|
192
|
+
whole-protein singles because one-hot has no cross-position transfer);
|
|
193
|
+
- you cannot measure a focused combinatorial library (no assay throughput at the chosen site).
|
|
194
|
+
|
|
195
|
+
Real-data caution (PtIspS, `artifacts/ispS_wetlab_validation.md`): a campaign run as a broad
|
|
196
|
+
single-mutant scan with the active site barely sampled and the global winner distal is exactly the
|
|
197
|
+
regime epiquire is NOT for. Pick broad-vs-focused from your biology; epiquire does not decide it.
|
|
198
|
+
|
|
199
|
+
More detail: `ARCHITECTURE.md`, `artifacts/DECISION_EVIDENCE.md`, and the Obsidian decision log named
|
|
200
|
+
there.
|
epiquire-0.1.0/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# epiquire
|
|
2
|
+
|
|
3
|
+
Low-N, acquisition-driven protein mutation recommendation. The spine is one measurement campaign:
|
|
4
|
+
|
|
5
|
+
```text
|
|
6
|
+
R0 seed to measure -> fit surrogate on all measured data -> acquire next plate -> measure -> repeat
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
`epiquire` is not a magic zero-shot "better protein" predictor. Zero-shot, MSA, inverse-folding,
|
|
10
|
+
structure, and ddG signals are used as priors, constraints, or features. The function-aligned signal
|
|
11
|
+
is still measurement. The tool's job is to spend a small measurement budget better.
|
|
12
|
+
|
|
13
|
+
## Easiest entry point: `epiquire round`
|
|
14
|
+
|
|
15
|
+
One command per round. It auto-resolves the active-site design region (the `resolve-site` evidence
|
|
16
|
+
ladder: UniProt -> linked PDB cocrystal -> AlphaFold) and then runs the campaign round, so there is
|
|
17
|
+
no position bookkeeping by hand.
|
|
18
|
+
|
|
19
|
+
**Run it in your browser, no install:** open [`notebooks/epiquire_round_colab.ipynb`](https://colab.research.google.com/github/gyuminlee-repo/epiquire/blob/master/notebooks/epiquire_round_colab.ipynb)
|
|
20
|
+
in Google Colab -- a form-driven front end over `epiquire round` (paste the WT sequence + an accession,
|
|
21
|
+
optionally upload measured results, get the plate CSV). The core loop is numpy/scikit-learn (seconds);
|
|
22
|
+
only the optional PLM path is heavy, and Colab's free GPU covers that.
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# R0: no --measured -> the funclib seed plate to MEASURE (needs a signal: --msa / --ddg / --if-logprobs)
|
|
26
|
+
epiquire round WT.fasta --uniprot Q50L36 --msa alignment.a3m --plate 95 --outdir outputs/round
|
|
27
|
+
|
|
28
|
+
# Rk: add each round's measured results -> the next AL plate over the full saturation universe
|
|
29
|
+
epiquire round WT.fasta --uniprot Q50L36 --measured round0_measured.csv --plate 95 --outdir outputs/round
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Define the design region any one way: `--uniprot`/`--query`/`--pdb` (auto ladder), a local `--holo`
|
|
33
|
+
structure, or explicit `--positions`. If the ladder cannot map evidence onto your WT it abstains
|
|
34
|
+
(supply `--positions` or `--holo`) rather than guessing. `round` is a thin wrapper: `resolve-site`
|
|
35
|
+
and `campaign` below remain the explicit, fully-configurable commands it calls.
|
|
36
|
+
|
|
37
|
+
## The campaign engine (what `round` calls)
|
|
38
|
+
|
|
39
|
+
Use `campaign` when starting an active-site campaign. It combines the two validated pieces:
|
|
40
|
+
|
|
41
|
+
1. **R0 cold-start seed**: FuncLib-style active-site library construction. This proposes variants to
|
|
42
|
+
measure; it does not predict winners.
|
|
43
|
+
2. **R1+ active learning**: after measurements arrive, the AL loop searches the full saturation space
|
|
44
|
+
of the design positions, not just the R0 seed.
|
|
45
|
+
|
|
46
|
+
### R0: generate the seed plate
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
epiquire campaign WT.fasta \
|
|
50
|
+
--holo holo_structure.pdb --ligand-resnames LIG \
|
|
51
|
+
--msa alignment.a3m \
|
|
52
|
+
--plate 95 \
|
|
53
|
+
--outdir outputs/campaign
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
This writes `outputs/campaign/round0_plate.csv` with `variant,mutations,n_mut,seed_score`. Measure
|
|
57
|
+
those variants and save a `variant,fitness` CSV.
|
|
58
|
+
|
|
59
|
+
You can use explicit design positions instead of a holo structure:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
epiquire campaign WT.fasta \
|
|
63
|
+
--positions 183,184,227,228 \
|
|
64
|
+
--msa alignment.a3m \
|
|
65
|
+
--plate 95 \
|
|
66
|
+
--outdir outputs/campaign
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### R1+: propose the next plate from measured data
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
epiquire campaign WT.fasta \
|
|
73
|
+
--positions 183,184,227,228 \
|
|
74
|
+
--measured round0_measured.csv \
|
|
75
|
+
--plate 95 \
|
|
76
|
+
--outdir outputs/campaign
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
With multiple rounds, pass all accumulated CSVs:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
epiquire campaign WT.fasta \
|
|
83
|
+
--positions 183,184,227,228 \
|
|
84
|
+
--measured round0_measured.csv round1_measured.csv \
|
|
85
|
+
--plate 95 \
|
|
86
|
+
--outdir outputs/campaign
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Simulate against a complete landscape
|
|
90
|
+
|
|
91
|
+
For validation or retrospective benchmarks:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
epiquire campaign WT.fasta \
|
|
95
|
+
--simulate data/trpb.csv \
|
|
96
|
+
--positions 183,184,227,228 \
|
|
97
|
+
--msa isps_run/msa_trpb.a3m \
|
|
98
|
+
--strategies funclib singles random \
|
|
99
|
+
--plate 95 --rounds 3 --seeds 20 --jobs 4 \
|
|
100
|
+
--model-class ridge \
|
|
101
|
+
--outdir artifacts/campaign_trpb
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
`--jobs` parallelizes independent seeds with spawn processes; serial output and parallel output are
|
|
105
|
+
byte-identical by test.
|
|
106
|
+
|
|
107
|
+
## Other commands
|
|
108
|
+
|
|
109
|
+
- `epiquire recommend measured.csv`: rank next candidates from accumulated measurements. This is the
|
|
110
|
+
AL round engine exposed directly. If you pass `--if-logprobs`, epiquire CV-gates that signal by
|
|
111
|
+
default and uses it only if it improves held-out performance on this protein. Use
|
|
112
|
+
`--no-auto-signals` only as an expert override. `--recombine` also adds recombinations of your
|
|
113
|
+
MEASURED beneficials (variants beating WT) to the candidate pool -- the strategy the real-data
|
|
114
|
+
evidence supports (measure broadly, then recombine confirmed wins).
|
|
115
|
+
- `epiquire resolve-site WT.fasta --uniprot ACC` (or `--query`): resolve active-site/binding positions
|
|
116
|
+
from UniProt experimental curation, sequence-aligned onto YOUR WT numbering (handles transit-peptide
|
|
117
|
+
/ species offsets), with provenance + confidence per position. If UniProt features do not map it
|
|
118
|
+
falls back to a UniProt-linked PDB cocrystal (active-site shells); `--pdb ID` forces the structure
|
|
119
|
+
path. If neither maps, it fetches the AlphaFold model (predicted, apo). Optional `--apo-pocket`
|
|
120
|
+
(fpocket) guesses pocket positions but is LOW-confidence -- it missed the catalytic site on ispS,
|
|
121
|
+
so it is warned, not default. Abstains on positions rather than guessing -- then provide
|
|
122
|
+
`--holo`/`--positions`. Feed resolved positions to `campaign`.
|
|
123
|
+
- `epiquire funclib WT.fasta ...`: build only the active-site R0 measurement library. Useful when you
|
|
124
|
+
want the seed plate without the campaign wrapper.
|
|
125
|
+
- `epiquire advise measured.csv`: readiness check on measured data.
|
|
126
|
+
- `epiquire bench`: synthetic alpha-spectrum benchmark.
|
|
127
|
+
- `epiquire report`: summarize a JSON report.
|
|
128
|
+
|
|
129
|
+
## What the opt-ins mean
|
|
130
|
+
|
|
131
|
+
Most optional flags are not feature clutter. They are either:
|
|
132
|
+
|
|
133
|
+
- **resources you must provide**: MSA, PDB, inverse-folding table, ddG table, PLM cache; or
|
|
134
|
+
- **expert overrides** for known regimes.
|
|
135
|
+
|
|
136
|
+
The default stays conservative because low-N model selection overfits. When epiquire can decide from
|
|
137
|
+
this protein's measured data, it should decide itself: optional IF signals are now CV-gated by default;
|
|
138
|
+
pairwise epistasis has its own data gate (`--auto-gate-pairwise`). See `docs/OPTIN_AUDIT.md`.
|
|
139
|
+
|
|
140
|
+
## Honest scope
|
|
141
|
+
|
|
142
|
+
Evidence so far supports these claims:
|
|
143
|
+
|
|
144
|
+
- active-site FuncLib-style seeds enrich measurable libraries and improve low-budget discovery;
|
|
145
|
+
- the global activity winner can be outside the seed, so AL must search beyond the seed;
|
|
146
|
+
- stability/ddG signals are best used as constraints, while activity improvement is measurement-bound;
|
|
147
|
+
- high-order activity epistasis makes zero-shot winner prediction unreliable.
|
|
148
|
+
|
|
149
|
+
Evidence does **not** support: "epiquire predicts the best protein from sequence/structure alone." It
|
|
150
|
+
increases the probability and efficiency of finding a better variant per measured plate, conditional on
|
|
151
|
+
the design space and assay.
|
|
152
|
+
|
|
153
|
+
## When NOT to use epiquire
|
|
154
|
+
|
|
155
|
+
The focused active-site campaign helps only when the wins plausibly live in a region you can name
|
|
156
|
+
and measure combinatorially. Do NOT reach for it when:
|
|
157
|
+
|
|
158
|
+
- you do not know where beneficial mutations are (no mechanistic/structural prior on the target site);
|
|
159
|
+
- the wins are likely distal/distributed across the protein (use a broad whole-protein single-mutant
|
|
160
|
+
scan instead -- that is a different tool class; epiquire's additive default cannot rank
|
|
161
|
+
whole-protein singles because one-hot has no cross-position transfer);
|
|
162
|
+
- you cannot measure a focused combinatorial library (no assay throughput at the chosen site).
|
|
163
|
+
|
|
164
|
+
Real-data caution (PtIspS, `artifacts/ispS_wetlab_validation.md`): a campaign run as a broad
|
|
165
|
+
single-mutant scan with the active site barely sampled and the global winner distal is exactly the
|
|
166
|
+
regime epiquire is NOT for. Pick broad-vs-focused from your biology; epiquire does not decide it.
|
|
167
|
+
|
|
168
|
+
More detail: `ARCHITECTURE.md`, `artifacts/DECISION_EVIDENCE.md`, and the Obsidian decision log named
|
|
169
|
+
there.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "epiquire"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Low-N, acquisition-driven, epistasis-aware protein mutation recommendation engine"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "gyuminlee-repo" }]
|
|
12
|
+
keywords = [
|
|
13
|
+
"directed-evolution",
|
|
14
|
+
"protein-engineering",
|
|
15
|
+
"active-learning",
|
|
16
|
+
"bayesian-optimization",
|
|
17
|
+
"epistasis",
|
|
18
|
+
"mutation-recommendation",
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 4 - Beta",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"numpy",
|
|
32
|
+
"scipy",
|
|
33
|
+
"scikit-learn",
|
|
34
|
+
"pandas",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
plm = ["torch", "esm"] # ESM-C (EvolutionaryScale) + on-demand embeddings
|
|
39
|
+
plm2 = ["fair-esm"] # ESM-2 (Meta) alternative provider
|
|
40
|
+
dev = ["pytest", "pyright", "build", "twine"]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
epiquire = "epiquire.cli:main"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.pytest.ini_options]
|
|
49
|
+
testpaths = ["tests"]
|
epiquire-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""epiquire: low-N, acquisition-driven, epistasis-aware mutation recommendation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
__version__ = "0.0.1"
|
|
5
|
+
|
|
6
|
+
from epiquire.funclib import (
|
|
7
|
+
FuncLibLibrary,
|
|
8
|
+
combine_tolerances,
|
|
9
|
+
design_library,
|
|
10
|
+
tolerance_from_logprob_table,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"FuncLibLibrary",
|
|
15
|
+
"combine_tolerances",
|
|
16
|
+
"design_library",
|
|
17
|
+
"tolerance_from_logprob_table",
|
|
18
|
+
]
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Acquisition layer: the experimental-design lever (the tool's organizing axis).
|
|
2
|
+
|
|
3
|
+
Score-based acquisitions (Greedy / UCB / MaxVariance) rank candidates by a scalar;
|
|
4
|
+
``select`` returns the top-k. DiverseBatch wraps a score acquisition and applies
|
|
5
|
+
KURO greedy-maximin so a batch is both high-value and spread out (no near-duplicate
|
|
6
|
+
wells). All satisfy ``protocols.Acquisition`` and compose freely:
|
|
7
|
+
|
|
8
|
+
DiverseBatch(UCB(beta=1.0)) # diverse, uncertainty-aware exploitation
|
|
9
|
+
DiverseBatch(MaxVariance()) # diverse pure exploration (resolve uncertainty)
|
|
10
|
+
Greedy() # pure exploitation (the additive-greedy baseline)
|
|
11
|
+
|
|
12
|
+
UCB / MaxVariance are only meaningful because the surrogate std is CALIBRATED (P1).
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from epiquire.design_space import DesignSpace, to_assignment
|
|
22
|
+
from epiquire.types import Prediction
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _ScoreAcquisition(ABC):
|
|
26
|
+
"""Acquisitions that rank candidates by a per-candidate scalar score."""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def score(self, pred: Prediction) -> np.ndarray:
|
|
30
|
+
"""Per-candidate acquisition value (higher = more desirable)."""
|
|
31
|
+
|
|
32
|
+
def select(
|
|
33
|
+
self,
|
|
34
|
+
pred: Prediction,
|
|
35
|
+
k: int,
|
|
36
|
+
*,
|
|
37
|
+
candidates: Sequence[str] | None = None,
|
|
38
|
+
space: DesignSpace | None = None,
|
|
39
|
+
) -> list[int]:
|
|
40
|
+
s = self.score(pred)
|
|
41
|
+
if k <= 0 or s.size == 0:
|
|
42
|
+
return []
|
|
43
|
+
k = min(k, s.size)
|
|
44
|
+
return np.argsort(s)[::-1][:k].astype(int).tolist()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Greedy(_ScoreAcquisition):
|
|
48
|
+
"""Pure exploitation: score = predicted mean (the additive-greedy lever)."""
|
|
49
|
+
|
|
50
|
+
def score(self, pred: Prediction) -> np.ndarray:
|
|
51
|
+
return np.asarray(pred.mean, dtype=float)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class UCB(_ScoreAcquisition):
|
|
55
|
+
"""Upper-confidence bound: score = mean + beta * calibrated std."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, beta: float = 1.0) -> None:
|
|
58
|
+
self.beta = beta
|
|
59
|
+
|
|
60
|
+
def score(self, pred: Prediction) -> np.ndarray:
|
|
61
|
+
return np.asarray(pred.mean, dtype=float) + self.beta * np.asarray(pred.std, dtype=float)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MaxVariance(_ScoreAcquisition):
|
|
65
|
+
"""Information-targeted (BALD-style) pure exploration: score = calibrated std.
|
|
66
|
+
|
|
67
|
+
Acquires where the model is most uncertain, i.e. where a measurement most reduces
|
|
68
|
+
predictive uncertainty. With calibrated std this is a principled exploration lever
|
|
69
|
+
(and the seed of resolving epistasis once interaction uncertainty is exposed).
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def score(self, pred: Prediction) -> np.ndarray:
|
|
73
|
+
return np.asarray(pred.std, dtype=float)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _combo_vectors(candidates: Sequence[str], space: DesignSpace) -> list[tuple[str, ...]]:
|
|
77
|
+
"""Per-position residue tuples for fast Hamming distance over the design space."""
|
|
78
|
+
out: list[tuple[str, ...]] = []
|
|
79
|
+
for v in candidates:
|
|
80
|
+
a = to_assignment(v, space)
|
|
81
|
+
out.append(tuple(a[p] for p in space.positions))
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _hamming(a: tuple[str, ...], b: tuple[str, ...]) -> int:
|
|
86
|
+
return sum(1 for x, y in zip(a, b, strict=True) if x != y)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DiverseBatch:
|
|
90
|
+
"""KURO greedy-maximin diversity over a top-scored pool of a base acquisition.
|
|
91
|
+
|
|
92
|
+
Ranks all candidates by the base acquisition, keeps the top ``pool_factor * k``,
|
|
93
|
+
then greedily builds a batch that maximizes the minimum design-space (Hamming)
|
|
94
|
+
distance to the already-chosen set, seeded by the single best-scoring candidate.
|
|
95
|
+
The result is a high-value batch with no near-duplicate wells.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, base: _ScoreAcquisition, pool_factor: float = 4.0) -> None:
|
|
99
|
+
self.base = base
|
|
100
|
+
self.pool_factor = pool_factor
|
|
101
|
+
|
|
102
|
+
def select(
|
|
103
|
+
self,
|
|
104
|
+
pred: Prediction,
|
|
105
|
+
k: int,
|
|
106
|
+
*,
|
|
107
|
+
candidates: Sequence[str],
|
|
108
|
+
space: DesignSpace,
|
|
109
|
+
) -> list[int]:
|
|
110
|
+
n = len(candidates)
|
|
111
|
+
if n == 0 or k <= 0:
|
|
112
|
+
return []
|
|
113
|
+
k = min(k, n)
|
|
114
|
+
scores = self.base.score(pred)
|
|
115
|
+
order = np.argsort(scores)[::-1]
|
|
116
|
+
pool_size = min(n, max(k, int(self.pool_factor * k)))
|
|
117
|
+
pool: list[int] = [int(j) for j in order[:pool_size]]
|
|
118
|
+
vecs = _combo_vectors([candidates[i] for i in pool], space)
|
|
119
|
+
|
|
120
|
+
chosen = [0] # local index into pool; pool[0] is the highest base score
|
|
121
|
+
mind = [_hamming(vecs[j], vecs[0]) for j in range(len(pool))]
|
|
122
|
+
mind[0] = -1
|
|
123
|
+
while len(chosen) < k:
|
|
124
|
+
j = int(np.argmax(mind)) # max-min distance; ties -> lowest index = higher base score
|
|
125
|
+
if mind[j] < 0: # pool exhausted of distinct points
|
|
126
|
+
break
|
|
127
|
+
chosen.append(j)
|
|
128
|
+
mind[j] = -1
|
|
129
|
+
vj = vecs[j]
|
|
130
|
+
for t in range(len(pool)):
|
|
131
|
+
if mind[t] >= 0:
|
|
132
|
+
d = _hamming(vecs[t], vj)
|
|
133
|
+
if d < mind[t]:
|
|
134
|
+
mind[t] = d
|
|
135
|
+
return [pool[j] for j in chosen]
|