epiquire 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. epiquire-0.1.0/PKG-INFO +200 -0
  2. epiquire-0.1.0/README.md +169 -0
  3. epiquire-0.1.0/pyproject.toml +49 -0
  4. epiquire-0.1.0/setup.cfg +4 -0
  5. epiquire-0.1.0/src/epiquire/__init__.py +18 -0
  6. epiquire-0.1.0/src/epiquire/acquisition.py +135 -0
  7. epiquire-0.1.0/src/epiquire/advisor.py +158 -0
  8. epiquire-0.1.0/src/epiquire/bench/__init__.py +17 -0
  9. epiquire-0.1.0/src/epiquire/bench/falsification.py +95 -0
  10. epiquire-0.1.0/src/epiquire/bench/harness.py +134 -0
  11. epiquire-0.1.0/src/epiquire/bench/stats.py +46 -0
  12. epiquire-0.1.0/src/epiquire/campaign.py +257 -0
  13. epiquire-0.1.0/src/epiquire/candidates.py +341 -0
  14. epiquire-0.1.0/src/epiquire/cli.py +1017 -0
  15. epiquire-0.1.0/src/epiquire/conformal.py +160 -0
  16. epiquire-0.1.0/src/epiquire/design_space.py +141 -0
  17. epiquire-0.1.0/src/epiquire/embeddings/__init__.py +6 -0
  18. epiquire-0.1.0/src/epiquire/embeddings/esm2.py +76 -0
  19. epiquire-0.1.0/src/epiquire/embeddings/esmc.py +94 -0
  20. epiquire-0.1.0/src/epiquire/embeddings/provider.py +105 -0
  21. epiquire-0.1.0/src/epiquire/features/__init__.py +22 -0
  22. epiquire-0.1.0/src/epiquire/features/base.py +46 -0
  23. epiquire-0.1.0/src/epiquire/features/defaults.py +11 -0
  24. epiquire-0.1.0/src/epiquire/features/embedding.py +80 -0
  25. epiquire-0.1.0/src/epiquire/features/inverse_folding.py +62 -0
  26. epiquire-0.1.0/src/epiquire/features/matrix.py +50 -0
  27. epiquire-0.1.0/src/epiquire/features/naturalness.py +65 -0
  28. epiquire-0.1.0/src/epiquire/features/onehot.py +55 -0
  29. epiquire-0.1.0/src/epiquire/features/pairwise.py +64 -0
  30. epiquire-0.1.0/src/epiquire/funclib.py +331 -0
  31. epiquire-0.1.0/src/epiquire/gating.py +181 -0
  32. epiquire-0.1.0/src/epiquire/loop.py +136 -0
  33. epiquire-0.1.0/src/epiquire/msa.py +106 -0
  34. epiquire-0.1.0/src/epiquire/naturalness.py +142 -0
  35. epiquire-0.1.0/src/epiquire/oracle.py +94 -0
  36. epiquire-0.1.0/src/epiquire/plm.py +145 -0
  37. epiquire-0.1.0/src/epiquire/prereg.py +41 -0
  38. epiquire-0.1.0/src/epiquire/protocols.py +87 -0
  39. epiquire-0.1.0/src/epiquire/rank.py +101 -0
  40. epiquire-0.1.0/src/epiquire/report.py +132 -0
  41. epiquire-0.1.0/src/epiquire/selector.py +63 -0
  42. epiquire-0.1.0/src/epiquire/sitefinder.py +424 -0
  43. epiquire-0.1.0/src/epiquire/structure.py +353 -0
  44. epiquire-0.1.0/src/epiquire/surrogate.py +300 -0
  45. epiquire-0.1.0/src/epiquire/types.py +66 -0
  46. epiquire-0.1.0/src/epiquire/zero_shot.py +160 -0
  47. epiquire-0.1.0/src/epiquire.egg-info/PKG-INFO +200 -0
  48. epiquire-0.1.0/src/epiquire.egg-info/SOURCES.txt +78 -0
  49. epiquire-0.1.0/src/epiquire.egg-info/dependency_links.txt +1 -0
  50. epiquire-0.1.0/src/epiquire.egg-info/entry_points.txt +2 -0
  51. epiquire-0.1.0/src/epiquire.egg-info/requires.txt +17 -0
  52. epiquire-0.1.0/src/epiquire.egg-info/top_level.txt +1 -0
  53. epiquire-0.1.0/tests/test_acquisition.py +77 -0
  54. epiquire-0.1.0/tests/test_advisor.py +184 -0
  55. epiquire-0.1.0/tests/test_bench.py +51 -0
  56. epiquire-0.1.0/tests/test_campaign.py +215 -0
  57. epiquire-0.1.0/tests/test_candidates.py +83 -0
  58. epiquire-0.1.0/tests/test_cli.py +132 -0
  59. epiquire-0.1.0/tests/test_conformal.py +199 -0
  60. epiquire-0.1.0/tests/test_defaults.py +21 -0
  61. epiquire-0.1.0/tests/test_design_space.py +56 -0
  62. epiquire-0.1.0/tests/test_embeddings.py +149 -0
  63. epiquire-0.1.0/tests/test_encode_fast.py +82 -0
  64. epiquire-0.1.0/tests/test_falsification.py +58 -0
  65. epiquire-0.1.0/tests/test_features.py +54 -0
  66. epiquire-0.1.0/tests/test_funclib.py +285 -0
  67. epiquire-0.1.0/tests/test_gating.py +161 -0
  68. epiquire-0.1.0/tests/test_heldout.py +37 -0
  69. epiquire-0.1.0/tests/test_msa.py +54 -0
  70. epiquire-0.1.0/tests/test_naturalness.py +139 -0
  71. epiquire-0.1.0/tests/test_oracle_loop.py +57 -0
  72. epiquire-0.1.0/tests/test_pairwise.py +50 -0
  73. epiquire-0.1.0/tests/test_plm.py +139 -0
  74. epiquire-0.1.0/tests/test_prereg.py +20 -0
  75. epiquire-0.1.0/tests/test_rank_report.py +120 -0
  76. epiquire-0.1.0/tests/test_selector.py +43 -0
  77. epiquire-0.1.0/tests/test_sitefinder.py +122 -0
  78. epiquire-0.1.0/tests/test_structure.py +245 -0
  79. epiquire-0.1.0/tests/test_surrogate.py +158 -0
  80. epiquire-0.1.0/tests/test_zero_shot.py +100 -0
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: epiquire
3
+ Version: 0.1.0
4
+ Summary: Low-N, acquisition-driven, epistasis-aware protein mutation recommendation engine
5
+ Author: gyuminlee-repo
6
+ Keywords: directed-evolution,protein-engineering,active-learning,bayesian-optimization,epistasis,mutation-recommendation
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
14
+ Classifier: Operating System :: OS Independent
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: numpy
18
+ Requires-Dist: scipy
19
+ Requires-Dist: scikit-learn
20
+ Requires-Dist: pandas
21
+ Provides-Extra: plm
22
+ Requires-Dist: torch; extra == "plm"
23
+ Requires-Dist: esm; extra == "plm"
24
+ Provides-Extra: plm2
25
+ Requires-Dist: fair-esm; extra == "plm2"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == "dev"
28
+ Requires-Dist: pyright; extra == "dev"
29
+ Requires-Dist: build; extra == "dev"
30
+ Requires-Dist: twine; extra == "dev"
31
+
32
+ # epiquire
33
+
34
+ Low-N, acquisition-driven protein mutation recommendation. The spine is one measurement campaign:
35
+
36
+ ```text
37
+ R0 seed to measure -> fit surrogate on all measured data -> acquire next plate -> measure -> repeat
38
+ ```
39
+
40
+ `epiquire` is not a magic zero-shot "better protein" predictor. Zero-shot, MSA, inverse-folding,
41
+ structure, and ddG signals are used as priors, constraints, or features. The function-aligned signal
42
+ is still measurement. The tool's job is to spend a small measurement budget better.
43
+
44
+ ## Easiest entry point: `epiquire round`
45
+
46
+ One command per round. It auto-resolves the active-site design region (the `resolve-site` evidence
47
+ ladder: UniProt -> linked PDB cocrystal -> AlphaFold) and then runs the campaign round, so there is
48
+ no position bookkeeping by hand.
49
+
50
+ **Run it in your browser, no install:** open [`notebooks/epiquire_round_colab.ipynb`](https://colab.research.google.com/github/gyuminlee-repo/epiquire/blob/master/notebooks/epiquire_round_colab.ipynb)
51
+ in Google Colab -- a form-driven front end over `epiquire round` (paste the WT sequence + an accession,
52
+ optionally upload measured results, get the plate CSV). The core loop is numpy/scikit-learn (seconds);
53
+ only the optional PLM path is heavy, and Colab's free GPU covers that.
54
+
55
+ ```bash
56
+ # R0: no --measured -> the funclib seed plate to MEASURE (needs a signal: --msa / --ddg / --if-logprobs)
57
+ epiquire round WT.fasta --uniprot Q50L36 --msa alignment.a3m --plate 95 --outdir outputs/round
58
+
59
+ # Rk: add each round's measured results -> the next AL plate over the full saturation universe
60
+ epiquire round WT.fasta --uniprot Q50L36 --measured round0_measured.csv --plate 95 --outdir outputs/round
61
+ ```
62
+
63
+ Define the design region any one way: `--uniprot`/`--query`/`--pdb` (auto ladder), a local `--holo`
64
+ structure, or explicit `--positions`. If the ladder cannot map evidence onto your WT it abstains
65
+ (supply `--positions` or `--holo`) rather than guessing. `round` is a thin wrapper: `resolve-site`
66
+ and `campaign` below remain the explicit, fully-configurable commands it calls.
67
+
68
+ ## The campaign engine (what `round` calls)
69
+
70
+ Use `campaign` when starting an active-site campaign. It combines the two validated pieces:
71
+
72
+ 1. **R0 cold-start seed**: FuncLib-style active-site library construction. This proposes variants to
73
+ measure; it does not predict winners.
74
+ 2. **R1+ active learning**: after measurements arrive, the AL loop searches the full saturation space
75
+ of the design positions, not just the R0 seed.
76
+
77
+ ### R0: generate the seed plate
78
+
79
+ ```bash
80
+ epiquire campaign WT.fasta \
81
+ --holo holo_structure.pdb --ligand-resnames LIG \
82
+ --msa alignment.a3m \
83
+ --plate 95 \
84
+ --outdir outputs/campaign
85
+ ```
86
+
87
+ This writes `outputs/campaign/round0_plate.csv` with `variant,mutations,n_mut,seed_score`. Measure
88
+ those variants and save a `variant,fitness` CSV.
89
+
90
+ You can use explicit design positions instead of a holo structure:
91
+
92
+ ```bash
93
+ epiquire campaign WT.fasta \
94
+ --positions 183,184,227,228 \
95
+ --msa alignment.a3m \
96
+ --plate 95 \
97
+ --outdir outputs/campaign
98
+ ```
99
+
100
+ ### R1+: propose the next plate from measured data
101
+
102
+ ```bash
103
+ epiquire campaign WT.fasta \
104
+ --positions 183,184,227,228 \
105
+ --measured round0_measured.csv \
106
+ --plate 95 \
107
+ --outdir outputs/campaign
108
+ ```
109
+
110
+ With multiple rounds, pass all accumulated CSVs:
111
+
112
+ ```bash
113
+ epiquire campaign WT.fasta \
114
+ --positions 183,184,227,228 \
115
+ --measured round0_measured.csv round1_measured.csv \
116
+ --plate 95 \
117
+ --outdir outputs/campaign
118
+ ```
119
+
120
+ ### Simulate against a complete landscape
121
+
122
+ For validation or retrospective benchmarks:
123
+
124
+ ```bash
125
+ epiquire campaign WT.fasta \
126
+ --simulate data/trpb.csv \
127
+ --positions 183,184,227,228 \
128
+ --msa isps_run/msa_trpb.a3m \
129
+ --strategies funclib singles random \
130
+ --plate 95 --rounds 3 --seeds 20 --jobs 4 \
131
+ --model-class ridge \
132
+ --outdir artifacts/campaign_trpb
133
+ ```
134
+
135
+ `--jobs` parallelizes independent seeds with spawn processes; serial output and parallel output are
136
+ byte-identical by test.
137
+
138
+ ## Other commands
139
+
140
+ - `epiquire recommend measured.csv`: rank next candidates from accumulated measurements. This is the
141
+ AL round engine exposed directly. If you pass `--if-logprobs`, epiquire CV-gates that signal by
142
+ default and uses it only if it improves held-out performance on this protein. Use
143
+ `--no-auto-signals` only as an expert override. `--recombine` also adds recombinations of your
144
+ MEASURED beneficials (variants beating WT) to the candidate pool -- the strategy the real-data
145
+ evidence supports (measure broadly, then recombine confirmed wins).
146
+ - `epiquire resolve-site WT.fasta --uniprot ACC` (or `--query`): resolve active-site/binding positions
147
+ from UniProt experimental curation, sequence-aligned onto YOUR WT numbering (handles transit-peptide
148
+ / species offsets), with provenance + confidence per position. If UniProt features do not map it
149
+ falls back to a UniProt-linked PDB cocrystal (active-site shells); `--pdb ID` forces the structure
150
+ path. If neither maps, it fetches the AlphaFold model (predicted, apo). Optional `--apo-pocket`
151
+ (fpocket) guesses pocket positions but is LOW-confidence -- it missed the catalytic site on ispS,
152
+ so it is warned, not default. Abstains on positions rather than guessing -- then provide
153
+ `--holo`/`--positions`. Feed resolved positions to `campaign`.
154
+ - `epiquire funclib WT.fasta ...`: build only the active-site R0 measurement library. Useful when you
155
+ want the seed plate without the campaign wrapper.
156
+ - `epiquire advise measured.csv`: readiness check on measured data.
157
+ - `epiquire bench`: synthetic alpha-spectrum benchmark.
158
+ - `epiquire report`: summarize a JSON report.
159
+
160
+ ## What the opt-ins mean
161
+
162
+ Most optional flags are not feature clutter. They are either:
163
+
164
+ - **resources you must provide**: MSA, PDB, inverse-folding table, ddG table, PLM cache; or
165
+ - **expert overrides** for known regimes.
166
+
167
+ The default stays conservative because low-N model selection overfits. When epiquire can decide from
168
+ this protein's measured data, it should decide itself: optional IF signals are now CV-gated by default;
169
+ pairwise epistasis has its own data gate (`--auto-gate-pairwise`). See `docs/OPTIN_AUDIT.md`.
170
+
171
+ ## Honest scope
172
+
173
+ Evidence so far supports these claims:
174
+
175
+ - active-site FuncLib-style seeds enrich measurable libraries and improve low-budget discovery;
176
+ - the global activity winner can be outside the seed, so AL must search beyond the seed;
177
+ - stability/ddG signals are best used as constraints, while activity improvement is measurement-bound;
178
+ - high-order activity epistasis makes zero-shot winner prediction unreliable.
179
+
180
+ Evidence does **not** support: "epiquire predicts the best protein from sequence/structure alone." It
181
+ increases the probability and efficiency of finding a better variant per measured plate, conditional on
182
+ the design space and assay.
183
+
184
+ ## When NOT to use epiquire
185
+
186
+ The focused active-site campaign helps only when the wins plausibly live in a region you can name
187
+ and measure combinatorially. Do NOT reach for it when:
188
+
189
+ - you do not know where beneficial mutations are (no mechanistic/structural prior on the target site);
190
+ - the wins are likely distal/distributed across the protein (use a broad whole-protein single-mutant
191
+ scan instead -- that is a different tool class; epiquire's additive default cannot rank
192
+ whole-protein singles because one-hot has no cross-position transfer);
193
+ - you cannot measure a focused combinatorial library (no assay throughput at the chosen site).
194
+
195
+ Real-data caution (PtIspS, `artifacts/ispS_wetlab_validation.md`): a campaign run as a broad
196
+ single-mutant scan with the active site barely sampled and the global winner distal is exactly the
197
+ regime epiquire is NOT for. Pick broad-vs-focused from your biology; epiquire does not decide it.
198
+
199
+ More detail: `ARCHITECTURE.md`, `artifacts/DECISION_EVIDENCE.md`, and the Obsidian decision log named
200
+ there.
@@ -0,0 +1,169 @@
1
+ # epiquire
2
+
3
+ Low-N, acquisition-driven protein mutation recommendation. The spine is one measurement campaign:
4
+
5
+ ```text
6
+ R0 seed to measure -> fit surrogate on all measured data -> acquire next plate -> measure -> repeat
7
+ ```
8
+
9
+ `epiquire` is not a magic zero-shot "better protein" predictor. Zero-shot, MSA, inverse-folding,
10
+ structure, and ddG signals are used as priors, constraints, or features. The function-aligned signal
11
+ is still measurement. The tool's job is to spend a small measurement budget better.
12
+
13
+ ## Easiest entry point: `epiquire round`
14
+
15
+ One command per round. It auto-resolves the active-site design region (the `resolve-site` evidence
16
+ ladder: UniProt -> linked PDB cocrystal -> AlphaFold) and then runs the campaign round, so there is
17
+ no position bookkeeping by hand.
18
+
19
+ **Run it in your browser, no install:** open [`notebooks/epiquire_round_colab.ipynb`](https://colab.research.google.com/github/gyuminlee-repo/epiquire/blob/master/notebooks/epiquire_round_colab.ipynb)
20
+ in Google Colab -- a form-driven front end over `epiquire round` (paste the WT sequence + an accession,
21
+ optionally upload measured results, get the plate CSV). The core loop is numpy/scikit-learn (seconds);
22
+ only the optional PLM path is heavy, and Colab's free GPU covers that.
23
+
24
+ ```bash
25
+ # R0: no --measured -> the funclib seed plate to MEASURE (needs a signal: --msa / --ddg / --if-logprobs)
26
+ epiquire round WT.fasta --uniprot Q50L36 --msa alignment.a3m --plate 95 --outdir outputs/round
27
+
28
+ # Rk: add each round's measured results -> the next AL plate over the full saturation universe
29
+ epiquire round WT.fasta --uniprot Q50L36 --measured round0_measured.csv --plate 95 --outdir outputs/round
30
+ ```
31
+
32
+ Define the design region any one way: `--uniprot`/`--query`/`--pdb` (auto ladder), a local `--holo`
33
+ structure, or explicit `--positions`. If the ladder cannot map evidence onto your WT it abstains
34
+ (supply `--positions` or `--holo`) rather than guessing. `round` is a thin wrapper: `resolve-site`
35
+ and `campaign` below remain the explicit, fully-configurable commands it calls.
36
+
37
+ ## The campaign engine (what `round` calls)
38
+
39
+ Use `campaign` when starting an active-site campaign. It combines the two validated pieces:
40
+
41
+ 1. **R0 cold-start seed**: FuncLib-style active-site library construction. This proposes variants to
42
+ measure; it does not predict winners.
43
+ 2. **R1+ active learning**: after measurements arrive, the AL loop searches the full saturation space
44
+ of the design positions, not just the R0 seed.
45
+
46
+ ### R0: generate the seed plate
47
+
48
+ ```bash
49
+ epiquire campaign WT.fasta \
50
+ --holo holo_structure.pdb --ligand-resnames LIG \
51
+ --msa alignment.a3m \
52
+ --plate 95 \
53
+ --outdir outputs/campaign
54
+ ```
55
+
56
+ This writes `outputs/campaign/round0_plate.csv` with `variant,mutations,n_mut,seed_score`. Measure
57
+ those variants and save a `variant,fitness` CSV.
58
+
59
+ You can use explicit design positions instead of a holo structure:
60
+
61
+ ```bash
62
+ epiquire campaign WT.fasta \
63
+ --positions 183,184,227,228 \
64
+ --msa alignment.a3m \
65
+ --plate 95 \
66
+ --outdir outputs/campaign
67
+ ```
68
+
69
+ ### R1+: propose the next plate from measured data
70
+
71
+ ```bash
72
+ epiquire campaign WT.fasta \
73
+ --positions 183,184,227,228 \
74
+ --measured round0_measured.csv \
75
+ --plate 95 \
76
+ --outdir outputs/campaign
77
+ ```
78
+
79
+ With multiple rounds, pass all accumulated CSVs:
80
+
81
+ ```bash
82
+ epiquire campaign WT.fasta \
83
+ --positions 183,184,227,228 \
84
+ --measured round0_measured.csv round1_measured.csv \
85
+ --plate 95 \
86
+ --outdir outputs/campaign
87
+ ```
88
+
89
+ ### Simulate against a complete landscape
90
+
91
+ For validation or retrospective benchmarks:
92
+
93
+ ```bash
94
+ epiquire campaign WT.fasta \
95
+ --simulate data/trpb.csv \
96
+ --positions 183,184,227,228 \
97
+ --msa isps_run/msa_trpb.a3m \
98
+ --strategies funclib singles random \
99
+ --plate 95 --rounds 3 --seeds 20 --jobs 4 \
100
+ --model-class ridge \
101
+ --outdir artifacts/campaign_trpb
102
+ ```
103
+
104
+ `--jobs` parallelizes independent seeds with spawn processes; serial output and parallel output are
105
+ byte-identical by test.
106
+
107
+ ## Other commands
108
+
109
+ - `epiquire recommend measured.csv`: rank next candidates from accumulated measurements. This is the
110
+ AL round engine exposed directly. If you pass `--if-logprobs`, epiquire CV-gates that signal by
111
+ default and uses it only if it improves held-out performance on this protein. Use
112
+ `--no-auto-signals` only as an expert override. `--recombine` also adds recombinations of your
113
+ MEASURED beneficials (variants beating WT) to the candidate pool -- the strategy the real-data
114
+ evidence supports (measure broadly, then recombine confirmed wins).
115
+ - `epiquire resolve-site WT.fasta --uniprot ACC` (or `--query`): resolve active-site/binding positions
116
+ from UniProt experimental curation, sequence-aligned onto YOUR WT numbering (handles transit-peptide
117
+ / species offsets), with provenance + confidence per position. If UniProt features do not map it
118
+ falls back to a UniProt-linked PDB cocrystal (active-site shells); `--pdb ID` forces the structure
119
+ path. If neither maps, it fetches the AlphaFold model (predicted, apo). Optional `--apo-pocket`
120
+ (fpocket) guesses pocket positions but is LOW-confidence -- it missed the catalytic site on ispS,
121
+ so it is warned, not default. Abstains on positions rather than guessing -- then provide
122
+ `--holo`/`--positions`. Feed resolved positions to `campaign`.
123
+ - `epiquire funclib WT.fasta ...`: build only the active-site R0 measurement library. Useful when you
124
+ want the seed plate without the campaign wrapper.
125
+ - `epiquire advise measured.csv`: readiness check on measured data.
126
+ - `epiquire bench`: synthetic alpha-spectrum benchmark.
127
+ - `epiquire report`: summarize a JSON report.
128
+
129
+ ## What the opt-ins mean
130
+
131
+ Most optional flags are not feature clutter. They are either:
132
+
133
+ - **resources you must provide**: MSA, PDB, inverse-folding table, ddG table, PLM cache; or
134
+ - **expert overrides** for known regimes.
135
+
136
+ The default stays conservative because low-N model selection overfits. When epiquire can decide from
137
+ this protein's measured data, it should decide itself: optional IF signals are now CV-gated by default;
138
+ pairwise epistasis has its own data gate (`--auto-gate-pairwise`). See `docs/OPTIN_AUDIT.md`.
139
+
140
+ ## Honest scope
141
+
142
+ Evidence so far supports these claims:
143
+
144
+ - active-site FuncLib-style seeds enrich measurable libraries and improve low-budget discovery;
145
+ - the global activity winner can be outside the seed, so AL must search beyond the seed;
146
+ - stability/ddG signals are best used as constraints, while activity improvement is measurement-bound;
147
+ - high-order activity epistasis makes zero-shot winner prediction unreliable.
148
+
149
+ Evidence does **not** support: "epiquire predicts the best protein from sequence/structure alone." It
150
+ increases the probability and efficiency of finding a better variant per measured plate, conditional on
151
+ the design space and assay.
152
+
153
+ ## When NOT to use epiquire
154
+
155
+ The focused active-site campaign helps only when the wins plausibly live in a region you can name
156
+ and measure combinatorially. Do NOT reach for it when:
157
+
158
+ - you do not know where beneficial mutations are (no mechanistic/structural prior on the target site);
159
+ - the wins are likely distal/distributed across the protein (use a broad whole-protein single-mutant
160
+ scan instead -- that is a different tool class; epiquire's additive default cannot rank
161
+ whole-protein singles because one-hot has no cross-position transfer);
162
+ - you cannot measure a focused combinatorial library (no assay throughput at the chosen site).
163
+
164
+ Real-data caution (PtIspS, `artifacts/ispS_wetlab_validation.md`): a campaign run as a broad
165
+ single-mutant scan with the active site barely sampled and the global winner distal is exactly the
166
+ regime epiquire is NOT for. Pick broad-vs-focused from your biology; epiquire does not decide it.
167
+
168
+ More detail: `ARCHITECTURE.md`, `artifacts/DECISION_EVIDENCE.md`, and the Obsidian decision log named
169
+ there.
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "epiquire"
7
+ version = "0.1.0"
8
+ description = "Low-N, acquisition-driven, epistasis-aware protein mutation recommendation engine"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "gyuminlee-repo" }]
12
+ keywords = [
13
+ "directed-evolution",
14
+ "protein-engineering",
15
+ "active-learning",
16
+ "bayesian-optimization",
17
+ "epistasis",
18
+ "mutation-recommendation",
19
+ ]
20
+ classifiers = [
21
+ "Development Status :: 4 - Beta",
22
+ "Intended Audience :: Science/Research",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
28
+ "Operating System :: OS Independent",
29
+ ]
30
+ dependencies = [
31
+ "numpy",
32
+ "scipy",
33
+ "scikit-learn",
34
+ "pandas",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ plm = ["torch", "esm"] # ESM-C (EvolutionaryScale) + on-demand embeddings
39
+ plm2 = ["fair-esm"] # ESM-2 (Meta) alternative provider
40
+ dev = ["pytest", "pyright", "build", "twine"]
41
+
42
+ [project.scripts]
43
+ epiquire = "epiquire.cli:main"
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
47
+
48
+ [tool.pytest.ini_options]
49
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,18 @@
1
+ """epiquire: low-N, acquisition-driven, epistasis-aware mutation recommendation."""
2
+ from __future__ import annotations
3
+
4
+ __version__ = "0.0.1"
5
+
6
+ from epiquire.funclib import (
7
+ FuncLibLibrary,
8
+ combine_tolerances,
9
+ design_library,
10
+ tolerance_from_logprob_table,
11
+ )
12
+
13
+ __all__ = [
14
+ "FuncLibLibrary",
15
+ "combine_tolerances",
16
+ "design_library",
17
+ "tolerance_from_logprob_table",
18
+ ]
@@ -0,0 +1,135 @@
1
+ """Acquisition layer: the experimental-design lever (the tool's organizing axis).
2
+
3
+ Score-based acquisitions (Greedy / UCB / MaxVariance) rank candidates by a scalar;
4
+ ``select`` returns the top-k. DiverseBatch wraps a score acquisition and applies
5
+ KURO greedy-maximin so a batch is both high-value and spread out (no near-duplicate
6
+ wells). All satisfy ``protocols.Acquisition`` and compose freely:
7
+
8
+ DiverseBatch(UCB(beta=1.0)) # diverse, uncertainty-aware exploitation
9
+ DiverseBatch(MaxVariance()) # diverse pure exploration (resolve uncertainty)
10
+ Greedy() # pure exploitation (the additive-greedy baseline)
11
+
12
+ UCB / MaxVariance are only meaningful because the surrogate std is CALIBRATED (P1).
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from collections.abc import Sequence
18
+
19
+ import numpy as np
20
+
21
+ from epiquire.design_space import DesignSpace, to_assignment
22
+ from epiquire.types import Prediction
23
+
24
+
25
+ class _ScoreAcquisition(ABC):
26
+ """Acquisitions that rank candidates by a per-candidate scalar score."""
27
+
28
+ @abstractmethod
29
+ def score(self, pred: Prediction) -> np.ndarray:
30
+ """Per-candidate acquisition value (higher = more desirable)."""
31
+
32
+ def select(
33
+ self,
34
+ pred: Prediction,
35
+ k: int,
36
+ *,
37
+ candidates: Sequence[str] | None = None,
38
+ space: DesignSpace | None = None,
39
+ ) -> list[int]:
40
+ s = self.score(pred)
41
+ if k <= 0 or s.size == 0:
42
+ return []
43
+ k = min(k, s.size)
44
+ return np.argsort(s)[::-1][:k].astype(int).tolist()
45
+
46
+
47
+ class Greedy(_ScoreAcquisition):
48
+ """Pure exploitation: score = predicted mean (the additive-greedy lever)."""
49
+
50
+ def score(self, pred: Prediction) -> np.ndarray:
51
+ return np.asarray(pred.mean, dtype=float)
52
+
53
+
54
+ class UCB(_ScoreAcquisition):
55
+ """Upper-confidence bound: score = mean + beta * calibrated std."""
56
+
57
+ def __init__(self, beta: float = 1.0) -> None:
58
+ self.beta = beta
59
+
60
+ def score(self, pred: Prediction) -> np.ndarray:
61
+ return np.asarray(pred.mean, dtype=float) + self.beta * np.asarray(pred.std, dtype=float)
62
+
63
+
64
+ class MaxVariance(_ScoreAcquisition):
65
+ """Information-targeted (BALD-style) pure exploration: score = calibrated std.
66
+
67
+ Acquires where the model is most uncertain, i.e. where a measurement most reduces
68
+ predictive uncertainty. With calibrated std this is a principled exploration lever
69
+ (and the seed of resolving epistasis once interaction uncertainty is exposed).
70
+ """
71
+
72
+ def score(self, pred: Prediction) -> np.ndarray:
73
+ return np.asarray(pred.std, dtype=float)
74
+
75
+
76
+ def _combo_vectors(candidates: Sequence[str], space: DesignSpace) -> list[tuple[str, ...]]:
77
+ """Per-position residue tuples for fast Hamming distance over the design space."""
78
+ out: list[tuple[str, ...]] = []
79
+ for v in candidates:
80
+ a = to_assignment(v, space)
81
+ out.append(tuple(a[p] for p in space.positions))
82
+ return out
83
+
84
+
85
+ def _hamming(a: tuple[str, ...], b: tuple[str, ...]) -> int:
86
+ return sum(1 for x, y in zip(a, b, strict=True) if x != y)
87
+
88
+
89
+ class DiverseBatch:
90
+ """KURO greedy-maximin diversity over a top-scored pool of a base acquisition.
91
+
92
+ Ranks all candidates by the base acquisition, keeps the top ``pool_factor * k``,
93
+ then greedily builds a batch that maximizes the minimum design-space (Hamming)
94
+ distance to the already-chosen set, seeded by the single best-scoring candidate.
95
+ The result is a high-value batch with no near-duplicate wells.
96
+ """
97
+
98
+ def __init__(self, base: _ScoreAcquisition, pool_factor: float = 4.0) -> None:
99
+ self.base = base
100
+ self.pool_factor = pool_factor
101
+
102
+ def select(
103
+ self,
104
+ pred: Prediction,
105
+ k: int,
106
+ *,
107
+ candidates: Sequence[str],
108
+ space: DesignSpace,
109
+ ) -> list[int]:
110
+ n = len(candidates)
111
+ if n == 0 or k <= 0:
112
+ return []
113
+ k = min(k, n)
114
+ scores = self.base.score(pred)
115
+ order = np.argsort(scores)[::-1]
116
+ pool_size = min(n, max(k, int(self.pool_factor * k)))
117
+ pool: list[int] = [int(j) for j in order[:pool_size]]
118
+ vecs = _combo_vectors([candidates[i] for i in pool], space)
119
+
120
+ chosen = [0] # local index into pool; pool[0] is the highest base score
121
+ mind = [_hamming(vecs[j], vecs[0]) for j in range(len(pool))]
122
+ mind[0] = -1
123
+ while len(chosen) < k:
124
+ j = int(np.argmax(mind)) # max-min distance; ties -> lowest index = higher base score
125
+ if mind[j] < 0: # pool exhausted of distinct points
126
+ break
127
+ chosen.append(j)
128
+ mind[j] = -1
129
+ vj = vecs[j]
130
+ for t in range(len(pool)):
131
+ if mind[t] >= 0:
132
+ d = _hamming(vecs[t], vj)
133
+ if d < mind[t]:
134
+ mind[t] = d
135
+ return [pool[j] for j in chosen]