evoseq 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evoseq-0.1.0/LICENSE +21 -0
- evoseq-0.1.0/PKG-INFO +230 -0
- evoseq-0.1.0/README.md +201 -0
- evoseq-0.1.0/evoseq/__init__.py +3 -0
- evoseq-0.1.0/evoseq/cli.py +21 -0
- evoseq-0.1.0/evoseq/config.py +107 -0
- evoseq-0.1.0/evoseq/paths.py +43 -0
- evoseq-0.1.0/evoseq/preprocess/__init__.py +13 -0
- evoseq-0.1.0/evoseq/preprocess/discovery.py +108 -0
- evoseq-0.1.0/evoseq/preprocess/export.py +80 -0
- evoseq-0.1.0/evoseq/preprocess/fasta.py +29 -0
- evoseq-0.1.0/evoseq/preprocess/manifest.py +65 -0
- evoseq-0.1.0/evoseq/preprocess/pipeline.py +259 -0
- evoseq-0.1.0/evoseq/preprocess/validation.py +48 -0
- evoseq-0.1.0/evoseq/preprocess/variant.py +73 -0
- evoseq-0.1.0/evoseq/scoring/__init__.py +18 -0
- evoseq-0.1.0/evoseq/scoring/environment.py +58 -0
- evoseq-0.1.0/evoseq/scoring/evo2_model.py +105 -0
- evoseq-0.1.0/evoseq/scoring/export.py +51 -0
- evoseq-0.1.0/evoseq/scoring/perbase.py +75 -0
- evoseq-0.1.0/evoseq/scoring/pipeline.py +456 -0
- evoseq-0.1.0/evoseq.egg-info/PKG-INFO +230 -0
- evoseq-0.1.0/evoseq.egg-info/SOURCES.txt +28 -0
- evoseq-0.1.0/evoseq.egg-info/dependency_links.txt +1 -0
- evoseq-0.1.0/evoseq.egg-info/entry_points.txt +2 -0
- evoseq-0.1.0/evoseq.egg-info/requires.txt +8 -0
- evoseq-0.1.0/evoseq.egg-info/top_level.txt +1 -0
- evoseq-0.1.0/pyproject.toml +48 -0
- evoseq-0.1.0/setup.cfg +4 -0
- evoseq-0.1.0/tests/test_preprocess_and_score.py +112 -0
evoseq-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hideaki Mizoue
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
evoseq-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evoseq
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Genome analysis toolkit powered by Evo
|
|
5
|
+
Author: Hideaki Mizoue
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mizomizo1/EvoSeq
|
|
8
|
+
Project-URL: Repository, https://github.com/mizomizo1/EvoSeq
|
|
9
|
+
Project-URL: Issues, https://github.com/mizomizo1/EvoSeq/issues
|
|
10
|
+
Keywords: bioinformatics,genomics,evo2,variant-scoring
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pandas>=2.2
|
|
22
|
+
Requires-Dist: numpy>=2.0
|
|
23
|
+
Requires-Dist: biopython>=1.85
|
|
24
|
+
Requires-Dist: tqdm>=4.66
|
|
25
|
+
Provides-Extra: evo2
|
|
26
|
+
Requires-Dist: evo2; extra == "evo2"
|
|
27
|
+
Requires-Dist: torch; extra == "evo2"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# EvoSeq
|
|
31
|
+
|
|
32
|
+
EvoSeq is a small Colab-friendly toolkit for preparing paired reference/mutant
|
|
33
|
+
FASTA files and scoring variants with Evo2.
|
|
34
|
+
|
|
35
|
+
It is designed for the common research workflow where positive datasets have a
|
|
36
|
+
`manifest.tsv`, negative datasets may only have paired FASTA files, and the same
|
|
37
|
+
Evo2 model should stay loaded once per Colab runtime.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
For local testing from this repository:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -e .
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
For Evo2 scoring dependencies:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e ".[evo2]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
In Google Colab, Evo2 often needs a runtime-specific install. Use this before
|
|
54
|
+
scoring:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip uninstall -y torchvision
|
|
58
|
+
pip install -q torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128
|
|
59
|
+
pip install -q flash-attn==2.8.0.post2 --no-build-isolation
|
|
60
|
+
pip install -q evo2
|
|
61
|
+
pip install -e .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
After a GitHub Release is tagged, users can install a specific version directly:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install "git+https://github.com/mizomizo1/EvoSeq.git@v0.1.0"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
For Evo2 scoring in Colab, install Evo2 and GPU dependencies in the runtime that
|
|
71
|
+
matches your model. The preprocessing step only needs the base dependencies.
|
|
72
|
+
|
|
73
|
+
## Debug / Test
|
|
74
|
+
|
|
75
|
+
Run the local workflow tests without Evo2, torch, or flash-attn:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python -m unittest discover -s tests -v
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
These tests cover preprocessing, folder discovery, score-table export with a
|
|
82
|
+
fake scorer, and the missing Evo2 dependency message. Real Evo2 scoring still
|
|
83
|
+
requires a Colab GPU runtime with `torch`, `flash-attn`, and `evo2` installed.
|
|
84
|
+
|
|
85
|
+
## Quick Start: Preprocessing Files
|
|
86
|
+
|
|
87
|
+
Put files anywhere, for example in `test/`, and pass the files directly:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from evoseq.preprocess import preprocess_files
|
|
91
|
+
|
|
92
|
+
evo_df, paths = preprocess_files(
|
|
93
|
+
reference_fasta_path="test/evo2_reference.fasta",
|
|
94
|
+
mutant_fasta_path="test/evo2_mutant.fasta",
|
|
95
|
+
manifest_path="auto",
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
By default, outputs are written next to the input files:
|
|
100
|
+
`test/evoseq_preprocess_output/`.
|
|
101
|
+
|
|
102
|
+
You can also be explicit:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
evo_df, paths = preprocess_files(
|
|
106
|
+
reference_fasta_path="test/evo2_reference.fasta",
|
|
107
|
+
mutant_fasta_path="test/evo2_mutant.fasta",
|
|
108
|
+
output_dir="outputs/preprocessing",
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Outputs include:
|
|
113
|
+
|
|
114
|
+
- `evo2_pairs.tsv`: one row per variant with `ref_seq` and `mut_seq`
|
|
115
|
+
- `evo2_reference.fa`
|
|
116
|
+
- `evo2_mutant.fa`
|
|
117
|
+
- `evo2_all.fa`
|
|
118
|
+
- `preprocessing_report.tsv`
|
|
119
|
+
|
|
120
|
+
`manifest.tsv` is optional. When present, metadata are merged by `record_id`.
|
|
121
|
+
When absent, metadata are inferred from FASTA IDs when possible.
|
|
122
|
+
|
|
123
|
+
## Quick Start: Preprocessing a Folder
|
|
124
|
+
|
|
125
|
+
If your folder contains paired FASTA files, EvoSeq can discover them:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from evoseq.preprocess import preprocess_folder
|
|
129
|
+
|
|
130
|
+
evo_df, paths = preprocess_folder("test")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Quick Start: Evo2 Scoring
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from evoseq.scoring import score_pairs_file
|
|
137
|
+
|
|
138
|
+
result_df, result_paths = score_pairs_file(
|
|
139
|
+
pairs_path="test/evoseq_preprocess_output/evo2_pairs.tsv",
|
|
140
|
+
model_name="evo2_7b",
|
|
141
|
+
batch_size=8,
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
By default, outputs are written next to the pair table:
|
|
146
|
+
`test/evoseq_preprocess_output/evoseq_scoring_output/`.
|
|
147
|
+
|
|
148
|
+
Use `output_dir="outputs/scoring"` if you want a project-level result folder.
|
|
149
|
+
|
|
150
|
+
- `evo2_variant_scores_unique.tsv`
|
|
151
|
+
- `evo2_variant_scores_manifest.tsv` when a manifest is available
|
|
152
|
+
- `environment_info.tsv`
|
|
153
|
+
- `scoring_report.tsv`
|
|
154
|
+
|
|
155
|
+
Reference sequences are scored once per unique sequence and reused. This is
|
|
156
|
+
useful when many variants share the same reference window.
|
|
157
|
+
|
|
158
|
+
## Model Handling
|
|
159
|
+
|
|
160
|
+
EvoSeq caches the loaded Evo2 model inside the Python process:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from evoseq.scoring import Evo2Scorer
|
|
164
|
+
|
|
165
|
+
scorer = Evo2Scorer(model_name="evo2_7b", device="cuda:0")
|
|
166
|
+
scores = scorer.score_sequences(["ACGTACGT"])
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Calling another scoring function with the same model reuses it. Attempting to
|
|
170
|
+
load a different Evo2 model in the same runtime raises an explicit error by
|
|
171
|
+
default, because loading multiple large models often exhausts Colab GPU memory.
|
|
172
|
+
Restart the runtime when switching from 7B to 20B.
|
|
173
|
+
|
|
174
|
+
Common model names:
|
|
175
|
+
|
|
176
|
+
- `evo2_7b`
|
|
177
|
+
- `evo2_7b_base`
|
|
178
|
+
- `evo2_20b`
|
|
179
|
+
|
|
180
|
+
For local model weights:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
score_evo2_pairs(
|
|
184
|
+
base_dir=".",
|
|
185
|
+
model_name="evo2_20b",
|
|
186
|
+
local_path="/content/drive/MyDrive/Models/evo2_20b.pt",
|
|
187
|
+
)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## TOML Config
|
|
191
|
+
|
|
192
|
+
Copy `evoseq.example.toml`, edit the input paths/model, and run:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from evoseq import run_from_config
|
|
196
|
+
|
|
197
|
+
outputs = run_from_config("evoseq.example.toml")
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
or:
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
evoseq-run evoseq.example.toml
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Per-Base Log-Probabilities
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from evoseq.scoring import export_perbase_logprobs
|
|
210
|
+
|
|
211
|
+
path = export_perbase_logprobs(
|
|
212
|
+
fasta_path="test/representative_perbase.fasta",
|
|
213
|
+
model_name="evo2_7b",
|
|
214
|
+
center=4096,
|
|
215
|
+
half_window=320,
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
By default, this writes `test/evoseq_perbase_output/perbase_logprobs.tsv`.
|
|
220
|
+
|
|
221
|
+
## Reproducibility
|
|
222
|
+
|
|
223
|
+
EvoSeq writes small TSV reports for methods sections and reruns:
|
|
224
|
+
|
|
225
|
+
- input paths and output paths
|
|
226
|
+
- number of variants and unique reference sequences
|
|
227
|
+
- model name, batch size, device, elapsed time
|
|
228
|
+
- Python, PyTorch, CUDA, GPU, NumPy, pandas, Biopython, and Evo2 versions
|
|
229
|
+
|
|
230
|
+
These files are meant to be saved with each analysis directory.
|
evoseq-0.1.0/README.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# EvoSeq
|
|
2
|
+
|
|
3
|
+
EvoSeq is a small Colab-friendly toolkit for preparing paired reference/mutant
|
|
4
|
+
FASTA files and scoring variants with Evo2.
|
|
5
|
+
|
|
6
|
+
It is designed for the common research workflow where positive datasets have a
|
|
7
|
+
`manifest.tsv`, negative datasets may only have paired FASTA files, and the same
|
|
8
|
+
Evo2 model should stay loaded once per Colab runtime.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
For local testing from this repository:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install -e .
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
For Evo2 scoring dependencies:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install -e ".[evo2]"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
In Google Colab, Evo2 often needs a runtime-specific install. Use this before
|
|
25
|
+
scoring:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip uninstall -y torchvision
|
|
29
|
+
pip install -q torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128
|
|
30
|
+
pip install -q flash-attn==2.8.0.post2 --no-build-isolation
|
|
31
|
+
pip install -q evo2
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
After a GitHub Release is tagged, users can install a specific version directly:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install "git+https://github.com/mizomizo1/EvoSeq.git@v0.1.0"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For Evo2 scoring in Colab, install Evo2 and GPU dependencies in the runtime that
|
|
42
|
+
matches your model. The preprocessing step only needs the base dependencies.
|
|
43
|
+
|
|
44
|
+
## Debug / Test
|
|
45
|
+
|
|
46
|
+
Run the local workflow tests without Evo2, torch, or flash-attn:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python -m unittest discover -s tests -v
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
These tests cover preprocessing, folder discovery, score-table export with a
|
|
53
|
+
fake scorer, and the missing Evo2 dependency message. Real Evo2 scoring still
|
|
54
|
+
requires a Colab GPU runtime with `torch`, `flash-attn`, and `evo2` installed.
|
|
55
|
+
|
|
56
|
+
## Quick Start: Preprocessing Files
|
|
57
|
+
|
|
58
|
+
Put files anywhere, for example in `test/`, and pass the files directly:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from evoseq.preprocess import preprocess_files
|
|
62
|
+
|
|
63
|
+
evo_df, paths = preprocess_files(
|
|
64
|
+
reference_fasta_path="test/evo2_reference.fasta",
|
|
65
|
+
mutant_fasta_path="test/evo2_mutant.fasta",
|
|
66
|
+
manifest_path="auto",
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
By default, outputs are written next to the input files:
|
|
71
|
+
`test/evoseq_preprocess_output/`.
|
|
72
|
+
|
|
73
|
+
You can also be explicit:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
evo_df, paths = preprocess_files(
|
|
77
|
+
reference_fasta_path="test/evo2_reference.fasta",
|
|
78
|
+
mutant_fasta_path="test/evo2_mutant.fasta",
|
|
79
|
+
output_dir="outputs/preprocessing",
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Outputs include:
|
|
84
|
+
|
|
85
|
+
- `evo2_pairs.tsv`: one row per variant with `ref_seq` and `mut_seq`
|
|
86
|
+
- `evo2_reference.fa`
|
|
87
|
+
- `evo2_mutant.fa`
|
|
88
|
+
- `evo2_all.fa`
|
|
89
|
+
- `preprocessing_report.tsv`
|
|
90
|
+
|
|
91
|
+
`manifest.tsv` is optional. When present, metadata are merged by `record_id`.
|
|
92
|
+
When absent, metadata are inferred from FASTA IDs when possible.
|
|
93
|
+
|
|
94
|
+
## Quick Start: Preprocessing a Folder
|
|
95
|
+
|
|
96
|
+
If your folder contains paired FASTA files, EvoSeq can discover them:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from evoseq.preprocess import preprocess_folder
|
|
100
|
+
|
|
101
|
+
evo_df, paths = preprocess_folder("test")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Quick Start: Evo2 Scoring
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from evoseq.scoring import score_pairs_file
|
|
108
|
+
|
|
109
|
+
result_df, result_paths = score_pairs_file(
|
|
110
|
+
pairs_path="test/evoseq_preprocess_output/evo2_pairs.tsv",
|
|
111
|
+
model_name="evo2_7b",
|
|
112
|
+
batch_size=8,
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
By default, outputs are written next to the pair table:
|
|
117
|
+
`test/evoseq_preprocess_output/evoseq_scoring_output/`.
|
|
118
|
+
|
|
119
|
+
Use `output_dir="outputs/scoring"` if you want a project-level result folder.
|
|
120
|
+
|
|
121
|
+
- `evo2_variant_scores_unique.tsv`
|
|
122
|
+
- `evo2_variant_scores_manifest.tsv` when a manifest is available
|
|
123
|
+
- `environment_info.tsv`
|
|
124
|
+
- `scoring_report.tsv`
|
|
125
|
+
|
|
126
|
+
Reference sequences are scored once per unique sequence and reused. This is
|
|
127
|
+
useful when many variants share the same reference window.
|
|
128
|
+
|
|
129
|
+
## Model Handling
|
|
130
|
+
|
|
131
|
+
EvoSeq caches the loaded Evo2 model inside the Python process:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from evoseq.scoring import Evo2Scorer
|
|
135
|
+
|
|
136
|
+
scorer = Evo2Scorer(model_name="evo2_7b", device="cuda:0")
|
|
137
|
+
scores = scorer.score_sequences(["ACGTACGT"])
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Calling another scoring function with the same model reuses it. Attempting to
|
|
141
|
+
load a different Evo2 model in the same runtime raises an explicit error by
|
|
142
|
+
default, because loading multiple large models often exhausts Colab GPU memory.
|
|
143
|
+
Restart the runtime when switching from 7B to 20B.
|
|
144
|
+
|
|
145
|
+
Common model names:
|
|
146
|
+
|
|
147
|
+
- `evo2_7b`
|
|
148
|
+
- `evo2_7b_base`
|
|
149
|
+
- `evo2_20b`
|
|
150
|
+
|
|
151
|
+
For local model weights:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
score_evo2_pairs(
|
|
155
|
+
base_dir=".",
|
|
156
|
+
model_name="evo2_20b",
|
|
157
|
+
local_path="/content/drive/MyDrive/Models/evo2_20b.pt",
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## TOML Config
|
|
162
|
+
|
|
163
|
+
Copy `evoseq.example.toml`, edit the input paths/model, and run:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from evoseq import run_from_config
|
|
167
|
+
|
|
168
|
+
outputs = run_from_config("evoseq.example.toml")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
or:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
evoseq-run evoseq.example.toml
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Per-Base Log-Probabilities
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from evoseq.scoring import export_perbase_logprobs
|
|
181
|
+
|
|
182
|
+
path = export_perbase_logprobs(
|
|
183
|
+
fasta_path="test/representative_perbase.fasta",
|
|
184
|
+
model_name="evo2_7b",
|
|
185
|
+
center=4096,
|
|
186
|
+
half_window=320,
|
|
187
|
+
)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
By default, this writes `test/evoseq_perbase_output/perbase_logprobs.tsv`.
|
|
191
|
+
|
|
192
|
+
## Reproducibility
|
|
193
|
+
|
|
194
|
+
EvoSeq writes small TSV reports for methods sections and reruns:
|
|
195
|
+
|
|
196
|
+
- input paths and output paths
|
|
197
|
+
- number of variants and unique reference sequences
|
|
198
|
+
- model name, batch size, device, elapsed time
|
|
199
|
+
- Python, PyTorch, CUDA, GPU, NumPy, pandas, Biopython, and Evo2 versions
|
|
200
|
+
|
|
201
|
+
These files are meant to be saved with each analysis directory.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from .config import run_from_config
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main(argv=None):
|
|
7
|
+
parser = argparse.ArgumentParser(description="Run EvoSeq from a TOML config.")
|
|
8
|
+
parser.add_argument("config", help="Path to an EvoSeq TOML config file.")
|
|
9
|
+
args = parser.parse_args(argv)
|
|
10
|
+
|
|
11
|
+
outputs = run_from_config(args.config)
|
|
12
|
+
print("EvoSeq run completed.")
|
|
13
|
+
for key, value in outputs.items():
|
|
14
|
+
if key.endswith("_paths"):
|
|
15
|
+
print(f"{key}:")
|
|
16
|
+
for name, path in value.items():
|
|
17
|
+
print(f" {name}: {path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
main()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import tomllib
|
|
3
|
+
|
|
4
|
+
from .preprocess import preprocess_files, preprocess_folder
|
|
5
|
+
from .scoring import export_perbase_logprobs, score_pairs_file
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _none_if_blank(value):
|
|
9
|
+
return None if value == "" else value
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_config(path):
|
|
13
|
+
with open(path, "rb") as fh:
|
|
14
|
+
return tomllib.load(fh)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def run_from_config(path):
|
|
18
|
+
config = load_config(path)
|
|
19
|
+
|
|
20
|
+
project = config.get("project", {})
|
|
21
|
+
input_dir = project.get("input_dir", project.get("base_dir", "."))
|
|
22
|
+
|
|
23
|
+
preprocess_config = config.get("preprocess", {})
|
|
24
|
+
scoring_config = config.get("scoring", {})
|
|
25
|
+
perbase_config = config.get("perbase", {})
|
|
26
|
+
|
|
27
|
+
outputs = {}
|
|
28
|
+
if preprocess_config.get("enabled", True):
|
|
29
|
+
reference_fasta_path = _none_if_blank(preprocess_config.get("reference_fasta_path"))
|
|
30
|
+
mutant_fasta_path = _none_if_blank(preprocess_config.get("mutant_fasta_path"))
|
|
31
|
+
manifest_path = preprocess_config.get("manifest_path", "auto")
|
|
32
|
+
|
|
33
|
+
if reference_fasta_path and mutant_fasta_path:
|
|
34
|
+
evo_df, saved = preprocess_files(
|
|
35
|
+
reference_fasta_path=reference_fasta_path,
|
|
36
|
+
mutant_fasta_path=mutant_fasta_path,
|
|
37
|
+
manifest_path=manifest_path,
|
|
38
|
+
output_dir=_none_if_blank(
|
|
39
|
+
preprocess_config.get("output_dir", preprocess_config.get("out_dir"))
|
|
40
|
+
),
|
|
41
|
+
strict_manifest=preprocess_config.get("strict_manifest", False),
|
|
42
|
+
progress=preprocess_config.get("progress", True),
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
evo_df, saved = preprocess_folder(
|
|
46
|
+
input_dir=input_dir,
|
|
47
|
+
output_dir=_none_if_blank(
|
|
48
|
+
preprocess_config.get("output_dir", preprocess_config.get("out_dir"))
|
|
49
|
+
),
|
|
50
|
+
manifest_path=manifest_path,
|
|
51
|
+
reference_fasta_path=reference_fasta_path,
|
|
52
|
+
mutant_fasta_path=mutant_fasta_path,
|
|
53
|
+
dataset_type=preprocess_config.get("dataset_type", "auto"),
|
|
54
|
+
window_size=preprocess_config.get("window_size"),
|
|
55
|
+
strict_manifest=preprocess_config.get("strict_manifest", False),
|
|
56
|
+
progress=preprocess_config.get("progress", True),
|
|
57
|
+
)
|
|
58
|
+
outputs["preprocess_df"] = evo_df
|
|
59
|
+
outputs["preprocess_paths"] = saved
|
|
60
|
+
|
|
61
|
+
if scoring_config.get("enabled", False):
|
|
62
|
+
pairs_path = (
|
|
63
|
+
_none_if_blank(scoring_config.get("pairs_path"))
|
|
64
|
+
or outputs.get("preprocess_paths", {}).get("pairs")
|
|
65
|
+
)
|
|
66
|
+
if not pairs_path:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"scoring.enabled is true, but no pairs_path was provided and "
|
|
69
|
+
"preprocessing did not produce a pair table."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
result_df, paths = score_pairs_file(
|
|
73
|
+
pairs_path=pairs_path,
|
|
74
|
+
output_dir=_none_if_blank(
|
|
75
|
+
scoring_config.get("output_dir", scoring_config.get("result_dir"))
|
|
76
|
+
),
|
|
77
|
+
manifest_path=scoring_config.get("manifest_path", "auto"),
|
|
78
|
+
model_name=scoring_config.get("model_name", "evo2_7b"),
|
|
79
|
+
device=scoring_config.get("device", "cuda:0"),
|
|
80
|
+
local_path=_none_if_blank(scoring_config.get("local_path")),
|
|
81
|
+
batch_size=scoring_config.get("batch_size", 8),
|
|
82
|
+
force_reload=scoring_config.get("force_reload", False),
|
|
83
|
+
require_recommended_gpu=scoring_config.get(
|
|
84
|
+
"require_recommended_gpu",
|
|
85
|
+
True,
|
|
86
|
+
),
|
|
87
|
+
progress=scoring_config.get("progress", True),
|
|
88
|
+
)
|
|
89
|
+
outputs["scoring_df"] = result_df
|
|
90
|
+
outputs["scoring_paths"] = paths
|
|
91
|
+
|
|
92
|
+
if perbase_config.get("enabled", False):
|
|
93
|
+
output_path = export_perbase_logprobs(
|
|
94
|
+
fasta_path=perbase_config["fasta_path"],
|
|
95
|
+
output_path=_none_if_blank(perbase_config.get("output_path")),
|
|
96
|
+
output_dir=_none_if_blank(perbase_config.get("output_dir")),
|
|
97
|
+
model_name=perbase_config.get("model_name", "evo2_7b"),
|
|
98
|
+
device=perbase_config.get("device", "cuda:0"),
|
|
99
|
+
center=perbase_config.get("center", 4096),
|
|
100
|
+
half_window=perbase_config.get("half_window", 320),
|
|
101
|
+
local_path=_none_if_blank(perbase_config.get("local_path")),
|
|
102
|
+
progress=perbase_config.get("progress", True),
|
|
103
|
+
)
|
|
104
|
+
outputs["perbase_path"] = output_path
|
|
105
|
+
|
|
106
|
+
outputs["config_path"] = Path(path)
|
|
107
|
+
return outputs
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def common_parent(paths):
|
|
5
|
+
resolved = [Path(path).expanduser().resolve().parent for path in paths if path]
|
|
6
|
+
if not resolved:
|
|
7
|
+
return Path.cwd()
|
|
8
|
+
if len(resolved) == 1:
|
|
9
|
+
return resolved[0]
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
return Path(os.path.commonpath([str(path) for path in resolved]))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def default_output_dir(kind, *input_paths, base_dir=None):
|
|
17
|
+
names = {
|
|
18
|
+
"preprocess": "evoseq_preprocess_output",
|
|
19
|
+
"scoring": "evoseq_scoring_output",
|
|
20
|
+
"perbase": "evoseq_perbase_output",
|
|
21
|
+
}
|
|
22
|
+
dirname = names.get(kind, f"evoseq_{kind}_output")
|
|
23
|
+
|
|
24
|
+
if base_dir:
|
|
25
|
+
return Path(base_dir) / dirname
|
|
26
|
+
|
|
27
|
+
return common_parent(input_paths) / dirname
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ensure_output_dir(path, fallback="/content/evoseq_output"):
|
|
31
|
+
path = Path(path)
|
|
32
|
+
try:
|
|
33
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
test_path = path / ".write_test"
|
|
35
|
+
test_path.write_text("ok")
|
|
36
|
+
test_path.unlink(missing_ok=True)
|
|
37
|
+
return path
|
|
38
|
+
except OSError as exc:
|
|
39
|
+
fallback_path = Path(fallback)
|
|
40
|
+
print(f"Warning: cannot use output directory {path} ({exc}).")
|
|
41
|
+
print(f"Using fallback output directory: {fallback_path}")
|
|
42
|
+
fallback_path.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
return fallback_path
|