supremo-lite 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supremo_lite-0.5.4/LICENSE +22 -0
- supremo_lite-0.5.4/PKG-INFO +216 -0
- supremo_lite-0.5.4/README.md +194 -0
- supremo_lite-0.5.4/pyproject.toml +37 -0
- supremo_lite-0.5.4/src/supremo_lite/__init__.py +59 -0
- supremo_lite-0.5.4/src/supremo_lite/chromosome_utils.py +322 -0
- supremo_lite-0.5.4/src/supremo_lite/core.py +41 -0
- supremo_lite-0.5.4/src/supremo_lite/mock_models/__init__.py +110 -0
- supremo_lite-0.5.4/src/supremo_lite/mock_models/testmodel_1d.py +184 -0
- supremo_lite-0.5.4/src/supremo_lite/mock_models/testmodel_2d.py +203 -0
- supremo_lite-0.5.4/src/supremo_lite/mutagenesis.py +414 -0
- supremo_lite-0.5.4/src/supremo_lite/personalize.py +3098 -0
- supremo_lite-0.5.4/src/supremo_lite/prediction_alignment.py +1014 -0
- supremo_lite-0.5.4/src/supremo_lite/sequence_utils.py +137 -0
- supremo_lite-0.5.4/src/supremo_lite/variant_utils.py +1645 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Gladstone Institutes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: supremo_lite
|
|
3
|
+
Version: 0.5.4
|
|
4
|
+
Summary: A lightweight memory first, model agnostic version of SuPreMo
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Natalie Gill
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Provides-Extra: fast
|
|
18
|
+
Requires-Dist: brisket (>=0.1.2) ; extra == "fast"
|
|
19
|
+
Requires-Dist: pandas (>=1.5.0)
|
|
20
|
+
Requires-Dist: pyfaidx (>=0.7.0)
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# supremo_lite
|
|
24
|
+
|
|
25
|
+
A lightweight memory-first, model-agnostic version of [SuPreMo](https://github.com/ketringjoni/SuPreMo).
|
|
26
|
+
|
|
27
|
+
## Key Features
|
|
28
|
+
|
|
29
|
+
- 𧬠**Personalized Genome Generation**: Apply variants from VCF files to reference genomes
|
|
30
|
+
- π― **Variant-Centered Sequences**: Generate sequence windows around variants
|
|
31
|
+
- βοΈ **PAM Site Analysis**: Identify variants that disrupt CRISPR PAM sites
|
|
32
|
+
- π§ͺ **Saturation Mutagenesis**: Systematic single-nucleotide mutations at every position for predictive modeling
|
|
33
|
+
- π§ **Memory Efficient**: Chunked processing for large VCF files
|
|
34
|
+
- πΊοΈ **Chromosome Matching**: Optional handling of chromosome naming differences (chr1 β 1, chrM β MT) via `auto_map_chromosomes=True`
|
|
35
|
+
- β‘ **PyTorch Integration**: Automatic tensor support when PyTorch is available
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
### Install from GitHub (Recommended)
|
|
40
|
+
|
|
41
|
+
For the latest features and bug fixes:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Install directly latest release
|
|
45
|
+
pip install supremo_lite
|
|
46
|
+
|
|
47
|
+
# Or install a specific version/tag
|
|
48
|
+
pip install git+https://github.com/gladstone-institutes/supremo_lite.git@v0.5.0
|
|
49
|
+
|
|
50
|
+
# Or install from a specific branch
|
|
51
|
+
pip install git+https://github.com/gladstone-institutes/supremo_lite.git@main
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Dependencies
|
|
55
|
+
|
|
56
|
+
Required dependencies will be installed automatically:
|
|
57
|
+
- `pandas` - For VCF data handling
|
|
58
|
+
- `numpy` - For numerical operations
|
|
59
|
+
- `pyfaidx` - For FASTA file reading
|
|
60
|
+
|
|
61
|
+
Optional dependencies:
|
|
62
|
+
- `torch` - For PyTorch tensor support (automatically detected)
|
|
63
|
+
- [https://github.com/gladstone-institutes/brisket](brisket) - Cython powered faster 1 hot encoding for DNA sequences (automatically detected)
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import supremo_lite as sl
|
|
69
|
+
from pyfaidx import Fasta
|
|
70
|
+
|
|
71
|
+
# Load reference genome and variants
|
|
72
|
+
reference = Fasta('hg38.fa')
|
|
73
|
+
variants = sl.read_vcf('variants.vcf')
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### DNA Sequence Encoding
|
|
77
|
+
|
|
78
|
+
supremo_lite uses **one-hot encoding** by default:
|
|
79
|
+
- `A` = `[1,0,0,0]`, `C` = `[0,1,0,0]`, `G` = `[0,0,1,0]`, `T` = `[0,0,0,1]`
|
|
80
|
+
- Ambiguous bases = `[0,0,0,0]`
|
|
81
|
+
- Returns PyTorch tensors when available, otherwise NumPy arrays
|
|
82
|
+
|
|
83
|
+
### Personalized Genome Generation
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# Apply variants to create personalized genome
|
|
87
|
+
personal_genome = sl.get_personal_genome(
|
|
88
|
+
reference_fn=reference,
|
|
89
|
+
variants_fn=variants,
|
|
90
|
+
encode=True, # One-hot encoded (or False for strings)
|
|
91
|
+
chunk_size=10000, # Process 10k variants at a time
|
|
92
|
+
verbose=True # Show progress
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# If your VCF uses 'chr1' and reference uses '1', enable chromosome mapping
|
|
96
|
+
personal_genome = sl.get_personal_genome(
|
|
97
|
+
reference_fn=reference,
|
|
98
|
+
variants_fn=variants,
|
|
99
|
+
auto_map_chromosomes=True # Handle chromosome name differences
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**π [Full Guide: Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/personalization.md) | [Tutorial Notebook](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/02_personalized_genomes.ipynb)**
|
|
104
|
+
|
|
105
|
+
### Variant-Centered Sequences
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
# Generate reference and alternate sequences around variants
|
|
109
|
+
# Note: get_alt_ref_sequences is a generator that yields chunks
|
|
110
|
+
results = list(sl.get_alt_ref_sequences(
|
|
111
|
+
reference_fn=reference,
|
|
112
|
+
variants_fn=variants,
|
|
113
|
+
seq_len=1000,
|
|
114
|
+
encode=True
|
|
115
|
+
))
|
|
116
|
+
# Unpack from the first chunk
|
|
117
|
+
alt_seqs, ref_seqs, metadata = results[0]
|
|
118
|
+
# Returns: (n_variants, seq_len, 4) shaped arrays
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**π [Full Guide: Variant-Centered Sequences](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/variant_centered_sequences.md) | [Getting Started Notebook](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/01_getting_started.ipynb)**
|
|
122
|
+
|
|
123
|
+
### Prediction Alignment
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
# Align model predictions accounting for variant coordinate changes
|
|
127
|
+
from supremo_lite.mock_models import TestModel
|
|
128
|
+
|
|
129
|
+
model = TestModel(n_targets=2, bin_size=8, crop_length=10)
|
|
130
|
+
ref_preds = model(ref_seqs)
|
|
131
|
+
alt_preds = model(alt_seqs)
|
|
132
|
+
|
|
133
|
+
ref_aligned, alt_aligned = sl.align_predictions_by_coordinate(
|
|
134
|
+
ref_pred=ref_preds[0],
|
|
135
|
+
alt_pred=alt_preds[0],
|
|
136
|
+
metadata=metadata[0],
|
|
137
|
+
prediction_type="1D",
|
|
138
|
+
bin_size=8,
|
|
139
|
+
crop_length=10
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**π [Full Guide: Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/prediction_alignment.md) | [Tutorial with Visualizations](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/03_prediction_alignment.ipynb)**
|
|
144
|
+
|
|
145
|
+
### Saturation Mutagenesis
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
# Mutate every position in a region
|
|
149
|
+
ref_seq, alt_seqs, metadata = sl.get_sm_sequences(
|
|
150
|
+
chrom='chr1',
|
|
151
|
+
start=1000,
|
|
152
|
+
end=1100, # 100 bp β 300 mutations (3 per position)
|
|
153
|
+
reference_fasta=reference
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**π [Full Guide: Mutagenesis](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/mutagenesis.md)**
|
|
158
|
+
|
|
159
|
+
## Documentation
|
|
160
|
+
|
|
161
|
+
### π User Guides
|
|
162
|
+
Detailed documentation for each major feature:
|
|
163
|
+
- **[Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/personalization.md)** - Apply variants to genomes
|
|
164
|
+
- **[Variant-Centered Sequences](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/variant_centered_sequences.md)** - Extract sequence windows around variants
|
|
165
|
+
- **[Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/prediction_alignment.md)** - Align model predictions for variant effect analysis
|
|
166
|
+
- **[Saturation Mutagenesis](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/mutagenesis.md)** - In-silico mutagenesis workflows
|
|
167
|
+
- **[Variant Classification](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/_static/images/variant_classification.png)** - Flow chart showing automatic variant classification logic
|
|
168
|
+
|
|
169
|
+
### π Interactive Tutorials
|
|
170
|
+
Hands-on Jupyter notebooks with visualizations:
|
|
171
|
+
- **[Getting Started](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/01_getting_started.ipynb)** - Installation and basic concepts
|
|
172
|
+
- **[Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/02_personalized_genomes.ipynb)** - Genome personalization workflows
|
|
173
|
+
- **[Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/03_prediction_alignment.ipynb)** - Complete prediction workflow with visualizations β
|
|
174
|
+
|
|
175
|
+
### π API Reference
|
|
176
|
+
**Core Functions:**
|
|
177
|
+
- `get_personal_genome()` - Generate personalized genomes
|
|
178
|
+
- `get_alt_ref_sequences()` - Generate variant-centered sequences
|
|
179
|
+
- `align_predictions_by_coordinate()` - Align model predictions
|
|
180
|
+
- `get_sm_sequences()` - Saturation mutagenesis
|
|
181
|
+
- `read_vcf()` - Read VCF files
|
|
182
|
+
|
|
183
|
+
For complete API documentation with all parameters, see the [docs/](https://github.com/gladstone-institutes/supremo_lite/tree/main/docs) directory.
|
|
184
|
+
|
|
185
|
+
## Issues and Support
|
|
186
|
+
|
|
187
|
+
We welcome feedback, bug reports, and feature requests! If you encounter any issues or have suggestions for improvements, please:
|
|
188
|
+
|
|
189
|
+
1. **Check existing issues** first to see if your problem has already been reported
|
|
190
|
+
2. **File a new issue** on our [GitHub Issues page](https://github.com/gladstone-institutes/supremo_lite/issues)
|
|
191
|
+
3. **Provide detailed information** including:
|
|
192
|
+
- Python version and operating system
|
|
193
|
+
- Package version (`supremo_lite.__version__`)
|
|
194
|
+
- Complete error messages and stack traces
|
|
195
|
+
- Minimal reproducible example
|
|
196
|
+
- Expected vs. actual behavior
|
|
197
|
+
|
|
198
|
+
### Common Issues to Report
|
|
199
|
+
|
|
200
|
+
- **Performance problems** with large genomes or variant files
|
|
201
|
+
- **Unexpected behavior** with edge cases
|
|
202
|
+
- **Documentation gaps** or unclear examples
|
|
203
|
+
- **Feature requests** for new functionality
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
## Contributing
|
|
207
|
+
|
|
208
|
+
Interested in contributing? Check out the contributing guidelines. Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.
|
|
209
|
+
|
|
210
|
+
## License
|
|
211
|
+
|
|
212
|
+
`supremo_lite` was created by Natalie Gill and Sean Whalen, based on Sequence Mutator for Predictive Models ([SuPreMo](https://github.com/ketringjoni/SuPreMo)) by Katie Gjoni. It is licensed under the terms of the MIT license.
|
|
213
|
+
|
|
214
|
+
## Credits
|
|
215
|
+
|
|
216
|
+
`supremo_lite` was created with [`cookiecutter`](https://cookiecutter.readthedocs.io/en/latest/) and the `py-pkgs-cookiecutter` [template](https://github.com/py-pkgs/py-pkgs-cookiecutter).
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# supremo_lite
|
|
2
|
+
|
|
3
|
+
A lightweight memory-first, model-agnostic version of [SuPreMo](https://github.com/ketringjoni/SuPreMo).
|
|
4
|
+
|
|
5
|
+
## Key Features
|
|
6
|
+
|
|
7
|
+
- 𧬠**Personalized Genome Generation**: Apply variants from VCF files to reference genomes
|
|
8
|
+
- π― **Variant-Centered Sequences**: Generate sequence windows around variants
|
|
9
|
+
- βοΈ **PAM Site Analysis**: Identify variants that disrupt CRISPR PAM sites
|
|
10
|
+
- π§ͺ **Saturation Mutagenesis**: Systematic single-nucleotide mutations at every position for predictive modeling
|
|
11
|
+
- π§ **Memory Efficient**: Chunked processing for large VCF files
|
|
12
|
+
- πΊοΈ **Chromosome Matching**: Optional handling of chromosome naming differences (chr1 β 1, chrM β MT) via `auto_map_chromosomes=True`
|
|
13
|
+
- β‘ **PyTorch Integration**: Automatic tensor support when PyTorch is available
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
### Install from GitHub (Recommended)
|
|
18
|
+
|
|
19
|
+
For the latest features and bug fixes:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Install directly latest release
|
|
23
|
+
pip install supremo_lite
|
|
24
|
+
|
|
25
|
+
# Or install a specific version/tag
|
|
26
|
+
pip install git+https://github.com/gladstone-institutes/supremo_lite.git@v0.5.0
|
|
27
|
+
|
|
28
|
+
# Or install from a specific branch
|
|
29
|
+
pip install git+https://github.com/gladstone-institutes/supremo_lite.git@main
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Dependencies
|
|
33
|
+
|
|
34
|
+
Required dependencies will be installed automatically:
|
|
35
|
+
- `pandas` - For VCF data handling
|
|
36
|
+
- `numpy` - For numerical operations
|
|
37
|
+
- `pyfaidx` - For FASTA file reading
|
|
38
|
+
|
|
39
|
+
Optional dependencies:
|
|
40
|
+
- `torch` - For PyTorch tensor support (automatically detected)
|
|
41
|
+
- [https://github.com/gladstone-institutes/brisket](brisket) - Cython powered faster 1 hot encoding for DNA sequences (automatically detected)
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import supremo_lite as sl
|
|
47
|
+
from pyfaidx import Fasta
|
|
48
|
+
|
|
49
|
+
# Load reference genome and variants
|
|
50
|
+
reference = Fasta('hg38.fa')
|
|
51
|
+
variants = sl.read_vcf('variants.vcf')
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### DNA Sequence Encoding
|
|
55
|
+
|
|
56
|
+
supremo_lite uses **one-hot encoding** by default:
|
|
57
|
+
- `A` = `[1,0,0,0]`, `C` = `[0,1,0,0]`, `G` = `[0,0,1,0]`, `T` = `[0,0,0,1]`
|
|
58
|
+
- Ambiguous bases = `[0,0,0,0]`
|
|
59
|
+
- Returns PyTorch tensors when available, otherwise NumPy arrays
|
|
60
|
+
|
|
61
|
+
### Personalized Genome Generation
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
# Apply variants to create personalized genome
|
|
65
|
+
personal_genome = sl.get_personal_genome(
|
|
66
|
+
reference_fn=reference,
|
|
67
|
+
variants_fn=variants,
|
|
68
|
+
encode=True, # One-hot encoded (or False for strings)
|
|
69
|
+
chunk_size=10000, # Process 10k variants at a time
|
|
70
|
+
verbose=True # Show progress
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# If your VCF uses 'chr1' and reference uses '1', enable chromosome mapping
|
|
74
|
+
personal_genome = sl.get_personal_genome(
|
|
75
|
+
reference_fn=reference,
|
|
76
|
+
variants_fn=variants,
|
|
77
|
+
auto_map_chromosomes=True # Handle chromosome name differences
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**π [Full Guide: Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/personalization.md) | [Tutorial Notebook](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/02_personalized_genomes.ipynb)**
|
|
82
|
+
|
|
83
|
+
### Variant-Centered Sequences
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# Generate reference and alternate sequences around variants
|
|
87
|
+
# Note: get_alt_ref_sequences is a generator that yields chunks
|
|
88
|
+
results = list(sl.get_alt_ref_sequences(
|
|
89
|
+
reference_fn=reference,
|
|
90
|
+
variants_fn=variants,
|
|
91
|
+
seq_len=1000,
|
|
92
|
+
encode=True
|
|
93
|
+
))
|
|
94
|
+
# Unpack from the first chunk
|
|
95
|
+
alt_seqs, ref_seqs, metadata = results[0]
|
|
96
|
+
# Returns: (n_variants, seq_len, 4) shaped arrays
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**π [Full Guide: Variant-Centered Sequences](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/variant_centered_sequences.md) | [Getting Started Notebook](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/01_getting_started.ipynb)**
|
|
100
|
+
|
|
101
|
+
### Prediction Alignment
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
# Align model predictions accounting for variant coordinate changes
|
|
105
|
+
from supremo_lite.mock_models import TestModel
|
|
106
|
+
|
|
107
|
+
model = TestModel(n_targets=2, bin_size=8, crop_length=10)
|
|
108
|
+
ref_preds = model(ref_seqs)
|
|
109
|
+
alt_preds = model(alt_seqs)
|
|
110
|
+
|
|
111
|
+
ref_aligned, alt_aligned = sl.align_predictions_by_coordinate(
|
|
112
|
+
ref_pred=ref_preds[0],
|
|
113
|
+
alt_pred=alt_preds[0],
|
|
114
|
+
metadata=metadata[0],
|
|
115
|
+
prediction_type="1D",
|
|
116
|
+
bin_size=8,
|
|
117
|
+
crop_length=10
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**π [Full Guide: Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/prediction_alignment.md) | [Tutorial with Visualizations](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/03_prediction_alignment.ipynb)**
|
|
122
|
+
|
|
123
|
+
### Saturation Mutagenesis
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
# Mutate every position in a region
|
|
127
|
+
ref_seq, alt_seqs, metadata = sl.get_sm_sequences(
|
|
128
|
+
chrom='chr1',
|
|
129
|
+
start=1000,
|
|
130
|
+
end=1100, # 100 bp β 300 mutations (3 per position)
|
|
131
|
+
reference_fasta=reference
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**π [Full Guide: Mutagenesis](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/mutagenesis.md)**
|
|
136
|
+
|
|
137
|
+
## Documentation
|
|
138
|
+
|
|
139
|
+
### π User Guides
|
|
140
|
+
Detailed documentation for each major feature:
|
|
141
|
+
- **[Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/personalization.md)** - Apply variants to genomes
|
|
142
|
+
- **[Variant-Centered Sequences](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/variant_centered_sequences.md)** - Extract sequence windows around variants
|
|
143
|
+
- **[Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/prediction_alignment.md)** - Align model predictions for variant effect analysis
|
|
144
|
+
- **[Saturation Mutagenesis](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/user_guide/mutagenesis.md)** - In-silico mutagenesis workflows
|
|
145
|
+
- **[Variant Classification](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/_static/images/variant_classification.png)** - Flow chart showing automatic variant classification logic
|
|
146
|
+
|
|
147
|
+
### π Interactive Tutorials
|
|
148
|
+
Hands-on Jupyter notebooks with visualizations:
|
|
149
|
+
- **[Getting Started](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/01_getting_started.ipynb)** - Installation and basic concepts
|
|
150
|
+
- **[Personalized Genomes](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/02_personalized_genomes.ipynb)** - Genome personalization workflows
|
|
151
|
+
- **[Prediction Alignment](https://github.com/gladstone-institutes/supremo_lite/blob/main/docs/notebooks/03_prediction_alignment.ipynb)** - Complete prediction workflow with visualizations β
|
|
152
|
+
|
|
153
|
+
### π API Reference
|
|
154
|
+
**Core Functions:**
|
|
155
|
+
- `get_personal_genome()` - Generate personalized genomes
|
|
156
|
+
- `get_alt_ref_sequences()` - Generate variant-centered sequences
|
|
157
|
+
- `align_predictions_by_coordinate()` - Align model predictions
|
|
158
|
+
- `get_sm_sequences()` - Saturation mutagenesis
|
|
159
|
+
- `read_vcf()` - Read VCF files
|
|
160
|
+
|
|
161
|
+
For complete API documentation with all parameters, see the [docs/](https://github.com/gladstone-institutes/supremo_lite/tree/main/docs) directory.
|
|
162
|
+
|
|
163
|
+
## Issues and Support
|
|
164
|
+
|
|
165
|
+
We welcome feedback, bug reports, and feature requests! If you encounter any issues or have suggestions for improvements, please:
|
|
166
|
+
|
|
167
|
+
1. **Check existing issues** first to see if your problem has already been reported
|
|
168
|
+
2. **File a new issue** on our [GitHub Issues page](https://github.com/gladstone-institutes/supremo_lite/issues)
|
|
169
|
+
3. **Provide detailed information** including:
|
|
170
|
+
- Python version and operating system
|
|
171
|
+
- Package version (`supremo_lite.__version__`)
|
|
172
|
+
- Complete error messages and stack traces
|
|
173
|
+
- Minimal reproducible example
|
|
174
|
+
- Expected vs. actual behavior
|
|
175
|
+
|
|
176
|
+
### Common Issues to Report
|
|
177
|
+
|
|
178
|
+
- **Performance problems** with large genomes or variant files
|
|
179
|
+
- **Unexpected behavior** with edge cases
|
|
180
|
+
- **Documentation gaps** or unclear examples
|
|
181
|
+
- **Feature requests** for new functionality
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
## Contributing
|
|
185
|
+
|
|
186
|
+
Interested in contributing? Check out the contributing guidelines. Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
`supremo_lite` was created by Natalie Gill and Sean Whalen, based on Sequence Mutator for Predictive Models ([SuPreMo](https://github.com/ketringjoni/SuPreMo)) by Katie Gjoni. It is licensed under the terms of the MIT license.
|
|
191
|
+
|
|
192
|
+
## Credits
|
|
193
|
+
|
|
194
|
+
`supremo_lite` was created with [`cookiecutter`](https://cookiecutter.readthedocs.io/en/latest/) and the `py-pkgs-cookiecutter` [template](https://github.com/py-pkgs/py-pkgs-cookiecutter).
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "supremo_lite"
|
|
3
|
+
version = "0.5.4"
|
|
4
|
+
description = "A lightweight memory first, model agnostic version of SuPreMo"
|
|
5
|
+
authors = ["Natalie Gill", "Sean Whalen"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.9"
|
|
11
|
+
pandas = ">=1.5.0"
|
|
12
|
+
pyfaidx = ">=0.7.0"
|
|
13
|
+
torch = {version = ">=1.13.0", optional = true}
|
|
14
|
+
brisket = {version = ">=0.1.2", optional = true}
|
|
15
|
+
|
|
16
|
+
[tool.poetry.extras]
|
|
17
|
+
fast = ["brisket"]
|
|
18
|
+
|
|
19
|
+
[tool.poetry.group.dev.dependencies]
|
|
20
|
+
pytest = ">=6.0.0"
|
|
21
|
+
pytest-cov = ">=3.0.0"
|
|
22
|
+
notebook = ">=6.0.0"
|
|
23
|
+
black = ">=20.0.0"
|
|
24
|
+
ipython = ">=7.0.0"
|
|
25
|
+
tokenize-rt = ">=3.2.0"
|
|
26
|
+
matplotlib = "3.9"
|
|
27
|
+
brisket = "^0.1.2"
|
|
28
|
+
torch = ">=1.13.0"
|
|
29
|
+
seaborn = "^0.13.2"
|
|
30
|
+
sphinx = "<8.0"
|
|
31
|
+
myst-nb = "^1.3.0"
|
|
32
|
+
sphinx-autoapi = "^3.6.1"
|
|
33
|
+
sphinx-rtd-theme = "^3.0.2"
|
|
34
|
+
|
|
35
|
+
[build-system]
|
|
36
|
+
requires = ["poetry-core>=1.0.0"]
|
|
37
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
supremo_lite: A module for generating personalized genome sequences from a reference
|
|
3
|
+
fasta and a variants file, or sequences for in-silico mutagenesis.
|
|
4
|
+
|
|
5
|
+
This package provides functionality for:
|
|
6
|
+
- Sequence encoding and transformation
|
|
7
|
+
- Variant reading and application
|
|
8
|
+
- In-silico mutagenesis
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Import core components
|
|
12
|
+
from .core import TORCH_AVAILABLE, BRISKET_AVAILABLE, nt_to_1h, nts
|
|
13
|
+
|
|
14
|
+
# Import sequence transformation utilities
|
|
15
|
+
from .sequence_utils import encode_seq, decode_seq, rc, rc_str
|
|
16
|
+
|
|
17
|
+
# Import variant reading utilities
|
|
18
|
+
from .variant_utils import (
|
|
19
|
+
read_vcf,
|
|
20
|
+
read_vcf_chunked,
|
|
21
|
+
get_vcf_chromosomes,
|
|
22
|
+
read_vcf_chromosome,
|
|
23
|
+
classify_variant_type,
|
|
24
|
+
parse_vcf_info,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Import chromosome matching utilities
|
|
28
|
+
from .chromosome_utils import (
|
|
29
|
+
normalize_chromosome_name,
|
|
30
|
+
create_chromosome_mapping,
|
|
31
|
+
match_chromosomes_with_report,
|
|
32
|
+
ChromosomeMismatchError,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Import personalize functions
|
|
36
|
+
from .personalize import (
|
|
37
|
+
get_personal_genome,
|
|
38
|
+
get_alt_sequences,
|
|
39
|
+
get_ref_sequences,
|
|
40
|
+
get_pam_disrupting_alt_sequences,
|
|
41
|
+
get_alt_ref_sequences,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Import mutagenesis functions
|
|
45
|
+
from .mutagenesis import get_sm_sequences, get_sm_subsequences
|
|
46
|
+
|
|
47
|
+
# Import prediction alignment functions
|
|
48
|
+
from .prediction_alignment import align_predictions_by_coordinate
|
|
49
|
+
|
|
50
|
+
# Mock models are available in a separate submodule
|
|
51
|
+
# Import with: from supremo_lite.mock_models import TestModel, TestModel2D
|
|
52
|
+
# This allows users who don't have PyTorch to still use the main package
|
|
53
|
+
|
|
54
|
+
# Version
|
|
55
|
+
__version__ = "0.5.4"
|
|
56
|
+
# Package metadata
|
|
57
|
+
__description__ = (
|
|
58
|
+
"A module for generating personalized genome sequences and in-silico mutagenesis"
|
|
59
|
+
)
|