supremo-lite 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supremo_lite/__init__.py +59 -0
- supremo_lite/chromosome_utils.py +322 -0
- supremo_lite/core.py +41 -0
- supremo_lite/mock_models/__init__.py +110 -0
- supremo_lite/mock_models/testmodel_1d.py +184 -0
- supremo_lite/mock_models/testmodel_2d.py +203 -0
- supremo_lite/mutagenesis.py +414 -0
- supremo_lite/personalize.py +3098 -0
- supremo_lite/prediction_alignment.py +1014 -0
- supremo_lite/sequence_utils.py +137 -0
- supremo_lite/variant_utils.py +1645 -0
- supremo_lite-0.5.4.dist-info/METADATA +216 -0
- supremo_lite-0.5.4.dist-info/RECORD +15 -0
- supremo_lite-0.5.4.dist-info/WHEEL +4 -0
- supremo_lite-0.5.4.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mock 1D genomic prediction model for testing and demonstrations.
|
|
3
|
+
|
|
4
|
+
This module provides a simple PyTorch model that mimics realistic genomic deep learning
|
|
5
|
+
architectures. It is intended for:
|
|
6
|
+
1. Testing prediction alignment functionality
|
|
7
|
+
2. Providing runnable examples for users without trained models
|
|
8
|
+
|
|
9
|
+
**NOT for actual genomic predictions** - this model returns constant values and has
|
|
10
|
+
no learned parameters.
|
|
11
|
+
|
|
12
|
+
Model Architecture Characteristics:
|
|
13
|
+
- **Binning**: Predictions at lower resolution than input (bin_size parameter)
|
|
14
|
+
- **Cropping**: Edge bins removed to focus on central regions (crop_length parameter)
|
|
15
|
+
- **Output shape**: (batch_size, n_targets, n_final_bins)
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> from supremo_lite.mock_models import TestModel
|
|
19
|
+
>>> import torch
|
|
20
|
+
>>>
|
|
21
|
+
>>> model = TestModel(seq_length=1024, bin_length=32, crop_length=128, n_targets=1)
|
|
22
|
+
>>> x = torch.randn(8, 4, 1024) # (batch, channels, length)
|
|
23
|
+
>>> predictions = model(x)
|
|
24
|
+
>>> predictions.shape
|
|
25
|
+
torch.Size([8, 1, 28]) # (batch, targets, bins after cropping)
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
import torch
|
|
31
|
+
import torch.nn as nn
|
|
32
|
+
|
|
33
|
+
TORCH_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
TORCH_AVAILABLE = False
|
|
36
|
+
torch = None
|
|
37
|
+
nn = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if TORCH_AVAILABLE:
|
|
41
|
+
|
|
42
|
+
class TestModel(nn.Module):
|
|
43
|
+
"""
|
|
44
|
+
Mock 1D genomic prediction model.
|
|
45
|
+
|
|
46
|
+
This model demonstrates typical genomic deep learning architecture features:
|
|
47
|
+
- Accepts one-hot encoded DNA sequences
|
|
48
|
+
- Outputs binned predictions at lower resolution
|
|
49
|
+
- Applies edge cropping to focus on central regions
|
|
50
|
+
|
|
51
|
+
**Warning**: Returns constant values (ones). Not for actual predictions.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
seq_length : int
|
|
56
|
+
Length of input sequences in base pairs
|
|
57
|
+
bin_length : int
|
|
58
|
+
Number of base pairs per prediction bin
|
|
59
|
+
crop_length : int, optional
|
|
60
|
+
Number of base pairs to crop from each edge (default: 0)
|
|
61
|
+
n_targets : int, optional
|
|
62
|
+
Number of prediction targets per bin (default: 1)
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
Basic usage:
|
|
67
|
+
|
|
68
|
+
>>> model = TestModel(seq_length=1024, bin_length=32)
|
|
69
|
+
>>> x = torch.randn(4, 4, 1024) # (batch, channels, length)
|
|
70
|
+
>>> out = model(x)
|
|
71
|
+
>>> out.shape
|
|
72
|
+
torch.Size([4, 1, 32])
|
|
73
|
+
|
|
74
|
+
With cropping:
|
|
75
|
+
|
|
76
|
+
>>> model = TestModel(seq_length=2048, bin_length=64, crop_length=256)
|
|
77
|
+
>>> model.n_initial_bins
|
|
78
|
+
32
|
|
79
|
+
>>> model.crop_bins
|
|
80
|
+
4
|
|
81
|
+
>>> model.n_final_bins
|
|
82
|
+
24
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, seq_length, bin_length, crop_length=0, n_targets=1):
|
|
86
|
+
super().__init__()
|
|
87
|
+
|
|
88
|
+
self.seq_length = seq_length
|
|
89
|
+
self.bin_length = bin_length
|
|
90
|
+
self.crop_length = crop_length
|
|
91
|
+
self.n_targets = n_targets
|
|
92
|
+
|
|
93
|
+
self.crop_bins = crop_length // bin_length
|
|
94
|
+
self.n_initial_bins = seq_length // bin_length
|
|
95
|
+
self.n_final_bins = self.n_initial_bins - 2 * self.crop_bins
|
|
96
|
+
|
|
97
|
+
def forward(self, x):
|
|
98
|
+
"""
|
|
99
|
+
Forward pass returning mock predictions.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
x : torch.Tensor
|
|
104
|
+
Input tensor of shape (batch_size, 4, seq_length)
|
|
105
|
+
Channel dimension should be 4 (one-hot encoded A, C, G, T)
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
torch.Tensor
|
|
110
|
+
Mock predictions of shape (batch_size, n_targets, n_final_bins)
|
|
111
|
+
Contains all ones (not meaningful predictions)
|
|
112
|
+
"""
|
|
113
|
+
assert x.shape[1] == 4, f"Expected 4 channels (one-hot), got {x.shape[1]}"
|
|
114
|
+
assert (
|
|
115
|
+
x.shape[2] == self.seq_length
|
|
116
|
+
), f"Expected sequence length {self.seq_length}, got {x.shape[2]}"
|
|
117
|
+
|
|
118
|
+
return torch.ones([x.shape[0], self.n_targets, self.n_final_bins])
|
|
119
|
+
|
|
120
|
+
def training_step(self, batch, batch_idx):
|
|
121
|
+
"""
|
|
122
|
+
Mock training step for demonstration purposes.
|
|
123
|
+
|
|
124
|
+
Shows how cropping would be applied during training.
|
|
125
|
+
This is for educational purposes only - the model has no learnable parameters.
|
|
126
|
+
"""
|
|
127
|
+
x, y = batch
|
|
128
|
+
# Crop target predictions to match model output
|
|
129
|
+
y = y[:, :, self.crop_bins : -self.crop_bins]
|
|
130
|
+
y_hat = self(x)
|
|
131
|
+
return nn.functional.mse_loss(y_hat, y)
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
# Fallback when PyTorch not available
|
|
135
|
+
class TestModel:
|
|
136
|
+
"""TestModel requires PyTorch. Please install with: pip install torch"""
|
|
137
|
+
|
|
138
|
+
def __init__(self, *args, **kwargs):
|
|
139
|
+
raise ImportError(
|
|
140
|
+
"TestModel requires PyTorch. Install with: pip install torch\n"
|
|
141
|
+
"See https://pytorch.org/get-started/locally/ for installation instructions."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Make TestModel available at module level
|
|
146
|
+
__all__ = ["TestModel", "TORCH_AVAILABLE"]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
if not TORCH_AVAILABLE:
|
|
151
|
+
print("PyTorch not available. Install with: pip install torch")
|
|
152
|
+
exit(1)
|
|
153
|
+
|
|
154
|
+
# Demonstration of model behavior
|
|
155
|
+
batch_size = 8
|
|
156
|
+
seq_length = 524288
|
|
157
|
+
bin_length = 32
|
|
158
|
+
crop_length = 163840
|
|
159
|
+
n_targets = 2
|
|
160
|
+
|
|
161
|
+
crop_bins = crop_length // bin_length
|
|
162
|
+
n_initial_bins = seq_length // bin_length
|
|
163
|
+
n_cropped_bins = n_initial_bins - 2 * crop_bins
|
|
164
|
+
|
|
165
|
+
print(f"Model Configuration:")
|
|
166
|
+
print(f" Sequence length: {seq_length:,} bp")
|
|
167
|
+
print(f" Bin length: {bin_length} bp")
|
|
168
|
+
print(f" Crop length: {crop_length:,} bp")
|
|
169
|
+
print(f" Initial bins: {n_initial_bins}")
|
|
170
|
+
print(f" Crop bins per edge: {crop_bins}")
|
|
171
|
+
print(f" Final bins: {n_cropped_bins}")
|
|
172
|
+
print(f" Targets: {n_targets}")
|
|
173
|
+
|
|
174
|
+
m = TestModel(seq_length, bin_length, crop_length, n_targets)
|
|
175
|
+
x = torch.ones([batch_size, 4, seq_length])
|
|
176
|
+
|
|
177
|
+
y_hat = m(x)
|
|
178
|
+
assert y_hat.shape[0] == batch_size
|
|
179
|
+
assert y_hat.shape[1] == n_targets
|
|
180
|
+
assert y_hat.shape[2] == n_cropped_bins
|
|
181
|
+
|
|
182
|
+
print(f"\nInput shape: {x.shape}")
|
|
183
|
+
print(f"Output shape: {y_hat.shape}")
|
|
184
|
+
print("✓ Model test passed!")
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mock 2D genomic prediction model for contact map testing and demonstrations.
|
|
3
|
+
|
|
4
|
+
This module provides a simple PyTorch model that mimics realistic 2D genomic prediction
|
|
5
|
+
architectures (e.g., for chromatin contact maps, Hi-C predictions). It is intended for:
|
|
6
|
+
1. Testing 2D prediction alignment functionality
|
|
7
|
+
2. Providing runnable examples for users without trained models
|
|
8
|
+
|
|
9
|
+
**NOT for actual genomic predictions** - this model returns constant values and has
|
|
10
|
+
no learned parameters.
|
|
11
|
+
|
|
12
|
+
Model Architecture Characteristics:
|
|
13
|
+
- **Binning**: Predictions at 2D grid resolution (bin_size × bin_size)
|
|
14
|
+
- **Cropping**: Edge bins removed from all sides
|
|
15
|
+
- **Output shape**: (batch_size, n_targets, n_final_bins, n_final_bins)
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> from supremo_lite.mock_models import TestModel2D
|
|
19
|
+
>>> import torch
|
|
20
|
+
>>>
|
|
21
|
+
>>> model = TestModel2D(seq_length=2048, bin_length=64, crop_length=256, n_targets=1)
|
|
22
|
+
>>> x = torch.randn(4, 4, 2048) # (batch, channels, length)
|
|
23
|
+
>>> predictions = model(x)
|
|
24
|
+
>>> predictions.shape
|
|
25
|
+
torch.Size([4, 1, 24, 24]) # Full contact matrix after cropping
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
import torch
|
|
31
|
+
import torch.nn as nn
|
|
32
|
+
|
|
33
|
+
TORCH_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
TORCH_AVAILABLE = False
|
|
36
|
+
torch = None
|
|
37
|
+
nn = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if TORCH_AVAILABLE:
|
|
41
|
+
|
|
42
|
+
class TestModel2D(nn.Module):
|
|
43
|
+
"""
|
|
44
|
+
Mock 2D genomic prediction model (e.g., for contact maps).
|
|
45
|
+
|
|
46
|
+
This model demonstrates typical 2D genomic deep learning architecture features:
|
|
47
|
+
- Accepts one-hot encoded DNA sequences
|
|
48
|
+
- Outputs 2D contact matrix predictions at binned resolution
|
|
49
|
+
- Applies edge cropping on all sides
|
|
50
|
+
- Returns full symmetric contact matrix
|
|
51
|
+
|
|
52
|
+
**Warning**: Returns constant values (ones). Not for actual predictions.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
seq_length : int
|
|
57
|
+
Length of input sequences in base pairs
|
|
58
|
+
bin_length : int
|
|
59
|
+
Number of base pairs per prediction bin (in each dimension)
|
|
60
|
+
crop_length : int, optional
|
|
61
|
+
Number of base pairs to crop from each edge (default: 0)
|
|
62
|
+
n_targets : int, optional
|
|
63
|
+
Number of prediction targets per bin pair (default: 1)
|
|
64
|
+
|
|
65
|
+
Examples
|
|
66
|
+
--------
|
|
67
|
+
Basic usage:
|
|
68
|
+
|
|
69
|
+
>>> model = TestModel2D(seq_length=2048, bin_length=64)
|
|
70
|
+
>>> x = torch.randn(4, 4, 2048) # (batch, channels, length)
|
|
71
|
+
>>> out = model(x)
|
|
72
|
+
>>> # Output is full contact matrix
|
|
73
|
+
>>> # For n=32 bins: 32×32 matrix
|
|
74
|
+
>>> out.shape
|
|
75
|
+
torch.Size([4, 1, 32, 32])
|
|
76
|
+
|
|
77
|
+
With cropping:
|
|
78
|
+
|
|
79
|
+
>>> model = TestModel2D(seq_length=4096, bin_length=128, crop_length=512)
|
|
80
|
+
>>> model.n_initial_bins
|
|
81
|
+
32
|
|
82
|
+
>>> model.crop_bins
|
|
83
|
+
4
|
|
84
|
+
>>> model.n_final_bins
|
|
85
|
+
24
|
|
86
|
+
>>> # Contact matrix: 24×24
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, seq_length, bin_length, crop_length=0, n_targets=1):
|
|
90
|
+
super().__init__()
|
|
91
|
+
|
|
92
|
+
self.seq_length = seq_length
|
|
93
|
+
self.bin_length = bin_length
|
|
94
|
+
self.crop_length = crop_length
|
|
95
|
+
self.n_targets = n_targets
|
|
96
|
+
|
|
97
|
+
self.crop_bins = crop_length // bin_length
|
|
98
|
+
self.n_initial_bins = seq_length // bin_length
|
|
99
|
+
self.n_final_bins = self.n_initial_bins - 2 * self.crop_bins
|
|
100
|
+
|
|
101
|
+
def forward(self, x):
|
|
102
|
+
"""
|
|
103
|
+
Forward pass returning mock 2D contact map predictions.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
x : torch.Tensor
|
|
108
|
+
Input tensor of shape (batch_size, 4, seq_length)
|
|
109
|
+
Channel dimension should be 4 (one-hot encoded A, C, G, T)
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
torch.Tensor
|
|
114
|
+
Mock predictions of shape (batch_size, n_targets, n_final_bins, n_final_bins)
|
|
115
|
+
Full symmetric contact matrix after cropping
|
|
116
|
+
Contains all ones (not meaningful predictions)
|
|
117
|
+
"""
|
|
118
|
+
assert x.shape[1] == 4, f"Expected 4 channels (one-hot), got {x.shape[1]}"
|
|
119
|
+
assert (
|
|
120
|
+
x.shape[2] == self.seq_length
|
|
121
|
+
), f"Expected sequence length {self.seq_length}, got {x.shape[2]}"
|
|
122
|
+
|
|
123
|
+
# Create placeholder (N, N) contact matrix
|
|
124
|
+
y_hat = torch.ones(
|
|
125
|
+
[x.shape[0], self.n_targets, self.n_initial_bins, self.n_initial_bins]
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Crop bins from all edges to focus loss function
|
|
129
|
+
y_hat = y_hat[
|
|
130
|
+
:, :, self.crop_bins : -self.crop_bins, self.crop_bins : -self.crop_bins
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
# Return full contact matrix
|
|
134
|
+
return y_hat
|
|
135
|
+
|
|
136
|
+
def training_step(self, batch, batch_idx):
|
|
137
|
+
"""
|
|
138
|
+
Mock training step for demonstration purposes.
|
|
139
|
+
|
|
140
|
+
Shows how 2D predictions would be trained.
|
|
141
|
+
This is for educational purposes only - the model has no learnable parameters.
|
|
142
|
+
"""
|
|
143
|
+
x, y = batch
|
|
144
|
+
y_hat = self(x)
|
|
145
|
+
return nn.functional.mse_loss(y_hat, y)
|
|
146
|
+
|
|
147
|
+
else:
|
|
148
|
+
# Fallback when PyTorch not available
|
|
149
|
+
class TestModel2D:
|
|
150
|
+
"""TestModel2D requires PyTorch. Please install with: pip install torch"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, *args, **kwargs):
|
|
153
|
+
raise ImportError(
|
|
154
|
+
"TestModel2D requires PyTorch. Install with: pip install torch\n"
|
|
155
|
+
"See https://pytorch.org/get-started/locally/ for installation instructions."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# Make TestModel2D available at module level
|
|
160
|
+
__all__ = ["TestModel2D", "TORCH_AVAILABLE"]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
if not TORCH_AVAILABLE:
|
|
165
|
+
print("PyTorch not available. Install with: pip install torch")
|
|
166
|
+
exit(1)
|
|
167
|
+
|
|
168
|
+
# Demonstration of model behavior
|
|
169
|
+
batch_size = 8
|
|
170
|
+
seq_length = 1048576
|
|
171
|
+
bin_length = 2048
|
|
172
|
+
crop_length = 65536
|
|
173
|
+
n_targets = 2
|
|
174
|
+
|
|
175
|
+
crop_bins = crop_length // bin_length
|
|
176
|
+
n_initial_bins = seq_length // bin_length
|
|
177
|
+
n_final_bins = n_initial_bins - 2 * crop_bins
|
|
178
|
+
|
|
179
|
+
print(f"Model Configuration:")
|
|
180
|
+
print(f" Sequence length: {seq_length:,} bp")
|
|
181
|
+
print(f" Bin length: {bin_length:,} bp")
|
|
182
|
+
print(f" Crop length: {crop_length:,} bp")
|
|
183
|
+
print(f" Initial bins (1D): {n_initial_bins}")
|
|
184
|
+
print(f" Crop bins per edge: {crop_bins}")
|
|
185
|
+
print(f" Final bins (1D): {n_final_bins}")
|
|
186
|
+
print(f" Contact matrix size: {n_final_bins} × {n_final_bins}")
|
|
187
|
+
print(f" Targets: {n_targets}")
|
|
188
|
+
|
|
189
|
+
m = TestModel2D(seq_length, bin_length, crop_length, n_targets)
|
|
190
|
+
x = torch.ones([batch_size, 4, seq_length])
|
|
191
|
+
|
|
192
|
+
y_hat = m(x)
|
|
193
|
+
assert y_hat.shape[0] == batch_size
|
|
194
|
+
assert y_hat.shape[1] == n_targets
|
|
195
|
+
assert y_hat.shape[2] == n_final_bins
|
|
196
|
+
assert y_hat.shape[3] == n_final_bins
|
|
197
|
+
|
|
198
|
+
print(f"\nInput shape: {x.shape}")
|
|
199
|
+
print(f"Output shape: {y_hat.shape}")
|
|
200
|
+
print(
|
|
201
|
+
f" (batch_size={batch_size}, n_targets={n_targets}, n_final_bins={n_final_bins})"
|
|
202
|
+
)
|
|
203
|
+
print("✓ Model test passed!")
|