geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +55 -57
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/spliceai_utils.py +17 -17
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/METADATA +1 -1
- geney-1.4.0.dist-info/RECORD +51 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/WHEEL +1 -1
- geney-1.3.78.dist-info/RECORD +0 -31
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Load models
|
|
2
|
+
#
|
|
3
|
+
__all__ = ['pangolin_predict_probs']
|
|
4
|
+
# Load models
|
|
5
|
+
import torch
|
|
6
|
+
from pkg_resources import resource_filename
|
|
7
|
+
from pangolin.model import *
|
|
8
|
+
import numpy as np
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
12
|
+
pang_models = []
|
|
13
|
+
|
|
14
|
+
device = torch.device('cpu')
|
|
15
|
+
if sys.platform == 'darwin':
|
|
16
|
+
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
|
|
17
|
+
|
|
18
|
+
if sys.platform == 'linux':
|
|
19
|
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
20
|
+
|
|
21
|
+
print(f"Pangolin loaded to {device}.")
|
|
22
|
+
|
|
23
|
+
for i in pang_model_nums:
|
|
24
|
+
for j in range(1, 6):
|
|
25
|
+
model = Pangolin(L, W, AR).to(device)
|
|
26
|
+
if torch.cuda.is_available():
|
|
27
|
+
model.cuda()
|
|
28
|
+
# weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
|
|
29
|
+
weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)), weights_only=True)
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)), weights_only=True,
|
|
33
|
+
map_location=device)
|
|
34
|
+
|
|
35
|
+
model.load_state_dict(weights)
|
|
36
|
+
model.eval()
|
|
37
|
+
pang_models.append(model)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def pang_one_hot_encode(seq):
|
|
41
|
+
IN_MAP = np.asarray([[0, 0, 0, 0],
|
|
42
|
+
[1, 0, 0, 0],
|
|
43
|
+
[0, 1, 0, 0],
|
|
44
|
+
[0, 0, 1, 0],
|
|
45
|
+
[0, 0, 0, 1]])
|
|
46
|
+
seq = seq.upper().replace('A', '1').replace('C', '2')
|
|
47
|
+
seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
|
|
48
|
+
seq = np.asarray(list(map(int, list(seq))))
|
|
49
|
+
return IN_MAP[seq.astype('int8')]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def pangolin_predict_probs(true_seq, models, just_ss=False):
|
|
54
|
+
# print(f"Running pangolin on: {true_seq}")
|
|
55
|
+
if just_ss:
|
|
56
|
+
model_nums = [0, 2, 4, 6]
|
|
57
|
+
else:
|
|
58
|
+
model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
59
|
+
|
|
60
|
+
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
61
|
+
|
|
62
|
+
seq = true_seq
|
|
63
|
+
true_seq = true_seq[5000:-5000]
|
|
64
|
+
acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))]) # np.ones(len(true_seq)) #
|
|
65
|
+
donor_dinucleotide = np.array([true_seq[i+1:i+3] == 'GT' for i in range(len(true_seq))]) #np.ones(len(true_seq)) #
|
|
66
|
+
|
|
67
|
+
seq = pang_one_hot_encode(seq).T
|
|
68
|
+
seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
|
|
69
|
+
|
|
70
|
+
# if torch.cuda.is_available():
|
|
71
|
+
seq = seq.to(torch.device(device))
|
|
72
|
+
|
|
73
|
+
scores = []
|
|
74
|
+
for j, model_num in enumerate(model_nums):
|
|
75
|
+
score = []
|
|
76
|
+
# Average across 5 models
|
|
77
|
+
for model in models[5 * j:5 * j + 5]:
|
|
78
|
+
with torch.no_grad():
|
|
79
|
+
score.append(model(seq.to(device))[0][INDEX_MAP[model_num], :].cpu().numpy())
|
|
80
|
+
|
|
81
|
+
scores.append(np.mean(score, axis=0))
|
|
82
|
+
|
|
83
|
+
splicing_pred = np.array(scores).max(axis=0)
|
|
84
|
+
donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
85
|
+
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
86
|
+
return donor_probs, acceptor_probs
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
#
|
|
90
|
+
# import torch
|
|
91
|
+
# from pkg_resources import resource_filename
|
|
92
|
+
# from pangolin.model import *
|
|
93
|
+
# import numpy as np
|
|
94
|
+
# import sys
|
|
95
|
+
#
|
|
96
|
+
# pang_model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
97
|
+
# pang_models = []
|
|
98
|
+
#
|
|
99
|
+
# device = torch.device('cpu')
|
|
100
|
+
# if sys.platform == 'darwin':
|
|
101
|
+
# device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
|
|
102
|
+
#
|
|
103
|
+
# if sys.platform == 'linux':
|
|
104
|
+
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
105
|
+
#
|
|
106
|
+
# device = 'cpu'
|
|
107
|
+
# print(f"Pangolin loaded to {device}.")
|
|
108
|
+
#
|
|
109
|
+
# for i in pang_model_nums:
|
|
110
|
+
# for j in range(1, 6):
|
|
111
|
+
# model = Pangolin(L, W, AR).to(device)
|
|
112
|
+
# if torch.cuda.is_available():
|
|
113
|
+
# model.cuda()
|
|
114
|
+
# # weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
|
|
115
|
+
# weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)), weights_only=True)
|
|
116
|
+
#
|
|
117
|
+
# else:
|
|
118
|
+
# weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)), weights_only=True,
|
|
119
|
+
# map_location=device)
|
|
120
|
+
#
|
|
121
|
+
# model.load_state_dict(weights)
|
|
122
|
+
# model.eval()
|
|
123
|
+
# pang_models.append(model)
|
|
124
|
+
#
|
|
125
|
+
#
|
|
126
|
+
# def pang_one_hot_encode(seq):
|
|
127
|
+
# IN_MAP = np.asarray([[0, 0, 0, 0],
|
|
128
|
+
# [1, 0, 0, 0],
|
|
129
|
+
# [0, 1, 0, 0],
|
|
130
|
+
# [0, 0, 1, 0],
|
|
131
|
+
# [0, 0, 0, 1]])
|
|
132
|
+
# seq = seq.upper().replace('A', '1').replace('C', '2')
|
|
133
|
+
# seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
|
|
134
|
+
# seq = np.asarray(list(map(int, list(seq))))
|
|
135
|
+
# return IN_MAP[seq.astype('int8')]
|
|
136
|
+
#
|
|
137
|
+
#
|
|
138
|
+
#
|
|
139
|
+
# def pangolin_predict_probs(true_seq, models, just_ss=False):
|
|
140
|
+
# # print(f"Running pangolin on: {true_seq}")
|
|
141
|
+
# if just_ss:
|
|
142
|
+
# model_nums = [0, 2, 4, 6]
|
|
143
|
+
# else:
|
|
144
|
+
# model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
145
|
+
#
|
|
146
|
+
# INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
147
|
+
#
|
|
148
|
+
# seq = true_seq
|
|
149
|
+
# true_seq = true_seq[5000:-5000]
|
|
150
|
+
# acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))]) # np.ones(len(true_seq)) #
|
|
151
|
+
# donor_dinucleotide = np.array([true_seq[i+1:i+3] == 'GT' for i in range(len(true_seq))]) #np.ones(len(true_seq)) #
|
|
152
|
+
#
|
|
153
|
+
# seq = pang_one_hot_encode(seq).T
|
|
154
|
+
# seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
|
|
155
|
+
#
|
|
156
|
+
# # if torch.cuda.is_available():
|
|
157
|
+
# seq = seq.to(torch.device(device))
|
|
158
|
+
# print(seq)
|
|
159
|
+
# scores = []
|
|
160
|
+
# for j, model_num in enumerate(model_nums):
|
|
161
|
+
# score = []
|
|
162
|
+
# # Average across 5 models
|
|
163
|
+
# for model in models[5 * j:5 * j + 5]:
|
|
164
|
+
# with torch.no_grad():
|
|
165
|
+
# score.append(model(seq)[0][INDEX_MAP[model_num], :].cpu().numpy())
|
|
166
|
+
#
|
|
167
|
+
# scores.append(np.mean(score, axis=0))
|
|
168
|
+
#
|
|
169
|
+
# splicing_pred = np.array(scores).max(axis=0)
|
|
170
|
+
# donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
171
|
+
# acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
172
|
+
# return donor_probs, acceptor_probs
|
|
173
|
+
#
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# __all__ = ['sai_predict_probs']
|
|
2
|
+
#
|
|
3
|
+
# #### SpliceAI Modules
|
|
4
|
+
#
|
|
5
|
+
# from keras.models import load_model
|
|
6
|
+
# from importlib import resources
|
|
7
|
+
# import numpy as np
|
|
8
|
+
# import tensorflow as tf
|
|
9
|
+
# import sys
|
|
10
|
+
# import absl.logging
|
|
11
|
+
# absl.logging.set_verbosity(absl.logging.ERROR)
|
|
12
|
+
#
|
|
13
|
+
#
|
|
14
|
+
# # Check if GPU is available
|
|
15
|
+
# # if tf.config.list_physical_devices('GPU'):
|
|
16
|
+
# # print("Running on GPU.")
|
|
17
|
+
# # else:
|
|
18
|
+
# # print("Running on CPU.")
|
|
19
|
+
#
|
|
20
|
+
# # tf.config.threading.set_intra_op_parallelism_threads(1)
|
|
21
|
+
# # tf.config.threading.set_inter_op_parallelism_threads(1)
|
|
22
|
+
#
|
|
23
|
+
# # List the model filenames relative to the spliceai package.
|
|
24
|
+
# model_filenames = [f"models/spliceai{i}.h5" for i in range(1, 6)]
|
|
25
|
+
#
|
|
26
|
+
# # Load each model using the package resources.
|
|
27
|
+
# # sai_models = [load_model(resources.files("spliceai").joinpath(filename))
|
|
28
|
+
# # for filename in model_filenames]
|
|
29
|
+
#
|
|
30
|
+
# if sys.platform == 'darwin':
|
|
31
|
+
# sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
|
|
32
|
+
# # sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
|
|
33
|
+
# sai_models = [load_model(resources.files('spliceai').joinpath(f)) for f in sai_paths]
|
|
34
|
+
# else:
|
|
35
|
+
# sai_paths = ['/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai1.h5',
|
|
36
|
+
# '/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai2.h5',
|
|
37
|
+
# '/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai3.h5',
|
|
38
|
+
# '/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai4.h5',
|
|
39
|
+
# '/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai5.h5']
|
|
40
|
+
#
|
|
41
|
+
# sai_models = [load_model(f) for f in sai_paths]
|
|
42
|
+
|
|
43
|
+
__all__ = ['sai_predict_probs']
|
|
44
|
+
import os
|
|
45
|
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
|
46
|
+
|
|
47
|
+
import absl.logging
|
|
48
|
+
absl.logging.set_verbosity(absl.logging.ERROR)
|
|
49
|
+
|
|
50
|
+
import os
|
|
51
|
+
import sys
|
|
52
|
+
import tensorflow as tf
|
|
53
|
+
import numpy as np
|
|
54
|
+
from keras.models import load_model
|
|
55
|
+
from importlib import resources
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Force device selection
|
|
59
|
+
if tf.config.list_physical_devices('GPU'):
|
|
60
|
+
device = '/GPU:0'
|
|
61
|
+
elif sys.platform == 'darwin' and tf.config.list_physical_devices('MPS'):
|
|
62
|
+
device = '/device:GPU:0' # MPS uses /device:GPU:0 in TF
|
|
63
|
+
else:
|
|
64
|
+
device = '/CPU:0'
|
|
65
|
+
|
|
66
|
+
# Model loading paths
|
|
67
|
+
if sys.platform == 'darwin':
|
|
68
|
+
model_filenames = [f"models/spliceai{i}.h5" for i in range(1, 6)]
|
|
69
|
+
model_paths = [resources.files('spliceai').joinpath(f) for f in model_filenames]
|
|
70
|
+
else:
|
|
71
|
+
model_paths = [f"/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai{i}.h5"
|
|
72
|
+
for i in range(1, 6)]
|
|
73
|
+
|
|
74
|
+
# Load models onto correct device
|
|
75
|
+
with tf.device(device):
|
|
76
|
+
sai_models = [load_model(str(f)) for f in model_paths]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
print(f"SpliceAI loaded to {device}.")
|
|
80
|
+
|
|
81
|
+
def one_hot_encode(seq):
|
|
82
|
+
|
|
83
|
+
map = np.asarray([[0, 0, 0, 0],
|
|
84
|
+
[1, 0, 0, 0],
|
|
85
|
+
[0, 1, 0, 0],
|
|
86
|
+
[0, 0, 1, 0],
|
|
87
|
+
[0, 0, 0, 1]])
|
|
88
|
+
|
|
89
|
+
seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
|
|
90
|
+
seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')
|
|
91
|
+
|
|
92
|
+
return map[np.fromstring(seq, np.int8) % 5]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def sai_predict_probs(seq: str, models: list) -> list:
|
|
96
|
+
'''
|
|
97
|
+
Predicts the donor and acceptor junction probability of each
|
|
98
|
+
NT in seq using SpliceAI.
|
|
99
|
+
|
|
100
|
+
Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
|
|
101
|
+
that the input seq has the following structure:
|
|
102
|
+
|
|
103
|
+
seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
|
|
104
|
+
|
|
105
|
+
The returned probability matrix is of size 2XL, where
|
|
106
|
+
the first row is the acceptor probability and the second row
|
|
107
|
+
is the donor probability. These probabilities corresponds to the
|
|
108
|
+
middel <L NTs> NTs of the input seq.
|
|
109
|
+
'''
|
|
110
|
+
x = one_hot_encode(seq)[None, :]
|
|
111
|
+
y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
|
|
112
|
+
# return y[0, :, 1:].T
|
|
113
|
+
y = y[0, :, 1:].T
|
|
114
|
+
return y[0, :], y[1, :]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def run_spliceai_seq(seq, indices, threshold=0):
|
|
118
|
+
# seq = 'N' * 5000 + seq + 'N' * 5000
|
|
119
|
+
ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
|
|
120
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
121
|
+
acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b >= threshold}
|
|
122
|
+
donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b >= threshold}
|
|
123
|
+
return donor_indices, acceptor_indices
|