geney 1.2.22__py2.py3-none-any.whl → 1.2.23__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/__init__.py +14 -2
- geney/data_setup.py +1 -1
- geney/graphic_utils.py +270 -0
- geney/mutation_utils.py +56 -0
- geney/oncosplice.py +197 -1543
- geney/pangolin_utils.py +78 -0
- geney/seqmat_utils.py +406 -0
- geney/spliceai_utils.py +52 -0
- geney/splicing_utils.py +372 -0
- geney/utils.py +24 -20
- {geney-1.2.22.dist-info → geney-1.2.23.dist-info}/METADATA +14 -14
- geney-1.2.23.dist-info/RECORD +25 -0
- {geney-1.2.22.dist-info → geney-1.2.23.dist-info}/WHEEL +1 -1
- geney-1.2.22.dist-info/RECORD +0 -19
- {geney-1.2.22.dist-info → geney-1.2.23.dist-info}/top_level.txt +0 -0
geney/pangolin_utils.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Load models
|
|
2
|
+
import torch
|
|
3
|
+
from pkg_resources import resource_filename
|
|
4
|
+
from pangolin.model import *
|
|
5
|
+
import numpy as np
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
|
|
9
|
+
pang_models = []
|
|
10
|
+
|
|
11
|
+
device = torch.device('cpu')
|
|
12
|
+
if sys.platform == 'darwin':
|
|
13
|
+
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
for i in pang_model_nums:
|
|
17
|
+
for j in range(1, 6):
|
|
18
|
+
model = Pangolin(L, W, AR).to(device)
|
|
19
|
+
if torch.cuda.is_available():
|
|
20
|
+
model.cuda()
|
|
21
|
+
# weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
|
|
22
|
+
weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)), weights_only=True)
|
|
23
|
+
|
|
24
|
+
else:
|
|
25
|
+
weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)), weights_only=True,
|
|
26
|
+
map_location=device)
|
|
27
|
+
|
|
28
|
+
model.load_state_dict(weights)
|
|
29
|
+
model.eval()
|
|
30
|
+
pang_models.append(model)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def pang_one_hot_encode(seq):
|
|
34
|
+
IN_MAP = np.asarray([[0, 0, 0, 0],
|
|
35
|
+
[1, 0, 0, 0],
|
|
36
|
+
[0, 1, 0, 0],
|
|
37
|
+
[0, 0, 1, 0],
|
|
38
|
+
[0, 0, 0, 1]])
|
|
39
|
+
seq = seq.upper().replace('A', '1').replace('C', '2')
|
|
40
|
+
seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
|
|
41
|
+
seq = np.asarray(list(map(int, list(seq))))
|
|
42
|
+
return IN_MAP[seq.astype('int8')]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def pangolin_predict_probs(true_seq, models):
|
|
47
|
+
# print(f"Running pangolin on: {true_seq}")
|
|
48
|
+
model_nums = [0, 2, 4, 6]
|
|
49
|
+
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
50
|
+
|
|
51
|
+
# seq = 'N'*5000 + true_seq + 'N'*5000
|
|
52
|
+
seq = true_seq
|
|
53
|
+
true_seq = true_seq[5000:-5000]
|
|
54
|
+
acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))])
|
|
55
|
+
donor_dinucleotide = np.array([true_seq[i + 1:i + 3] == 'GT' for i in range(len(true_seq))])
|
|
56
|
+
|
|
57
|
+
seq = pang_one_hot_encode(seq).T
|
|
58
|
+
seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
|
|
59
|
+
|
|
60
|
+
if torch.cuda.is_available():
|
|
61
|
+
seq = seq.to(torch.device("cuda"))
|
|
62
|
+
|
|
63
|
+
scores = []
|
|
64
|
+
for j, model_num in enumerate(model_nums):
|
|
65
|
+
score = []
|
|
66
|
+
# Average across 5 models
|
|
67
|
+
for model in models[5 * j:5 * j + 5]:
|
|
68
|
+
with torch.no_grad():
|
|
69
|
+
score.append(model(seq.to(device))[0][INDEX_MAP[model_num], :].cpu().numpy())
|
|
70
|
+
|
|
71
|
+
scores.append(np.mean(score, axis=0))
|
|
72
|
+
|
|
73
|
+
splicing_pred = np.array(scores).max(axis=0)
|
|
74
|
+
donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
75
|
+
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
76
|
+
# print(acceptor_probs)
|
|
77
|
+
# return donor_probs[context:-context], acceptor_probs[context:-context]
|
|
78
|
+
return donor_probs, acceptor_probs
|
geney/seqmat_utils.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
from . import unload_pickle, Fasta_segment, config
|
|
2
|
+
import numpy as np
|
|
3
|
+
import copy
|
|
4
|
+
from Bio.Seq import Seq
|
|
5
|
+
|
|
6
|
+
NT_ALPHABET = ['A', 'T', 'G', 'C']
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
'''
|
|
10
|
+
for DNAseq
|
|
11
|
+
character track
|
|
12
|
+
position1 track
|
|
13
|
+
position2 track
|
|
14
|
+
transcript track
|
|
15
|
+
orf track
|
|
16
|
+
|
|
17
|
+
for AAseq
|
|
18
|
+
character track
|
|
19
|
+
position track
|
|
20
|
+
conservation track
|
|
21
|
+
domain track
|
|
22
|
+
'''
|
|
23
|
+
class SeqMat:
|
|
24
|
+
ROW_SEQ = 0
|
|
25
|
+
ROW_INDS = 1
|
|
26
|
+
ROW_SUPERINDS = 2
|
|
27
|
+
|
|
28
|
+
def __init__(self, seq='-', inds=None, superinds=None, alphabet=NT_ALPHABET, ref=False):
|
|
29
|
+
seq = list(seq)
|
|
30
|
+
if inds is None:
|
|
31
|
+
inds = np.arange(0, len(seq), dtype=np.int32)
|
|
32
|
+
|
|
33
|
+
if superinds is None:
|
|
34
|
+
superinds = np.zeros(len(seq), dtype=np.int32)
|
|
35
|
+
|
|
36
|
+
else:
|
|
37
|
+
assert len(seq) == len(inds), f'Sequence length {len(seq)} must be equal to indices length {len(inds)}'
|
|
38
|
+
assert self._is_monotonic(inds), f'Sequence indices must be monotonic, got {inds}'
|
|
39
|
+
|
|
40
|
+
self.char_to_value = {c: i + 1 for i, c in enumerate(alphabet)}
|
|
41
|
+
self.value_to_char = {i: c for c, i in self.char_to_value.items()}
|
|
42
|
+
self.complement_char = {1: 2, 2: 1, 3: 4, 4: 3}
|
|
43
|
+
|
|
44
|
+
self.vectorized_map_c2v = np.vectorize(self.map_char_to_value)
|
|
45
|
+
self.vectorized_map_v2c = np.vectorize(self.map_value_to_char)
|
|
46
|
+
self.vectorized_map_v2v = np.vectorize(self.map_values_to_complement)
|
|
47
|
+
self.seqmat = np.vstack([self.vectorized_map_c2v(seq), inds, superinds], dtype=np.int32)
|
|
48
|
+
self.rev = ref
|
|
49
|
+
if self.rev:
|
|
50
|
+
self.reverse_complement()
|
|
51
|
+
|
|
52
|
+
def __repr__(self):
|
|
53
|
+
return self.seq
|
|
54
|
+
|
|
55
|
+
def __str__(self):
|
|
56
|
+
return self.seq
|
|
57
|
+
|
|
58
|
+
def __len__(self):
|
|
59
|
+
return len(self.seq)
|
|
60
|
+
|
|
61
|
+
def _is_monotonic(self, inds):
|
|
62
|
+
# return all(x <= y for x, y in zip(inds, inds[1:])) or all(x >= y for x, y in zip(inds, inds[1:]))
|
|
63
|
+
return True # np.all(np.diff(inds) >= 0) or np.all(np.diff(inds) <= 0)
|
|
64
|
+
|
|
65
|
+
def map_char_to_value(self, char):
|
|
66
|
+
return self.char_to_value.get(char, 0) # Return 0 if character not found
|
|
67
|
+
|
|
68
|
+
def map_value_to_char(self, val):
|
|
69
|
+
return self.value_to_char.get(val, '-') # Return 0 if character not found
|
|
70
|
+
|
|
71
|
+
def map_values_to_complement(self, val):
|
|
72
|
+
return self.complement_char.get(val, 0)
|
|
73
|
+
|
|
74
|
+
def mutate(self, mut, return_seqmat=False):
|
|
75
|
+
ref_seqmat = self.seqmat.copy()
|
|
76
|
+
mut_seqmat = mut.seqmat
|
|
77
|
+
# assert ref_seqmat[self.ROW_INDS, :].min() <= mut_seqmat[self.ROW_INDS, :].min() and ref_seqmat[self.ROW_INDS,
|
|
78
|
+
# :].max() >= mut_seqmat[
|
|
79
|
+
# self.ROW_INDS,
|
|
80
|
+
# :].max(), 'Mutation outside sequence'
|
|
81
|
+
assert np.all(np.isin(mut_seqmat[1, :], ref_seqmat[1, :])), "Mutation not in sequence"
|
|
82
|
+
|
|
83
|
+
if np.any(mut_seqmat[self.ROW_SUPERINDS, :] > 0):
|
|
84
|
+
insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0][0]
|
|
85
|
+
mut_seqmat, ins_seqmat = mut_seqmat[:, :insertions], mut_seqmat[:, insertions:]
|
|
86
|
+
ins_loc = np.where(ref_seqmat[1, :] == ins_seqmat[1, 0])[0][0] + 1
|
|
87
|
+
ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
|
|
88
|
+
|
|
89
|
+
condition = np.logical_and(np.isin(ref_seqmat[self.ROW_INDS, :], mut_seqmat[self.ROW_INDS, :]),
|
|
90
|
+
ref_seqmat[self.ROW_SUPERINDS, :] == 0)
|
|
91
|
+
indices = np.where(condition)[0]
|
|
92
|
+
ref_seqmat[:, indices] = mut_seqmat
|
|
93
|
+
if return_seqmat:
|
|
94
|
+
return ref_seqmat
|
|
95
|
+
|
|
96
|
+
return ''.join(self.vectorized_map_v2c(ref_seqmat[self.ROW_SEQ, :])), ref_seqmat[self.ROW_INDS, :], ref_seqmat[
|
|
97
|
+
self.ROW_SUPERINDS,
|
|
98
|
+
:]
|
|
99
|
+
|
|
100
|
+
def reverse_complement(self):
|
|
101
|
+
self.seqmat = self.seqmat[:, ::-1]
|
|
102
|
+
self.seqmat[0, :] = self.vectorized_map_v2v(self.seqmat[0, :])
|
|
103
|
+
|
|
104
|
+
def pull_region(self, inds1, inds2=None):
|
|
105
|
+
start_pos = np.where(self.seqmat[self.ROW_INDS] == inds1[0])[0][0]
|
|
106
|
+
end_pos = np.where(self.seqmat[self.ROW_INDS] == inds1[1])[0][0] + 1
|
|
107
|
+
return self.seqmat[:, start_pos:end_pos]
|
|
108
|
+
|
|
109
|
+
def set_seqmat(self, mat):
|
|
110
|
+
self.seqmat = mat
|
|
111
|
+
return self
|
|
112
|
+
|
|
113
|
+
def __add__(self, mut):
|
|
114
|
+
return SeqMat(*self.mutate(mut))
|
|
115
|
+
|
|
116
|
+
def __iadd__(self, mut):
|
|
117
|
+
self.seqmat = self.mutate(mut, return_seqmat=True)
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def seq(self):
|
|
122
|
+
|
|
123
|
+
return ''.join(self.vectorized_map_v2c(self.seqmat[self.ROW_SEQ, :])).replace('-', '')
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def indices(self):
|
|
127
|
+
return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 0] + (self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 0] / 10)
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def rawseq(self):
|
|
131
|
+
return ''.join(self.vectorized_map_v2c(self.seqmat[self.ROW_SEQ, :]))
|
|
132
|
+
|
|
133
|
+
def subseq(self, start, end):
|
|
134
|
+
start_pos = np.where(self.seqmat[self.ROW_INDS] == start)[0][0]
|
|
135
|
+
end_pos = np.where(self.seqmat[self.ROW_INDS] == end)[0][0] + 1
|
|
136
|
+
return self.seq[start_pos:end_pos]
|
|
137
|
+
|
|
138
|
+
def raw_subseq(self, start, end):
|
|
139
|
+
start_pos = np.where(self.seqmat[self.ROW_INDS] == start)[0][0]
|
|
140
|
+
end_pos = np.where(self.seqmat[self.ROW_INDS] == end)[0][0] + 1
|
|
141
|
+
return self.seqmat[:, start_pos:end_pos]
|
|
142
|
+
|
|
143
|
+
def inspect(self, pos, context=500):
|
|
144
|
+
condition = np.where(self.seqmat[1, :] == pos)[0][0]
|
|
145
|
+
return SeqMat().set_seqmat(self.seqmat[:, max(0, condition - context):min(self.seqmat.shape[-1], condition + context + 1)])
|
|
146
|
+
|
|
147
|
+
def rel_pos(self, pos):
|
|
148
|
+
return np.where(self.seqmat[1, :] == pos)[0][0]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class Gene:
|
|
152
|
+
def __init__(self, gene_name='KRAS', variation=None, organism='hg38'):
|
|
153
|
+
gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
|
|
154
|
+
if not gene_files:
|
|
155
|
+
raise FileNotFoundError(f"No files available for gene {gene_name}.")
|
|
156
|
+
|
|
157
|
+
data = unload_pickle(gene_files[0])
|
|
158
|
+
for k, v in data.items():
|
|
159
|
+
setattr(self, k, v)
|
|
160
|
+
|
|
161
|
+
self.organism = organism
|
|
162
|
+
needed_attributes = ['organism', 'transcripts', 'gene_name']
|
|
163
|
+
assert all(hasattr(self, attr) for attr in needed_attributes), \
|
|
164
|
+
f"Transcript is missing required attributes: {[attr for attr in needed_attributes if not hasattr(self, attr)]}"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def __repr__(self):
|
|
168
|
+
return f'Gene({self.gene_name})'
|
|
169
|
+
|
|
170
|
+
def __len__(self):
|
|
171
|
+
return len(self.transcripts)
|
|
172
|
+
|
|
173
|
+
def __str__(self):
|
|
174
|
+
return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
|
|
175
|
+
|
|
176
|
+
def __copy__(self):
|
|
177
|
+
return copy.copy(self)
|
|
178
|
+
|
|
179
|
+
def __deepcopy__(self, memo):
|
|
180
|
+
return copy.deepcopy(self, memo)
|
|
181
|
+
|
|
182
|
+
def __getitem__(self, index):
|
|
183
|
+
key = list(self.transcripts.keys())[index]
|
|
184
|
+
return Transcript(self.transcripts[key])
|
|
185
|
+
|
|
186
|
+
def transcript(self, tid=None):
|
|
187
|
+
if tid is None:
|
|
188
|
+
tid = self.primary_transcript
|
|
189
|
+
|
|
190
|
+
if tid not in self.transcripts:
|
|
191
|
+
raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
|
|
192
|
+
|
|
193
|
+
return Transcript(self.transcripts[tid], organism=self.organism)
|
|
194
|
+
|
|
195
|
+
def run_transcripts(self, primary_transcript=False, protein_coding=False):
|
|
196
|
+
for tid, annotations in self.transcripts.items():
|
|
197
|
+
if (primary_transcript and not annotations.get('primary_transcript')) or \
|
|
198
|
+
(protein_coding and annotations.get('transcript_biotype') != 'protein_coding'):
|
|
199
|
+
continue
|
|
200
|
+
yield tid, Transcript(annotations, organism=self.organism)
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def primary_transcript(self):
|
|
204
|
+
if not hasattr(self, '_primary_transcript'):
|
|
205
|
+
self._primary_transcript = [k for k, v in self.transcripts.items() if v.get('primary_transcript')][0]
|
|
206
|
+
return self._primary_transcript
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class Transcript:
|
|
210
|
+
def __init__(self, d, organism='hg38'):
|
|
211
|
+
for k, v in d.items():
|
|
212
|
+
if k in ['acceptors', 'donors', 'cons_vector']:
|
|
213
|
+
v = np.array(v)
|
|
214
|
+
setattr(self, k, v)
|
|
215
|
+
|
|
216
|
+
self.organism = organism
|
|
217
|
+
needed_attributes = ['transcript_start', 'transcript_end', 'rev', 'chrm']
|
|
218
|
+
assert all(hasattr(self, attr) for attr in needed_attributes), \
|
|
219
|
+
f"Transcript is missing required attributes: {[attr for attr in needed_attributes if not hasattr(self, attr)]}"
|
|
220
|
+
|
|
221
|
+
if not hasattr(self, 'donors'):
|
|
222
|
+
self.donors = []
|
|
223
|
+
|
|
224
|
+
if not hasattr(self, 'acceptors'):
|
|
225
|
+
self.acceptors = []
|
|
226
|
+
|
|
227
|
+
if not hasattr(self, 'cons_available'):
|
|
228
|
+
self.cons_available = False
|
|
229
|
+
|
|
230
|
+
if not (hasattr(self, 'TIS') and hasattr(self, 'TTS')):
|
|
231
|
+
self.protein_coding = False
|
|
232
|
+
else:
|
|
233
|
+
self.protein_coding = True
|
|
234
|
+
|
|
235
|
+
self.transcript_upper, self.transcript_lower = max(self.transcript_start, self.transcript_end), min(
|
|
236
|
+
self.transcript_start, self.transcript_end)
|
|
237
|
+
self.generate_pre_mrna()
|
|
238
|
+
|
|
239
|
+
if self.cons_available:
|
|
240
|
+
if '*' == self.cons_seq[-1] and len(self.cons_seq) == len(self.cons_vector):
|
|
241
|
+
self.cons_vector = self.cons_vector[:-1]
|
|
242
|
+
self.cons_seq = self.cons_seq[:-1]
|
|
243
|
+
|
|
244
|
+
def __repr__(self):
|
|
245
|
+
return 'Transcript({tid})'.format(tid=self.transcript_id)
|
|
246
|
+
|
|
247
|
+
def __len__(self):
|
|
248
|
+
return len(self.transcript_seq)
|
|
249
|
+
|
|
250
|
+
def __str__(self):
|
|
251
|
+
return 'Transcript {tid}, Transcript Type: ' \
|
|
252
|
+
'{protein_coding}, Primary: {primary}'.format(
|
|
253
|
+
tid=self.transcript_id, protein_coding=self.transcript_biotype.replace('_', ' ').title(),
|
|
254
|
+
primary=self.primary_transcript)
|
|
255
|
+
|
|
256
|
+
def __eq__(self, other):
|
|
257
|
+
return self.transcript_seq == other.transcript_seq
|
|
258
|
+
|
|
259
|
+
def __contains__(self, subvalue:np.array):
|
|
260
|
+
'''
|
|
261
|
+
:param subvalue: the substring to search for in the mature mrna transcript
|
|
262
|
+
:return: wehether or not the substring is seen in the mature transcript or not
|
|
263
|
+
'''
|
|
264
|
+
return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
# def __handle_cons(self):
|
|
268
|
+
# if '*' in self.cons_seq:
|
|
269
|
+
# self.cons_seq = self.cons_seq.replace('*', '')
|
|
270
|
+
# self.cons_vector = np.array(self.cons_vector[:-1])
|
|
271
|
+
|
|
272
|
+
# if self.cons_seq == self.protein and len(self.cons_vector) == len(self.cons_seq):
|
|
273
|
+
# self.cons_available = True
|
|
274
|
+
|
|
275
|
+
# if self.cons_available == False:
|
|
276
|
+
# self.cons_vector = np.ones(len(self.protein))
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def exons(self):
|
|
280
|
+
'''
|
|
281
|
+
:return: a list of tuples where the first position is the acceptor and the second position is the donor
|
|
282
|
+
'''
|
|
283
|
+
acceptors = np.concatenate(([self.transcript_start], self.acceptors))
|
|
284
|
+
donors = np.concatenate((self.donors, [self.transcript_end]))
|
|
285
|
+
return list(zip(acceptors, donors))
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def exons_pos(self):
|
|
289
|
+
temp = self.exons
|
|
290
|
+
if self.rev:
|
|
291
|
+
# Reverse the order of exons and switch positions of the tuples
|
|
292
|
+
temp = [(b, a) for a, b in temp[::-1]]
|
|
293
|
+
return temp
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def introns(self):
|
|
297
|
+
donors = self.donors[self.donors != self.transcript_end]
|
|
298
|
+
acceptors = self.acceptors[self.acceptors != self.transcript_start]
|
|
299
|
+
return list(zip(donors, acceptors))
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def introns_pos(self):
|
|
303
|
+
temp = self.introns
|
|
304
|
+
if self.rev:
|
|
305
|
+
temp = [(b, a) for a, b in temp[::-1]]
|
|
306
|
+
return temp
|
|
307
|
+
|
|
308
|
+
def _fix_and_check_introns(self):
|
|
309
|
+
self.acceptors = np.unique(self.acceptors)
|
|
310
|
+
self.donors = np.unique(self.donors)
|
|
311
|
+
self.acceptors = np.sort(self.acceptors)[::-1] if self.rev else np.sort(self.acceptors)
|
|
312
|
+
self.donors = np.sort(self.donors)[::-1] if self.rev else np.sort(self.donors)
|
|
313
|
+
|
|
314
|
+
if self.__exon_intron_matchup_flag():
|
|
315
|
+
raise ValueError(f"Unequal number of acceptors and donors.")
|
|
316
|
+
if self.__exon_intron_order_flag():
|
|
317
|
+
raise ValueError(f"Exons / intron order out of position.")
|
|
318
|
+
if self.__transcript_boundary_flag():
|
|
319
|
+
raise ValueError(f"Transcript boundaries must straddle acceptors and donors.")
|
|
320
|
+
|
|
321
|
+
return self
|
|
322
|
+
|
|
323
|
+
def __exon_coverage_flag(self):
|
|
324
|
+
exon_lengths = np.sum(np.abs(self.acceptors - self.donors) + 1) # Vectorized calculation
|
|
325
|
+
return exon_lengths != len(self)
|
|
326
|
+
|
|
327
|
+
def __exon_intron_matchup_flag(self):
|
|
328
|
+
return len(self.acceptors) != len(self.donors)
|
|
329
|
+
|
|
330
|
+
def __exon_intron_order_flag(self):
|
|
331
|
+
exons_pos = self.exons_pos # Precomputed exons with positions
|
|
332
|
+
return np.any([start > end for start, end in exons_pos])
|
|
333
|
+
|
|
334
|
+
def __transcript_boundary_flag(self):
|
|
335
|
+
if len(self.acceptors) == 0 and len(self.donors) == 0:
|
|
336
|
+
return False
|
|
337
|
+
|
|
338
|
+
min_boundary = np.min(np.concatenate((self.acceptors, self.donors)))
|
|
339
|
+
max_boundary = np.max(np.concatenate((self.acceptors, self.donors)))
|
|
340
|
+
return self.transcript_lower > min_boundary or self.transcript_upper < max_boundary
|
|
341
|
+
|
|
342
|
+
@property
|
|
343
|
+
def exonic_indices(self):
|
|
344
|
+
return np.concatenate([np.arange(a, b + 1) for a, b in self.exons_pos])
|
|
345
|
+
|
|
346
|
+
# Related to transcript seq generation
|
|
347
|
+
def pull_pre_mrna_pos(self):
|
|
348
|
+
fasta_obj = Fasta_segment()
|
|
349
|
+
return fasta_obj.read_segment_endpoints(config[self.organism]['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
|
|
350
|
+
self.transcript_lower,
|
|
351
|
+
self.transcript_upper)
|
|
352
|
+
|
|
353
|
+
def generate_pre_mrna(self):
|
|
354
|
+
pre_mrna = SeqMat(*self.pull_pre_mrna_pos())
|
|
355
|
+
if self.rev:
|
|
356
|
+
pre_mrna.reverse_complement()
|
|
357
|
+
|
|
358
|
+
self.pre_mrna = pre_mrna
|
|
359
|
+
return self
|
|
360
|
+
|
|
361
|
+
def generate_mature_mrna(self, inplace=True):
|
|
362
|
+
self._fix_and_check_introns()
|
|
363
|
+
|
|
364
|
+
exon_regions = []
|
|
365
|
+
for exon in self.exons:
|
|
366
|
+
exon_regions.append(self.pre_mrna.pull_region(exon))
|
|
367
|
+
mature_mrna = np.concatenate(exon_regions, axis=1)
|
|
368
|
+
if inplace:
|
|
369
|
+
self.mature_mrna = SeqMat().set_seqmat(mature_mrna)
|
|
370
|
+
return self
|
|
371
|
+
|
|
372
|
+
return mature_mrna
|
|
373
|
+
|
|
374
|
+
# def find_end_codon(self):
|
|
375
|
+
# first_stop_index = next((i for i in range(0, len(orf) - 2, 3) if orf[i:i + 3] in {"TAG", "TAA", "TGA"}),
|
|
376
|
+
# len(orf) - 3)
|
|
377
|
+
# while first_stop_index % 3 != 0:
|
|
378
|
+
# first_stop_index -= 1
|
|
379
|
+
#
|
|
380
|
+
# orf = orf[:first_stop_index + 3]
|
|
381
|
+
# return None
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def orf(self):
|
|
385
|
+
if not (hasattr(self, 'TIS') and hasattr(self, 'TTS')):
|
|
386
|
+
print("Cannot create protein without set TIS and TTS values.")
|
|
387
|
+
return self
|
|
388
|
+
|
|
389
|
+
return SeqMat().set_seqmat(self.mature_mrna.raw_subseq(self.TIS, self.TTS))
|
|
390
|
+
|
|
391
|
+
def generate_protein(self, inplace=True, domains=None):
|
|
392
|
+
protein = str(Seq(self.orf.seq).translate()).replace('*', '')
|
|
393
|
+
cons_vector = self.cons_vector
|
|
394
|
+
if domains is not None and np.all(np.isin(domains, np.arange(0, len(protein)))):
|
|
395
|
+
all_indices = np.arange(cons_vector.size)
|
|
396
|
+
mask = ~np.isin(all_indices, domains)
|
|
397
|
+
cons_vector[mask] = 0
|
|
398
|
+
|
|
399
|
+
if inplace:
|
|
400
|
+
self.protein = protein
|
|
401
|
+
if domains is not None:
|
|
402
|
+
self.cons_vector = cons_vector
|
|
403
|
+
return self
|
|
404
|
+
|
|
405
|
+
return protein, cons_vector
|
|
406
|
+
|
geney/spliceai_utils.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
|
|
2
|
+
#### SpliceAI Modules
|
|
3
|
+
import tensorflow as tf
|
|
4
|
+
from keras.models import load_model
|
|
5
|
+
from pkg_resources import resource_filename
|
|
6
|
+
from spliceai.utils import one_hot_encode
|
|
7
|
+
import numpy as np
|
|
8
|
+
import tensorflow
|
|
9
|
+
|
|
10
|
+
# Check if GPU is available
|
|
11
|
+
if tf.config.list_physical_devices('GPU'):
|
|
12
|
+
print("Running on GPU.")
|
|
13
|
+
else:
|
|
14
|
+
print("Running on CPU.")
|
|
15
|
+
|
|
16
|
+
# tf.config.threading.set_intra_op_parallelism_threads(1)
|
|
17
|
+
# tf.config.threading.set_inter_op_parallelism_threads(1)
|
|
18
|
+
|
|
19
|
+
sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
|
|
20
|
+
sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def sai_predict_probs(seq: str, models: list) -> list:
|
|
25
|
+
'''
|
|
26
|
+
Predicts the donor and acceptor junction probability of each
|
|
27
|
+
NT in seq using SpliceAI.
|
|
28
|
+
|
|
29
|
+
Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
|
|
30
|
+
that the input seq has the following structure:
|
|
31
|
+
|
|
32
|
+
seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
|
|
33
|
+
|
|
34
|
+
The returned probability matrix is of size 2XL, where
|
|
35
|
+
the first row is the acceptor probability and the second row
|
|
36
|
+
is the donor probability. These probabilities corresponds to the
|
|
37
|
+
middel <L NTs> NTs of the input seq.
|
|
38
|
+
'''
|
|
39
|
+
x = one_hot_encode(seq)[None, :]
|
|
40
|
+
y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
|
|
41
|
+
# return y[0, :, 1:].T
|
|
42
|
+
y = y[0, :, 1:].T
|
|
43
|
+
return y[0, :], y[1, :]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_spliceai_seq(seq, indices, threshold=0):
|
|
47
|
+
# seq = 'N' * 5000 + seq + 'N' * 5000
|
|
48
|
+
ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
|
|
49
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
50
|
+
acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b >= threshold}
|
|
51
|
+
donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b >= threshold}
|
|
52
|
+
return acceptor_indices, donor_indices
|