geney 1.2.22__py2.py3-none-any.whl → 1.2.24__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

@@ -0,0 +1,81 @@
1
+ # Load models
2
+ import torch
3
+ from pkg_resources import resource_filename
4
+ from pangolin.model import *
5
+ import numpy as np
6
+ import sys
7
+
8
+ pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
9
+ pang_models = []
10
+
11
+ device = torch.device('cpu')
12
+ if sys.platform == 'darwin':
13
+ device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
14
+
15
+ if sys.platform == 'linux':
16
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
17
+
18
+
19
+ for i in pang_model_nums:
20
+ for j in range(1, 6):
21
+ model = Pangolin(L, W, AR).to(device)
22
+ if torch.cuda.is_available():
23
+ model.cuda()
24
+ # weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
25
+ weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)), weights_only=True)
26
+
27
+ else:
28
+ weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)), weights_only=True,
29
+ map_location=device)
30
+
31
+ model.load_state_dict(weights)
32
+ model.eval()
33
+ pang_models.append(model)
34
+
35
+
36
+ def pang_one_hot_encode(seq):
37
+ IN_MAP = np.asarray([[0, 0, 0, 0],
38
+ [1, 0, 0, 0],
39
+ [0, 1, 0, 0],
40
+ [0, 0, 1, 0],
41
+ [0, 0, 0, 1]])
42
+ seq = seq.upper().replace('A', '1').replace('C', '2')
43
+ seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
44
+ seq = np.asarray(list(map(int, list(seq))))
45
+ return IN_MAP[seq.astype('int8')]
46
+
47
+
48
+
49
+ def pangolin_predict_probs(true_seq, models):
50
+ # print(f"Running pangolin on: {true_seq}")
51
+ model_nums = [0, 2, 4, 6]
52
+ INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
53
+
54
+ # seq = 'N'*5000 + true_seq + 'N'*5000
55
+ seq = true_seq
56
+ true_seq = true_seq[5000:-5000]
57
+ acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))])
58
+ donor_dinucleotide = np.array([true_seq[i + 1:i + 3] == 'GT' for i in range(len(true_seq))])
59
+
60
+ seq = pang_one_hot_encode(seq).T
61
+ seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
62
+
63
+ # if torch.cuda.is_available():
64
+ seq = seq.to(torch.device("cuda"))
65
+
66
+ scores = []
67
+ for j, model_num in enumerate(model_nums):
68
+ score = []
69
+ # Average across 5 models
70
+ for model in models[5 * j:5 * j + 5]:
71
+ with torch.no_grad():
72
+ score.append(model(seq.to(device))[0][INDEX_MAP[model_num], :].cpu().numpy())
73
+
74
+ scores.append(np.mean(score, axis=0))
75
+
76
+ splicing_pred = np.array(scores).max(axis=0)
77
+ donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
78
+ acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
79
+ # print(acceptor_probs)
80
+ # return donor_probs[context:-context], acceptor_probs[context:-context]
81
+ return donor_probs, acceptor_probs
geney/seqmat_utils.py ADDED
@@ -0,0 +1,406 @@
1
+ from . import unload_pickle, Fasta_segment, config
2
+ import numpy as np
3
+ import copy
4
+ from Bio.Seq import Seq
5
+
6
+ NT_ALPHABET = ['A', 'T', 'G', 'C']
7
+
8
+
9
+ '''
10
+ for DNAseq
11
+ character track
12
+ position1 track
13
+ position2 track
14
+ transcript track
15
+ orf track
16
+
17
+ for AAseq
18
+ character track
19
+ position track
20
+ conservation track
21
+ domain track
22
+ '''
23
+ class SeqMat:
24
+ ROW_SEQ = 0
25
+ ROW_INDS = 1
26
+ ROW_SUPERINDS = 2
27
+
28
+ def __init__(self, seq='-', inds=None, superinds=None, alphabet=NT_ALPHABET, ref=False):
29
+ seq = list(seq)
30
+ if inds is None:
31
+ inds = np.arange(0, len(seq), dtype=np.int32)
32
+
33
+ if superinds is None:
34
+ superinds = np.zeros(len(seq), dtype=np.int32)
35
+
36
+ else:
37
+ assert len(seq) == len(inds), f'Sequence length {len(seq)} must be equal to indices length {len(inds)}'
38
+ assert self._is_monotonic(inds), f'Sequence indices must be monotonic, got {inds}'
39
+
40
+ self.char_to_value = {c: i + 1 for i, c in enumerate(alphabet)}
41
+ self.value_to_char = {i: c for c, i in self.char_to_value.items()}
42
+ self.complement_char = {1: 2, 2: 1, 3: 4, 4: 3}
43
+
44
+ self.vectorized_map_c2v = np.vectorize(self.map_char_to_value)
45
+ self.vectorized_map_v2c = np.vectorize(self.map_value_to_char)
46
+ self.vectorized_map_v2v = np.vectorize(self.map_values_to_complement)
47
+ self.seqmat = np.vstack([self.vectorized_map_c2v(seq), inds, superinds], dtype=np.int32)
48
+ self.rev = ref
49
+ if self.rev:
50
+ self.reverse_complement()
51
+
52
+ def __repr__(self):
53
+ return self.seq
54
+
55
+ def __str__(self):
56
+ return self.seq
57
+
58
+ def __len__(self):
59
+ return len(self.seq)
60
+
61
+ def _is_monotonic(self, inds):
62
+ # return all(x <= y for x, y in zip(inds, inds[1:])) or all(x >= y for x, y in zip(inds, inds[1:]))
63
+ return True # np.all(np.diff(inds) >= 0) or np.all(np.diff(inds) <= 0)
64
+
65
+ def map_char_to_value(self, char):
66
+ return self.char_to_value.get(char, 0) # Return 0 if character not found
67
+
68
+ def map_value_to_char(self, val):
69
+ return self.value_to_char.get(val, '-') # Return 0 if character not found
70
+
71
+ def map_values_to_complement(self, val):
72
+ return self.complement_char.get(val, 0)
73
+
74
+ def mutate(self, mut, return_seqmat=False):
75
+ ref_seqmat = self.seqmat.copy()
76
+ mut_seqmat = mut.seqmat
77
+ # assert ref_seqmat[self.ROW_INDS, :].min() <= mut_seqmat[self.ROW_INDS, :].min() and ref_seqmat[self.ROW_INDS,
78
+ # :].max() >= mut_seqmat[
79
+ # self.ROW_INDS,
80
+ # :].max(), 'Mutation outside sequence'
81
+ assert np.all(np.isin(mut_seqmat[1, :], ref_seqmat[1, :])), "Mutation not in sequence"
82
+
83
+ if np.any(mut_seqmat[self.ROW_SUPERINDS, :] > 0):
84
+ insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0][0]
85
+ mut_seqmat, ins_seqmat = mut_seqmat[:, :insertions], mut_seqmat[:, insertions:]
86
+ ins_loc = np.where(ref_seqmat[1, :] == ins_seqmat[1, 0])[0][0] + 1
87
+ ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
88
+
89
+ condition = np.logical_and(np.isin(ref_seqmat[self.ROW_INDS, :], mut_seqmat[self.ROW_INDS, :]),
90
+ ref_seqmat[self.ROW_SUPERINDS, :] == 0)
91
+ indices = np.where(condition)[0]
92
+ ref_seqmat[:, indices] = mut_seqmat
93
+ if return_seqmat:
94
+ return ref_seqmat
95
+
96
+ return ''.join(self.vectorized_map_v2c(ref_seqmat[self.ROW_SEQ, :])), ref_seqmat[self.ROW_INDS, :], ref_seqmat[
97
+ self.ROW_SUPERINDS,
98
+ :]
99
+
100
+ def reverse_complement(self):
101
+ self.seqmat = self.seqmat[:, ::-1]
102
+ self.seqmat[0, :] = self.vectorized_map_v2v(self.seqmat[0, :])
103
+
104
+ def pull_region(self, inds1, inds2=None):
105
+ start_pos = np.where(self.seqmat[self.ROW_INDS] == inds1[0])[0][0]
106
+ end_pos = np.where(self.seqmat[self.ROW_INDS] == inds1[1])[0][0] + 1
107
+ return self.seqmat[:, start_pos:end_pos]
108
+
109
+ def set_seqmat(self, mat):
110
+ self.seqmat = mat
111
+ return self
112
+
113
+ def __add__(self, mut):
114
+ return SeqMat(*self.mutate(mut))
115
+
116
+ def __iadd__(self, mut):
117
+ self.seqmat = self.mutate(mut, return_seqmat=True)
118
+ return self
119
+
120
+ @property
121
+ def seq(self):
122
+
123
+ return ''.join(self.vectorized_map_v2c(self.seqmat[self.ROW_SEQ, :])).replace('-', '')
124
+
125
+ @property
126
+ def indices(self):
127
+ return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 0] + (self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 0] / 10)
128
+
129
+ @property
130
+ def rawseq(self):
131
+ return ''.join(self.vectorized_map_v2c(self.seqmat[self.ROW_SEQ, :]))
132
+
133
+ def subseq(self, start, end):
134
+ start_pos = np.where(self.seqmat[self.ROW_INDS] == start)[0][0]
135
+ end_pos = np.where(self.seqmat[self.ROW_INDS] == end)[0][0] + 1
136
+ return self.seq[start_pos:end_pos]
137
+
138
+ def raw_subseq(self, start, end):
139
+ start_pos = np.where(self.seqmat[self.ROW_INDS] == start)[0][0]
140
+ end_pos = np.where(self.seqmat[self.ROW_INDS] == end)[0][0] + 1
141
+ return self.seqmat[:, start_pos:end_pos]
142
+
143
+ def inspect(self, pos, context=500):
144
+ condition = np.where(self.seqmat[1, :] == pos)[0][0]
145
+ return SeqMat().set_seqmat(self.seqmat[:, max(0, condition - context):min(self.seqmat.shape[-1], condition + context + 1)])
146
+
147
+ def rel_pos(self, pos):
148
+ return np.where(self.seqmat[1, :] == pos)[0][0]
149
+
150
+
151
+ class Gene:
152
+ def __init__(self, gene_name='KRAS', variation=None, organism='hg38'):
153
+ gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
154
+ if not gene_files:
155
+ raise FileNotFoundError(f"No files available for gene {gene_name}.")
156
+
157
+ data = unload_pickle(gene_files[0])
158
+ for k, v in data.items():
159
+ setattr(self, k, v)
160
+
161
+ self.organism = organism
162
+ needed_attributes = ['organism', 'transcripts', 'gene_name']
163
+ assert all(hasattr(self, attr) for attr in needed_attributes), \
164
+ f"Transcript is missing required attributes: {[attr for attr in needed_attributes if not hasattr(self, attr)]}"
165
+
166
+
167
+ def __repr__(self):
168
+ return f'Gene({self.gene_name})'
169
+
170
+ def __len__(self):
171
+ return len(self.transcripts)
172
+
173
+ def __str__(self):
174
+ return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
175
+
176
+ def __copy__(self):
177
+ return copy.copy(self)
178
+
179
+ def __deepcopy__(self, memo):
180
+ return copy.deepcopy(self, memo)
181
+
182
+ def __getitem__(self, index):
183
+ key = list(self.transcripts.keys())[index]
184
+ return Transcript(self.transcripts[key])
185
+
186
+ def transcript(self, tid=None):
187
+ if tid is None:
188
+ tid = self.primary_transcript
189
+
190
+ if tid not in self.transcripts:
191
+ raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
192
+
193
+ return Transcript(self.transcripts[tid], organism=self.organism)
194
+
195
+ def run_transcripts(self, primary_transcript=False, protein_coding=False):
196
+ for tid, annotations in self.transcripts.items():
197
+ if (primary_transcript and not annotations.get('primary_transcript')) or \
198
+ (protein_coding and annotations.get('transcript_biotype') != 'protein_coding'):
199
+ continue
200
+ yield tid, Transcript(annotations, organism=self.organism)
201
+
202
+ @property
203
+ def primary_transcript(self):
204
+ if not hasattr(self, '_primary_transcript'):
205
+ self._primary_transcript = [k for k, v in self.transcripts.items() if v.get('primary_transcript')][0]
206
+ return self._primary_transcript
207
+
208
+
209
+ class Transcript:
210
+ def __init__(self, d, organism='hg38'):
211
+ for k, v in d.items():
212
+ if k in ['acceptors', 'donors', 'cons_vector']:
213
+ v = np.array(v)
214
+ setattr(self, k, v)
215
+
216
+ self.organism = organism
217
+ needed_attributes = ['transcript_start', 'transcript_end', 'rev', 'chrm']
218
+ assert all(hasattr(self, attr) for attr in needed_attributes), \
219
+ f"Transcript is missing required attributes: {[attr for attr in needed_attributes if not hasattr(self, attr)]}"
220
+
221
+ if not hasattr(self, 'donors'):
222
+ self.donors = []
223
+
224
+ if not hasattr(self, 'acceptors'):
225
+ self.acceptors = []
226
+
227
+ if not hasattr(self, 'cons_available'):
228
+ self.cons_available = False
229
+
230
+ if not (hasattr(self, 'TIS') and hasattr(self, 'TTS')):
231
+ self.protein_coding = False
232
+ else:
233
+ self.protein_coding = True
234
+
235
+ self.transcript_upper, self.transcript_lower = max(self.transcript_start, self.transcript_end), min(
236
+ self.transcript_start, self.transcript_end)
237
+ self.generate_pre_mrna()
238
+
239
+ if self.cons_available:
240
+ if '*' == self.cons_seq[-1] and len(self.cons_seq) == len(self.cons_vector):
241
+ self.cons_vector = self.cons_vector[:-1]
242
+ self.cons_seq = self.cons_seq[:-1]
243
+
244
+ def __repr__(self):
245
+ return 'Transcript({tid})'.format(tid=self.transcript_id)
246
+
247
+ def __len__(self):
248
+ return len(self.transcript_seq)
249
+
250
+ def __str__(self):
251
+ return 'Transcript {tid}, Transcript Type: ' \
252
+ '{protein_coding}, Primary: {primary}'.format(
253
+ tid=self.transcript_id, protein_coding=self.transcript_biotype.replace('_', ' ').title(),
254
+ primary=self.primary_transcript)
255
+
256
+ def __eq__(self, other):
257
+ return self.transcript_seq == other.transcript_seq
258
+
259
+ def __contains__(self, subvalue:np.array):
260
+ '''
261
+ :param subvalue: the substring to search for in the mature mrna transcript
262
+ :return: wehether or not the substring is seen in the mature transcript or not
263
+ '''
264
+ return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
265
+
266
+
267
+ # def __handle_cons(self):
268
+ # if '*' in self.cons_seq:
269
+ # self.cons_seq = self.cons_seq.replace('*', '')
270
+ # self.cons_vector = np.array(self.cons_vector[:-1])
271
+
272
+ # if self.cons_seq == self.protein and len(self.cons_vector) == len(self.cons_seq):
273
+ # self.cons_available = True
274
+
275
+ # if self.cons_available == False:
276
+ # self.cons_vector = np.ones(len(self.protein))
277
+
278
+ @property
279
+ def exons(self):
280
+ '''
281
+ :return: a list of tuples where the first position is the acceptor and the second position is the donor
282
+ '''
283
+ acceptors = np.concatenate(([self.transcript_start], self.acceptors))
284
+ donors = np.concatenate((self.donors, [self.transcript_end]))
285
+ return list(zip(acceptors, donors))
286
+
287
+ @property
288
+ def exons_pos(self):
289
+ temp = self.exons
290
+ if self.rev:
291
+ # Reverse the order of exons and switch positions of the tuples
292
+ temp = [(b, a) for a, b in temp[::-1]]
293
+ return temp
294
+
295
+ @property
296
+ def introns(self):
297
+ donors = self.donors[self.donors != self.transcript_end]
298
+ acceptors = self.acceptors[self.acceptors != self.transcript_start]
299
+ return list(zip(donors, acceptors))
300
+
301
+ @property
302
+ def introns_pos(self):
303
+ temp = self.introns
304
+ if self.rev:
305
+ temp = [(b, a) for a, b in temp[::-1]]
306
+ return temp
307
+
308
+ def _fix_and_check_introns(self):
309
+ self.acceptors = np.unique(self.acceptors)
310
+ self.donors = np.unique(self.donors)
311
+ self.acceptors = np.sort(self.acceptors)[::-1] if self.rev else np.sort(self.acceptors)
312
+ self.donors = np.sort(self.donors)[::-1] if self.rev else np.sort(self.donors)
313
+
314
+ if self.__exon_intron_matchup_flag():
315
+ raise ValueError(f"Unequal number of acceptors and donors.")
316
+ if self.__exon_intron_order_flag():
317
+ raise ValueError(f"Exons / intron order out of position.")
318
+ if self.__transcript_boundary_flag():
319
+ raise ValueError(f"Transcript boundaries must straddle acceptors and donors.")
320
+
321
+ return self
322
+
323
+ def __exon_coverage_flag(self):
324
+ exon_lengths = np.sum(np.abs(self.acceptors - self.donors) + 1) # Vectorized calculation
325
+ return exon_lengths != len(self)
326
+
327
+ def __exon_intron_matchup_flag(self):
328
+ return len(self.acceptors) != len(self.donors)
329
+
330
+ def __exon_intron_order_flag(self):
331
+ exons_pos = self.exons_pos # Precomputed exons with positions
332
+ return np.any([start > end for start, end in exons_pos])
333
+
334
+ def __transcript_boundary_flag(self):
335
+ if len(self.acceptors) == 0 and len(self.donors) == 0:
336
+ return False
337
+
338
+ min_boundary = np.min(np.concatenate((self.acceptors, self.donors)))
339
+ max_boundary = np.max(np.concatenate((self.acceptors, self.donors)))
340
+ return self.transcript_lower > min_boundary or self.transcript_upper < max_boundary
341
+
342
+ @property
343
+ def exonic_indices(self):
344
+ return np.concatenate([np.arange(a, b + 1) for a, b in self.exons_pos])
345
+
346
+ # Related to transcript seq generation
347
+ def pull_pre_mrna_pos(self):
348
+ fasta_obj = Fasta_segment()
349
+ return fasta_obj.read_segment_endpoints(config[self.organism]['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
350
+ self.transcript_lower,
351
+ self.transcript_upper)
352
+
353
+ def generate_pre_mrna(self):
354
+ pre_mrna = SeqMat(*self.pull_pre_mrna_pos())
355
+ if self.rev:
356
+ pre_mrna.reverse_complement()
357
+
358
+ self.pre_mrna = pre_mrna
359
+ return self
360
+
361
+ def generate_mature_mrna(self, inplace=True):
362
+ self._fix_and_check_introns()
363
+
364
+ exon_regions = []
365
+ for exon in self.exons:
366
+ exon_regions.append(self.pre_mrna.pull_region(exon))
367
+ mature_mrna = np.concatenate(exon_regions, axis=1)
368
+ if inplace:
369
+ self.mature_mrna = SeqMat().set_seqmat(mature_mrna)
370
+ return self
371
+
372
+ return mature_mrna
373
+
374
+ # def find_end_codon(self):
375
+ # first_stop_index = next((i for i in range(0, len(orf) - 2, 3) if orf[i:i + 3] in {"TAG", "TAA", "TGA"}),
376
+ # len(orf) - 3)
377
+ # while first_stop_index % 3 != 0:
378
+ # first_stop_index -= 1
379
+ #
380
+ # orf = orf[:first_stop_index + 3]
381
+ # return None
382
+
383
+ @property
384
+ def orf(self):
385
+ if not (hasattr(self, 'TIS') and hasattr(self, 'TTS')):
386
+ print("Cannot create protein without set TIS and TTS values.")
387
+ return self
388
+
389
+ return SeqMat().set_seqmat(self.mature_mrna.raw_subseq(self.TIS, self.TTS))
390
+
391
+ def generate_protein(self, inplace=True, domains=None):
392
+ protein = str(Seq(self.orf.seq).translate()).replace('*', '')
393
+ cons_vector = self.cons_vector
394
+ if domains is not None and np.all(np.isin(domains, np.arange(0, len(protein)))):
395
+ all_indices = np.arange(cons_vector.size)
396
+ mask = ~np.isin(all_indices, domains)
397
+ cons_vector[mask] = 0
398
+
399
+ if inplace:
400
+ self.protein = protein
401
+ if domains is not None:
402
+ self.cons_vector = cons_vector
403
+ return self
404
+
405
+ return protein, cons_vector
406
+
@@ -0,0 +1,52 @@
1
+
2
+ #### SpliceAI Modules
3
+ import tensorflow as tf
4
+ from keras.models import load_model
5
+ from pkg_resources import resource_filename
6
+ from spliceai.utils import one_hot_encode
7
+ import numpy as np
8
+ import tensorflow
9
+
10
+ # Check if GPU is available
11
+ if tf.config.list_physical_devices('GPU'):
12
+ print("Running on GPU.")
13
+ else:
14
+ print("Running on CPU.")
15
+
16
+ # tf.config.threading.set_intra_op_parallelism_threads(1)
17
+ # tf.config.threading.set_inter_op_parallelism_threads(1)
18
+
19
+ sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
20
+ sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
21
+
22
+
23
+
24
+ def sai_predict_probs(seq: str, models: list) -> list:
25
+ '''
26
+ Predicts the donor and acceptor junction probability of each
27
+ NT in seq using SpliceAI.
28
+
29
+ Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
30
+ that the input seq has the following structure:
31
+
32
+ seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
33
+
34
+ The returned probability matrix is of size 2XL, where
35
+ the first row is the acceptor probability and the second row
36
+ is the donor probability. These probabilities corresponds to the
37
+ middel <L NTs> NTs of the input seq.
38
+ '''
39
+ x = one_hot_encode(seq)[None, :]
40
+ y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
41
+ # return y[0, :, 1:].T
42
+ y = y[0, :, 1:].T
43
+ return y[0, :], y[1, :]
44
+
45
+
46
+ def run_spliceai_seq(seq, indices, threshold=0):
47
+ # seq = 'N' * 5000 + seq + 'N' * 5000
48
+ ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
49
+ ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
50
+ acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b >= threshold}
51
+ donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b >= threshold}
52
+ return acceptor_indices, donor_indices