geney 1.2.22__py2.py3-none-any.whl → 1.2.24__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/__init__.py +14 -2
- geney/data_setup.py +1 -1
- geney/graphic_utils.py +270 -0
- geney/mutation_utils.py +56 -0
- geney/oncosplice.py +197 -1543
- geney/pangolin_utils.py +81 -0
- geney/seqmat_utils.py +406 -0
- geney/spliceai_utils.py +52 -0
- geney/splicing_utils.py +372 -0
- geney/utils.py +24 -20
- {geney-1.2.22.dist-info → geney-1.2.24.dist-info}/METADATA +13 -16
- geney-1.2.24.dist-info/RECORD +25 -0
- {geney-1.2.22.dist-info → geney-1.2.24.dist-info}/WHEEL +1 -1
- geney-1.2.22.dist-info/RECORD +0 -19
- {geney-1.2.22.dist-info → geney-1.2.24.dist-info}/top_level.txt +0 -0
geney/splicing_utils.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
import numpy as np
|
|
3
|
+
from .mutation_utils import get_mutation
|
|
4
|
+
from .seqmat_utils import Gene
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SpliceSite:
|
|
8
|
+
def __init__(self, pos, ss_type, prob):
|
|
9
|
+
self.pos = pos
|
|
10
|
+
self.ss_type = ss_type # 0 for donors, 1 for acceptors
|
|
11
|
+
self.prob = prob
|
|
12
|
+
|
|
13
|
+
class SpliceSiteFactory:
|
|
14
|
+
@staticmethod
|
|
15
|
+
def create_splice_site(pos, ss_type, prob):
|
|
16
|
+
return SpliceSite(pos, ss_type, prob)
|
|
17
|
+
|
|
18
|
+
def compute_paths_sequential(G, transcript, exon_starts, exon_ends):
|
|
19
|
+
"""
|
|
20
|
+
Compute paths from start to end and from end to start sequentially, then return the paths with their probabilities.
|
|
21
|
+
"""
|
|
22
|
+
new_paths = {}
|
|
23
|
+
prob_sum = 0
|
|
24
|
+
|
|
25
|
+
# Combine paths in both directions
|
|
26
|
+
all_paths = list(nx.all_simple_paths(G, transcript.transcript_start, transcript.transcript_end)) + \
|
|
27
|
+
list(nx.all_simple_paths(G, transcript.transcript_end, transcript.transcript_start))
|
|
28
|
+
|
|
29
|
+
# Compute the probabilities of each path sequentially
|
|
30
|
+
path_probs = [path_weight_mult(G, path, 'weight') for path in all_paths]
|
|
31
|
+
|
|
32
|
+
# Populate new_paths dictionary with computed paths and probabilities
|
|
33
|
+
for i, (path, curr_prob) in enumerate(zip(all_paths, path_probs)):
|
|
34
|
+
prob_sum += curr_prob
|
|
35
|
+
new_paths[i] = {
|
|
36
|
+
'acceptors': sorted([p for p in path if p in exon_starts and p != transcript.transcript_start], reverse=transcript.rev),
|
|
37
|
+
'donors': sorted([p for p in path if p in exon_ends and p != transcript.transcript_end], reverse=transcript.rev),
|
|
38
|
+
'path_weight': curr_prob
|
|
39
|
+
}
|
|
40
|
+
return new_paths, prob_sum
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def develop_aberrant_splicing(transcript, aberrant_splicing):
|
|
44
|
+
# Prepare exon start and end dictionaries
|
|
45
|
+
exon_starts = prepare_splice_sites(transcript.acceptors, transcript.transcript_start, aberrant_splicing, 'acceptors')
|
|
46
|
+
exon_ends = prepare_splice_sites(transcript.donors, transcript.transcript_end, aberrant_splicing, 'donors')
|
|
47
|
+
|
|
48
|
+
# Create SpliceSite nodes and filter based on probability > 0
|
|
49
|
+
nodes = [
|
|
50
|
+
SpliceSiteFactory.create_splice_site(pos, 0, prob) for pos, prob in exon_ends.items()
|
|
51
|
+
] + [
|
|
52
|
+
SpliceSiteFactory.create_splice_site(pos, 1, prob) for pos, prob in exon_starts.items()
|
|
53
|
+
]
|
|
54
|
+
nodes = [s for s in nodes if s.prob > 0]
|
|
55
|
+
|
|
56
|
+
# Sort nodes based on position, respecting transcript direction
|
|
57
|
+
nodes.sort(key=lambda x: x.pos, reverse=transcript.rev)
|
|
58
|
+
|
|
59
|
+
# Create the directed graph
|
|
60
|
+
G = create_splice_graph(nodes, transcript.rev)
|
|
61
|
+
|
|
62
|
+
# Compute new paths and their probabilities sequentially
|
|
63
|
+
new_paths, prob_sum = compute_paths_sequential(G, transcript, exon_starts, exon_ends)
|
|
64
|
+
|
|
65
|
+
# Normalize probabilities and filter based on threshold
|
|
66
|
+
new_paths = normalize_and_filter_paths(new_paths, prob_sum)
|
|
67
|
+
|
|
68
|
+
return list(new_paths.values())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def prepare_splice_sites(transcript_sites, transcript_boundary, aberrant_splicing, site_type):
|
|
72
|
+
"""
|
|
73
|
+
Prepare and return a dictionary of splice sites (acceptors or donors) including transcript boundaries
|
|
74
|
+
and aberrant splicing information.
|
|
75
|
+
"""
|
|
76
|
+
site_dict = {v: 1 for v in transcript_sites}
|
|
77
|
+
site_dict.update({transcript_boundary: 1})
|
|
78
|
+
site_dict.update({s: v['absolute'] for s, v in aberrant_splicing[f'missed_{site_type}'].items()})
|
|
79
|
+
site_dict.update({s: v['absolute'] for s, v in aberrant_splicing[f'discovered_{site_type}'].items()})
|
|
80
|
+
return site_dict
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def create_splice_graph(nodes, reverse_direction):
|
|
84
|
+
"""
|
|
85
|
+
Create and return a directed graph with splice sites as nodes and edges based on splice site type
|
|
86
|
+
and probability of occurrence.
|
|
87
|
+
"""
|
|
88
|
+
G = nx.DiGraph()
|
|
89
|
+
G.add_nodes_from([n.pos for n in nodes])
|
|
90
|
+
|
|
91
|
+
for i in range(len(nodes)):
|
|
92
|
+
trailing_prob = 0
|
|
93
|
+
in_between = set()
|
|
94
|
+
curr_node = nodes[i]
|
|
95
|
+
|
|
96
|
+
for j in range(i + 1, len(nodes)):
|
|
97
|
+
next_node = nodes[j]
|
|
98
|
+
in_between.add(next_node.ss_type)
|
|
99
|
+
|
|
100
|
+
if curr_node.ss_type != next_node.ss_type:
|
|
101
|
+
new_prob = next_node.prob - trailing_prob
|
|
102
|
+
if new_prob > 0:
|
|
103
|
+
G.add_edge(curr_node.pos, next_node.pos, weight=new_prob)
|
|
104
|
+
trailing_prob += next_node.prob
|
|
105
|
+
return G
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def normalize_and_filter_paths(new_paths, prob_sum):
|
|
109
|
+
"""
|
|
110
|
+
Normalize path probabilities and filter out paths with a probability less than 0.01.
|
|
111
|
+
"""
|
|
112
|
+
for i, d in new_paths.items():
|
|
113
|
+
d['path_weight'] = round(d['path_weight'] / prob_sum, 3)
|
|
114
|
+
new_paths = {k: v for k, v in new_paths.items() if v['path_weight'] > 0.00001}
|
|
115
|
+
return new_paths
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def path_weight_mult(G, path, weight):
|
|
119
|
+
"""
|
|
120
|
+
Calculate the multiplicative weight of the path.
|
|
121
|
+
"""
|
|
122
|
+
cost = 1
|
|
123
|
+
for node, nbr in zip(path[:-1], path[1:]):
|
|
124
|
+
cost *= G[node][nbr][weight]
|
|
125
|
+
return cost
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Missplicing Detection
|
|
129
|
+
def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
|
|
130
|
+
'''
|
|
131
|
+
:param ref_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
|
|
132
|
+
:param mut_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
|
|
133
|
+
:param known_splice_sites: the indices (by genomic position) that serve as known splice sites
|
|
134
|
+
:param threshold: the threshold for detection (difference between reference and mutated probabilities)
|
|
135
|
+
:return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
|
|
136
|
+
and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
|
|
137
|
+
'''
|
|
138
|
+
|
|
139
|
+
new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
|
|
140
|
+
list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
|
|
141
|
+
|
|
142
|
+
discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
|
|
143
|
+
new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
|
|
144
|
+
|
|
145
|
+
deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
|
|
146
|
+
new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
|
|
147
|
+
|
|
148
|
+
return discovered_pos, deleted_pos
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def find_transcript_missplicing(transcript, mutation, context=5000, window=2500, threshold=0.5, engine='spliceai'):
|
|
152
|
+
ref = transcript.pre_mrna
|
|
153
|
+
var = ref + mutation
|
|
154
|
+
|
|
155
|
+
# print(len(ref.indices))
|
|
156
|
+
# print(len(var.indices))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
center = mutation.position
|
|
160
|
+
total_context = context + window
|
|
161
|
+
|
|
162
|
+
length = ref.seqmat.shape[-1]
|
|
163
|
+
center_index = ref.rel_pos(center)
|
|
164
|
+
ref_start_pad = max(0, total_context - center_index)
|
|
165
|
+
ref_end_pad = max(0, total_context - (length - center_index))
|
|
166
|
+
|
|
167
|
+
length = var.seqmat.shape[-1]
|
|
168
|
+
center_index = var.rel_pos(center)
|
|
169
|
+
var_start_pad = max(0, total_context - center_index)
|
|
170
|
+
var_end_pad = max(0, total_context - (length - center_index))
|
|
171
|
+
|
|
172
|
+
ref = ref.inspect(center, context=total_context)
|
|
173
|
+
var = var.inspect(center, context=total_context)
|
|
174
|
+
#
|
|
175
|
+
# ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.inspect(center, context = window).indices, np.zeros(ref_end_pad)])
|
|
176
|
+
# mut_indices = np.concatenate([np.zeros(var_start_pad), var.inspect(center, context = window).indices, np.zeros(var_end_pad)])
|
|
177
|
+
|
|
178
|
+
ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.indices, np.zeros(ref_end_pad)])
|
|
179
|
+
mut_indices = np.concatenate([np.zeros(var_start_pad), var.indices, np.zeros(var_end_pad)])
|
|
180
|
+
|
|
181
|
+
ref_indices = ref_indices[context:-context]
|
|
182
|
+
mut_indices = mut_indices[context:-context]
|
|
183
|
+
|
|
184
|
+
ref_seq = 'N'*ref_start_pad + ref.seq + 'N'*ref_end_pad
|
|
185
|
+
var_seq = 'N'*var_start_pad + var.seq + 'N'*var_end_pad
|
|
186
|
+
|
|
187
|
+
# print(f"PAdding: {ref_start_pad}, {ref_end_pad}")
|
|
188
|
+
if engine == 'spliceai':
|
|
189
|
+
from .spliceai_utils import sai_predict_probs, sai_models
|
|
190
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, models=sai_models)
|
|
191
|
+
mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq, models=sai_models)
|
|
192
|
+
# ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
193
|
+
# mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
194
|
+
|
|
195
|
+
elif engine == 'pangolin':
|
|
196
|
+
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
197
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
198
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models)
|
|
199
|
+
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError(f"{engine} not implemented")
|
|
202
|
+
|
|
203
|
+
visible_donors = np.intersect1d(transcript.donors, ref_indices)
|
|
204
|
+
visible_acceptors = np.intersect1d(transcript.acceptors, ref_indices)
|
|
205
|
+
|
|
206
|
+
assert len(ref_indices) == len(ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
|
|
207
|
+
assert len(mut_indices) == len(mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
|
|
208
|
+
|
|
209
|
+
iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
210
|
+
{p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
211
|
+
visible_acceptors,
|
|
212
|
+
threshold=threshold)
|
|
213
|
+
|
|
214
|
+
assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
215
|
+
assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
216
|
+
|
|
217
|
+
idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
218
|
+
{p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
219
|
+
visible_donors,
|
|
220
|
+
threshold=threshold)
|
|
221
|
+
|
|
222
|
+
ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
223
|
+
ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
224
|
+
|
|
225
|
+
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
226
|
+
visible_acceptors if p not in mut_indices and p not in dap}
|
|
227
|
+
lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
|
|
228
|
+
if p not in mut_indices and p not in ddp}
|
|
229
|
+
dap.update(lost_acceptors)
|
|
230
|
+
ddp.update(lost_donors)
|
|
231
|
+
|
|
232
|
+
missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
|
|
233
|
+
missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
234
|
+
temp = {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
235
|
+
return temp
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class Missplicing:
|
|
239
|
+
def __init__(self, splicing_dict, threshold=0.5):
|
|
240
|
+
self.missplicing = splicing_dict
|
|
241
|
+
self.threshold = threshold
|
|
242
|
+
|
|
243
|
+
# def __repr__(self):
|
|
244
|
+
# return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
|
|
245
|
+
|
|
246
|
+
def __str__(self):
|
|
247
|
+
return self.aberrant_splicing
|
|
248
|
+
|
|
249
|
+
def __bool__(self):
|
|
250
|
+
if self.apply_sai_threshold_alt() is not None:
|
|
251
|
+
return True
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
def __iter__(self):
|
|
255
|
+
vals = [0]
|
|
256
|
+
for event, details in self.missplicing.items():
|
|
257
|
+
for e, d in details.items():
|
|
258
|
+
vals.append(d['delta'])
|
|
259
|
+
return iter(vals)
|
|
260
|
+
|
|
261
|
+
# def __eq__(self, alt_splicing):
|
|
262
|
+
# flag, _ = self.check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
|
|
263
|
+
# return not flag
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def aberrant_splicing(self):
|
|
267
|
+
return self.apply_sai_threshold(self.threshold)
|
|
268
|
+
|
|
269
|
+
def apply_sai_threshold(self, threshold=None):
|
|
270
|
+
splicing_dict = self.missplicing
|
|
271
|
+
if not threshold:
|
|
272
|
+
threshold = self.threshold
|
|
273
|
+
|
|
274
|
+
new_dict = {}
|
|
275
|
+
for event, details in self.missplicing.items():
|
|
276
|
+
in_dict = {}
|
|
277
|
+
for e, d in details.items():
|
|
278
|
+
if abs(d['delta']) >= threshold:
|
|
279
|
+
in_dict[e] = d
|
|
280
|
+
# return splicing_dict
|
|
281
|
+
new_dict[event] = in_dict
|
|
282
|
+
return new_dict
|
|
283
|
+
|
|
284
|
+
def apply_sai_threshold_alt(self, splicing_dict=None, threshold=None):
|
|
285
|
+
splicing_dict = self.missplicing if not splicing_dict else splicing_dict
|
|
286
|
+
threshold = self.threshold if not threshold else threshold
|
|
287
|
+
for event, details in splicing_dict.items():
|
|
288
|
+
for e, d in details.items():
|
|
289
|
+
if abs(d['delta']) >= threshold:
|
|
290
|
+
return splicing_dict
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
def get_max_missplicing_delta(self):
|
|
294
|
+
max_delta = 0
|
|
295
|
+
for event, details in self.missplicing.items():
|
|
296
|
+
for e, d in details.items():
|
|
297
|
+
if abs(d['delta']) > max_delta:
|
|
298
|
+
max_delta = abs(d['delta'])
|
|
299
|
+
return max_delta
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def find_transcript_splicing(transcript, engine='spliceai'):
|
|
303
|
+
ref = transcript.pre_mrna
|
|
304
|
+
ref_start_pad = 5000
|
|
305
|
+
ref_end_pad = 5000
|
|
306
|
+
|
|
307
|
+
ref_indices = ref.indices
|
|
308
|
+
ref_seq = 'N' * ref_start_pad + ref.seq + 'N' * ref_end_pad
|
|
309
|
+
if engine == 'spliceai':
|
|
310
|
+
from .spliceai_utils import sai_predict_probs, sai_models
|
|
311
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, sai_models)
|
|
312
|
+
|
|
313
|
+
elif engine == 'pangolin':
|
|
314
|
+
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
315
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
316
|
+
|
|
317
|
+
else:
|
|
318
|
+
raise ValueError(f"{engine} not implemented")
|
|
319
|
+
|
|
320
|
+
assert len(ref_seq_donor_probs) == len(ref_indices), f'{len(ref_seq_donor_probs)} vs. {len(ref_indices)}'
|
|
321
|
+
donor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
322
|
+
donor_probs = dict(sorted(donor_probs.items(), key=lambda item: item[1], reverse=True))
|
|
323
|
+
|
|
324
|
+
acceptor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
325
|
+
acceptor_probs = dict(sorted(acceptor_probs.items(), key=lambda item: item[1], reverse=True))
|
|
326
|
+
return donor_probs, acceptor_probs
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
330
|
+
gene = Gene(gene, organism=organism)
|
|
331
|
+
transcript = gene.transcript()
|
|
332
|
+
if len(transcript.introns) == 0:
|
|
333
|
+
return None, None
|
|
334
|
+
|
|
335
|
+
transcript.generate_pre_mrna()
|
|
336
|
+
predicted_donor_sites, predicted_acceptor_sites = find_transcript_splicing(transcript, engine=engine)
|
|
337
|
+
num_introns = len(transcript.introns)
|
|
338
|
+
predicted_donors = list(predicted_donor_sites.keys())[:num_introns]
|
|
339
|
+
predicted_acceptors = list(predicted_acceptor_sites.keys())[:num_introns]
|
|
340
|
+
correct_donor_preds = [v for v in predicted_donors if v in transcript.donors]
|
|
341
|
+
correct_acceptor_preds = [v for v in predicted_acceptors if v in transcript.acceptors]
|
|
342
|
+
return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def missplicing(mut_id, splicing_threshold=0.5, primary_transcript=True, organism='hg38', engine='spliceai'):
|
|
346
|
+
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
347
|
+
mutation = get_mutation(mut_id, rev=gene.rev)
|
|
348
|
+
results = {}
|
|
349
|
+
|
|
350
|
+
for tid, transcript in gene.run_transcripts():
|
|
351
|
+
# if not transcript.primary_transcript and primary_transcript:
|
|
352
|
+
# continue
|
|
353
|
+
#
|
|
354
|
+
if mutation not in transcript:
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
good_tid = tid
|
|
358
|
+
|
|
359
|
+
transcript.generate_pre_mrna()
|
|
360
|
+
results[tid] = Missplicing(find_transcript_missplicing(transcript, mutation, engine=engine),
|
|
361
|
+
threshold=splicing_threshold)
|
|
362
|
+
|
|
363
|
+
# if len(results) == 0:
|
|
364
|
+
# return None
|
|
365
|
+
#
|
|
366
|
+
# if primary_transcript and good_tid in results:
|
|
367
|
+
# return results[good_tid]
|
|
368
|
+
# else:
|
|
369
|
+
# return None
|
|
370
|
+
|
|
371
|
+
return results
|
|
372
|
+
|
geney/utils.py
CHANGED
|
@@ -15,7 +15,6 @@ def is_monotonic(A):
|
|
|
15
15
|
return False
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
def available_genes(organism='hg38'):
|
|
20
19
|
from geney import config_setup
|
|
21
20
|
annotation_path = config_setup[organism]['MRNA_PATH'] / 'protein_coding'
|
|
@@ -52,24 +51,29 @@ def dump_pickle(file_path, payload):
|
|
|
52
51
|
return None
|
|
53
52
|
|
|
54
53
|
|
|
55
|
-
def find_files_by_gene_name(gene_name, organism='hg38'):
|
|
56
|
-
from geney import config_setup
|
|
57
|
-
mrna_path = config_setup[organism]['MRNA_PATH'] / 'protein_coding'
|
|
58
|
-
matching_files = [f for f in mrna_path.glob(f'*_{gene_name}.pkl')]
|
|
59
|
-
if len(matching_files) > 1:
|
|
60
|
-
print(f"Multiple files available ({[f.name for f in matching_files]}).")
|
|
61
|
-
elif len(matching_files) == 0:
|
|
62
|
-
raise FileNotFoundError(f"No files available for gene {gene_name}.")
|
|
63
|
-
|
|
64
|
-
return matching_files[0]
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def reverse_complement(s: str, complement: dict = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} ) -> str:
|
|
68
|
-
'''Performs reverse-complement of a sequence. Default is a DNA sequence.'''
|
|
69
|
-
s_rev = s[::-1]
|
|
70
|
-
lower = [b.islower() for b in list(s_rev)]
|
|
71
|
-
bases = [complement.get(base, base) for base in list(s_rev.upper())]
|
|
72
|
-
rev_compl = ''.join([b.lower() if l else b for l, b in zip(lower, bases)])
|
|
73
|
-
return rev_compl
|
|
74
54
|
|
|
55
|
+
def is_monotonic(A):
|
|
56
|
+
return all(x <= y for x, y in zip(A, A[1:])) or all(x >= y for x, y in zip(A, A[1:]))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# def find_files_by_gene_name(gene_name, organism='hg38'):
|
|
60
|
+
# from geney import config_setup
|
|
61
|
+
# mrna_path = config_setup[organism]['MRNA_PATH'] / 'protein_coding'
|
|
62
|
+
# matching_files = [f for f in mrna_path.glob(f'*_{gene_name}.pkl')]
|
|
63
|
+
# if len(matching_files) > 1:
|
|
64
|
+
# print(f"Multiple files available ({[f.name for f in matching_files]}).")
|
|
65
|
+
# elif len(matching_files) == 0:
|
|
66
|
+
# raise FileNotFoundError(f"No files available for gene {gene_name}.")
|
|
67
|
+
#
|
|
68
|
+
# return matching_files[0]
|
|
69
|
+
#
|
|
70
|
+
#
|
|
71
|
+
# def reverse_complement(s: str, complement: dict = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} ) -> str:
|
|
72
|
+
# '''Performs reverse-complement of a sequence. Default is a DNA sequence.'''
|
|
73
|
+
# s_rev = s[::-1]
|
|
74
|
+
# lower = [b.islower() for b in list(s_rev)]
|
|
75
|
+
# bases = [complement.get(base, base) for base in list(s_rev.upper())]
|
|
76
|
+
# rev_compl = ''.join([b.lower() if l else b for l, b in zip(lower, bases)])
|
|
77
|
+
# return rev_compl
|
|
78
|
+
#
|
|
75
79
|
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.24
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
7
7
|
Author-email: nicolasalynn@gmail.com
|
|
8
8
|
License: Free for non-commercial use
|
|
9
|
+
Platform: UNKNOWN
|
|
9
10
|
Classifier: Development Status :: 1 - Planning
|
|
10
11
|
Classifier: Intended Audience :: Science/Research
|
|
11
12
|
Classifier: License :: Free for non-commercial use
|
|
@@ -13,25 +14,21 @@ Classifier: Operating System :: POSIX :: Linux
|
|
|
13
14
|
Classifier: Operating System :: MacOS
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
16
|
Requires-Python: >3.9
|
|
16
|
-
Requires-Dist: numpy
|
|
17
|
-
Requires-Dist: pandas
|
|
18
|
-
Requires-Dist: networkx
|
|
19
|
-
Requires-Dist: viennarna
|
|
20
|
-
Requires-Dist: tqdm
|
|
21
|
-
Requires-Dist: spliceai
|
|
22
|
-
Requires-Dist: biopython
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: joblib ==1.3.2
|
|
26
|
-
Requires-Dist: gtfparse ==1.3.0
|
|
27
|
-
Requires-Dist: sh ==2.0.6
|
|
28
|
-
Requires-Dist: termplotlib ==0.3.9
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: networkx
|
|
20
|
+
Requires-Dist: viennarna
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: spliceai
|
|
23
|
+
Requires-Dist: biopython==1.81
|
|
24
|
+
Requires-Dist: gtfparse==1.3.0
|
|
25
|
+
Requires-Dist: sh==2.0.6
|
|
29
26
|
Requires-Dist: torch
|
|
30
27
|
Requires-Dist: lifelines
|
|
31
28
|
Requires-Dist: notebook
|
|
32
29
|
Requires-Dist: matplotlib
|
|
33
|
-
Requires-Dist: dask[complete]
|
|
34
|
-
Requires-Dist: dask-jobqueue
|
|
35
30
|
Requires-Dist: gffutils
|
|
36
31
|
Requires-Dist: pyfastx
|
|
37
32
|
|
|
33
|
+
UNKNOWN
|
|
34
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
|
|
2
|
+
geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
|
|
3
|
+
geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
|
|
4
|
+
geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
|
|
5
|
+
geney/graphic_utils.py,sha256=tjm6IDQ1BdfSeuPYzjlqAUHFQoDYH9jXTzJjKFS4Hh4,11078
|
|
6
|
+
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
7
|
+
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
8
|
+
geney/mutation_utils.py,sha256=C-K8F8wyN5joI3ZuP-d7IMYTI43YPDXUc3IgAJ07o8Q,1546
|
|
9
|
+
geney/oncosplice.py,sha256=3jJc1-CWubH2ElHEjyQtsr9JYVmfPQEpq7EX-IfY-t8,20806
|
|
10
|
+
geney/pangolin_utils.py,sha256=S2uMjQnnxqWSnfuMaEjo-wq52DVKFiXt__L5VPdtzyU,2939
|
|
11
|
+
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
12
|
+
geney/seqmat_utils.py,sha256=4MiN6rGeQMfWK6bXOHGddxBffx8v4sT1THkZe-AceXE,15611
|
|
13
|
+
geney/spliceai_utils.py,sha256=BiTRIfrovX9qo9xup6bFWp0qkvmW9NVPY98Zw8-OaL0,1891
|
|
14
|
+
geney/splicing_utils.py,sha256=pS3jZEpmnDkbT1jjaJh-O6I--Xm22e5dj-GQu7IAZSQ,15943
|
|
15
|
+
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
16
|
+
geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
|
|
17
|
+
geney/utils.py,sha256=WbV1DBllQyvzoDiYkidRiTX5MBpQGr99M4hTUQ0BKo8,2185
|
|
18
|
+
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
|
|
20
|
+
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
21
|
+
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
22
|
+
geney-1.2.24.dist-info/METADATA,sha256=sJGrKawFcaFyF1QwLOHn7dNYeznt5f2fKL7NDZvqqq8,948
|
|
23
|
+
geney-1.2.24.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
24
|
+
geney-1.2.24.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
25
|
+
geney-1.2.24.dist-info/RECORD,,
|
geney-1.2.22.dist-info/RECORD
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
|
|
2
|
-
geney/__init__.py,sha256=knezxgbV2c2gcO2ek2-xxEC15HL4aO1WuoMiYOOvKf8,428
|
|
3
|
-
geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
|
|
4
|
-
geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
|
|
5
|
-
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
6
|
-
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
7
|
-
geney/oncosplice.py,sha256=AZm8Vj7z65DokPmeflwoqs2BM11neV9hQLA_Ao4ysnM,78242
|
|
8
|
-
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
9
|
-
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
10
|
-
geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
|
|
11
|
-
geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
|
|
12
|
-
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
|
|
14
|
-
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
15
|
-
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
16
|
-
geney-1.2.22.dist-info/METADATA,sha256=eTTiyuGPZ5lD7jV8YZXSocPyewD3OPwvgeaqiXxuVfo,1163
|
|
17
|
-
geney-1.2.22.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
18
|
-
geney-1.2.22.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
19
|
-
geney-1.2.22.dist-info/RECORD,,
|
|
File without changes
|