geney 1.3.79__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +54 -56
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/METADATA +1 -1
- geney-1.4.1.dist-info/RECORD +51 -0
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/WHEEL +1 -1
- geney-1.3.79.dist-info/RECORD +0 -31
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
__all__ = ['run_splicing_engine', 'adjoin_splicing_outcomes', 'process_epistasis']
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
def run_splicing_engine(seq: str, engine: str = 'spliceai') -> Tuple[List[float], List[float]]:
|
|
7
|
+
"""
|
|
8
|
+
Run the specified splicing engine to predict splice site probabilities on a sequence.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
seq: Nucleotide sequence.
|
|
12
|
+
engine: Engine name ('spliceai' or 'pangolin').
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Tuple (donor_probs, acceptor_probs) as lists of probability values.
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
ValueError: If the engine is not implemented.
|
|
19
|
+
"""
|
|
20
|
+
match engine:
|
|
21
|
+
case 'spliceai':
|
|
22
|
+
from geney.utils.spliceai_utils import sai_predict_probs, sai_models
|
|
23
|
+
# print(seq)
|
|
24
|
+
acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
|
|
25
|
+
case 'pangolin':
|
|
26
|
+
from geney.utils.pangolin_utils import pangolin_predict_probs, pang_models
|
|
27
|
+
# print(seq)
|
|
28
|
+
donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
|
|
29
|
+
case _:
|
|
30
|
+
raise ValueError(f"Engine '{engine}' not implemented")
|
|
31
|
+
|
|
32
|
+
return donor_probs, acceptor_probs
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def adjoin_splicing_outcomes(splicing_predictions, transcript=None):
|
|
37
|
+
"""
|
|
38
|
+
Predicts splicing effect for multiple mutations and organizes the output as a multi-index DataFrame.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
mut_ids (dict): Dictionary where keys are mutation labels (e.g. 'mut1', 'mut2', 'epistasis') and
|
|
42
|
+
values are mutation strings in format 'GENE:CHR:POS:REF:ALT'.
|
|
43
|
+
transcript (str): Transcript ID to target (optional).
|
|
44
|
+
engine (str): Splicing engine (default: 'spliceai').
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
pd.DataFrame: Multi-index column DataFrame with wild-type, canonical, and mutation-specific predictions.
|
|
48
|
+
"""
|
|
49
|
+
dfs = []
|
|
50
|
+
for label, splicing_df in splicing_predictions.items():
|
|
51
|
+
var_df = splicing_df.rename(columns={
|
|
52
|
+
'donor_prob': ('donors', f'{label}_prob'),
|
|
53
|
+
'acceptor_prob': ('acceptors', f'{label}_prob'),
|
|
54
|
+
'nucleotides': ('nts', f'{label}')
|
|
55
|
+
})
|
|
56
|
+
dfs.append(var_df)
|
|
57
|
+
|
|
58
|
+
# Concatenate all DataFrames and unify columns
|
|
59
|
+
full_df = pd.concat(dfs, axis=1)
|
|
60
|
+
|
|
61
|
+
# Ensure MultiIndex columns
|
|
62
|
+
if not isinstance(full_df.columns, pd.MultiIndex):
|
|
63
|
+
full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
|
|
64
|
+
|
|
65
|
+
if transcript is not None:
|
|
66
|
+
full_df[('acceptors', 'annotated')] = full_df.apply(
|
|
67
|
+
lambda row: row.name in transcript.acceptors,
|
|
68
|
+
axis=1
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
full_df[('donors', 'annotated')] = full_df.apply(
|
|
72
|
+
lambda row: row.name in transcript.donors,
|
|
73
|
+
axis=1
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
77
|
+
full_df.sort_index(ascending=not transcript.rev, inplace=True)
|
|
78
|
+
else:
|
|
79
|
+
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
80
|
+
|
|
81
|
+
return full_df
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def process_epistasis(df: pd.DataFrame, threshold=0.25) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
Computes the expected epistasis effect (additive) and residual epistasis
|
|
87
|
+
for both donor and acceptor probabilities.
|
|
88
|
+
|
|
89
|
+
Adds new columns under donors and acceptors:
|
|
90
|
+
- expected_epistasis
|
|
91
|
+
- residual_epistasis
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
df (pd.DataFrame): MultiIndex column DataFrame with keys:
|
|
95
|
+
'wt_prob', 'mut1_prob', 'mut2_prob', 'epistasis_prob'
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
pd.DataFrame: Modified DataFrame with expected and residual epistasis columns added.
|
|
99
|
+
"""
|
|
100
|
+
for feature in ['donors', 'acceptors']:
|
|
101
|
+
wt = df[feature]['wt_prob']
|
|
102
|
+
mut1 = df[feature]['mut1_prob']
|
|
103
|
+
mut2 = df[feature]['mut2_prob']
|
|
104
|
+
true_epi = df[feature]['epistasis_prob']
|
|
105
|
+
|
|
106
|
+
expected = mut1 + mut2 - wt
|
|
107
|
+
residual = true_epi - expected
|
|
108
|
+
|
|
109
|
+
df[(feature, 'expected_epistasis')] = expected
|
|
110
|
+
df[(feature, 'residual_epistasis')] = residual
|
|
111
|
+
|
|
112
|
+
df = df.sort_index(axis=1, level=0)
|
|
113
|
+
mask = (
|
|
114
|
+
df['donors']['residual_epistasis'].abs() > threshold
|
|
115
|
+
) | (
|
|
116
|
+
df['acceptors']['residual_epistasis'].abs() > threshold
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return df[mask]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# def predict_splicing(mut_id=None, transcript=None, engine='spliceai'):
|
|
123
|
+
# gene = Gene.from_file(mut_id.split(':')[0]).transcript(transcript).generate_pre_mrna()
|
|
124
|
+
# if mut_id is None:
|
|
125
|
+
# pass
|
|
126
|
+
# else:
|
|
127
|
+
# for m in mut_id.split('|'):
|
|
128
|
+
# gene.pre_mrna.apply_mutation(m)
|
|
129
|
+
# gene.pre_mrna.set_name(mut_id)
|
|
130
|
+
# return gene.pre_mrna.predict_missplicing(engine=engine, fmt='df')
|
|
131
|
+
#
|
|
132
|
+
#
|
|
133
|
+
#
|
|
134
|
+
#
|
|
135
|
+
#
|
|
136
|
+
#
|
|
137
|
+
# def find_event_splicing(mutations, engine='spliceai'):
|
|
138
|
+
# data = epistasis_id.split('|')
|
|
139
|
+
# gene = data[0].split(':')[0]
|
|
140
|
+
# pos = int(sum([int(p.split(':')[2]) for p in data]) / 2)
|
|
141
|
+
# g = Gene.from_file(gene).transcript().generate_pre_mrna()
|
|
142
|
+
# transcript = g.clone().pre_mrna
|
|
143
|
+
#
|
|
144
|
+
# muts = [MutSeqMat.from_mutid(m, g.rev) for m in data]
|
|
145
|
+
# # if g.rev:
|
|
146
|
+
# # muts = [m.reverse_complement() for m in muts]
|
|
147
|
+
#
|
|
148
|
+
# mut1 = transcript.clone().mutate(muts[0])
|
|
149
|
+
# mut2 = transcript.clone().mutate(muts[1])
|
|
150
|
+
# epistasis = transcript.clone()
|
|
151
|
+
# for m in muts:
|
|
152
|
+
# epistasis.mutate(m, inplace=True)
|
|
153
|
+
#
|
|
154
|
+
# wild_type = transcript.predict_splicing(pos, engine=engine)
|
|
155
|
+
# mut1 = mut1.predict_splicing(pos, engine=engine)
|
|
156
|
+
# mut2 = mut2.predict_splicing(pos, engine=engine)
|
|
157
|
+
# epistasis = epistasis.predict_splicing(pos, engine=engine)
|
|
158
|
+
#
|
|
159
|
+
# combined = pd.concat([wild_type, mut1, mut2, epistasis], axis=1, keys=['wild_type', 'mut1', 'mut2', 'epistasis'], join='outer')
|
|
160
|
+
# return combined
|
|
161
|
+
#
|
|
162
|
+
# # def extract_epistatic_sites(df, site_type_col='site_type', threshold=0.25):
|
|
163
|
+
# # """
|
|
164
|
+
# # From a multi-index DataFrame with columns like ('wild_type', 'donor_prob'), etc.,
|
|
165
|
+
# # compute expected additive effect and epistatic residual for donor and acceptor probabilities.
|
|
166
|
+
# # Return only rows where:
|
|
167
|
+
# # 1. |residual| > threshold
|
|
168
|
+
# # 2. donor sites have site_type == 1, acceptor sites have site_type == 0
|
|
169
|
+
# # """
|
|
170
|
+
# # features = ['donor_prob', 'acceptor_prob']
|
|
171
|
+
# # expected = {}
|
|
172
|
+
# # residual = {}
|
|
173
|
+
# #
|
|
174
|
+
# # for feature in features:
|
|
175
|
+
# # wt = df[('wild_type', feature)]
|
|
176
|
+
# # mut1 = df[('mut1', feature)]
|
|
177
|
+
# # mut2 = df[('mut2', feature)]
|
|
178
|
+
# # epi = df[('epistasis', feature)]
|
|
179
|
+
# #
|
|
180
|
+
# # expected_feature = 3 * wt - mut1 - mut2
|
|
181
|
+
# # residual_feature = expected_feature - epi
|
|
182
|
+
# #
|
|
183
|
+
# # expected[('expected', feature)] = expected_feature
|
|
184
|
+
# # residual[('residual', feature)] = residual_feature
|
|
185
|
+
# #
|
|
186
|
+
# # # Combine new columns
|
|
187
|
+
# # expected_df = pd.DataFrame(expected)
|
|
188
|
+
# # residual_df = pd.DataFrame(residual)
|
|
189
|
+
# #
|
|
190
|
+
# # # Join to original
|
|
191
|
+
# # df_combined = pd.concat([df, expected_df, residual_df], axis=1)
|
|
192
|
+
# #
|
|
193
|
+
# # # Create mask based on residual threshold
|
|
194
|
+
# # mask = (
|
|
195
|
+
# # (residual_df.abs() > threshold)
|
|
196
|
+
# # .any(axis=1) # at least one feature has large residual
|
|
197
|
+
# # )
|
|
198
|
+
# #
|
|
199
|
+
# # # Site type condition: donor=1, acceptor=0
|
|
200
|
+
# # donor_mask = df[('wild_type', 'donor_prob')].notna() & (df[site_type_col] == 1)
|
|
201
|
+
# # acceptor_mask = df[('wild_type', 'acceptor_prob')].notna() & (df[site_type_col] == 0)
|
|
202
|
+
# #
|
|
203
|
+
# # # Combine all masks
|
|
204
|
+
# # final_mask = mask & (donor_mask | acceptor_mask)
|
|
205
|
+
# #
|
|
206
|
+
# # return df_combined[final_mask]
|
|
207
|
+
# #
|
|
208
|
+
# #
|
|
209
|
+
# # # variability = df.groupby(level=1, axis=1).apply(lambda subdf: subdf.max(axis=1) - subdf.min(axis=1))
|
|
210
|
+
# #
|
|
211
|
+
#
|
|
212
|
+
# """
|
|
213
|
+
# splicing_module.py
|
|
214
|
+
#
|
|
215
|
+
# A modular and comprehensive implementation for splicing, missplicing, and pairwise epistasis analysis.
|
|
216
|
+
# This module has been refactored with advanced Python practices:
|
|
217
|
+
# • Extensive type annotations and detailed docstrings.
|
|
218
|
+
# • Decomposition into small, testable functions and classes.
|
|
219
|
+
# • Explicit encapsulation of the pairwise epistasis analysis.
|
|
220
|
+
# • Usage of Python 3.10+ pattern matching for engine dispatch.
|
|
221
|
+
#
|
|
222
|
+
# Dependencies:
|
|
223
|
+
# numpy, pandas, sqlite3, json, os, redis, and internal modules: Gene, SeqMats, config, spliceai_utils, and pangolin_utils.
|
|
224
|
+
# """
|
|
225
|
+
#
|
|
226
|
+
# import os
|
|
227
|
+
# import json
|
|
228
|
+
# import sqlite3
|
|
229
|
+
# from collections import defaultdict
|
|
230
|
+
# from dataclasses import dataclass, field
|
|
231
|
+
# from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
|
|
232
|
+
#
|
|
233
|
+
# import numpy as np
|
|
234
|
+
# import pandas as pd
|
|
235
|
+
# from redis import Redis
|
|
236
|
+
#
|
|
237
|
+
# # Internal module imports (assumed to be in the same package)
|
|
238
|
+
# from .Gene import Gene
|
|
239
|
+
# from .SeqMats import MutSeqMat
|
|
240
|
+
# from . import config
|
|
241
|
+
#
|
|
242
|
+
# # # Type aliases for clarity
|
|
243
|
+
# # SpliceProbs = Dict[int, float]
|
|
244
|
+
# # AdjacencyKey = Tuple[int, str]
|
|
245
|
+
# # AdjacencyValue = Tuple[int, str, float]
|
|
246
|
+
# # AdjacencyList = Dict[AdjacencyKey, List[AdjacencyValue]]
|
|
247
|
+
#
|
|
248
|
+
# def run_splicing_engine(seq: str, engine: str = 'spliceai') -> Tuple[List[float], List[float]]:
|
|
249
|
+
# """
|
|
250
|
+
# Run the specified splicing engine to predict splice site probabilities on a sequence.
|
|
251
|
+
#
|
|
252
|
+
# Args:
|
|
253
|
+
# seq: Nucleotide sequence.
|
|
254
|
+
# engine: Engine name ('spliceai' or 'pangolin').
|
|
255
|
+
#
|
|
256
|
+
# Returns:
|
|
257
|
+
# Tuple (donor_probs, acceptor_probs) as lists of probability values.
|
|
258
|
+
#
|
|
259
|
+
# Raises:
|
|
260
|
+
# ValueError: If the engine is not implemented.
|
|
261
|
+
# """
|
|
262
|
+
# match engine:
|
|
263
|
+
# case 'spliceai':
|
|
264
|
+
# from .spliceai_utils import sai_predict_probs, sai_models
|
|
265
|
+
# acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
|
|
266
|
+
# case 'pangolin':
|
|
267
|
+
# from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
268
|
+
# donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
|
|
269
|
+
# case _:
|
|
270
|
+
# raise ValueError(f"Engine '{engine}' not implemented")
|
|
271
|
+
#
|
|
272
|
+
# return donor_probs, acceptor_probs
|
|
273
|
+
#
|
|
274
|
+
#
|
|
275
|
+
# # =============================================================================
|
|
276
|
+
# # Helper Functions
|
|
277
|
+
# # =============================================================================
|
|
278
|
+
#
|
|
279
|
+
# def generate_adjacency_list(
|
|
280
|
+
# acceptors: List[Tuple[int, float]],
|
|
281
|
+
# donors: List[Tuple[int, float]],
|
|
282
|
+
# transcript_start: int,
|
|
283
|
+
# transcript_end: int,
|
|
284
|
+
# max_distance: int = 50,
|
|
285
|
+
# rev: bool = False
|
|
286
|
+
# ) -> AdjacencyList:
|
|
287
|
+
# """
|
|
288
|
+
# Build an adjacency list from donors to acceptors (and vice versa) based on distance and orientation.
|
|
289
|
+
#
|
|
290
|
+
# Args:
|
|
291
|
+
# acceptors: List of tuples (position, probability) for acceptor sites.
|
|
292
|
+
# donors: List of tuples (position, probability) for donor sites.
|
|
293
|
+
# transcript_start: Start coordinate of the transcript.
|
|
294
|
+
# transcript_end: End coordinate of the transcript.
|
|
295
|
+
# max_distance: Maximum allowed distance to connect sites.
|
|
296
|
+
# rev: If True, consider reverse orientation.
|
|
297
|
+
#
|
|
298
|
+
# Returns:
|
|
299
|
+
# A dictionary mapping (position, type) to a list of (neighbor_position, neighbor_type, normalized_probability).
|
|
300
|
+
# """
|
|
301
|
+
# # Append transcript end as an extra donor node
|
|
302
|
+
# donors = donors + [(transcript_end, 1)]
|
|
303
|
+
# # Sort acceptors and donors; use reversed ordering if needed
|
|
304
|
+
# acceptors = sorted(acceptors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
|
|
305
|
+
# donors = sorted(donors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
|
|
306
|
+
#
|
|
307
|
+
# adjacency_list: AdjacencyList = defaultdict(list)
|
|
308
|
+
#
|
|
309
|
+
# # Connect donors to acceptors
|
|
310
|
+
# for d_pos, d_prob in donors:
|
|
311
|
+
# running_prob = 1.0
|
|
312
|
+
# for a_pos, a_prob in acceptors:
|
|
313
|
+
# # Check orientation and max distance
|
|
314
|
+
# correct_orientation = (a_pos > d_pos and not rev) or (a_pos < d_pos and rev)
|
|
315
|
+
# distance_valid = abs(a_pos - d_pos) <= max_distance
|
|
316
|
+
# if correct_orientation and distance_valid:
|
|
317
|
+
# # Count intervening sites as a simplified penalty
|
|
318
|
+
# in_between_acceptors = sum(1 for a, _ in acceptors if (d_pos < a < a_pos) if not rev else (
|
|
319
|
+
# a_pos < a < d_pos))
|
|
320
|
+
# in_between_donors = sum(1 for d, _ in donors if (d_pos < d < a_pos) if not rev else (a_pos < d < d_pos))
|
|
321
|
+
# # If one set is empty, use raw probability; otherwise use a running product
|
|
322
|
+
# if in_between_donors == 0 or in_between_acceptors == 0:
|
|
323
|
+
# adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob))
|
|
324
|
+
# running_prob -= a_prob
|
|
325
|
+
# elif running_prob > 0:
|
|
326
|
+
# adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob * running_prob))
|
|
327
|
+
# running_prob -= a_prob
|
|
328
|
+
# else:
|
|
329
|
+
# break
|
|
330
|
+
#
|
|
331
|
+
# # Connect acceptors to donors
|
|
332
|
+
# for a_pos, a_prob in acceptors:
|
|
333
|
+
# running_prob = 1.0
|
|
334
|
+
# for d_pos, d_prob in donors:
|
|
335
|
+
# correct_orientation = (d_pos > a_pos and not rev) or (d_pos < a_pos and rev)
|
|
336
|
+
# distance_valid = abs(d_pos - a_pos) <= max_distance
|
|
337
|
+
# if correct_orientation and distance_valid:
|
|
338
|
+
# in_between_acceptors = sum(1 for a, _ in acceptors if (a_pos < a < d_pos) if not rev else (
|
|
339
|
+
# d_pos < a < a_pos))
|
|
340
|
+
# in_between_donors = sum(1 for d, _ in donors if (a_pos < d < d_pos) if not rev else (d_pos < d < a_pos))
|
|
341
|
+
# # Tag the donor as transcript_end if appropriate
|
|
342
|
+
# tag = 'donor' if d_pos != transcript_end else 'transcript_end'
|
|
343
|
+
# if in_between_acceptors == 0:
|
|
344
|
+
# adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob))
|
|
345
|
+
# running_prob -= d_prob
|
|
346
|
+
# elif running_prob > 0:
|
|
347
|
+
# adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob * running_prob))
|
|
348
|
+
# running_prob -= d_prob
|
|
349
|
+
# else:
|
|
350
|
+
# break
|
|
351
|
+
#
|
|
352
|
+
# # Connect transcript start to donors
|
|
353
|
+
# running_prob = 1.0
|
|
354
|
+
# for d_pos, d_prob in donors:
|
|
355
|
+
# if ((d_pos > transcript_start and not rev) or (d_pos < transcript_start and rev)) and abs(
|
|
356
|
+
# d_pos - transcript_start) <= max_distance:
|
|
357
|
+
# adjacency_list[(transcript_start, 'transcript_start')].append((d_pos, 'donor', d_prob))
|
|
358
|
+
# running_prob -= d_prob
|
|
359
|
+
# if running_prob <= 0:
|
|
360
|
+
# break
|
|
361
|
+
#
|
|
362
|
+
# # Normalize probabilities in each adjacency list
|
|
363
|
+
# for key, next_nodes in adjacency_list.items():
|
|
364
|
+
# total = sum(prob for _, _, prob in next_nodes)
|
|
365
|
+
# if total > 0:
|
|
366
|
+
# adjacency_list[key] = [(pos, typ, round(prob / total, 3)) for pos, typ, prob in next_nodes]
|
|
367
|
+
#
|
|
368
|
+
# return dict(adjacency_list)
|
|
369
|
+
#
|
|
370
|
+
#
|
|
371
|
+
# def find_all_paths(
|
|
372
|
+
# graph: AdjacencyList,
|
|
373
|
+
# start: Tuple[int, str],
|
|
374
|
+
# end: Tuple[int, str],
|
|
375
|
+
# path: List[Tuple[int, str]] = [],
|
|
376
|
+
# probability: float = 1.0
|
|
377
|
+
# ) -> Generator[Tuple[List[Tuple[int, str]], float], None, None]:
|
|
378
|
+
# """
|
|
379
|
+
# Recursively generate all paths from start node to end node in the graph.
|
|
380
|
+
#
|
|
381
|
+
# Args:
|
|
382
|
+
# graph: Adjacency list mapping nodes to neighbor nodes and probabilities.
|
|
383
|
+
# start: The starting node (position, type).
|
|
384
|
+
# end: The target node (position, type).
|
|
385
|
+
# path: The path traversed so far.
|
|
386
|
+
# probability: The cumulative probability along the path.
|
|
387
|
+
#
|
|
388
|
+
# Yields:
|
|
389
|
+
# A tuple of the complete path and its cumulative probability.
|
|
390
|
+
# """
|
|
391
|
+
# path = path + [start]
|
|
392
|
+
# if start == end:
|
|
393
|
+
# yield path, probability
|
|
394
|
+
# return
|
|
395
|
+
#
|
|
396
|
+
# if start not in graph:
|
|
397
|
+
# return
|
|
398
|
+
#
|
|
399
|
+
# for next_node, node_type, prob in graph[start]:
|
|
400
|
+
# yield from find_all_paths(graph, (next_node, node_type), end, path, probability * prob)
|
|
401
|
+
#
|
|
402
|
+
#
|
|
403
|
+
# def prepare_splice_sites(
|
|
404
|
+
# acceptors: List[int],
|
|
405
|
+
# donors: List[int],
|
|
406
|
+
# aberrant_splicing: Dict[str, Any]
|
|
407
|
+
# ) -> Tuple[List[Tuple[int, float]], List[Tuple[int, float]]]:
|
|
408
|
+
# """
|
|
409
|
+
# Prepare splice sites by merging reference sites with aberrant events.
|
|
410
|
+
#
|
|
411
|
+
# Args:
|
|
412
|
+
# acceptors: List of acceptor positions.
|
|
413
|
+
# donors: List of donor positions.
|
|
414
|
+
# aberrant_splicing: Dictionary containing aberrant splicing events.
|
|
415
|
+
#
|
|
416
|
+
# Returns:
|
|
417
|
+
# Tuple of lists:
|
|
418
|
+
# - List of tuples (acceptor_position, probability)
|
|
419
|
+
# - List of tuples (donor_position, probability)
|
|
420
|
+
# """
|
|
421
|
+
# acceptor_dict = {p: 1 for p in acceptors}
|
|
422
|
+
# donor_dict = {p: 1 for p in donors}
|
|
423
|
+
#
|
|
424
|
+
# for p, v in aberrant_splicing.get('missed_donors', {}).items():
|
|
425
|
+
# donor_dict[p] = v['absolute']
|
|
426
|
+
# for p, v in aberrant_splicing.get('discovered_donors', {}).items():
|
|
427
|
+
# donor_dict[p] = v['absolute']
|
|
428
|
+
# for p, v in aberrant_splicing.get('missed_acceptors', {}).items():
|
|
429
|
+
# acceptor_dict[p] = v['absolute']
|
|
430
|
+
# for p, v in aberrant_splicing.get('discovered_acceptors', {}).items():
|
|
431
|
+
# acceptor_dict[p] = v['absolute']
|
|
432
|
+
#
|
|
433
|
+
# # Ensure keys are integers
|
|
434
|
+
# acceptors_list = [(int(k), float(v)) for k, v in acceptor_dict.items()]
|
|
435
|
+
# donors_list = [(int(k), float(v)) for k, v in donor_dict.items()]
|
|
436
|
+
# return acceptors_list, donors_list
|
|
437
|
+
#
|
|
438
|
+
#
|
|
439
|
+
# def develop_aberrant_splicing(
|
|
440
|
+
# transcript: Any,
|
|
441
|
+
# aberrant_splicing: Any
|
|
442
|
+
# ) -> Generator[Dict[str, Any], None, None]:
|
|
443
|
+
# """
|
|
444
|
+
# Generator of potential aberrant splicing paths based on the transcript and missplicing events.
|
|
445
|
+
#
|
|
446
|
+
# If no aberrant events are provided, returns the original splice sites.
|
|
447
|
+
#
|
|
448
|
+
# Args:
|
|
449
|
+
# transcript: Transcript object containing splice site positions.
|
|
450
|
+
# aberrant_splicing: Object with missplicing events.
|
|
451
|
+
#
|
|
452
|
+
# Yields:
|
|
453
|
+
# Dictionary with keys 'acceptors', 'donors', and 'path_weight'.
|
|
454
|
+
# """
|
|
455
|
+
# if not aberrant_splicing:
|
|
456
|
+
# yield {
|
|
457
|
+
# 'acceptors': transcript.acceptors,
|
|
458
|
+
# 'donors': transcript.donors,
|
|
459
|
+
# 'path_weight': 1
|
|
460
|
+
# }
|
|
461
|
+
# else:
|
|
462
|
+
# all_acceptors, all_donors = prepare_splice_sites(
|
|
463
|
+
# transcript.acceptors, transcript.donors, aberrant_splicing.missplicing
|
|
464
|
+
# )
|
|
465
|
+
# adj_list = generate_adjacency_list(
|
|
466
|
+
# all_acceptors,
|
|
467
|
+
# all_donors,
|
|
468
|
+
# transcript_start=transcript.transcript_start,
|
|
469
|
+
# transcript_end=transcript.transcript_end,
|
|
470
|
+
# max_distance=100000,
|
|
471
|
+
# rev=transcript.rev
|
|
472
|
+
# )
|
|
473
|
+
# start_node = (transcript.transcript_start, 'transcript_start')
|
|
474
|
+
# end_node = (transcript.transcript_end, 'transcript_end')
|
|
475
|
+
# for path, prob in find_all_paths(adj_list, start_node, end_node):
|
|
476
|
+
# yield {
|
|
477
|
+
# 'acceptors': [node[0] for node in path if node[1] == 'acceptor'],
|
|
478
|
+
# 'donors': [node[0] for node in path if node[1] == 'donor'],
|
|
479
|
+
# 'path_weight': prob
|
|
480
|
+
# }
|
|
481
|
+
#
|
|
482
|
+
#
|
|
483
|
+
# def find_ss_changes(
|
|
484
|
+
# ref_dct: Dict[int, float],
|
|
485
|
+
# mut_dct: Dict[int, float],
|
|
486
|
+
# known_splice_sites: Union[List[int], np.ndarray],
|
|
487
|
+
# threshold: float = 0.5
|
|
488
|
+
# ) -> Tuple[Dict[float, Dict[str, float]], Dict[float, Dict[str, float]]]:
|
|
489
|
+
# """
|
|
490
|
+
# Compare reference and mutant splice probabilities to detect significant site changes.
|
|
491
|
+
#
|
|
492
|
+
# Args:
|
|
493
|
+
# ref_dct: Dictionary of splice site probabilities for the reference sequence.
|
|
494
|
+
# mut_dct: Dictionary of splice site probabilities for the mutant sequence.
|
|
495
|
+
# known_splice_sites: List/array of positions that are known splice sites.
|
|
496
|
+
# threshold: Minimum difference required to flag a significant change.
|
|
497
|
+
#
|
|
498
|
+
# Returns:
|
|
499
|
+
# A tuple (discovered, deleted) where:
|
|
500
|
+
# - discovered: Positions with a positive delta and not known splice sites.
|
|
501
|
+
# - deleted: Positions with a negative delta among known splice sites.
|
|
502
|
+
# """
|
|
503
|
+
# all_positions = set(ref_dct.keys()).union(mut_dct.keys())
|
|
504
|
+
# delta_dict = {pos: mut_dct.get(pos, 0) - ref_dct.get(pos, 0) for pos in all_positions}
|
|
505
|
+
#
|
|
506
|
+
# discovered = {
|
|
507
|
+
# float(k): {
|
|
508
|
+
# 'delta': round(float(delta), 3),
|
|
509
|
+
# 'absolute': round(float(mut_dct.get(k, 0)), 3),
|
|
510
|
+
# 'reference': round(ref_dct.get(k, 0), 3)
|
|
511
|
+
# }
|
|
512
|
+
# for k, delta in delta_dict.items() if delta >= threshold and k not in known_splice_sites
|
|
513
|
+
# }
|
|
514
|
+
# deleted = {
|
|
515
|
+
# float(k): {
|
|
516
|
+
# 'delta': round(float(delta), 3),
|
|
517
|
+
# 'absolute': round(float(mut_dct.get(k, 0)), 3),
|
|
518
|
+
# 'reference': round(ref_dct.get(k, 0), 3)
|
|
519
|
+
# }
|
|
520
|
+
# for k, delta in delta_dict.items() if -delta >= threshold and k in known_splice_sites
|
|
521
|
+
# }
|
|
522
|
+
# return discovered, deleted
|
|
523
|
+
#
|
|
524
|
+
#
|
|
525
|
+
#
|
geney/utils/utils.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
__all__ = ['is_monotonic', 'contains', 'unload_json', 'unload_pickle', 'dump_json', 'dump_pickle', 'generate_random_nucleotide_sequences']
|
|
2
|
+
|
|
3
|
+
import pickle
|
|
4
|
+
import json
|
|
5
|
+
# import re
|
|
6
|
+
# from pathlib import Path
|
|
7
|
+
from bisect import bisect_left
|
|
8
|
+
import hashlib
|
|
9
|
+
|
|
10
|
+
# def is_monotonic(A):
|
|
11
|
+
# x, y = [], []
|
|
12
|
+
# x.extend(A)
|
|
13
|
+
# y.extend(A)
|
|
14
|
+
# x.sort()
|
|
15
|
+
# y.sort(reverse=True)
|
|
16
|
+
# if (x == A or y == A):
|
|
17
|
+
# return True
|
|
18
|
+
# return False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# def available_genes(organism='hg38'):
|
|
22
|
+
# from geney import config
|
|
23
|
+
# annotation_path = config[organism]['MRNA_PATH'] / 'protein_coding'
|
|
24
|
+
# return sorted(list(set([m.stem.split('_')[-1] for m in annotation_path.glob('*')])))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def contains(a, x):
|
|
28
|
+
"""returns true if sorted sequence `a` contains `x`"""
|
|
29
|
+
i = bisect_left(a, x)
|
|
30
|
+
return i != len(a) and a[i] == x
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def unload_json(file_path):
|
|
34
|
+
with open(file_path, 'r') as f:
|
|
35
|
+
data = json.load(f)
|
|
36
|
+
return data
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def dump_json(file_path, payload):
|
|
40
|
+
with open(file_path, 'w') as f:
|
|
41
|
+
json.dump(payload, f)
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def unload_pickle(file_path):
|
|
46
|
+
|
|
47
|
+
with open(file_path, 'rb') as f:
|
|
48
|
+
data = pickle.load(f)
|
|
49
|
+
return data
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def dump_pickle(file_path, payload):
|
|
53
|
+
with open(file_path, 'wb') as f:
|
|
54
|
+
pickle.dump(payload, f)
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def is_monotonic(A):
|
|
60
|
+
return all(x <= y for x, y in zip(A, A[1:])) or all(x >= y for x, y in zip(A, A[1:]))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def generate_random_nucleotide_sequences(num_sequences, min_len=3, max_len=10):
|
|
65
|
+
"""
|
|
66
|
+
Generate random sequences of nucleotides.
|
|
67
|
+
|
|
68
|
+
Parameters:
|
|
69
|
+
num_sequences (int): Number of sequences to generate.
|
|
70
|
+
sequence_length (int): Length of each sequence.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
list: A list of random nucleotide sequences.
|
|
74
|
+
"""
|
|
75
|
+
import random
|
|
76
|
+
nucleotides = ['A', 'C', 'G', 'T']
|
|
77
|
+
lengths = list(range(min_len, max_len))
|
|
78
|
+
sequences = [
|
|
79
|
+
''.join(random.choices(nucleotides, k=random.choice(lengths)))
|
|
80
|
+
for _ in range(num_sequences)
|
|
81
|
+
]
|
|
82
|
+
return sequences
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def short_hash_of_list(numbers, length=5):
|
|
87
|
+
encoded = repr(numbers).encode('utf-8')
|
|
88
|
+
full_hash = hashlib.sha256(encoded).hexdigest()
|
|
89
|
+
return full_hash[:length]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
geney/Fasta_segment.py,sha256=99HxNGNh_MfdVW6hhtlb1vOn7eSmT7oFoEfHDFMxG8w,11275
|
|
2
|
+
geney/Gene.py,sha256=G-5ROebtvbVazzPlsBJ1r2DEduCwsIA5S8_TmBuoyjw,7030
|
|
3
|
+
geney/Oncosplice.py,sha256=ETAvMl_Oq6mEJQHPNwdDO5csX6Ahuped_om10KifCyM,17739
|
|
4
|
+
geney/SeqMats.py,sha256=9-eJnfU2w3LGc0XvVvFEO_QrBneTkC6xkZKDfTcEw5o,19282
|
|
5
|
+
geney/SpliceSimulator.py,sha256=eVXEpczq3fqQpfmgn-xCnJdCiNYG9TwW3LkSPdyeFpI,18376
|
|
6
|
+
geney/Transcript.py,sha256=Wu0UiubFOdasfPCpe9uGfhPDG4MNks5LzUqGzo85ong,14458
|
|
7
|
+
geney/__init__.py,sha256=YLWXJS53yeryp6nVhCgFg3_Du9Guj9y3iSrdfx61q5Y,3017
|
|
8
|
+
geney/_config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
|
|
9
|
+
geney/_graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
10
|
+
geney/_gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
11
|
+
geney/_immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
|
|
12
|
+
geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
|
|
13
|
+
geney/_oncosplice.py,sha256=UkGPJqHSKK_XVsDp-03Baa3ks5ePb_1f1EB0wbkKrDo,35527
|
|
14
|
+
geney/_splicing_utils.py,sha256=Zda6MD0e81p46_y6A240W97d1TP4dakLhG2WT0kSN5U,31473
|
|
15
|
+
geney/_survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
16
|
+
geney/_tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
|
|
17
|
+
geney/_tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
|
|
18
|
+
geney/allele_linkage.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
geney/config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
|
|
20
|
+
geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
|
|
21
|
+
geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
22
|
+
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
23
|
+
geney/immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
|
|
24
|
+
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
25
|
+
geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
|
|
26
|
+
geney/pipelines.py,sha256=XeC4NTqxzHzrGX6HIBXe8pe4pELpjYLgTmcOt7ESN0g,2916
|
|
27
|
+
geney/power_utils.py,sha256=orOhsr9vkQ-Y4nD1zHj_MmR2J3uYiUsiklqVy-5T-2M,7331
|
|
28
|
+
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
29
|
+
geney/spliceai_utils.py,sha256=nyBnLdYs1rB-duA9lfJYM9Q2xNlvZA3I_sCJ1z5WjFw,3294
|
|
30
|
+
geney/splicing_utils.py,sha256=UkG2YphjLNUYsv3o3RGUTW1ScHbEMOLL2M_7WbgDVME,47466
|
|
31
|
+
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
32
|
+
geney/tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
|
|
33
|
+
geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
|
|
34
|
+
geney/utils.py,sha256=KBdwNIywo7INVEQEsuIXauEJobvReE9TXAi5qqXanSI,2775
|
|
35
|
+
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
|
+
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
37
|
+
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
38
|
+
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
39
|
+
geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
|
|
40
|
+
geney/utils/SeqMats.py,sha256=vjU0lTkB0s0RoLjNXLqt0kJQDni-it09-iAOv5QAYFs,17686
|
|
41
|
+
geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
|
|
42
|
+
geney/utils/__init__.py,sha256=jCoB0doidTbCFT34Yx8gQROcZOsw4LnqhgkwRgGQWt0,693
|
|
43
|
+
geney/utils/mutation_utils.py,sha256=r-pHr56gEa5kh_DPX8MjFY3ZfYaOtyo4CUfJ5ZHlXPw,3243
|
|
44
|
+
geney/utils/pangolin_utils.py,sha256=EUadXPxY7QUnsQrlyO7K5cg9mi5ssZjSDvNa_SzoBQg,6160
|
|
45
|
+
geney/utils/spliceai_utils.py,sha256=oRrGJqjWirzYmiBmUR9hGr4B7V_7Y1uMyRTmbFKc_t0,4539
|
|
46
|
+
geney/utils/splicing_utils.py,sha256=_Df3SakZrDjs2yKLG05TtfwuoXDLLrZWc9Y8i79rFDM,20633
|
|
47
|
+
geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
|
|
48
|
+
geney-1.4.1.dist-info/METADATA,sha256=HyC14VKqeu1HExC-vasEc4yhH73fN1hI_L5fAE1UXgo,989
|
|
49
|
+
geney-1.4.1.dist-info/WHEEL,sha256=Kh9pAotZVRFj97E15yTA4iADqXdQfIVTHcNaZTjxeGM,110
|
|
50
|
+
geney-1.4.1.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
51
|
+
geney-1.4.1.dist-info/RECORD,,
|