geney 1.2.21__py2.py3-none-any.whl → 1.2.23__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/__init__.py +14 -2
- geney/data_setup.py +1 -1
- geney/graphic_utils.py +270 -0
- geney/mutation_utils.py +56 -0
- geney/oncosplice.py +197 -1543
- geney/pangolin_utils.py +78 -0
- geney/seqmat_utils.py +406 -0
- geney/spliceai_utils.py +52 -0
- geney/splicing_utils.py +372 -0
- geney/utils.py +24 -20
- {geney-1.2.21.dist-info → geney-1.2.23.dist-info}/METADATA +14 -14
- geney-1.2.23.dist-info/RECORD +25 -0
- {geney-1.2.21.dist-info → geney-1.2.23.dist-info}/WHEEL +1 -1
- geney-1.2.21.dist-info/RECORD +0 -19
- {geney-1.2.21.dist-info → geney-1.2.23.dist-info}/top_level.txt +0 -0
geney/__init__.py
CHANGED
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
from .config_setup import get_config
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
config = get_config()
|
|
3
|
+
from .Fasta_segment import Fasta_segment
|
|
4
|
+
from .utils import *
|
|
5
|
+
|
|
6
|
+
mut_id = 'KRAS:12:25227343:G:T'
|
|
7
|
+
epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
|
|
8
|
+
|
|
9
|
+
def available_genes(organism='hg38'):
|
|
10
|
+
import os
|
|
11
|
+
for file in os.listdir(config[organism]['MRNA_PATH'] / 'protein_coding'):
|
|
12
|
+
gene = file.split('_')[-1].strip('.pkl')
|
|
13
|
+
yield gene
|
|
14
|
+
|
|
15
|
+
|
|
4
16
|
# import os
|
|
5
17
|
# import json
|
|
6
18
|
# from pathlib import Path
|
geney/data_setup.py
CHANGED
|
@@ -234,7 +234,7 @@ def main():
|
|
|
234
234
|
config_data = unload_json(config_file)
|
|
235
235
|
overwrite = 'y'
|
|
236
236
|
if args.organism in config_data:
|
|
237
|
-
overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
|
|
237
|
+
overwrite = input(f"Organism {args.organism} already configured... Overwrite? (y/n)")
|
|
238
238
|
|
|
239
239
|
if overwrite == 'y':
|
|
240
240
|
config_data[args.organism] = config_paths
|
geney/graphic_utils.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
from matplotlib.patches import Rectangle
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
from collections import namedtuple
|
|
6
|
+
from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
### Graphical Stuff
|
|
10
|
+
def create_figure_story(epistasis, to_file=None):
|
|
11
|
+
g = epistasis.split(':')[0]
|
|
12
|
+
out = oncosplice(epistasis, annotate=True)
|
|
13
|
+
out = out[out.cons_available==1]
|
|
14
|
+
|
|
15
|
+
for _, row in out.iterrows():
|
|
16
|
+
max_length = 0
|
|
17
|
+
pos = 0
|
|
18
|
+
for i, k in row.deletions.items():
|
|
19
|
+
if len(k) > max_length:
|
|
20
|
+
pos = i
|
|
21
|
+
max_length = len(k)
|
|
22
|
+
|
|
23
|
+
if max_length > 5:
|
|
24
|
+
del_reg = [pos, pos + max_length]
|
|
25
|
+
else:
|
|
26
|
+
del_reg = None
|
|
27
|
+
|
|
28
|
+
if row.oncosplice_score == 0:
|
|
29
|
+
mutation_loc = None
|
|
30
|
+
else:
|
|
31
|
+
mutation_loc = pos
|
|
32
|
+
|
|
33
|
+
plot_conservation(tid=row.transcript_id,
|
|
34
|
+
gene=f'{g}, {row.transcript_id}.{row.isoform}',
|
|
35
|
+
mutation_loc=mutation_loc,
|
|
36
|
+
target_region=del_reg, mut_name='Epistasis',
|
|
37
|
+
domain_annotations=get_annotations(row.transcript_id, 300),
|
|
38
|
+
to_file=to_file)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
|
|
43
|
+
"""
|
|
44
|
+
Plots conservation vectors with protein domain visualization and Rate4Site scores.
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
tid (str): Transcript identifier.
|
|
48
|
+
gene (str): Gene name.
|
|
49
|
+
mutation_loc (int): Position of the mutation.
|
|
50
|
+
target_region (tuple): Start and end positions of the target region.
|
|
51
|
+
mut_name (str): Name of the mutation.
|
|
52
|
+
domain_annotations (list): List of tuples for domain annotations (start, end, label).
|
|
53
|
+
"""
|
|
54
|
+
# Access conservation data
|
|
55
|
+
_, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
|
|
56
|
+
|
|
57
|
+
if not cons_vec:
|
|
58
|
+
raise ValueError("The conservation vector is empty.")
|
|
59
|
+
|
|
60
|
+
sns.set_theme(style="white")
|
|
61
|
+
fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
|
|
62
|
+
|
|
63
|
+
# Plotting the conservation vectors in the main plot
|
|
64
|
+
plot_conservation_vectors(ax, cons_vec)
|
|
65
|
+
|
|
66
|
+
# Setting up primary axis for the main plot
|
|
67
|
+
setup_primary_axis(ax, gene, len(cons_vec))
|
|
68
|
+
|
|
69
|
+
# Create a separate axes for protein domain visualization
|
|
70
|
+
domain_ax = create_domain_axes(fig, len(cons_vec))
|
|
71
|
+
|
|
72
|
+
# Draw protein domains
|
|
73
|
+
plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
|
|
74
|
+
|
|
75
|
+
# Plotting Rate4Site scores on secondary y-axis
|
|
76
|
+
plot_rate4site_scores(ax, cons_vec)
|
|
77
|
+
|
|
78
|
+
# Plotting mutation location and target region, if provided
|
|
79
|
+
plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
|
|
80
|
+
|
|
81
|
+
plt.show()
|
|
82
|
+
|
|
83
|
+
def plot_conservation_vectors(ax, cons_vec):
|
|
84
|
+
"""Plots transformed conservation vectors."""
|
|
85
|
+
temp = transform_conservation_vector(cons_vec, 76) # Larger window
|
|
86
|
+
temp /= max(temp)
|
|
87
|
+
ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
88
|
+
|
|
89
|
+
temp = transform_conservation_vector(cons_vec, 6) # Smaller window
|
|
90
|
+
temp /= max(temp)
|
|
91
|
+
ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
92
|
+
|
|
93
|
+
def setup_primary_axis(ax, gene, length):
|
|
94
|
+
"""Configures the primary axis of the plot."""
|
|
95
|
+
ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
96
|
+
ax.set_xlim(0, length)
|
|
97
|
+
ax.set_ylim(0, 1)
|
|
98
|
+
ax.set_ylabel('Relative Importance', weight='bold')
|
|
99
|
+
ax.tick_params(axis='y')
|
|
100
|
+
ax.spines['right'].set_visible(False)
|
|
101
|
+
ax.spines['top'].set_visible(False)
|
|
102
|
+
|
|
103
|
+
def create_domain_axes(fig, length):
|
|
104
|
+
"""Creates an axis for protein domain visualization."""
|
|
105
|
+
domain_ax_height = 0.06
|
|
106
|
+
domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
|
|
107
|
+
domain_ax.set_xlim(0, length)
|
|
108
|
+
domain_ax.set_xticks([])
|
|
109
|
+
domain_ax.set_yticks([])
|
|
110
|
+
for spine in domain_ax.spines.values():
|
|
111
|
+
spine.set_visible(False)
|
|
112
|
+
return domain_ax
|
|
113
|
+
|
|
114
|
+
def plot_protein_domains(ax, domain_annotations, length):
|
|
115
|
+
"""Plots protein domain annotations."""
|
|
116
|
+
ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
|
|
117
|
+
for domain in domain_annotations:
|
|
118
|
+
start, end, label = domain
|
|
119
|
+
ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
120
|
+
ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
121
|
+
|
|
122
|
+
def plot_rate4site_scores(ax, cons_vec):
|
|
123
|
+
"""Plots Rate4Site scores on a secondary y-axis."""
|
|
124
|
+
ax2 = ax.twinx()
|
|
125
|
+
c = np.array(cons_vec)
|
|
126
|
+
c = c + abs(min(c))
|
|
127
|
+
c = c/max(c)
|
|
128
|
+
ax2.set_ylim(min(c), max(c)*1.1)
|
|
129
|
+
ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
130
|
+
ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
131
|
+
ax2.tick_params(axis='y', labelcolor='green')
|
|
132
|
+
ax2.spines['right'].set_visible(True)
|
|
133
|
+
ax2.spines['top'].set_visible(False)
|
|
134
|
+
|
|
135
|
+
def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
|
|
136
|
+
"""Highlights mutation location and target region, if provided."""
|
|
137
|
+
if mutation_loc is not None:
|
|
138
|
+
ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
|
|
139
|
+
ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
140
|
+
|
|
141
|
+
if target_region is not None:
|
|
142
|
+
ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
143
|
+
center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
144
|
+
ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def merge_overlapping_regions(df):
|
|
148
|
+
"""
|
|
149
|
+
Merges overlapping regions in a DataFrame.
|
|
150
|
+
|
|
151
|
+
Parameters:
|
|
152
|
+
df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
List: List of merged regions as namedtuples (start, end, combined_name)
|
|
156
|
+
"""
|
|
157
|
+
if df.empty:
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
Region = namedtuple('Region', ['start', 'end', 'combined_name'])
|
|
161
|
+
df = df.sort_values(by='start')
|
|
162
|
+
merged_regions = []
|
|
163
|
+
current_region = None
|
|
164
|
+
|
|
165
|
+
for _, row in df.iterrows():
|
|
166
|
+
start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
|
|
167
|
+
if current_region is None:
|
|
168
|
+
current_region = Region(start, end, [name])
|
|
169
|
+
elif start <= current_region.end:
|
|
170
|
+
current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
|
|
171
|
+
else:
|
|
172
|
+
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
173
|
+
current_region = Region(start, end, [name])
|
|
174
|
+
|
|
175
|
+
if current_region:
|
|
176
|
+
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
177
|
+
|
|
178
|
+
# Assuming split_text is a function that splits the text appropriately.
|
|
179
|
+
merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
|
|
180
|
+
return merged_regions
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def split_text(text, width):
|
|
184
|
+
"""
|
|
185
|
+
Splits a text into lines with a maximum specified width.
|
|
186
|
+
|
|
187
|
+
Parameters:
|
|
188
|
+
text (str): The text to be split.
|
|
189
|
+
width (int): Maximum width of each line.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
str: The text split into lines of specified width.
|
|
193
|
+
"""
|
|
194
|
+
lines = re.findall('.{1,' + str(width) + '}', text)
|
|
195
|
+
return '\n'.join(lines)
|
|
196
|
+
|
|
197
|
+
def get_annotations(target_gene, w=500):
|
|
198
|
+
PROTEIN_ANNOTATIONS = {}
|
|
199
|
+
temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
|
|
200
|
+
return merge_overlapping_regions(temp)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
|
|
204
|
+
# _, cons_vec = access_conservation_data(tid)
|
|
205
|
+
#
|
|
206
|
+
# sns.set_theme(style="white")
|
|
207
|
+
# fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
|
|
208
|
+
#
|
|
209
|
+
# # Plotting the conservation vectors in the main plot
|
|
210
|
+
# temp = transform_conservation_vector(cons_vec, 76)
|
|
211
|
+
# temp /= max(temp)
|
|
212
|
+
# ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
213
|
+
# temp = transform_conservation_vector(cons_vec, 6)
|
|
214
|
+
# temp /= max(temp)
|
|
215
|
+
# ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
216
|
+
#
|
|
217
|
+
# # Setting up primary axis for the main plot
|
|
218
|
+
# ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
219
|
+
# ax.set_xlim(0, len(cons_vec))
|
|
220
|
+
# ax.set_ylim(0, 1) # Set y-limit to end at 1
|
|
221
|
+
# ax.set_ylabel('Relative Importance', weight='bold')
|
|
222
|
+
# ax.tick_params(axis='y')
|
|
223
|
+
# ax.spines['right'].set_visible(False)
|
|
224
|
+
# ax.spines['top'].set_visible(False)
|
|
225
|
+
#
|
|
226
|
+
# # Create a separate axes for protein domain visualization above the main plot
|
|
227
|
+
# domain_ax_height = 0.06 # Adjust for thinner protein diagram
|
|
228
|
+
# domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
|
|
229
|
+
# domain_ax.set_xlim(0, len(cons_vec))
|
|
230
|
+
# domain_ax.set_xticks([])
|
|
231
|
+
# domain_ax.set_yticks([])
|
|
232
|
+
# domain_ax.spines['top'].set_visible(False)
|
|
233
|
+
# domain_ax.spines['right'].set_visible(False)
|
|
234
|
+
# domain_ax.spines['left'].set_visible(False)
|
|
235
|
+
# domain_ax.spines['bottom'].set_visible(False)
|
|
236
|
+
#
|
|
237
|
+
# # Draw the full-length protein as a base rectangle
|
|
238
|
+
# domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
|
|
239
|
+
#
|
|
240
|
+
# # Overlay domain annotations
|
|
241
|
+
# for domain in domain_annotations:
|
|
242
|
+
# start, end, label = domain
|
|
243
|
+
# domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
244
|
+
# domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
245
|
+
#
|
|
246
|
+
# # Plotting Rate4Site scores on secondary y-axis
|
|
247
|
+
# ax2 = ax.twinx()
|
|
248
|
+
# c = np.array(cons_vec)
|
|
249
|
+
# c = c + abs(min(c))
|
|
250
|
+
# c = c/max(c)
|
|
251
|
+
# ax2.set_ylim(min(c), max(c)*1.1)
|
|
252
|
+
# ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
253
|
+
# ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
254
|
+
# ax2.tick_params(axis='y', labelcolor='green')
|
|
255
|
+
# ax2.spines['right'].set_visible(True)
|
|
256
|
+
# ax2.spines['top'].set_visible(False)
|
|
257
|
+
#
|
|
258
|
+
# # Plotting mutation location and target region
|
|
259
|
+
# if mutation_loc is not None:
|
|
260
|
+
# ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
|
|
261
|
+
# ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
262
|
+
#
|
|
263
|
+
# if target_region is not None:
|
|
264
|
+
# ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
265
|
+
# center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
266
|
+
# ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
267
|
+
#
|
|
268
|
+
# plt.show()
|
|
269
|
+
#
|
|
270
|
+
|
geney/mutation_utils.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from .seqmat_utils import *
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
class Allele(SeqMat):
|
|
5
|
+
def __init__(self, alt, pos1, pos2, rev):
|
|
6
|
+
super().__init__(alt, pos1, pos2)
|
|
7
|
+
self.position = min(pos1)
|
|
8
|
+
if rev:
|
|
9
|
+
self.reverse_complement()
|
|
10
|
+
|
|
11
|
+
# def _continuous(self, ind):
|
|
12
|
+
# return True
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SNP(Allele):
|
|
16
|
+
def __init__(self, alt, pos1, pos2):
|
|
17
|
+
super().__init__(alt, pos1, pos2)
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
class INDEL(Allele):
|
|
21
|
+
def __init__(self, alt, pos):
|
|
22
|
+
super().__init__(alt, pos)
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
class INS(Allele):
|
|
26
|
+
def __init__(self, alt, pos):
|
|
27
|
+
super().__init__()
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
class DEL(Allele):
|
|
31
|
+
def __init__(self, alt, pos):
|
|
32
|
+
super().__init__()
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
def get_mutation(mut_id, rev=False):
|
|
36
|
+
_, _, i, r, a = mut_id.split(':')
|
|
37
|
+
i = int(i)
|
|
38
|
+
|
|
39
|
+
if len(a) == len(r) == 1 and a != '-' and r != '-':
|
|
40
|
+
return Allele(a, [i], [0], rev)
|
|
41
|
+
|
|
42
|
+
elif a == '-' and r != '-':
|
|
43
|
+
return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
|
|
44
|
+
|
|
45
|
+
elif r == '-' and a != '-':
|
|
46
|
+
# print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
|
|
47
|
+
return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
|
|
48
|
+
|
|
49
|
+
elif a != '-' and r != '-':
|
|
50
|
+
ind1 = np.concatenate(
|
|
51
|
+
[np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
|
|
52
|
+
ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
|
|
53
|
+
return Allele('-' * len(r) + a, ind1, ind2, rev)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|