geney 1.2.20__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
- geney-1.2.21.dist-info/RECORD +19 -0
- geney/Gene.py +0 -258
- geney/analyzers/__init__.py +0 -0
- geney/analyzers/benchmark_clinvar.py +0 -158
- geney/analyzers/characterize_epistasis.py +0 -15
- geney/analyzers/compare_sets.py +0 -91
- geney/analyzers/group_comparison.py +0 -81
- geney/analyzers/survival.py +0 -144
- geney/analyzers/tcga_annotations.py +0 -194
- geney/analyzers/visualize_protein_conservation.py +0 -398
- geney/benchmark_clinvar.py +0 -158
- geney/compare_sets.py +0 -91
- geney/data_parsers/__init__.py +0 -0
- geney/data_parsers/gtex.py +0 -68
- geney/gtex.py +0 -68
- geney/immunotherapy/__init__.py +0 -0
- geney/immunotherapy/netchop.py +0 -78
- geney/mutations/__init__.py +0 -0
- geney/mutations/variant_utils.py +0 -125
- geney/netchop.py +0 -79
- geney/oncosplice/__init__.py +0 -0
- geney/oncosplice_mouse.py +0 -277
- geney/oncosplice_pipeline.py +0 -1588
- geney/performance_utils.py +0 -138
- geney/pipelines/__init__.py +0 -0
- geney/pipelines/dask_utils.py +0 -153
- geney/splicing/__init__.py +0 -2
- geney/splicing/spliceai_utils.py +0 -253
- geney/splicing/splicing_isoform_utils.py +0 -0
- geney/splicing/splicing_utils.py +0 -366
- geney/survival.py +0 -124
- geney/tcga_annotations.py +0 -352
- geney/translation_termination/__init__.py +0 -0
- geney/translation_termination/tts_utils.py +0 -0
- geney-1.2.20.dist-info/RECORD +0 -52
- {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
- {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0
|
@@ -1,398 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from collections import namedtuple
|
|
4
|
-
import re
|
|
5
|
-
from geney.oncosplice import oncosplice
|
|
6
|
-
from geney.utils import unload_pickle
|
|
7
|
-
|
|
8
|
-
def create_figure_story(epistasis, to_file=None):
|
|
9
|
-
g = epistasis.split(':')[0]
|
|
10
|
-
out = oncosplice(epistasis, annotate=True)
|
|
11
|
-
out = out[out.cons_available==1]
|
|
12
|
-
|
|
13
|
-
for _, row in out.iterrows():
|
|
14
|
-
max_length = 0
|
|
15
|
-
pos = 0
|
|
16
|
-
for i, k in row.deletions.items():
|
|
17
|
-
if len(k) > max_length:
|
|
18
|
-
pos = i
|
|
19
|
-
max_length = len(k)
|
|
20
|
-
|
|
21
|
-
if max_length > 5:
|
|
22
|
-
del_reg = [pos, pos + max_length]
|
|
23
|
-
else:
|
|
24
|
-
del_reg = None
|
|
25
|
-
|
|
26
|
-
if row.oncosplice_score == 0:
|
|
27
|
-
mutation_loc = None
|
|
28
|
-
else:
|
|
29
|
-
mutation_loc = pos
|
|
30
|
-
|
|
31
|
-
plot_conservation(tid=row.transcript_id,
|
|
32
|
-
gene=f'{g}, {row.transcript_id}.{row.isoform}',
|
|
33
|
-
mutation_loc=mutation_loc,
|
|
34
|
-
target_region=del_reg, mut_name='Epistasis',
|
|
35
|
-
domain_annotations=get_annotations(row.transcript_id, 300),
|
|
36
|
-
to_file=to_file)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
import numpy as np
|
|
40
|
-
import matplotlib.pyplot as plt
|
|
41
|
-
from matplotlib.patches import Rectangle
|
|
42
|
-
import seaborn as sns
|
|
43
|
-
|
|
44
|
-
def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
|
|
45
|
-
"""
|
|
46
|
-
Plots conservation vectors with protein domain visualization and Rate4Site scores.
|
|
47
|
-
|
|
48
|
-
Parameters:
|
|
49
|
-
tid (str): Transcript identifier.
|
|
50
|
-
gene (str): Gene name.
|
|
51
|
-
mutation_loc (int): Position of the mutation.
|
|
52
|
-
target_region (tuple): Start and end positions of the target region.
|
|
53
|
-
mut_name (str): Name of the mutation.
|
|
54
|
-
domain_annotations (list): List of tuples for domain annotations (start, end, label).
|
|
55
|
-
"""
|
|
56
|
-
# Access conservation data
|
|
57
|
-
_, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
|
|
58
|
-
|
|
59
|
-
if not cons_vec:
|
|
60
|
-
raise ValueError("The conservation vector is empty.")
|
|
61
|
-
|
|
62
|
-
sns.set_theme(style="white")
|
|
63
|
-
fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
|
|
64
|
-
|
|
65
|
-
# Plotting the conservation vectors in the main plot
|
|
66
|
-
plot_conservation_vectors(ax, cons_vec)
|
|
67
|
-
|
|
68
|
-
# Setting up primary axis for the main plot
|
|
69
|
-
setup_primary_axis(ax, gene, len(cons_vec))
|
|
70
|
-
|
|
71
|
-
# Create a separate axes for protein domain visualization
|
|
72
|
-
domain_ax = create_domain_axes(fig, len(cons_vec))
|
|
73
|
-
|
|
74
|
-
# Draw protein domains
|
|
75
|
-
plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
|
|
76
|
-
|
|
77
|
-
# Plotting Rate4Site scores on secondary y-axis
|
|
78
|
-
plot_rate4site_scores(ax, cons_vec)
|
|
79
|
-
|
|
80
|
-
# Plotting mutation location and target region, if provided
|
|
81
|
-
plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
|
|
82
|
-
|
|
83
|
-
plt.show()
|
|
84
|
-
|
|
85
|
-
def plot_conservation_vectors(ax, cons_vec):
|
|
86
|
-
"""Plots transformed conservation vectors."""
|
|
87
|
-
temp = transform_conservation_vector(cons_vec, 76) # Larger window
|
|
88
|
-
temp /= max(temp)
|
|
89
|
-
ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
90
|
-
|
|
91
|
-
temp = transform_conservation_vector(cons_vec, 6) # Smaller window
|
|
92
|
-
temp /= max(temp)
|
|
93
|
-
ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
94
|
-
|
|
95
|
-
def setup_primary_axis(ax, gene, length):
|
|
96
|
-
"""Configures the primary axis of the plot."""
|
|
97
|
-
ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
98
|
-
ax.set_xlim(0, length)
|
|
99
|
-
ax.set_ylim(0, 1)
|
|
100
|
-
ax.set_ylabel('Relative Importance', weight='bold')
|
|
101
|
-
ax.tick_params(axis='y')
|
|
102
|
-
ax.spines['right'].set_visible(False)
|
|
103
|
-
ax.spines['top'].set_visible(False)
|
|
104
|
-
|
|
105
|
-
def create_domain_axes(fig, length):
|
|
106
|
-
"""Creates an axis for protein domain visualization."""
|
|
107
|
-
domain_ax_height = 0.06
|
|
108
|
-
domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
|
|
109
|
-
domain_ax.set_xlim(0, length)
|
|
110
|
-
domain_ax.set_xticks([])
|
|
111
|
-
domain_ax.set_yticks([])
|
|
112
|
-
for spine in domain_ax.spines.values():
|
|
113
|
-
spine.set_visible(False)
|
|
114
|
-
return domain_ax
|
|
115
|
-
|
|
116
|
-
def plot_protein_domains(ax, domain_annotations, length):
|
|
117
|
-
"""Plots protein domain annotations."""
|
|
118
|
-
ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
|
|
119
|
-
for domain in domain_annotations:
|
|
120
|
-
start, end, label = domain
|
|
121
|
-
ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
122
|
-
ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
123
|
-
|
|
124
|
-
def plot_rate4site_scores(ax, cons_vec):
|
|
125
|
-
"""Plots Rate4Site scores on a secondary y-axis."""
|
|
126
|
-
ax2 = ax.twinx()
|
|
127
|
-
c = np.array(cons_vec)
|
|
128
|
-
c = c + abs(min(c))
|
|
129
|
-
c = c/max(c)
|
|
130
|
-
ax2.set_ylim(min(c), max(c)*1.1)
|
|
131
|
-
ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
132
|
-
ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
133
|
-
ax2.tick_params(axis='y', labelcolor='green')
|
|
134
|
-
ax2.spines['right'].set_visible(True)
|
|
135
|
-
ax2.spines['top'].set_visible(False)
|
|
136
|
-
|
|
137
|
-
def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
|
|
138
|
-
"""Highlights mutation location and target region, if provided."""
|
|
139
|
-
if mutation_loc is not None:
|
|
140
|
-
ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
|
|
141
|
-
ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
142
|
-
|
|
143
|
-
if target_region is not None:
|
|
144
|
-
ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
145
|
-
center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
146
|
-
ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
# def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
|
|
150
|
-
# _, cons_vec = access_conservation_data(tid)
|
|
151
|
-
#
|
|
152
|
-
# sns.set_theme(style="white")
|
|
153
|
-
# fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
|
|
154
|
-
#
|
|
155
|
-
# # Plotting the conservation vectors in the main plot
|
|
156
|
-
# temp = transform_conservation_vector(cons_vec, 76)
|
|
157
|
-
# temp /= max(temp)
|
|
158
|
-
# ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
159
|
-
# temp = transform_conservation_vector(cons_vec, 6)
|
|
160
|
-
# temp /= max(temp)
|
|
161
|
-
# ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
162
|
-
#
|
|
163
|
-
# # Setting up primary axis for the main plot
|
|
164
|
-
# ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
165
|
-
# ax.set_xlim(0, len(cons_vec))
|
|
166
|
-
# ax.set_ylim(0, 1) # Set y-limit to end at 1
|
|
167
|
-
# ax.set_ylabel('Relative Importance', weight='bold')
|
|
168
|
-
# ax.tick_params(axis='y')
|
|
169
|
-
# ax.spines['right'].set_visible(False)
|
|
170
|
-
# ax.spines['top'].set_visible(False)
|
|
171
|
-
#
|
|
172
|
-
# # Create a separate axes for protein domain visualization above the main plot
|
|
173
|
-
# domain_ax_height = 0.06 # Adjust for thinner protein diagram
|
|
174
|
-
# domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
|
|
175
|
-
# domain_ax.set_xlim(0, len(cons_vec))
|
|
176
|
-
# domain_ax.set_xticks([])
|
|
177
|
-
# domain_ax.set_yticks([])
|
|
178
|
-
# domain_ax.spines['top'].set_visible(False)
|
|
179
|
-
# domain_ax.spines['right'].set_visible(False)
|
|
180
|
-
# domain_ax.spines['left'].set_visible(False)
|
|
181
|
-
# domain_ax.spines['bottom'].set_visible(False)
|
|
182
|
-
#
|
|
183
|
-
# # Draw the full-length protein as a base rectangle
|
|
184
|
-
# domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
|
|
185
|
-
#
|
|
186
|
-
# # Overlay domain annotations
|
|
187
|
-
# for domain in domain_annotations:
|
|
188
|
-
# start, end, label = domain
|
|
189
|
-
# domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
190
|
-
# domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
191
|
-
#
|
|
192
|
-
# # Plotting Rate4Site scores on secondary y-axis
|
|
193
|
-
# ax2 = ax.twinx()
|
|
194
|
-
# c = np.array(cons_vec)
|
|
195
|
-
# c = c + abs(min(c))
|
|
196
|
-
# c = c/max(c)
|
|
197
|
-
# ax2.set_ylim(min(c), max(c)*1.1)
|
|
198
|
-
# ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
199
|
-
# ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
200
|
-
# ax2.tick_params(axis='y', labelcolor='green')
|
|
201
|
-
# ax2.spines['right'].set_visible(True)
|
|
202
|
-
# ax2.spines['top'].set_visible(False)
|
|
203
|
-
#
|
|
204
|
-
# # Plotting mutation location and target region
|
|
205
|
-
# if mutation_loc is not None:
|
|
206
|
-
# ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
|
|
207
|
-
# ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
208
|
-
#
|
|
209
|
-
# if target_region is not None:
|
|
210
|
-
# ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
211
|
-
# center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
212
|
-
# ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
213
|
-
#
|
|
214
|
-
# plt.show()
|
|
215
|
-
#
|
|
216
|
-
|
|
217
|
-
def merge_overlapping_regions(df):
|
|
218
|
-
"""
|
|
219
|
-
Merges overlapping regions in a DataFrame.
|
|
220
|
-
|
|
221
|
-
Parameters:
|
|
222
|
-
df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
List: List of merged regions as namedtuples (start, end, combined_name)
|
|
226
|
-
"""
|
|
227
|
-
if df.empty:
|
|
228
|
-
return []
|
|
229
|
-
|
|
230
|
-
Region = namedtuple('Region', ['start', 'end', 'combined_name'])
|
|
231
|
-
df = df.sort_values(by='start')
|
|
232
|
-
merged_regions = []
|
|
233
|
-
current_region = None
|
|
234
|
-
|
|
235
|
-
for _, row in df.iterrows():
|
|
236
|
-
start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
|
|
237
|
-
if current_region is None:
|
|
238
|
-
current_region = Region(start, end, [name])
|
|
239
|
-
elif start <= current_region.end:
|
|
240
|
-
current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
|
|
241
|
-
else:
|
|
242
|
-
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
243
|
-
current_region = Region(start, end, [name])
|
|
244
|
-
|
|
245
|
-
if current_region:
|
|
246
|
-
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
247
|
-
|
|
248
|
-
# Assuming split_text is a function that splits the text appropriately.
|
|
249
|
-
merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
|
|
250
|
-
return merged_regions
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def split_text(text, width):
|
|
254
|
-
"""
|
|
255
|
-
Splits a text into lines with a maximum specified width.
|
|
256
|
-
|
|
257
|
-
Parameters:
|
|
258
|
-
text (str): The text to be split.
|
|
259
|
-
width (int): Maximum width of each line.
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
str: The text split into lines of specified width.
|
|
263
|
-
"""
|
|
264
|
-
lines = re.findall('.{1,' + str(width) + '}', text)
|
|
265
|
-
return '\n'.join(lines)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
###
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
# def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
|
|
274
|
-
# _, cons_vec = access_conservation_data(tid)
|
|
275
|
-
#
|
|
276
|
-
# sns.set_theme(style="white")
|
|
277
|
-
# fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
|
|
278
|
-
#
|
|
279
|
-
# # Plotting the conservation vectors in the main plot
|
|
280
|
-
# temp = transform_conservation_vector(cons_vec, 76)
|
|
281
|
-
# temp /= max(temp)
|
|
282
|
-
# ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
283
|
-
# temp = transform_conservation_vector(cons_vec, 7)
|
|
284
|
-
# temp /= max(temp)
|
|
285
|
-
# ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
286
|
-
#
|
|
287
|
-
# # Setting up primary axis for the main plot
|
|
288
|
-
# ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
289
|
-
# ax.set_xlim(0, len(cons_vec))
|
|
290
|
-
# ax.set_ylim(0, 1) # Set y-limit to end at 1
|
|
291
|
-
# ax.set_ylabel('Relative Importance', weight='bold')
|
|
292
|
-
# ax.tick_params(axis='y')
|
|
293
|
-
# ax.spines['right'].set_visible(False)
|
|
294
|
-
# ax.spines['top'].set_visible(False)
|
|
295
|
-
#
|
|
296
|
-
# # Create a separate axes for protein domain visualization above the main plot
|
|
297
|
-
# domain_ax_height = 0.06 # Adjust for thinner protein diagram
|
|
298
|
-
# domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
|
|
299
|
-
# domain_ax.set_xlim(0, len(cons_vec))
|
|
300
|
-
# domain_ax.set_xticks([])
|
|
301
|
-
# domain_ax.set_yticks([])
|
|
302
|
-
# domain_ax.spines['top'].set_visible(False)
|
|
303
|
-
# domain_ax.spines['right'].set_visible(False)
|
|
304
|
-
# domain_ax.spines['left'].set_visible(False)
|
|
305
|
-
# domain_ax.spines['bottom'].set_visible(False)
|
|
306
|
-
#
|
|
307
|
-
# # Draw the full-length protein as a base rectangle
|
|
308
|
-
# domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
|
|
309
|
-
#
|
|
310
|
-
# # Overlay domain annotations
|
|
311
|
-
# for domain in domain_annotations:
|
|
312
|
-
# start, end, label = domain
|
|
313
|
-
# domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
314
|
-
# domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
315
|
-
#
|
|
316
|
-
# # Plotting Rate4Site scores on secondary y-axis
|
|
317
|
-
# ax2 = ax.twinx()
|
|
318
|
-
# c = np.array(cons_vec)
|
|
319
|
-
# c = c + abs(min(c))
|
|
320
|
-
# c = c/max(c)
|
|
321
|
-
# ax2.set_ylim(min(c), max(c)*1.1)
|
|
322
|
-
# ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
323
|
-
# ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
324
|
-
# ax2.tick_params(axis='y', labelcolor='green')
|
|
325
|
-
# ax2.spines['right'].set_visible(True)
|
|
326
|
-
# ax2.spines['top'].set_visible(False)
|
|
327
|
-
#
|
|
328
|
-
# # Plotting mutation location and target region
|
|
329
|
-
# if mutation_loc is not None:
|
|
330
|
-
# ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
|
|
331
|
-
# ax.text(mutation_loc, 1.01, mut_name, color='r', weight='bold', ha='center')
|
|
332
|
-
#
|
|
333
|
-
# if target_region is not None:
|
|
334
|
-
# ax.add_patch(Rectangle((target_region[0], 0.85), target_region[1] - target_region[0], 0.05, alpha=0.3, facecolor='blue'))
|
|
335
|
-
# center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
336
|
-
# ax.text(center_loc, 0.875, 'Target Region', ha='center', va='center', color='blue', weight='bold')
|
|
337
|
-
#
|
|
338
|
-
# plt.show()
|
|
339
|
-
#
|
|
340
|
-
#
|
|
341
|
-
# def merge_overlapping_regions(df):
|
|
342
|
-
# # Sort the DataFrame by the 'start' column
|
|
343
|
-
# df = df.sort_values(by='start')
|
|
344
|
-
#
|
|
345
|
-
# merged_regions = [] # List to store merged regions as tuples (start, end, combined_name)
|
|
346
|
-
#
|
|
347
|
-
# current_start = None
|
|
348
|
-
# current_end = None
|
|
349
|
-
# combined_names = [] # List to store names of overlapping regions
|
|
350
|
-
#
|
|
351
|
-
# for index, row in df.iterrows():
|
|
352
|
-
# start = row['start']
|
|
353
|
-
# end = row['end']
|
|
354
|
-
# name = row['name'].replace('_', ' ')
|
|
355
|
-
#
|
|
356
|
-
# if current_start is None:
|
|
357
|
-
# # Initialize the current region
|
|
358
|
-
# current_start = start
|
|
359
|
-
# current_end = end
|
|
360
|
-
# combined_names.append(name)
|
|
361
|
-
# else:
|
|
362
|
-
# if start <= current_end:
|
|
363
|
-
# # Regions overlap, update the current region and add the name to combined_names
|
|
364
|
-
# current_end = max(current_end, end)
|
|
365
|
-
# combined_names.append(name)
|
|
366
|
-
# else:
|
|
367
|
-
# # Regions don't overlap, add the current region to the result with combined names
|
|
368
|
-
# combined_name = ', '.join(combined_names)
|
|
369
|
-
# merged_regions.append((current_start, current_end, combined_name))
|
|
370
|
-
# # Start a new current region with the current row
|
|
371
|
-
# current_start = start
|
|
372
|
-
# current_end = end
|
|
373
|
-
# combined_names = [name]
|
|
374
|
-
#
|
|
375
|
-
# # Add the last current region to the result
|
|
376
|
-
# if current_start is not None:
|
|
377
|
-
# combined_name = ', '.join(combined_names)
|
|
378
|
-
# merged_regions.append((current_start, current_end, combined_name))
|
|
379
|
-
#
|
|
380
|
-
# merged_regions = [(a, b, split_text(c, 35)) for a, b, c in merged_regions]
|
|
381
|
-
# return merged_regions
|
|
382
|
-
#
|
|
383
|
-
# def split_text(text, width):
|
|
384
|
-
# lines = []
|
|
385
|
-
# while text:
|
|
386
|
-
# # Find the index to split at or take the whole text if it's shorter than the width
|
|
387
|
-
# split_index = min(len(text), width)
|
|
388
|
-
# # Append the substring up to the split index to the lines list
|
|
389
|
-
# lines.append(text[:split_index])
|
|
390
|
-
# # Remove the processed substring from the original text
|
|
391
|
-
# text = text[split_index:]
|
|
392
|
-
# return '\n'.join(lines)
|
|
393
|
-
#
|
|
394
|
-
# def get_annotations(target_gene, w=500):
|
|
395
|
-
# temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
|
|
396
|
-
# return merge_overlapping_regions(temp)
|
|
397
|
-
#
|
|
398
|
-
#
|
geney/benchmark_clinvar.py
DELETED
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from sklearn.metrics import roc_curve, precision_recall_curve
|
|
3
|
-
import matplotlib.pyplot as plt
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import subprocess
|
|
7
|
-
|
|
8
|
-
from geney import config_setup
|
|
9
|
-
from geney.utils import download_and_gunzip
|
|
10
|
-
from geney.oncosplice import oncosplice_reduced
|
|
11
|
-
|
|
12
|
-
def download_and_parse_clinvar():
|
|
13
|
-
url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
|
|
14
|
-
local_file = download_and_gunzip(url, target_path)
|
|
15
|
-
return local_file
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def aggregate_clinvar_results(benchmark_path, aggregate_mode=False, benchmark_feature=None, local_clinvar_df='/tamir2/nicolaslynn/data/ClinVar/clinvar_compact.csv'):
|
|
19
|
-
data = pd.concat([pd.read_csv(file) for file in Path(benchmark_path).glob('*.csv')])
|
|
20
|
-
if not aggregate_mode:
|
|
21
|
-
data = data[(data.cons_available) & (data.primary_transcript)]
|
|
22
|
-
|
|
23
|
-
data = oncosplice_reduced(data)
|
|
24
|
-
data = data.loc[:, ~data.columns.duplicated()]
|
|
25
|
-
data = pd.merge(data, pd.read_csv(local_clinvar_df), on='mut_id')
|
|
26
|
-
data['clinsig_val'] = data.apply(lambda row: {'Benign': 0, 'Pathogenic': 1}[row.clinsig], axis=1)
|
|
27
|
-
for c in data.columns:
|
|
28
|
-
try:
|
|
29
|
-
if data[c].min() < 0:
|
|
30
|
-
data[f'{c}_abs'] = abs(data[c])
|
|
31
|
-
except TypeError:
|
|
32
|
-
pass
|
|
33
|
-
|
|
34
|
-
print(data.corr(numeric_only=True))
|
|
35
|
-
print(data.corrwith(data['clinsig_val'], method='spearman'))
|
|
36
|
-
print(data.corrwith(data['clinsig_val'], method='pearson'))
|
|
37
|
-
return data
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def plot_performance(true_values, predictions):
|
|
41
|
-
clinsig_map = {'Benign': 0, 'Pathogenic': 1}
|
|
42
|
-
true_values = [clinsig_map[t] for t in true_values]
|
|
43
|
-
predictions = scale_predictions(predictions)
|
|
44
|
-
|
|
45
|
-
fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
|
|
46
|
-
|
|
47
|
-
# Calculate Precision-Recall curve
|
|
48
|
-
precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
|
|
49
|
-
|
|
50
|
-
# Plotting ROC curve
|
|
51
|
-
plt.figure(figsize=(20, 5))
|
|
52
|
-
|
|
53
|
-
plt.subplot(1, 4, 1)
|
|
54
|
-
plt.plot(fpr, tpr)
|
|
55
|
-
plt.title('ROC Curve')
|
|
56
|
-
plt.xlabel('False Positive Rate')
|
|
57
|
-
plt.ylabel('True Positive Rate')
|
|
58
|
-
|
|
59
|
-
# Plotting Precision-Recall curve
|
|
60
|
-
plt.subplot(1, 4, 2)
|
|
61
|
-
plt.plot(recall, precision)
|
|
62
|
-
plt.title('Precision-Recall Curve')
|
|
63
|
-
plt.xlabel('Recall')
|
|
64
|
-
plt.ylabel('Precision')
|
|
65
|
-
|
|
66
|
-
# Plotting Precision vs. Thresholds
|
|
67
|
-
plt.subplot(1, 4, 3)
|
|
68
|
-
plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
|
|
69
|
-
plt.title('Precision vs. Threshold')
|
|
70
|
-
plt.xlabel('Threshold')
|
|
71
|
-
plt.ylabel('Precision')
|
|
72
|
-
|
|
73
|
-
# Plotting Sample Percentage Captured vs. Thresholds
|
|
74
|
-
plt.subplot(1, 4, 4)
|
|
75
|
-
# Assuming 'tpr' or another appropriate metric represents the cumulative percentage
|
|
76
|
-
plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
|
|
77
|
-
plt.title('Cumulative Percentage vs. Threshold')
|
|
78
|
-
plt.xlabel('Threshold')
|
|
79
|
-
plt.ylabel('Cumulative Percentage of Population')
|
|
80
|
-
|
|
81
|
-
plt.tight_layout()
|
|
82
|
-
plt.show()
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class ClinVarBenchmark:
|
|
87
|
-
def __init__(self, df):
|
|
88
|
-
assert 'clinsig' in df.columns, 'No clinsig column found in dataframe.'
|
|
89
|
-
self.df = df
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def scale_predictions(self, p):
|
|
93
|
-
max_val = max(p)
|
|
94
|
-
min_val = min(p)
|
|
95
|
-
return (p - min_val) / (max_val - min_val)
|
|
96
|
-
|
|
97
|
-
def plot_performance(self, true_values, predictions):
|
|
98
|
-
clinsig_map = {'Benign': 0, 'Pathogenic': 1}
|
|
99
|
-
predictions = [clinsig_map[t] for t in true_values]
|
|
100
|
-
predictions = self.scale_predictions(predictions)
|
|
101
|
-
|
|
102
|
-
fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
|
|
103
|
-
|
|
104
|
-
# Calculate Precision-Recall curve
|
|
105
|
-
precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
|
|
106
|
-
|
|
107
|
-
# Plotting ROC curve
|
|
108
|
-
plt.figure(figsize=(20, 5))
|
|
109
|
-
|
|
110
|
-
plt.subplot(1, 4, 1)
|
|
111
|
-
plt.plot(fpr, tpr)
|
|
112
|
-
plt.title('ROC Curve')
|
|
113
|
-
plt.xlabel('False Positive Rate')
|
|
114
|
-
plt.ylabel('True Positive Rate')
|
|
115
|
-
|
|
116
|
-
# Plotting Precision-Recall curve
|
|
117
|
-
plt.subplot(1, 4, 2)
|
|
118
|
-
plt.plot(recall, precision)
|
|
119
|
-
plt.title('Precision-Recall Curve')
|
|
120
|
-
plt.xlabel('Recall')
|
|
121
|
-
plt.ylabel('Precision')
|
|
122
|
-
|
|
123
|
-
# Plotting Precision vs. Thresholds
|
|
124
|
-
plt.subplot(1, 4, 3)
|
|
125
|
-
plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
|
|
126
|
-
plt.title('Precision vs. Threshold')
|
|
127
|
-
plt.xlabel('Threshold')
|
|
128
|
-
plt.ylabel('Precision')
|
|
129
|
-
|
|
130
|
-
# Plotting Sample Percentage Captured vs. Thresholds
|
|
131
|
-
plt.subplot(1, 4, 4)
|
|
132
|
-
# Assuming 'tpr' or another appropriate metric represents the cumulative percentage
|
|
133
|
-
plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
|
|
134
|
-
plt.title('Cumulative Percentage vs. Threshold')
|
|
135
|
-
plt.xlabel('Threshold')
|
|
136
|
-
plt.ylabel('Cumulative Percentage of Population')
|
|
137
|
-
|
|
138
|
-
plt.tight_layout()
|
|
139
|
-
plt.show()
|
|
140
|
-
return None
|
|
141
|
-
|
|
142
|
-
def report(self, feature):
|
|
143
|
-
pass
|
|
144
|
-
|
|
145
|
-
def find_ppv_threshold(self, feature, ppv_threshold=0.95):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if __name__ == '__main__':
|
|
151
|
-
now = datetime.now()
|
|
152
|
-
benchmark_path = config_setup['ONCOSPLICE'] / f'clinvar_benchmark_{now.strftime("%m_%d_%Y")}'
|
|
153
|
-
print(f"Saving benchmark results to {benchmark_path}")
|
|
154
|
-
benchmark_path.mkdir(parents=True, exist_ok=True)
|
|
155
|
-
subprocess.run(['python', '-m', 'geney.pipelines.power_utils', '-i',
|
|
156
|
-
'/tamir2/nicolaslynn/data/ClinVar/clinvar_oncosplice_input.txt', '-r', str(benchmark_path),
|
|
157
|
-
'-n', '10', '-m', '5GB'])
|
|
158
|
-
|
geney/compare_sets.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
|
4
|
-
from sklearn.metrics import roc_auc_score, roc_curve
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
|
|
7
|
-
def plot_auc_curve(y_true, y_pred_proba):
|
|
8
|
-
"""
|
|
9
|
-
Plots the AUC curve.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
y_true (array-like): True labels (0 or 1).
|
|
13
|
-
y_pred_proba (array-like): Predicted probabilities for positive class.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
"""
|
|
18
|
-
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
|
19
|
-
auc_value = roc_auc_score(y_true, y_pred_proba)
|
|
20
|
-
|
|
21
|
-
plt.figure(figsize=(8, 6))
|
|
22
|
-
plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
|
|
23
|
-
plt.plot([0, 1], [0, 1], 'k--')
|
|
24
|
-
plt.xlabel("False Positive Rate")
|
|
25
|
-
plt.ylabel("True Positive Rate")
|
|
26
|
-
plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
27
|
-
plt.legend()
|
|
28
|
-
plt.show()
|
|
29
|
-
return auc_value
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def optimal_ppv(dataframe, feature_name, plot=False):
|
|
33
|
-
"""
|
|
34
|
-
Calculates the optimal positive predictive value (PPV) for a given feature.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
dataframe (pd.DataFrame): Input dataframe.
|
|
38
|
-
feature_name (str): Name of the feature column.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
float: Optimal PPV.
|
|
42
|
-
"""
|
|
43
|
-
# Assuming 'target' is the binary target column (0 or 1)
|
|
44
|
-
threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
|
|
45
|
-
ppv_values = []
|
|
46
|
-
|
|
47
|
-
for threshold in threshold_values:
|
|
48
|
-
predictions = (dataframe[feature_name] >= threshold).astype(int)
|
|
49
|
-
ppv = precision_score(dataframe['target'], predictions)
|
|
50
|
-
ppv_values.append(ppv)
|
|
51
|
-
|
|
52
|
-
optimal_threshold = threshold_values[np.argmax(ppv_values)]
|
|
53
|
-
optimal_ppv = max(ppv_values)
|
|
54
|
-
if plot:
|
|
55
|
-
plt.figure(figsize=(8, 6))
|
|
56
|
-
plt.scatter(threshold_values, ppv_values)
|
|
57
|
-
plt.xlabel("Threshold")
|
|
58
|
-
plt.ylabel("Positive Predictive Value (PPV)")
|
|
59
|
-
plt.title("Optimal Positive Predictive Value (PPV)")
|
|
60
|
-
plt.show()
|
|
61
|
-
|
|
62
|
-
return optimal_ppv, optimal_threshold
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def measure_prediction_quality(prediction_vector, quality_vector):
|
|
66
|
-
"""
|
|
67
|
-
Measure the quality of the predictions using the quality_vector as the characteristic to check.
|
|
68
|
-
"""
|
|
69
|
-
pass
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def create_ppv_vector(prediction_vector, true_value_vector):
|
|
74
|
-
"""
|
|
75
|
-
Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
|
|
76
|
-
"""
|
|
77
|
-
df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
|
|
78
|
-
df.sort_values('prediction', ascending=True, inplace=True)
|
|
79
|
-
df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
|
|
80
|
-
for bin in df.bin.unique():
|
|
81
|
-
temp_df = df[df.bin >= bin].
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def group_retention(predictions, predictor):
|
|
85
|
-
# first i need to get the ratio of values that are retained at particular values
|
|
86
|
-
predictions.sort_values(predictor, inplace=True)
|
|
87
|
-
_, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
|
|
88
|
-
tracker = []
|
|
89
|
-
for th in thresholds:
|
|
90
|
-
|
|
91
|
-
|
geney/data_parsers/__init__.py
DELETED
|
File without changes
|