geney 1.2.20__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (38) hide show
  1. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
  2. geney-1.2.21.dist-info/RECORD +19 -0
  3. geney/Gene.py +0 -258
  4. geney/analyzers/__init__.py +0 -0
  5. geney/analyzers/benchmark_clinvar.py +0 -158
  6. geney/analyzers/characterize_epistasis.py +0 -15
  7. geney/analyzers/compare_sets.py +0 -91
  8. geney/analyzers/group_comparison.py +0 -81
  9. geney/analyzers/survival.py +0 -144
  10. geney/analyzers/tcga_annotations.py +0 -194
  11. geney/analyzers/visualize_protein_conservation.py +0 -398
  12. geney/benchmark_clinvar.py +0 -158
  13. geney/compare_sets.py +0 -91
  14. geney/data_parsers/__init__.py +0 -0
  15. geney/data_parsers/gtex.py +0 -68
  16. geney/gtex.py +0 -68
  17. geney/immunotherapy/__init__.py +0 -0
  18. geney/immunotherapy/netchop.py +0 -78
  19. geney/mutations/__init__.py +0 -0
  20. geney/mutations/variant_utils.py +0 -125
  21. geney/netchop.py +0 -79
  22. geney/oncosplice/__init__.py +0 -0
  23. geney/oncosplice_mouse.py +0 -277
  24. geney/oncosplice_pipeline.py +0 -1588
  25. geney/performance_utils.py +0 -138
  26. geney/pipelines/__init__.py +0 -0
  27. geney/pipelines/dask_utils.py +0 -153
  28. geney/splicing/__init__.py +0 -2
  29. geney/splicing/spliceai_utils.py +0 -253
  30. geney/splicing/splicing_isoform_utils.py +0 -0
  31. geney/splicing/splicing_utils.py +0 -366
  32. geney/survival.py +0 -124
  33. geney/tcga_annotations.py +0 -352
  34. geney/translation_termination/__init__.py +0 -0
  35. geney/translation_termination/tts_utils.py +0 -0
  36. geney-1.2.20.dist-info/RECORD +0 -52
  37. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
  38. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0
@@ -1,398 +0,0 @@
1
-
2
- import pandas as pd
3
- from collections import namedtuple
4
- import re
5
- from geney.oncosplice import oncosplice
6
- from geney.utils import unload_pickle
7
-
8
- def create_figure_story(epistasis, to_file=None):
9
- g = epistasis.split(':')[0]
10
- out = oncosplice(epistasis, annotate=True)
11
- out = out[out.cons_available==1]
12
-
13
- for _, row in out.iterrows():
14
- max_length = 0
15
- pos = 0
16
- for i, k in row.deletions.items():
17
- if len(k) > max_length:
18
- pos = i
19
- max_length = len(k)
20
-
21
- if max_length > 5:
22
- del_reg = [pos, pos + max_length]
23
- else:
24
- del_reg = None
25
-
26
- if row.oncosplice_score == 0:
27
- mutation_loc = None
28
- else:
29
- mutation_loc = pos
30
-
31
- plot_conservation(tid=row.transcript_id,
32
- gene=f'{g}, {row.transcript_id}.{row.isoform}',
33
- mutation_loc=mutation_loc,
34
- target_region=del_reg, mut_name='Epistasis',
35
- domain_annotations=get_annotations(row.transcript_id, 300),
36
- to_file=to_file)
37
-
38
-
39
- import numpy as np
40
- import matplotlib.pyplot as plt
41
- from matplotlib.patches import Rectangle
42
- import seaborn as sns
43
-
44
- def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
45
- """
46
- Plots conservation vectors with protein domain visualization and Rate4Site scores.
47
-
48
- Parameters:
49
- tid (str): Transcript identifier.
50
- gene (str): Gene name.
51
- mutation_loc (int): Position of the mutation.
52
- target_region (tuple): Start and end positions of the target region.
53
- mut_name (str): Name of the mutation.
54
- domain_annotations (list): List of tuples for domain annotations (start, end, label).
55
- """
56
- # Access conservation data
57
- _, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
58
-
59
- if not cons_vec:
60
- raise ValueError("The conservation vector is empty.")
61
-
62
- sns.set_theme(style="white")
63
- fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
64
-
65
- # Plotting the conservation vectors in the main plot
66
- plot_conservation_vectors(ax, cons_vec)
67
-
68
- # Setting up primary axis for the main plot
69
- setup_primary_axis(ax, gene, len(cons_vec))
70
-
71
- # Create a separate axes for protein domain visualization
72
- domain_ax = create_domain_axes(fig, len(cons_vec))
73
-
74
- # Draw protein domains
75
- plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
76
-
77
- # Plotting Rate4Site scores on secondary y-axis
78
- plot_rate4site_scores(ax, cons_vec)
79
-
80
- # Plotting mutation location and target region, if provided
81
- plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
82
-
83
- plt.show()
84
-
85
- def plot_conservation_vectors(ax, cons_vec):
86
- """Plots transformed conservation vectors."""
87
- temp = transform_conservation_vector(cons_vec, 76) # Larger window
88
- temp /= max(temp)
89
- ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
90
-
91
- temp = transform_conservation_vector(cons_vec, 6) # Smaller window
92
- temp /= max(temp)
93
- ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
94
-
95
- def setup_primary_axis(ax, gene, length):
96
- """Configures the primary axis of the plot."""
97
- ax.set_xlabel(f'AA Position - {gene}', weight='bold')
98
- ax.set_xlim(0, length)
99
- ax.set_ylim(0, 1)
100
- ax.set_ylabel('Relative Importance', weight='bold')
101
- ax.tick_params(axis='y')
102
- ax.spines['right'].set_visible(False)
103
- ax.spines['top'].set_visible(False)
104
-
105
- def create_domain_axes(fig, length):
106
- """Creates an axis for protein domain visualization."""
107
- domain_ax_height = 0.06
108
- domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
109
- domain_ax.set_xlim(0, length)
110
- domain_ax.set_xticks([])
111
- domain_ax.set_yticks([])
112
- for spine in domain_ax.spines.values():
113
- spine.set_visible(False)
114
- return domain_ax
115
-
116
- def plot_protein_domains(ax, domain_annotations, length):
117
- """Plots protein domain annotations."""
118
- ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
119
- for domain in domain_annotations:
120
- start, end, label = domain
121
- ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
122
- ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
123
-
124
- def plot_rate4site_scores(ax, cons_vec):
125
- """Plots Rate4Site scores on a secondary y-axis."""
126
- ax2 = ax.twinx()
127
- c = np.array(cons_vec)
128
- c = c + abs(min(c))
129
- c = c/max(c)
130
- ax2.set_ylim(min(c), max(c)*1.1)
131
- ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
132
- ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
133
- ax2.tick_params(axis='y', labelcolor='green')
134
- ax2.spines['right'].set_visible(True)
135
- ax2.spines['top'].set_visible(False)
136
-
137
- def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
138
- """Highlights mutation location and target region, if provided."""
139
- if mutation_loc is not None:
140
- ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
141
- ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
142
-
143
- if target_region is not None:
144
- ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
145
- center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
146
- ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
147
-
148
-
149
- # def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
150
- # _, cons_vec = access_conservation_data(tid)
151
- #
152
- # sns.set_theme(style="white")
153
- # fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
154
- #
155
- # # Plotting the conservation vectors in the main plot
156
- # temp = transform_conservation_vector(cons_vec, 76)
157
- # temp /= max(temp)
158
- # ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
159
- # temp = transform_conservation_vector(cons_vec, 6)
160
- # temp /= max(temp)
161
- # ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
162
- #
163
- # # Setting up primary axis for the main plot
164
- # ax.set_xlabel(f'AA Position - {gene}', weight='bold')
165
- # ax.set_xlim(0, len(cons_vec))
166
- # ax.set_ylim(0, 1) # Set y-limit to end at 1
167
- # ax.set_ylabel('Relative Importance', weight='bold')
168
- # ax.tick_params(axis='y')
169
- # ax.spines['right'].set_visible(False)
170
- # ax.spines['top'].set_visible(False)
171
- #
172
- # # Create a separate axes for protein domain visualization above the main plot
173
- # domain_ax_height = 0.06 # Adjust for thinner protein diagram
174
- # domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
175
- # domain_ax.set_xlim(0, len(cons_vec))
176
- # domain_ax.set_xticks([])
177
- # domain_ax.set_yticks([])
178
- # domain_ax.spines['top'].set_visible(False)
179
- # domain_ax.spines['right'].set_visible(False)
180
- # domain_ax.spines['left'].set_visible(False)
181
- # domain_ax.spines['bottom'].set_visible(False)
182
- #
183
- # # Draw the full-length protein as a base rectangle
184
- # domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
185
- #
186
- # # Overlay domain annotations
187
- # for domain in domain_annotations:
188
- # start, end, label = domain
189
- # domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
190
- # domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
191
- #
192
- # # Plotting Rate4Site scores on secondary y-axis
193
- # ax2 = ax.twinx()
194
- # c = np.array(cons_vec)
195
- # c = c + abs(min(c))
196
- # c = c/max(c)
197
- # ax2.set_ylim(min(c), max(c)*1.1)
198
- # ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
199
- # ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
200
- # ax2.tick_params(axis='y', labelcolor='green')
201
- # ax2.spines['right'].set_visible(True)
202
- # ax2.spines['top'].set_visible(False)
203
- #
204
- # # Plotting mutation location and target region
205
- # if mutation_loc is not None:
206
- # ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
207
- # ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
208
- #
209
- # if target_region is not None:
210
- # ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
211
- # center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
212
- # ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
213
- #
214
- # plt.show()
215
- #
216
-
217
- def merge_overlapping_regions(df):
218
- """
219
- Merges overlapping regions in a DataFrame.
220
-
221
- Parameters:
222
- df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
223
-
224
- Returns:
225
- List: List of merged regions as namedtuples (start, end, combined_name)
226
- """
227
- if df.empty:
228
- return []
229
-
230
- Region = namedtuple('Region', ['start', 'end', 'combined_name'])
231
- df = df.sort_values(by='start')
232
- merged_regions = []
233
- current_region = None
234
-
235
- for _, row in df.iterrows():
236
- start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
237
- if current_region is None:
238
- current_region = Region(start, end, [name])
239
- elif start <= current_region.end:
240
- current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
241
- else:
242
- merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
243
- current_region = Region(start, end, [name])
244
-
245
- if current_region:
246
- merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
247
-
248
- # Assuming split_text is a function that splits the text appropriately.
249
- merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
250
- return merged_regions
251
-
252
-
253
- def split_text(text, width):
254
- """
255
- Splits a text into lines with a maximum specified width.
256
-
257
- Parameters:
258
- text (str): The text to be split.
259
- width (int): Maximum width of each line.
260
-
261
- Returns:
262
- str: The text split into lines of specified width.
263
- """
264
- lines = re.findall('.{1,' + str(width) + '}', text)
265
- return '\n'.join(lines)
266
-
267
-
268
-
269
- ###
270
-
271
-
272
-
273
- # def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
274
- # _, cons_vec = access_conservation_data(tid)
275
- #
276
- # sns.set_theme(style="white")
277
- # fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
278
- #
279
- # # Plotting the conservation vectors in the main plot
280
- # temp = transform_conservation_vector(cons_vec, 76)
281
- # temp /= max(temp)
282
- # ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
283
- # temp = transform_conservation_vector(cons_vec, 7)
284
- # temp /= max(temp)
285
- # ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
286
- #
287
- # # Setting up primary axis for the main plot
288
- # ax.set_xlabel(f'AA Position - {gene}', weight='bold')
289
- # ax.set_xlim(0, len(cons_vec))
290
- # ax.set_ylim(0, 1) # Set y-limit to end at 1
291
- # ax.set_ylabel('Relative Importance', weight='bold')
292
- # ax.tick_params(axis='y')
293
- # ax.spines['right'].set_visible(False)
294
- # ax.spines['top'].set_visible(False)
295
- #
296
- # # Create a separate axes for protein domain visualization above the main plot
297
- # domain_ax_height = 0.06 # Adjust for thinner protein diagram
298
- # domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
299
- # domain_ax.set_xlim(0, len(cons_vec))
300
- # domain_ax.set_xticks([])
301
- # domain_ax.set_yticks([])
302
- # domain_ax.spines['top'].set_visible(False)
303
- # domain_ax.spines['right'].set_visible(False)
304
- # domain_ax.spines['left'].set_visible(False)
305
- # domain_ax.spines['bottom'].set_visible(False)
306
- #
307
- # # Draw the full-length protein as a base rectangle
308
- # domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
309
- #
310
- # # Overlay domain annotations
311
- # for domain in domain_annotations:
312
- # start, end, label = domain
313
- # domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
314
- # domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
315
- #
316
- # # Plotting Rate4Site scores on secondary y-axis
317
- # ax2 = ax.twinx()
318
- # c = np.array(cons_vec)
319
- # c = c + abs(min(c))
320
- # c = c/max(c)
321
- # ax2.set_ylim(min(c), max(c)*1.1)
322
- # ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
323
- # ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
324
- # ax2.tick_params(axis='y', labelcolor='green')
325
- # ax2.spines['right'].set_visible(True)
326
- # ax2.spines['top'].set_visible(False)
327
- #
328
- # # Plotting mutation location and target region
329
- # if mutation_loc is not None:
330
- # ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
331
- # ax.text(mutation_loc, 1.01, mut_name, color='r', weight='bold', ha='center')
332
- #
333
- # if target_region is not None:
334
- # ax.add_patch(Rectangle((target_region[0], 0.85), target_region[1] - target_region[0], 0.05, alpha=0.3, facecolor='blue'))
335
- # center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
336
- # ax.text(center_loc, 0.875, 'Target Region', ha='center', va='center', color='blue', weight='bold')
337
- #
338
- # plt.show()
339
- #
340
- #
341
- # def merge_overlapping_regions(df):
342
- # # Sort the DataFrame by the 'start' column
343
- # df = df.sort_values(by='start')
344
- #
345
- # merged_regions = [] # List to store merged regions as tuples (start, end, combined_name)
346
- #
347
- # current_start = None
348
- # current_end = None
349
- # combined_names = [] # List to store names of overlapping regions
350
- #
351
- # for index, row in df.iterrows():
352
- # start = row['start']
353
- # end = row['end']
354
- # name = row['name'].replace('_', ' ')
355
- #
356
- # if current_start is None:
357
- # # Initialize the current region
358
- # current_start = start
359
- # current_end = end
360
- # combined_names.append(name)
361
- # else:
362
- # if start <= current_end:
363
- # # Regions overlap, update the current region and add the name to combined_names
364
- # current_end = max(current_end, end)
365
- # combined_names.append(name)
366
- # else:
367
- # # Regions don't overlap, add the current region to the result with combined names
368
- # combined_name = ', '.join(combined_names)
369
- # merged_regions.append((current_start, current_end, combined_name))
370
- # # Start a new current region with the current row
371
- # current_start = start
372
- # current_end = end
373
- # combined_names = [name]
374
- #
375
- # # Add the last current region to the result
376
- # if current_start is not None:
377
- # combined_name = ', '.join(combined_names)
378
- # merged_regions.append((current_start, current_end, combined_name))
379
- #
380
- # merged_regions = [(a, b, split_text(c, 35)) for a, b, c in merged_regions]
381
- # return merged_regions
382
- #
383
- # def split_text(text, width):
384
- # lines = []
385
- # while text:
386
- # # Find the index to split at or take the whole text if it's shorter than the width
387
- # split_index = min(len(text), width)
388
- # # Append the substring up to the split index to the lines list
389
- # lines.append(text[:split_index])
390
- # # Remove the processed substring from the original text
391
- # text = text[split_index:]
392
- # return '\n'.join(lines)
393
- #
394
- # def get_annotations(target_gene, w=500):
395
- # temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
396
- # return merge_overlapping_regions(temp)
397
- #
398
- #
@@ -1,158 +0,0 @@
1
- import pandas as pd
2
- from sklearn.metrics import roc_curve, precision_recall_curve
3
- import matplotlib.pyplot as plt
4
- from datetime import datetime
5
- from pathlib import Path
6
- import subprocess
7
-
8
- from geney import config_setup
9
- from geney.utils import download_and_gunzip
10
- from geney.oncosplice import oncosplice_reduced
11
-
12
- def download_and_parse_clinvar():
13
- url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
14
- local_file = download_and_gunzip(url, target_path)
15
- return local_file
16
-
17
-
18
- def aggregate_clinvar_results(benchmark_path, aggregate_mode=False, benchmark_feature=None, local_clinvar_df='/tamir2/nicolaslynn/data/ClinVar/clinvar_compact.csv'):
19
- data = pd.concat([pd.read_csv(file) for file in Path(benchmark_path).glob('*.csv')])
20
- if not aggregate_mode:
21
- data = data[(data.cons_available) & (data.primary_transcript)]
22
-
23
- data = oncosplice_reduced(data)
24
- data = data.loc[:, ~data.columns.duplicated()]
25
- data = pd.merge(data, pd.read_csv(local_clinvar_df), on='mut_id')
26
- data['clinsig_val'] = data.apply(lambda row: {'Benign': 0, 'Pathogenic': 1}[row.clinsig], axis=1)
27
- for c in data.columns:
28
- try:
29
- if data[c].min() < 0:
30
- data[f'{c}_abs'] = abs(data[c])
31
- except TypeError:
32
- pass
33
-
34
- print(data.corr(numeric_only=True))
35
- print(data.corrwith(data['clinsig_val'], method='spearman'))
36
- print(data.corrwith(data['clinsig_val'], method='pearson'))
37
- return data
38
-
39
-
40
- def plot_performance(true_values, predictions):
41
- clinsig_map = {'Benign': 0, 'Pathogenic': 1}
42
- true_values = [clinsig_map[t] for t in true_values]
43
- predictions = scale_predictions(predictions)
44
-
45
- fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
46
-
47
- # Calculate Precision-Recall curve
48
- precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
49
-
50
- # Plotting ROC curve
51
- plt.figure(figsize=(20, 5))
52
-
53
- plt.subplot(1, 4, 1)
54
- plt.plot(fpr, tpr)
55
- plt.title('ROC Curve')
56
- plt.xlabel('False Positive Rate')
57
- plt.ylabel('True Positive Rate')
58
-
59
- # Plotting Precision-Recall curve
60
- plt.subplot(1, 4, 2)
61
- plt.plot(recall, precision)
62
- plt.title('Precision-Recall Curve')
63
- plt.xlabel('Recall')
64
- plt.ylabel('Precision')
65
-
66
- # Plotting Precision vs. Thresholds
67
- plt.subplot(1, 4, 3)
68
- plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
69
- plt.title('Precision vs. Threshold')
70
- plt.xlabel('Threshold')
71
- plt.ylabel('Precision')
72
-
73
- # Plotting Sample Percentage Captured vs. Thresholds
74
- plt.subplot(1, 4, 4)
75
- # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
76
- plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
77
- plt.title('Cumulative Percentage vs. Threshold')
78
- plt.xlabel('Threshold')
79
- plt.ylabel('Cumulative Percentage of Population')
80
-
81
- plt.tight_layout()
82
- plt.show()
83
-
84
-
85
-
86
- class ClinVarBenchmark:
87
- def __init__(self, df):
88
- assert 'clinsig' in df.columns, 'No clinsig column found in dataframe.'
89
- self.df = df
90
-
91
-
92
- def scale_predictions(self, p):
93
- max_val = max(p)
94
- min_val = min(p)
95
- return (p - min_val) / (max_val - min_val)
96
-
97
- def plot_performance(self, true_values, predictions):
98
- clinsig_map = {'Benign': 0, 'Pathogenic': 1}
99
- predictions = [clinsig_map[t] for t in true_values]
100
- predictions = self.scale_predictions(predictions)
101
-
102
- fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
103
-
104
- # Calculate Precision-Recall curve
105
- precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
106
-
107
- # Plotting ROC curve
108
- plt.figure(figsize=(20, 5))
109
-
110
- plt.subplot(1, 4, 1)
111
- plt.plot(fpr, tpr)
112
- plt.title('ROC Curve')
113
- plt.xlabel('False Positive Rate')
114
- plt.ylabel('True Positive Rate')
115
-
116
- # Plotting Precision-Recall curve
117
- plt.subplot(1, 4, 2)
118
- plt.plot(recall, precision)
119
- plt.title('Precision-Recall Curve')
120
- plt.xlabel('Recall')
121
- plt.ylabel('Precision')
122
-
123
- # Plotting Precision vs. Thresholds
124
- plt.subplot(1, 4, 3)
125
- plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
126
- plt.title('Precision vs. Threshold')
127
- plt.xlabel('Threshold')
128
- plt.ylabel('Precision')
129
-
130
- # Plotting Sample Percentage Captured vs. Thresholds
131
- plt.subplot(1, 4, 4)
132
- # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
133
- plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
134
- plt.title('Cumulative Percentage vs. Threshold')
135
- plt.xlabel('Threshold')
136
- plt.ylabel('Cumulative Percentage of Population')
137
-
138
- plt.tight_layout()
139
- plt.show()
140
- return None
141
-
142
- def report(self, feature):
143
- pass
144
-
145
- def find_ppv_threshold(self, feature, ppv_threshold=0.95):
146
- pass
147
-
148
-
149
-
150
- if __name__ == '__main__':
151
- now = datetime.now()
152
- benchmark_path = config_setup['ONCOSPLICE'] / f'clinvar_benchmark_{now.strftime("%m_%d_%Y")}'
153
- print(f"Saving benchmark results to {benchmark_path}")
154
- benchmark_path.mkdir(parents=True, exist_ok=True)
155
- subprocess.run(['python', '-m', 'geney.pipelines.power_utils', '-i',
156
- '/tamir2/nicolaslynn/data/ClinVar/clinvar_oncosplice_input.txt', '-r', str(benchmark_path),
157
- '-n', '10', '-m', '5GB'])
158
-
geney/compare_sets.py DELETED
@@ -1,91 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.metrics import precision_score, recall_score, accuracy_score
4
- from sklearn.metrics import roc_auc_score, roc_curve
5
- import matplotlib.pyplot as plt
6
-
7
- def plot_auc_curve(y_true, y_pred_proba):
8
- """
9
- Plots the AUC curve.
10
-
11
- Args:
12
- y_true (array-like): True labels (0 or 1).
13
- y_pred_proba (array-like): Predicted probabilities for positive class.
14
-
15
- Returns:
16
- None
17
- """
18
- fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
19
- auc_value = roc_auc_score(y_true, y_pred_proba)
20
-
21
- plt.figure(figsize=(8, 6))
22
- plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
23
- plt.plot([0, 1], [0, 1], 'k--')
24
- plt.xlabel("False Positive Rate")
25
- plt.ylabel("True Positive Rate")
26
- plt.title("Receiver Operating Characteristic (ROC) Curve")
27
- plt.legend()
28
- plt.show()
29
- return auc_value
30
-
31
-
32
- def optimal_ppv(dataframe, feature_name, plot=False):
33
- """
34
- Calculates the optimal positive predictive value (PPV) for a given feature.
35
-
36
- Args:
37
- dataframe (pd.DataFrame): Input dataframe.
38
- feature_name (str): Name of the feature column.
39
-
40
- Returns:
41
- float: Optimal PPV.
42
- """
43
- # Assuming 'target' is the binary target column (0 or 1)
44
- threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
45
- ppv_values = []
46
-
47
- for threshold in threshold_values:
48
- predictions = (dataframe[feature_name] >= threshold).astype(int)
49
- ppv = precision_score(dataframe['target'], predictions)
50
- ppv_values.append(ppv)
51
-
52
- optimal_threshold = threshold_values[np.argmax(ppv_values)]
53
- optimal_ppv = max(ppv_values)
54
- if plot:
55
- plt.figure(figsize=(8, 6))
56
- plt.scatter(threshold_values, ppv_values)
57
- plt.xlabel("Threshold")
58
- plt.ylabel("Positive Predictive Value (PPV)")
59
- plt.title("Optimal Positive Predictive Value (PPV)")
60
- plt.show()
61
-
62
- return optimal_ppv, optimal_threshold
63
-
64
-
65
- def measure_prediction_quality(prediction_vector, quality_vector):
66
- """
67
- Measure the quality of the predictions using the quality_vector as the characteristic to check.
68
- """
69
- pass
70
-
71
-
72
-
73
- def create_ppv_vector(prediction_vector, true_value_vector):
74
- """
75
- Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
76
- """
77
- df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
78
- df.sort_values('prediction', ascending=True, inplace=True)
79
- df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
80
- for bin in df.bin.unique():
81
- temp_df = df[df.bin >= bin].
82
-
83
-
84
- def group_retention(predictions, predictor):
85
- # first i need to get the ratio of values that are retained at particular values
86
- predictions.sort_values(predictor, inplace=True)
87
- _, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
88
- tracker = []
89
- for th in thresholds:
90
-
91
-
File without changes