geney 1.2.19__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (40) hide show
  1. geney/__init__.py +1 -1
  2. geney/oncosplice.py +1 -3
  3. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
  4. geney-1.2.21.dist-info/RECORD +19 -0
  5. geney/Gene.py +0 -258
  6. geney/analyzers/__init__.py +0 -0
  7. geney/analyzers/benchmark_clinvar.py +0 -158
  8. geney/analyzers/characterize_epistasis.py +0 -15
  9. geney/analyzers/compare_sets.py +0 -91
  10. geney/analyzers/group_comparison.py +0 -81
  11. geney/analyzers/survival.py +0 -144
  12. geney/analyzers/tcga_annotations.py +0 -194
  13. geney/analyzers/visualize_protein_conservation.py +0 -398
  14. geney/benchmark_clinvar.py +0 -158
  15. geney/compare_sets.py +0 -91
  16. geney/data_parsers/__init__.py +0 -0
  17. geney/data_parsers/gtex.py +0 -68
  18. geney/gtex.py +0 -68
  19. geney/immunotherapy/__init__.py +0 -0
  20. geney/immunotherapy/netchop.py +0 -78
  21. geney/mutations/__init__.py +0 -0
  22. geney/mutations/variant_utils.py +0 -125
  23. geney/netchop.py +0 -79
  24. geney/oncosplice/__init__.py +0 -0
  25. geney/oncosplice_mouse.py +0 -277
  26. geney/oncosplice_pipeline.py +0 -1588
  27. geney/performance_utils.py +0 -138
  28. geney/pipelines/__init__.py +0 -0
  29. geney/pipelines/dask_utils.py +0 -153
  30. geney/splicing/__init__.py +0 -2
  31. geney/splicing/spliceai_utils.py +0 -253
  32. geney/splicing/splicing_isoform_utils.py +0 -0
  33. geney/splicing/splicing_utils.py +0 -366
  34. geney/survival.py +0 -124
  35. geney/tcga_annotations.py +0 -352
  36. geney/translation_termination/__init__.py +0 -0
  37. geney/translation_termination/tts_utils.py +0 -0
  38. geney-1.2.19.dist-info/RECORD +0 -52
  39. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
  40. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0
geney/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from .config_setup import get_config
2
2
  config_setup = get_config()
3
-
3
+ print("Hello Geney.")
4
4
  # import os
5
5
  # import json
6
6
  # from pathlib import Path
geney/oncosplice.py CHANGED
@@ -12,8 +12,7 @@ import matplotlib.pyplot as plt
12
12
  from matplotlib.patches import Rectangle
13
13
  import seaborn as sns
14
14
  from collections import namedtuple
15
-
16
-
15
+ print('hellp')
17
16
  from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
18
17
  from geney.Fasta_segment import Fasta_segment
19
18
 
@@ -29,7 +28,6 @@ tf.config.threading.set_inter_op_parallelism_threads(1)
29
28
  sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
30
29
  sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
31
30
 
32
-
33
31
  # Load models
34
32
  import torch
35
33
  from pkg_resources import resource_filename
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.19
3
+ Version: 1.2.21
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -0,0 +1,19 @@
1
+ geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
2
+ geney/__init__.py,sha256=knezxgbV2c2gcO2ek2-xxEC15HL4aO1WuoMiYOOvKf8,428
3
+ geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
4
+ geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
5
+ geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
6
+ geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
7
+ geney/oncosplice.py,sha256=sp6kfKbFqwpZIuLZadvCq0aj-JUnM_GE99eaGRm19eY,78240
8
+ geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
9
+ geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
10
+ geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
11
+ geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
12
+ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
14
+ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
15
+ geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
16
+ geney-1.2.21.dist-info/METADATA,sha256=PfL1XAeWg2oGBxlytfbfJARnKJ4W1GOrC9DOsSi7Jwc,1163
17
+ geney-1.2.21.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
18
+ geney-1.2.21.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
19
+ geney-1.2.21.dist-info/RECORD,,
geney/Gene.py DELETED
@@ -1,258 +0,0 @@
1
- from copy import copy
2
- from Bio.Seq import Seq
3
- from geney.mutations.variant_utils import generate_mut_variant, Mutation, find_new_tts
4
- from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle
5
- from geney.Fasta_segment import Fasta_segment
6
- from geney import config_setup
7
- from geney.translation_initiation.tis_utils import TISFInder
8
-
9
-
10
- class Gene:
11
- def __init__(self, gene_name, variation):
12
- self.gene_name = gene_name
13
- self.gene_id = ''
14
- self.rev = None
15
- self.chrm = ''
16
- self.gene_start = 0
17
- self.gene_end = 0
18
- self.transcripts = {}
19
- self.load_from_file(find_files_by_gene_name(gene_name))
20
- self.variation = variation
21
-
22
- def __repr__(self):
23
- return f'Gene(gene_name={self.gene_name})'
24
-
25
- def __len__(self):
26
- return len(self.transcripts)
27
-
28
- def __str__(self):
29
- return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
30
-
31
- def __copy__(self):
32
- cls = self.__class__
33
- result = cls.__new__(cls)
34
- result.__dict__.update(self.__dict__)
35
- return result
36
-
37
- def __getitem__(self, index):
38
- return Transcript(list(self.transcripts.values())[index])
39
-
40
- def load_from_file(self, file_name):
41
- if not file_name.exists():
42
- raise FileNotFoundError(f"File '{file_name}' not found.")
43
-
44
- self.load_from_dict(dict_data=unload_pickle(file_name))
45
- return self
46
-
47
- def load_from_dict(self, dict_data=None):
48
- for k, v in dict_data.items():
49
- setattr(self, k, v)
50
- return self
51
-
52
- # def generate_transcript(self, tid=None):
53
- # if tid == None:
54
- # tid = [k for k, v in self.transcripts.items() if v['primary_transcript']][0]
55
- # return Transcript(self.transcripts[tid])
56
-
57
- def transcript(self, tid):
58
- if tid not in self.transcripts:
59
- raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
60
- return Transcript(self.transcripts[tid]) #self.generate_transcript(tid)
61
-
62
-
63
- class Transcript:
64
- def __init__(self, d=None):
65
- self.transcript_id = None
66
- self.transcript_start = None # transcription
67
- self.transcript_end = None # transcription
68
- self.transcript_biotype = None # metadata
69
- self.acceptors, self.donors = [], [] # splicing
70
- self.TIS, self.TTS = None, None # translation
71
- self.transcript_seq, self.transcript_indices = '', [] # sequence data
72
- self.rev = None # sequence data
73
- self.chrm = '' # sequence data
74
- self.pre_mrna = '' # sequence data
75
- self.orf = '' # sequence data
76
- self.protein = '' # sequence data
77
- self.log = '' # sequence data
78
- self.primary_transcript=None # sequence data
79
- self.cons_available=False # metadata
80
- self.cons_seq = ''
81
- self.cons_vector = ''
82
- if d:
83
- self.load_from_dict(d)
84
-
85
- if self.cons_available:
86
- if '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector):
87
- self.cons_seq = self.cons_seq.replace('*', '')
88
- self.cons_vector = self.cons_vector[:-1]
89
-
90
- elif '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector) + 1:
91
- self.cons_seq = self.cons_seq.replace('*', '')
92
-
93
- else:
94
- self.cons_available = False
95
-
96
-
97
- def __repr__(self):
98
- return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
99
-
100
- def __len__(self):
101
- return len(self.transcript_seq)
102
-
103
- def __str__(self):
104
- return 'Transcript {tid}, Transcript Type: ' \
105
- '{protein_coding}'.format(
106
- tid=self.transcript_id, protein_coding=self.transcript_biotype)
107
-
108
- def __eq__(self, other):
109
- return self.transcript_seq == other.transcript_seq
110
-
111
- def __contains__(self, subvalue):
112
- if isinstance(subvalue, str):
113
- return subvalue in self.transcript_seq
114
- elif isinstance(subvalue, int):
115
- return subvalue in self.transcript_indices
116
- else:
117
- print(
118
- "Pass an integer to check against the span of the gene's coordinates or a string to check against the "
119
- "pre-mRNA sequence.")
120
- return False
121
-
122
- def __copy__(self, other):
123
- return copy(self)
124
-
125
- @property
126
- def constructor(self):
127
- core_attributes = ['transcript_id', 'transcript_start', 'transcript_end', 'transcript_biotype', 'acceptors', 'donors', 'TIS', 'TTS', 'rev', 'chrm']
128
- return {k: v for k, v in self.__dict__.items() if k in core_attributes}
129
-
130
- def load_from_dict(self, data):
131
- for k, v in data.items():
132
- setattr(self, k, v)
133
- self.__arrange_boundaries()
134
- self.generate_mature_mrna(inplace=True)
135
- return self
136
-
137
- @property
138
- def exons(self):
139
- return list(zip(self.acceptors, self.donors))
140
-
141
- @property
142
- def introns(self):
143
- return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
144
-
145
-
146
- def set_exons(self, boundaries):
147
- self.acceptors, self.donors = boundaries['acceptors'], boundaries['donors']
148
- self.__arrange_boundaries()
149
- return self
150
-
151
- @property
152
- def introns(self):
153
- return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
154
-
155
- def __exon_coverage_check(self):
156
- if sum([abs(a-b) + 1 for a, b in self.exons]) == len(self):
157
- return True
158
- else:
159
- return False
160
-
161
- @property
162
- def exons_pos(self):
163
- temp = self.exons
164
- if self.rev:
165
- temp = [(b, a) for a, b in temp[::-1]]
166
- return temp
167
-
168
- @property
169
- def mrna_indices(self):
170
- temp = [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
171
- return sorted(temp, reverse=self.rev)
172
-
173
- @property
174
- def exonic_indices(self):
175
- return [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
176
-
177
- def __arrange_boundaries(self):
178
- self.acceptors.append(self.transcript_start)
179
- self.donors.append(self.transcript_end)
180
- self.acceptors = list(set(self.acceptors))
181
- self.donors = list(set(self.donors))
182
- self.acceptors.sort(reverse=self.rev)
183
- self.donors.sort(reverse=self.rev)
184
- return self
185
-
186
- def positive_strand(self):
187
- if self.rev:
188
- return reverse_complement(self.transcript_seq)
189
- else:
190
- return self.transcript_seq
191
-
192
- def __pos2sense(self, mrna, indices):
193
- if self.rev:
194
- mrna = reverse_complement(mrna)
195
- indices = indices[::-1]
196
- return mrna, indices
197
-
198
- def pull_pre_mrna_pos(self):
199
- fasta_obj = Fasta_segment()
200
- if self.rev:
201
- return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_end,
202
- self.transcript_start)
203
- else:
204
- return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_start,
205
- self.transcript_end)
206
-
207
- def generate_pre_mrna_pos(self, mutations=[]):
208
- seq, indices = self.pull_pre_mrna_pos()
209
- for mutation in mutations:
210
- mutation = Mutation(mutation)
211
- seq, indices, _, _ = generate_mut_variant(seq, indices, mut=mutation)
212
-
213
- self.pre_mrna, _ = self.__pos2sense(seq, indices)
214
- return seq, indices
215
-
216
- def generate_pre_mrna(self, mutations=[], inplace=True):
217
- pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos(mutations))
218
- self.pre_mrna = pre_mrna
219
- if inplace:
220
- return self
221
- return pre_mrna
222
-
223
- def generate_mature_mrna_pos(self, mutations=[]):
224
- mature_mrna, mature_indices = '', []
225
- pre_seq, pre_indices = self.generate_pre_mrna_pos(mutations)
226
- for i, j in self.exons_pos:
227
- rel_start, rel_end = pre_indices.index(i), pre_indices.index(j)
228
- mature_mrna += pre_seq[rel_start:rel_end + 1]
229
- mature_indices.extend(pre_indices[rel_start:rel_end + 1])
230
- return mature_mrna, mature_indices
231
-
232
- def generate_mature_mrna(self, mutations=[], inplace=True):
233
- if inplace:
234
- self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
235
- return self
236
- return self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
237
-
238
- def generate_protein(self, inplace=True):
239
- rel_start = self.transcript_indices.index(self.TIS)
240
- rel_end = self.transcript_indices.index(self.TTS)
241
- orf = self.transcript_seq[rel_start:rel_end + 1 + 3]
242
- protein = str(Seq(orf).translate()).replace('*', '')
243
- if inplace:
244
- self.orf = orf
245
- self.protein = protein
246
- if self.protein != self.cons_seq:
247
- self.cons_available = False
248
- return self
249
- return protein
250
-
251
- def generate_translational_boundaries(self):
252
- if self.TIS not in self.transcript_indices or self.transcript_seq[self.transcript_indices.index(self.TIS):self.transcript_indices.index(self.TIS)+3] != 'ATG':
253
- new_tis = TISFInder(self.transcript_seq, self.transcript_indices)
254
- self.log += f' TIS for transcript reacquired: {self.TIS} --> {new_tis}.'
255
- self.TIS = new_tis
256
- self.TTS = find_new_tts(self.transcript_seq, self.transcript_indices, self.TIS)
257
- return self
258
-
File without changes
@@ -1,158 +0,0 @@
1
- import pandas as pd
2
- from sklearn.metrics import roc_curve, precision_recall_curve
3
- import matplotlib.pyplot as plt
4
- from datetime import datetime
5
- from pathlib import Path
6
- import subprocess
7
-
8
- from geney import config_setup
9
- from geney.utils import download_and_gunzip
10
- from geney.oncosplice import oncosplice_reduced
11
-
12
- def download_and_parse_clinvar():
13
- url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
14
- local_file = download_and_gunzip(url, target_path)
15
- return local_file
16
-
17
-
18
- def aggregate_clinvar_results(benchmark_path, aggregate_mode=False, benchmark_feature=None, local_clinvar_df='/tamir2/nicolaslynn/data/ClinVar/clinvar_compact.csv'):
19
- data = pd.concat([pd.read_csv(file) for file in Path(benchmark_path).glob('*.csv')])
20
- if not aggregate_mode:
21
- data = data[(data.cons_available) & (data.primary_transcript)]
22
-
23
- data = oncosplice_reduced(data)
24
- data = data.loc[:, ~data.columns.duplicated()]
25
- data = pd.merge(data, pd.read_csv(local_clinvar_df), on='mut_id')
26
- data['clinsig_val'] = data.apply(lambda row: {'Benign': 0, 'Pathogenic': 1}[row.clinsig], axis=1)
27
- for c in data.columns:
28
- try:
29
- if data[c].min() < 0:
30
- data[f'{c}_abs'] = abs(data[c])
31
- except TypeError:
32
- pass
33
-
34
- print(data.corr(numeric_only=True))
35
- print(data.corrwith(data['clinsig_val'], method='spearman'))
36
- print(data.corrwith(data['clinsig_val'], method='pearson'))
37
- return data
38
-
39
-
40
- def plot_performance(true_values, predictions):
41
- clinsig_map = {'Benign': 0, 'Pathogenic': 1}
42
- true_values = [clinsig_map[t] for t in true_values]
43
- predictions = scale_predictions(predictions)
44
-
45
- fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
46
-
47
- # Calculate Precision-Recall curve
48
- precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
49
-
50
- # Plotting ROC curve
51
- plt.figure(figsize=(20, 5))
52
-
53
- plt.subplot(1, 4, 1)
54
- plt.plot(fpr, tpr)
55
- plt.title('ROC Curve')
56
- plt.xlabel('False Positive Rate')
57
- plt.ylabel('True Positive Rate')
58
-
59
- # Plotting Precision-Recall curve
60
- plt.subplot(1, 4, 2)
61
- plt.plot(recall, precision)
62
- plt.title('Precision-Recall Curve')
63
- plt.xlabel('Recall')
64
- plt.ylabel('Precision')
65
-
66
- # Plotting Precision vs. Thresholds
67
- plt.subplot(1, 4, 3)
68
- plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
69
- plt.title('Precision vs. Threshold')
70
- plt.xlabel('Threshold')
71
- plt.ylabel('Precision')
72
-
73
- # Plotting Sample Percentage Captured vs. Thresholds
74
- plt.subplot(1, 4, 4)
75
- # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
76
- plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
77
- plt.title('Cumulative Percentage vs. Threshold')
78
- plt.xlabel('Threshold')
79
- plt.ylabel('Cumulative Percentage of Population')
80
-
81
- plt.tight_layout()
82
- plt.show()
83
-
84
-
85
-
86
- class ClinVarBenchmark:
87
- def __init__(self, df):
88
- assert 'clinsig' in df.columns, 'No clinsig column found in dataframe.'
89
- self.df = df
90
-
91
-
92
- def scale_predictions(self, p):
93
- max_val = max(p)
94
- min_val = min(p)
95
- return (p - min_val) / (max_val - min_val)
96
-
97
- def plot_performance(self, true_values, predictions):
98
- clinsig_map = {'Benign': 0, 'Pathogenic': 1}
99
- predictions = [clinsig_map[t] for t in true_values]
100
- predictions = self.scale_predictions(predictions)
101
-
102
- fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
103
-
104
- # Calculate Precision-Recall curve
105
- precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
106
-
107
- # Plotting ROC curve
108
- plt.figure(figsize=(20, 5))
109
-
110
- plt.subplot(1, 4, 1)
111
- plt.plot(fpr, tpr)
112
- plt.title('ROC Curve')
113
- plt.xlabel('False Positive Rate')
114
- plt.ylabel('True Positive Rate')
115
-
116
- # Plotting Precision-Recall curve
117
- plt.subplot(1, 4, 2)
118
- plt.plot(recall, precision)
119
- plt.title('Precision-Recall Curve')
120
- plt.xlabel('Recall')
121
- plt.ylabel('Precision')
122
-
123
- # Plotting Precision vs. Thresholds
124
- plt.subplot(1, 4, 3)
125
- plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
126
- plt.title('Precision vs. Threshold')
127
- plt.xlabel('Threshold')
128
- plt.ylabel('Precision')
129
-
130
- # Plotting Sample Percentage Captured vs. Thresholds
131
- plt.subplot(1, 4, 4)
132
- # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
133
- plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
134
- plt.title('Cumulative Percentage vs. Threshold')
135
- plt.xlabel('Threshold')
136
- plt.ylabel('Cumulative Percentage of Population')
137
-
138
- plt.tight_layout()
139
- plt.show()
140
- return None
141
-
142
- def report(self, feature):
143
- pass
144
-
145
- def find_ppv_threshold(self, feature, ppv_threshold=0.95):
146
- pass
147
-
148
-
149
-
150
- if __name__ == '__main__':
151
- now = datetime.now()
152
- benchmark_path = config_setup['ONCOSPLICE'] / f'clinvar_benchmark_{now.strftime("%m_%d_%Y")}'
153
- print(f"Saving benchmark results to {benchmark_path}")
154
- benchmark_path.mkdir(parents=True, exist_ok=True)
155
- subprocess.run(['python', '-m', 'geney.pipelines.dask_utils', '-i',
156
- '/tamir2/nicolaslynn/data/ClinVar/clinvar_oncosplice_input.txt', '-r', str(benchmark_path),
157
- '-n', '10', '-m', '5GB'])
158
-
@@ -1,15 +0,0 @@
1
- from geney.oncosplice import *
2
-
3
- class PairwiseEpistasis:
4
- def __init__(self, epistasis):
5
- # need some check here making sure format of mtuations isi good
6
- self.epistasis = epistasis
7
- self.mut_id1, self.mut_id2 = epistasis.split('|')
8
-
9
- def compare_functional_changes(self):
10
- self.results_mut1 = oncosplice(self.mut_id1, sai_threshold=0.5)
11
- self.results_mut2 = oncosplice(self.mut_id2, sai_threshold=0.5)
12
- self.results_epi = oncosplice(self.epistasis, sai_threshold=0.5)
13
-
14
- splicing1, splicing2, splicing_epi = 0, 0, 0
15
- oncosplice_score1, oncosplice_score2, oncosplice_score_epi = 0, 0, 0
@@ -1,91 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.metrics import precision_score, recall_score, accuracy_score
4
- from sklearn.metrics import roc_auc_score, roc_curve
5
- import matplotlib.pyplot as plt
6
-
7
- def plot_auc_curve(y_true, y_pred_proba):
8
- """
9
- Plots the AUC curve.
10
-
11
- Args:
12
- y_true (array-like): True labels (0 or 1).
13
- y_pred_proba (array-like): Predicted probabilities for positive class.
14
-
15
- Returns:
16
- None
17
- """
18
- fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
19
- auc_value = roc_auc_score(y_true, y_pred_proba)
20
-
21
- plt.figure(figsize=(8, 6))
22
- plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
23
- plt.plot([0, 1], [0, 1], 'k--')
24
- plt.xlabel("False Positive Rate")
25
- plt.ylabel("True Positive Rate")
26
- plt.title("Receiver Operating Characteristic (ROC) Curve")
27
- plt.legend()
28
- plt.show()
29
- return auc_value
30
-
31
-
32
- def optimal_ppv(dataframe, feature_name, plot=False):
33
- """
34
- Calculates the optimal positive predictive value (PPV) for a given feature.
35
-
36
- Args:
37
- dataframe (pd.DataFrame): Input dataframe.
38
- feature_name (str): Name of the feature column.
39
-
40
- Returns:
41
- float: Optimal PPV.
42
- """
43
- # Assuming 'target' is the binary target column (0 or 1)
44
- threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
45
- ppv_values = []
46
-
47
- for threshold in threshold_values:
48
- predictions = (dataframe[feature_name] >= threshold).astype(int)
49
- ppv = precision_score(dataframe['target'], predictions)
50
- ppv_values.append(ppv)
51
-
52
- optimal_threshold = threshold_values[np.argmax(ppv_values)]
53
- optimal_ppv = max(ppv_values)
54
- if plot:
55
- plt.figure(figsize=(8, 6))
56
- plt.scatter(threshold_values, ppv_values)
57
- plt.xlabel("Threshold")
58
- plt.ylabel("Positive Predictive Value (PPV)")
59
- plt.title("Optimal Positive Predictive Value (PPV)")
60
- plt.show()
61
-
62
- return optimal_ppv, optimal_threshold
63
-
64
-
65
- def measure_prediction_quality(prediction_vector, quality_vector):
66
- """
67
- Measure the quality of the predictions using the quality_vector as the characteristic to check.
68
- """
69
- pass
70
-
71
-
72
-
73
- def create_ppv_vector(prediction_vector, true_value_vector):
74
- """
75
- Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
76
- """
77
- df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
78
- df.sort_values('prediction', ascending=True, inplace=True)
79
- df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
80
- for bin in df.bin.unique():
81
- temp_df = df[df.bin >= bin].
82
-
83
-
84
- def group_retention(predictions, predictor):
85
- # first i need to get the ratio of values that are retained at particular values
86
- predictions.sort_values(predictor, inplace=True)
87
- _, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
88
- tracker = []
89
- for th in thresholds:
90
-
91
-
@@ -1,81 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.metrics import precision_score, recall_score, accuracy_score
4
- from sklearn.metrics import roc_auc_score, roc_curve
5
- import matplotlib.pyplot as plt
6
-
7
- def plot_auc_curve(y_true, y_pred_proba):
8
- """
9
- Plots the AUC curve.
10
-
11
- Args:
12
- y_true (array-like): True labels (0 or 1).
13
- y_pred_proba (array-like): Predicted probabilities for positive class.
14
-
15
- Returns:
16
- None
17
- """
18
- fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
19
- auc_value = roc_auc_score(y_true, y_pred_proba)
20
-
21
- plt.figure(figsize=(8, 6))
22
- plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
23
- plt.plot([0, 1], [0, 1], 'k--')
24
- plt.xlabel("False Positive Rate")
25
- plt.ylabel("True Positive Rate")
26
- plt.title("Receiver Operating Characteristic (ROC) Curve")
27
- plt.legend()
28
- plt.show()
29
- return auc_value
30
-
31
-
32
- def optimal_ppv(dataframe, feature_name, plot=False):
33
- """
34
- Calculates the optimal positive predictive value (PPV) for a given feature.
35
-
36
- Args:
37
- dataframe (pd.DataFrame): Input dataframe.
38
- feature_name (str): Name of the feature column.
39
-
40
- Returns:
41
- float: Optimal PPV.
42
- """
43
- # Assuming 'target' is the binary target column (0 or 1)
44
- threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
45
- ppv_values = []
46
-
47
- for threshold in threshold_values:
48
- predictions = (dataframe[feature_name] >= threshold).astype(int)
49
- ppv = precision_score(dataframe['target'], predictions)
50
- ppv_values.append(ppv)
51
-
52
- optimal_threshold = threshold_values[np.argmax(ppv_values)]
53
- optimal_ppv = max(ppv_values)
54
- if plot:
55
- plt.figure(figsize=(8, 6))
56
- plt.scatter(threshold_values, ppv_values)
57
- plt.xlabel("Threshold")
58
- plt.ylabel("Positive Predictive Value (PPV)")
59
- plt.title("Optimal Positive Predictive Value (PPV)")
60
- plt.show()
61
-
62
- return optimal_ppv, optimal_threshold
63
-
64
-
65
- def measure_prediction_quality(prediction_vector, quality_vector):
66
- """
67
- Measure the quality of the predictions using the quality_vector as the characteristic to check.
68
- """
69
- pass
70
-
71
-
72
-
73
- def create_ppv_vector(prediction_vector, true_value_vector):
74
- """
75
- Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
76
- """
77
- df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
78
- df.sort_values('prediction', ascending=True, inplace=True)
79
- df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
80
- for bin in df.bin.unique():
81
- temp_df = df[df.bin >= bin].