geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/oncosplice.py +1 -1
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
- geney-1.2.22.dist-info/RECORD +19 -0
- geney/Gene.py +0 -258
- geney/analyzers/__init__.py +0 -0
- geney/analyzers/benchmark_clinvar.py +0 -158
- geney/analyzers/characterize_epistasis.py +0 -15
- geney/analyzers/compare_sets.py +0 -91
- geney/analyzers/group_comparison.py +0 -81
- geney/analyzers/survival.py +0 -144
- geney/analyzers/tcga_annotations.py +0 -194
- geney/analyzers/visualize_protein_conservation.py +0 -398
- geney/benchmark_clinvar.py +0 -158
- geney/compare_sets.py +0 -91
- geney/data_parsers/__init__.py +0 -0
- geney/data_parsers/gtex.py +0 -68
- geney/gtex.py +0 -68
- geney/immunotherapy/__init__.py +0 -0
- geney/immunotherapy/netchop.py +0 -78
- geney/mutations/__init__.py +0 -0
- geney/mutations/variant_utils.py +0 -125
- geney/netchop.py +0 -79
- geney/oncosplice/__init__.py +0 -0
- geney/oncosplice_mouse.py +0 -277
- geney/oncosplice_pipeline.py +0 -1588
- geney/performance_utils.py +0 -138
- geney/pipelines/__init__.py +0 -0
- geney/pipelines/dask_utils.py +0 -153
- geney/splicing/__init__.py +0 -2
- geney/splicing/spliceai_utils.py +0 -253
- geney/splicing/splicing_isoform_utils.py +0 -0
- geney/splicing/splicing_utils.py +0 -366
- geney/survival.py +0 -124
- geney/tcga_annotations.py +0 -352
- geney/translation_termination/__init__.py +0 -0
- geney/translation_termination/tts_utils.py +0 -0
- geney-1.2.20.dist-info/RECORD +0 -52
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
geney/oncosplice.py
CHANGED
|
@@ -1047,7 +1047,7 @@ class PredictSpliceAI:
|
|
|
1047
1047
|
|
|
1048
1048
|
# self.missplicing = run_spliceai_transcript(self.modification, transcript_data=gene_data, sai_mrg_context=sai_mrg_context, min_coverage=min_coverage, sai_threshold=0.1)
|
|
1049
1049
|
# print(f"RUNNING: {mutation.mut_id}")
|
|
1050
|
-
ref_transcript, var_transcript = Gene(mutation.mut_id.split(':')[0], organism=organism).transcript(gene_data.transcript_id), Gene(mutation.mut_id.split(':')[0], mutation.mut_id, organism=
|
|
1050
|
+
ref_transcript, var_transcript = Gene(mutation.mut_id.split(':')[0], organism=organism).transcript(gene_data.transcript_id), Gene(mutation.mut_id.split(':')[0], mutation.mut_id, organism=organism).transcript(gene_data.transcript_id)
|
|
1051
1051
|
# print(f"Second check : {ref_transcript.pre_mrna == var_transcript.pre_mrna}")
|
|
1052
1052
|
self.missplicing = find_transcript_missplicing(self.modification, ref_transcript, var_transcript, context=sai_mrg_context+min_coverage, threshold=threshold,
|
|
1053
1053
|
engine=engine)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
|
|
2
|
+
geney/__init__.py,sha256=knezxgbV2c2gcO2ek2-xxEC15HL4aO1WuoMiYOOvKf8,428
|
|
3
|
+
geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
|
|
4
|
+
geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
|
|
5
|
+
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
6
|
+
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
7
|
+
geney/oncosplice.py,sha256=AZm8Vj7z65DokPmeflwoqs2BM11neV9hQLA_Ao4ysnM,78242
|
|
8
|
+
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
9
|
+
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
10
|
+
geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
|
|
11
|
+
geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
|
|
12
|
+
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
|
|
14
|
+
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
15
|
+
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
16
|
+
geney-1.2.22.dist-info/METADATA,sha256=eTTiyuGPZ5lD7jV8YZXSocPyewD3OPwvgeaqiXxuVfo,1163
|
|
17
|
+
geney-1.2.22.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
18
|
+
geney-1.2.22.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
19
|
+
geney-1.2.22.dist-info/RECORD,,
|
geney/Gene.py
DELETED
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
from copy import copy
|
|
2
|
-
from Bio.Seq import Seq
|
|
3
|
-
from geney.mutations.variant_utils import generate_mut_variant, Mutation, find_new_tts
|
|
4
|
-
from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle
|
|
5
|
-
from geney.Fasta_segment import Fasta_segment
|
|
6
|
-
from geney import config_setup
|
|
7
|
-
from geney.translation_initiation.tis_utils import TISFInder
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class Gene:
|
|
11
|
-
def __init__(self, gene_name, variation):
|
|
12
|
-
self.gene_name = gene_name
|
|
13
|
-
self.gene_id = ''
|
|
14
|
-
self.rev = None
|
|
15
|
-
self.chrm = ''
|
|
16
|
-
self.gene_start = 0
|
|
17
|
-
self.gene_end = 0
|
|
18
|
-
self.transcripts = {}
|
|
19
|
-
self.load_from_file(find_files_by_gene_name(gene_name))
|
|
20
|
-
self.variation = variation
|
|
21
|
-
|
|
22
|
-
def __repr__(self):
|
|
23
|
-
return f'Gene(gene_name={self.gene_name})'
|
|
24
|
-
|
|
25
|
-
def __len__(self):
|
|
26
|
-
return len(self.transcripts)
|
|
27
|
-
|
|
28
|
-
def __str__(self):
|
|
29
|
-
return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
|
|
30
|
-
|
|
31
|
-
def __copy__(self):
|
|
32
|
-
cls = self.__class__
|
|
33
|
-
result = cls.__new__(cls)
|
|
34
|
-
result.__dict__.update(self.__dict__)
|
|
35
|
-
return result
|
|
36
|
-
|
|
37
|
-
def __getitem__(self, index):
|
|
38
|
-
return Transcript(list(self.transcripts.values())[index])
|
|
39
|
-
|
|
40
|
-
def load_from_file(self, file_name):
|
|
41
|
-
if not file_name.exists():
|
|
42
|
-
raise FileNotFoundError(f"File '{file_name}' not found.")
|
|
43
|
-
|
|
44
|
-
self.load_from_dict(dict_data=unload_pickle(file_name))
|
|
45
|
-
return self
|
|
46
|
-
|
|
47
|
-
def load_from_dict(self, dict_data=None):
|
|
48
|
-
for k, v in dict_data.items():
|
|
49
|
-
setattr(self, k, v)
|
|
50
|
-
return self
|
|
51
|
-
|
|
52
|
-
# def generate_transcript(self, tid=None):
|
|
53
|
-
# if tid == None:
|
|
54
|
-
# tid = [k for k, v in self.transcripts.items() if v['primary_transcript']][0]
|
|
55
|
-
# return Transcript(self.transcripts[tid])
|
|
56
|
-
|
|
57
|
-
def transcript(self, tid):
|
|
58
|
-
if tid not in self.transcripts:
|
|
59
|
-
raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
|
|
60
|
-
return Transcript(self.transcripts[tid]) #self.generate_transcript(tid)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class Transcript:
|
|
64
|
-
def __init__(self, d=None):
|
|
65
|
-
self.transcript_id = None
|
|
66
|
-
self.transcript_start = None # transcription
|
|
67
|
-
self.transcript_end = None # transcription
|
|
68
|
-
self.transcript_biotype = None # metadata
|
|
69
|
-
self.acceptors, self.donors = [], [] # splicing
|
|
70
|
-
self.TIS, self.TTS = None, None # translation
|
|
71
|
-
self.transcript_seq, self.transcript_indices = '', [] # sequence data
|
|
72
|
-
self.rev = None # sequence data
|
|
73
|
-
self.chrm = '' # sequence data
|
|
74
|
-
self.pre_mrna = '' # sequence data
|
|
75
|
-
self.orf = '' # sequence data
|
|
76
|
-
self.protein = '' # sequence data
|
|
77
|
-
self.log = '' # sequence data
|
|
78
|
-
self.primary_transcript=None # sequence data
|
|
79
|
-
self.cons_available=False # metadata
|
|
80
|
-
self.cons_seq = ''
|
|
81
|
-
self.cons_vector = ''
|
|
82
|
-
if d:
|
|
83
|
-
self.load_from_dict(d)
|
|
84
|
-
|
|
85
|
-
if self.cons_available:
|
|
86
|
-
if '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector):
|
|
87
|
-
self.cons_seq = self.cons_seq.replace('*', '')
|
|
88
|
-
self.cons_vector = self.cons_vector[:-1]
|
|
89
|
-
|
|
90
|
-
elif '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector) + 1:
|
|
91
|
-
self.cons_seq = self.cons_seq.replace('*', '')
|
|
92
|
-
|
|
93
|
-
else:
|
|
94
|
-
self.cons_available = False
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def __repr__(self):
|
|
98
|
-
return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
|
|
99
|
-
|
|
100
|
-
def __len__(self):
|
|
101
|
-
return len(self.transcript_seq)
|
|
102
|
-
|
|
103
|
-
def __str__(self):
|
|
104
|
-
return 'Transcript {tid}, Transcript Type: ' \
|
|
105
|
-
'{protein_coding}'.format(
|
|
106
|
-
tid=self.transcript_id, protein_coding=self.transcript_biotype)
|
|
107
|
-
|
|
108
|
-
def __eq__(self, other):
|
|
109
|
-
return self.transcript_seq == other.transcript_seq
|
|
110
|
-
|
|
111
|
-
def __contains__(self, subvalue):
|
|
112
|
-
if isinstance(subvalue, str):
|
|
113
|
-
return subvalue in self.transcript_seq
|
|
114
|
-
elif isinstance(subvalue, int):
|
|
115
|
-
return subvalue in self.transcript_indices
|
|
116
|
-
else:
|
|
117
|
-
print(
|
|
118
|
-
"Pass an integer to check against the span of the gene's coordinates or a string to check against the "
|
|
119
|
-
"pre-mRNA sequence.")
|
|
120
|
-
return False
|
|
121
|
-
|
|
122
|
-
def __copy__(self, other):
|
|
123
|
-
return copy(self)
|
|
124
|
-
|
|
125
|
-
@property
|
|
126
|
-
def constructor(self):
|
|
127
|
-
core_attributes = ['transcript_id', 'transcript_start', 'transcript_end', 'transcript_biotype', 'acceptors', 'donors', 'TIS', 'TTS', 'rev', 'chrm']
|
|
128
|
-
return {k: v for k, v in self.__dict__.items() if k in core_attributes}
|
|
129
|
-
|
|
130
|
-
def load_from_dict(self, data):
|
|
131
|
-
for k, v in data.items():
|
|
132
|
-
setattr(self, k, v)
|
|
133
|
-
self.__arrange_boundaries()
|
|
134
|
-
self.generate_mature_mrna(inplace=True)
|
|
135
|
-
return self
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def exons(self):
|
|
139
|
-
return list(zip(self.acceptors, self.donors))
|
|
140
|
-
|
|
141
|
-
@property
|
|
142
|
-
def introns(self):
|
|
143
|
-
return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def set_exons(self, boundaries):
|
|
147
|
-
self.acceptors, self.donors = boundaries['acceptors'], boundaries['donors']
|
|
148
|
-
self.__arrange_boundaries()
|
|
149
|
-
return self
|
|
150
|
-
|
|
151
|
-
@property
|
|
152
|
-
def introns(self):
|
|
153
|
-
return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
|
|
154
|
-
|
|
155
|
-
def __exon_coverage_check(self):
|
|
156
|
-
if sum([abs(a-b) + 1 for a, b in self.exons]) == len(self):
|
|
157
|
-
return True
|
|
158
|
-
else:
|
|
159
|
-
return False
|
|
160
|
-
|
|
161
|
-
@property
|
|
162
|
-
def exons_pos(self):
|
|
163
|
-
temp = self.exons
|
|
164
|
-
if self.rev:
|
|
165
|
-
temp = [(b, a) for a, b in temp[::-1]]
|
|
166
|
-
return temp
|
|
167
|
-
|
|
168
|
-
@property
|
|
169
|
-
def mrna_indices(self):
|
|
170
|
-
temp = [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
|
|
171
|
-
return sorted(temp, reverse=self.rev)
|
|
172
|
-
|
|
173
|
-
@property
|
|
174
|
-
def exonic_indices(self):
|
|
175
|
-
return [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
|
|
176
|
-
|
|
177
|
-
def __arrange_boundaries(self):
|
|
178
|
-
self.acceptors.append(self.transcript_start)
|
|
179
|
-
self.donors.append(self.transcript_end)
|
|
180
|
-
self.acceptors = list(set(self.acceptors))
|
|
181
|
-
self.donors = list(set(self.donors))
|
|
182
|
-
self.acceptors.sort(reverse=self.rev)
|
|
183
|
-
self.donors.sort(reverse=self.rev)
|
|
184
|
-
return self
|
|
185
|
-
|
|
186
|
-
def positive_strand(self):
|
|
187
|
-
if self.rev:
|
|
188
|
-
return reverse_complement(self.transcript_seq)
|
|
189
|
-
else:
|
|
190
|
-
return self.transcript_seq
|
|
191
|
-
|
|
192
|
-
def __pos2sense(self, mrna, indices):
|
|
193
|
-
if self.rev:
|
|
194
|
-
mrna = reverse_complement(mrna)
|
|
195
|
-
indices = indices[::-1]
|
|
196
|
-
return mrna, indices
|
|
197
|
-
|
|
198
|
-
def pull_pre_mrna_pos(self):
|
|
199
|
-
fasta_obj = Fasta_segment()
|
|
200
|
-
if self.rev:
|
|
201
|
-
return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_end,
|
|
202
|
-
self.transcript_start)
|
|
203
|
-
else:
|
|
204
|
-
return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_start,
|
|
205
|
-
self.transcript_end)
|
|
206
|
-
|
|
207
|
-
def generate_pre_mrna_pos(self, mutations=[]):
|
|
208
|
-
seq, indices = self.pull_pre_mrna_pos()
|
|
209
|
-
for mutation in mutations:
|
|
210
|
-
mutation = Mutation(mutation)
|
|
211
|
-
seq, indices, _, _ = generate_mut_variant(seq, indices, mut=mutation)
|
|
212
|
-
|
|
213
|
-
self.pre_mrna, _ = self.__pos2sense(seq, indices)
|
|
214
|
-
return seq, indices
|
|
215
|
-
|
|
216
|
-
def generate_pre_mrna(self, mutations=[], inplace=True):
|
|
217
|
-
pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos(mutations))
|
|
218
|
-
self.pre_mrna = pre_mrna
|
|
219
|
-
if inplace:
|
|
220
|
-
return self
|
|
221
|
-
return pre_mrna
|
|
222
|
-
|
|
223
|
-
def generate_mature_mrna_pos(self, mutations=[]):
|
|
224
|
-
mature_mrna, mature_indices = '', []
|
|
225
|
-
pre_seq, pre_indices = self.generate_pre_mrna_pos(mutations)
|
|
226
|
-
for i, j in self.exons_pos:
|
|
227
|
-
rel_start, rel_end = pre_indices.index(i), pre_indices.index(j)
|
|
228
|
-
mature_mrna += pre_seq[rel_start:rel_end + 1]
|
|
229
|
-
mature_indices.extend(pre_indices[rel_start:rel_end + 1])
|
|
230
|
-
return mature_mrna, mature_indices
|
|
231
|
-
|
|
232
|
-
def generate_mature_mrna(self, mutations=[], inplace=True):
|
|
233
|
-
if inplace:
|
|
234
|
-
self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
|
|
235
|
-
return self
|
|
236
|
-
return self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
|
|
237
|
-
|
|
238
|
-
def generate_protein(self, inplace=True):
|
|
239
|
-
rel_start = self.transcript_indices.index(self.TIS)
|
|
240
|
-
rel_end = self.transcript_indices.index(self.TTS)
|
|
241
|
-
orf = self.transcript_seq[rel_start:rel_end + 1 + 3]
|
|
242
|
-
protein = str(Seq(orf).translate()).replace('*', '')
|
|
243
|
-
if inplace:
|
|
244
|
-
self.orf = orf
|
|
245
|
-
self.protein = protein
|
|
246
|
-
if self.protein != self.cons_seq:
|
|
247
|
-
self.cons_available = False
|
|
248
|
-
return self
|
|
249
|
-
return protein
|
|
250
|
-
|
|
251
|
-
def generate_translational_boundaries(self):
|
|
252
|
-
if self.TIS not in self.transcript_indices or self.transcript_seq[self.transcript_indices.index(self.TIS):self.transcript_indices.index(self.TIS)+3] != 'ATG':
|
|
253
|
-
new_tis = TISFInder(self.transcript_seq, self.transcript_indices)
|
|
254
|
-
self.log += f' TIS for transcript reacquired: {self.TIS} --> {new_tis}.'
|
|
255
|
-
self.TIS = new_tis
|
|
256
|
-
self.TTS = find_new_tts(self.transcript_seq, self.transcript_indices, self.TIS)
|
|
257
|
-
return self
|
|
258
|
-
|
geney/analyzers/__init__.py
DELETED
|
File without changes
|
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from sklearn.metrics import roc_curve, precision_recall_curve
|
|
3
|
-
import matplotlib.pyplot as plt
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import subprocess
|
|
7
|
-
|
|
8
|
-
from geney import config_setup
|
|
9
|
-
from geney.utils import download_and_gunzip
|
|
10
|
-
from geney.oncosplice import oncosplice_reduced
|
|
11
|
-
|
|
12
|
-
def download_and_parse_clinvar():
|
|
13
|
-
url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
|
|
14
|
-
local_file = download_and_gunzip(url, target_path)
|
|
15
|
-
return local_file
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def aggregate_clinvar_results(benchmark_path, aggregate_mode=False, benchmark_feature=None, local_clinvar_df='/tamir2/nicolaslynn/data/ClinVar/clinvar_compact.csv'):
|
|
19
|
-
data = pd.concat([pd.read_csv(file) for file in Path(benchmark_path).glob('*.csv')])
|
|
20
|
-
if not aggregate_mode:
|
|
21
|
-
data = data[(data.cons_available) & (data.primary_transcript)]
|
|
22
|
-
|
|
23
|
-
data = oncosplice_reduced(data)
|
|
24
|
-
data = data.loc[:, ~data.columns.duplicated()]
|
|
25
|
-
data = pd.merge(data, pd.read_csv(local_clinvar_df), on='mut_id')
|
|
26
|
-
data['clinsig_val'] = data.apply(lambda row: {'Benign': 0, 'Pathogenic': 1}[row.clinsig], axis=1)
|
|
27
|
-
for c in data.columns:
|
|
28
|
-
try:
|
|
29
|
-
if data[c].min() < 0:
|
|
30
|
-
data[f'{c}_abs'] = abs(data[c])
|
|
31
|
-
except TypeError:
|
|
32
|
-
pass
|
|
33
|
-
|
|
34
|
-
print(data.corr(numeric_only=True))
|
|
35
|
-
print(data.corrwith(data['clinsig_val'], method='spearman'))
|
|
36
|
-
print(data.corrwith(data['clinsig_val'], method='pearson'))
|
|
37
|
-
return data
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def plot_performance(true_values, predictions):
|
|
41
|
-
clinsig_map = {'Benign': 0, 'Pathogenic': 1}
|
|
42
|
-
true_values = [clinsig_map[t] for t in true_values]
|
|
43
|
-
predictions = scale_predictions(predictions)
|
|
44
|
-
|
|
45
|
-
fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
|
|
46
|
-
|
|
47
|
-
# Calculate Precision-Recall curve
|
|
48
|
-
precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
|
|
49
|
-
|
|
50
|
-
# Plotting ROC curve
|
|
51
|
-
plt.figure(figsize=(20, 5))
|
|
52
|
-
|
|
53
|
-
plt.subplot(1, 4, 1)
|
|
54
|
-
plt.plot(fpr, tpr)
|
|
55
|
-
plt.title('ROC Curve')
|
|
56
|
-
plt.xlabel('False Positive Rate')
|
|
57
|
-
plt.ylabel('True Positive Rate')
|
|
58
|
-
|
|
59
|
-
# Plotting Precision-Recall curve
|
|
60
|
-
plt.subplot(1, 4, 2)
|
|
61
|
-
plt.plot(recall, precision)
|
|
62
|
-
plt.title('Precision-Recall Curve')
|
|
63
|
-
plt.xlabel('Recall')
|
|
64
|
-
plt.ylabel('Precision')
|
|
65
|
-
|
|
66
|
-
# Plotting Precision vs. Thresholds
|
|
67
|
-
plt.subplot(1, 4, 3)
|
|
68
|
-
plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
|
|
69
|
-
plt.title('Precision vs. Threshold')
|
|
70
|
-
plt.xlabel('Threshold')
|
|
71
|
-
plt.ylabel('Precision')
|
|
72
|
-
|
|
73
|
-
# Plotting Sample Percentage Captured vs. Thresholds
|
|
74
|
-
plt.subplot(1, 4, 4)
|
|
75
|
-
# Assuming 'tpr' or another appropriate metric represents the cumulative percentage
|
|
76
|
-
plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
|
|
77
|
-
plt.title('Cumulative Percentage vs. Threshold')
|
|
78
|
-
plt.xlabel('Threshold')
|
|
79
|
-
plt.ylabel('Cumulative Percentage of Population')
|
|
80
|
-
|
|
81
|
-
plt.tight_layout()
|
|
82
|
-
plt.show()
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class ClinVarBenchmark:
|
|
87
|
-
def __init__(self, df):
|
|
88
|
-
assert 'clinsig' in df.columns, 'No clinsig column found in dataframe.'
|
|
89
|
-
self.df = df
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def scale_predictions(self, p):
|
|
93
|
-
max_val = max(p)
|
|
94
|
-
min_val = min(p)
|
|
95
|
-
return (p - min_val) / (max_val - min_val)
|
|
96
|
-
|
|
97
|
-
def plot_performance(self, true_values, predictions):
|
|
98
|
-
clinsig_map = {'Benign': 0, 'Pathogenic': 1}
|
|
99
|
-
predictions = [clinsig_map[t] for t in true_values]
|
|
100
|
-
predictions = self.scale_predictions(predictions)
|
|
101
|
-
|
|
102
|
-
fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
|
|
103
|
-
|
|
104
|
-
# Calculate Precision-Recall curve
|
|
105
|
-
precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
|
|
106
|
-
|
|
107
|
-
# Plotting ROC curve
|
|
108
|
-
plt.figure(figsize=(20, 5))
|
|
109
|
-
|
|
110
|
-
plt.subplot(1, 4, 1)
|
|
111
|
-
plt.plot(fpr, tpr)
|
|
112
|
-
plt.title('ROC Curve')
|
|
113
|
-
plt.xlabel('False Positive Rate')
|
|
114
|
-
plt.ylabel('True Positive Rate')
|
|
115
|
-
|
|
116
|
-
# Plotting Precision-Recall curve
|
|
117
|
-
plt.subplot(1, 4, 2)
|
|
118
|
-
plt.plot(recall, precision)
|
|
119
|
-
plt.title('Precision-Recall Curve')
|
|
120
|
-
plt.xlabel('Recall')
|
|
121
|
-
plt.ylabel('Precision')
|
|
122
|
-
|
|
123
|
-
# Plotting Precision vs. Thresholds
|
|
124
|
-
plt.subplot(1, 4, 3)
|
|
125
|
-
plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
|
|
126
|
-
plt.title('Precision vs. Threshold')
|
|
127
|
-
plt.xlabel('Threshold')
|
|
128
|
-
plt.ylabel('Precision')
|
|
129
|
-
|
|
130
|
-
# Plotting Sample Percentage Captured vs. Thresholds
|
|
131
|
-
plt.subplot(1, 4, 4)
|
|
132
|
-
# Assuming 'tpr' or another appropriate metric represents the cumulative percentage
|
|
133
|
-
plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
|
|
134
|
-
plt.title('Cumulative Percentage vs. Threshold')
|
|
135
|
-
plt.xlabel('Threshold')
|
|
136
|
-
plt.ylabel('Cumulative Percentage of Population')
|
|
137
|
-
|
|
138
|
-
plt.tight_layout()
|
|
139
|
-
plt.show()
|
|
140
|
-
return None
|
|
141
|
-
|
|
142
|
-
def report(self, feature):
|
|
143
|
-
pass
|
|
144
|
-
|
|
145
|
-
def find_ppv_threshold(self, feature, ppv_threshold=0.95):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if __name__ == '__main__':
|
|
151
|
-
now = datetime.now()
|
|
152
|
-
benchmark_path = config_setup['ONCOSPLICE'] / f'clinvar_benchmark_{now.strftime("%m_%d_%Y")}'
|
|
153
|
-
print(f"Saving benchmark results to {benchmark_path}")
|
|
154
|
-
benchmark_path.mkdir(parents=True, exist_ok=True)
|
|
155
|
-
subprocess.run(['python', '-m', 'geney.pipelines.dask_utils', '-i',
|
|
156
|
-
'/tamir2/nicolaslynn/data/ClinVar/clinvar_oncosplice_input.txt', '-r', str(benchmark_path),
|
|
157
|
-
'-n', '10', '-m', '5GB'])
|
|
158
|
-
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from geney.oncosplice import *
|
|
2
|
-
|
|
3
|
-
class PairwiseEpistasis:
|
|
4
|
-
def __init__(self, epistasis):
|
|
5
|
-
# need some check here making sure format of mtuations isi good
|
|
6
|
-
self.epistasis = epistasis
|
|
7
|
-
self.mut_id1, self.mut_id2 = epistasis.split('|')
|
|
8
|
-
|
|
9
|
-
def compare_functional_changes(self):
|
|
10
|
-
self.results_mut1 = oncosplice(self.mut_id1, sai_threshold=0.5)
|
|
11
|
-
self.results_mut2 = oncosplice(self.mut_id2, sai_threshold=0.5)
|
|
12
|
-
self.results_epi = oncosplice(self.epistasis, sai_threshold=0.5)
|
|
13
|
-
|
|
14
|
-
splicing1, splicing2, splicing_epi = 0, 0, 0
|
|
15
|
-
oncosplice_score1, oncosplice_score2, oncosplice_score_epi = 0, 0, 0
|
geney/analyzers/compare_sets.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
|
4
|
-
from sklearn.metrics import roc_auc_score, roc_curve
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
|
|
7
|
-
def plot_auc_curve(y_true, y_pred_proba):
|
|
8
|
-
"""
|
|
9
|
-
Plots the AUC curve.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
y_true (array-like): True labels (0 or 1).
|
|
13
|
-
y_pred_proba (array-like): Predicted probabilities for positive class.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
"""
|
|
18
|
-
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
|
19
|
-
auc_value = roc_auc_score(y_true, y_pred_proba)
|
|
20
|
-
|
|
21
|
-
plt.figure(figsize=(8, 6))
|
|
22
|
-
plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
|
|
23
|
-
plt.plot([0, 1], [0, 1], 'k--')
|
|
24
|
-
plt.xlabel("False Positive Rate")
|
|
25
|
-
plt.ylabel("True Positive Rate")
|
|
26
|
-
plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
27
|
-
plt.legend()
|
|
28
|
-
plt.show()
|
|
29
|
-
return auc_value
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def optimal_ppv(dataframe, feature_name, plot=False):
|
|
33
|
-
"""
|
|
34
|
-
Calculates the optimal positive predictive value (PPV) for a given feature.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
dataframe (pd.DataFrame): Input dataframe.
|
|
38
|
-
feature_name (str): Name of the feature column.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
float: Optimal PPV.
|
|
42
|
-
"""
|
|
43
|
-
# Assuming 'target' is the binary target column (0 or 1)
|
|
44
|
-
threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
|
|
45
|
-
ppv_values = []
|
|
46
|
-
|
|
47
|
-
for threshold in threshold_values:
|
|
48
|
-
predictions = (dataframe[feature_name] >= threshold).astype(int)
|
|
49
|
-
ppv = precision_score(dataframe['target'], predictions)
|
|
50
|
-
ppv_values.append(ppv)
|
|
51
|
-
|
|
52
|
-
optimal_threshold = threshold_values[np.argmax(ppv_values)]
|
|
53
|
-
optimal_ppv = max(ppv_values)
|
|
54
|
-
if plot:
|
|
55
|
-
plt.figure(figsize=(8, 6))
|
|
56
|
-
plt.scatter(threshold_values, ppv_values)
|
|
57
|
-
plt.xlabel("Threshold")
|
|
58
|
-
plt.ylabel("Positive Predictive Value (PPV)")
|
|
59
|
-
plt.title("Optimal Positive Predictive Value (PPV)")
|
|
60
|
-
plt.show()
|
|
61
|
-
|
|
62
|
-
return optimal_ppv, optimal_threshold
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def measure_prediction_quality(prediction_vector, quality_vector):
|
|
66
|
-
"""
|
|
67
|
-
Measure the quality of the predictions using the quality_vector as the characteristic to check.
|
|
68
|
-
"""
|
|
69
|
-
pass
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def create_ppv_vector(prediction_vector, true_value_vector):
|
|
74
|
-
"""
|
|
75
|
-
Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
|
|
76
|
-
"""
|
|
77
|
-
df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
|
|
78
|
-
df.sort_values('prediction', ascending=True, inplace=True)
|
|
79
|
-
df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
|
|
80
|
-
for bin in df.bin.unique():
|
|
81
|
-
temp_df = df[df.bin >= bin].
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def group_retention(predictions, predictor):
|
|
85
|
-
# first i need to get the ratio of values that are retained at particular values
|
|
86
|
-
predictions.sort_values(predictor, inplace=True)
|
|
87
|
-
_, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
|
|
88
|
-
tracker = []
|
|
89
|
-
for th in thresholds:
|
|
90
|
-
|
|
91
|
-
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
|
4
|
-
from sklearn.metrics import roc_auc_score, roc_curve
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
|
|
7
|
-
def plot_auc_curve(y_true, y_pred_proba):
|
|
8
|
-
"""
|
|
9
|
-
Plots the AUC curve.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
y_true (array-like): True labels (0 or 1).
|
|
13
|
-
y_pred_proba (array-like): Predicted probabilities for positive class.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
"""
|
|
18
|
-
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
|
19
|
-
auc_value = roc_auc_score(y_true, y_pred_proba)
|
|
20
|
-
|
|
21
|
-
plt.figure(figsize=(8, 6))
|
|
22
|
-
plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
|
|
23
|
-
plt.plot([0, 1], [0, 1], 'k--')
|
|
24
|
-
plt.xlabel("False Positive Rate")
|
|
25
|
-
plt.ylabel("True Positive Rate")
|
|
26
|
-
plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
27
|
-
plt.legend()
|
|
28
|
-
plt.show()
|
|
29
|
-
return auc_value
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def optimal_ppv(dataframe, feature_name, plot=False):
|
|
33
|
-
"""
|
|
34
|
-
Calculates the optimal positive predictive value (PPV) for a given feature.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
dataframe (pd.DataFrame): Input dataframe.
|
|
38
|
-
feature_name (str): Name of the feature column.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
float: Optimal PPV.
|
|
42
|
-
"""
|
|
43
|
-
# Assuming 'target' is the binary target column (0 or 1)
|
|
44
|
-
threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
|
|
45
|
-
ppv_values = []
|
|
46
|
-
|
|
47
|
-
for threshold in threshold_values:
|
|
48
|
-
predictions = (dataframe[feature_name] >= threshold).astype(int)
|
|
49
|
-
ppv = precision_score(dataframe['target'], predictions)
|
|
50
|
-
ppv_values.append(ppv)
|
|
51
|
-
|
|
52
|
-
optimal_threshold = threshold_values[np.argmax(ppv_values)]
|
|
53
|
-
optimal_ppv = max(ppv_values)
|
|
54
|
-
if plot:
|
|
55
|
-
plt.figure(figsize=(8, 6))
|
|
56
|
-
plt.scatter(threshold_values, ppv_values)
|
|
57
|
-
plt.xlabel("Threshold")
|
|
58
|
-
plt.ylabel("Positive Predictive Value (PPV)")
|
|
59
|
-
plt.title("Optimal Positive Predictive Value (PPV)")
|
|
60
|
-
plt.show()
|
|
61
|
-
|
|
62
|
-
return optimal_ppv, optimal_threshold
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def measure_prediction_quality(prediction_vector, quality_vector):
|
|
66
|
-
"""
|
|
67
|
-
Measure the quality of the predictions using the quality_vector as the characteristic to check.
|
|
68
|
-
"""
|
|
69
|
-
pass
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def create_ppv_vector(prediction_vector, true_value_vector):
|
|
74
|
-
"""
|
|
75
|
-
Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
|
|
76
|
-
"""
|
|
77
|
-
df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
|
|
78
|
-
df.sort_values('prediction', ascending=True, inplace=True)
|
|
79
|
-
df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
|
|
80
|
-
for bin in df.bin.unique():
|
|
81
|
-
temp_df = df[df.bin >= bin].
|