geney 1.3.79__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

@@ -0,0 +1,143 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from pathlib import Path
5
+ # from scipy.integrate import trapezoid
6
+ from geney.utils import unload_pickle, unload_json, contains
7
+ from lifelines.exceptions import ConvergenceError
8
+ from lifelines import KaplanMeierFitter
9
+ from lifelines.statistics import logrank_test
10
+ from lifelines import CoxPHFitter
11
+
12
+ pd.set_option('display.max_columns', None)
13
+ pd.options.mode.chained_assignment = None
14
+
15
+
16
+ def prepare_clinical_data(df=None):
17
+ if df is None:
18
+ CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
19
+ df = unload_pickle(CLINICAL_DATA_FILE)
20
+
21
+ df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
22
+ cols = list(df.columns)
23
+ cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
24
+ cols_days_to_know_alive = [col for col in cols if 'days_to_know_alive' in col] + [col for col in cols if 'days_to_last_known_alive' in col]
25
+ cols_days_to_death = [col for col in cols if 'days_to_death' in col]
26
+ cols_duration = cols_days_to_followup + cols_days_to_know_alive + cols_days_to_death
27
+ col_vital_status = 'days_to_death'
28
+ event_col_label = 'event'
29
+ duration_col_label = 'duration'
30
+ df.insert(1, event_col_label, df.apply(lambda x: int(not np.isnan(x[col_vital_status])), axis=1))
31
+ df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
32
+ df[duration_col_label] /= 365
33
+ df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
34
+ # df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
35
+ return df
36
+
37
+
38
+ class SurvivalAnalysis:
39
+ def __init__(self, clindf=None):
40
+ self.clindf = prepare_clinical_data(clindf)
41
+ self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
42
+ self.df = self.clindf.copy()
43
+ self.df['group'] = 0
44
+ self.df.fillna(0, inplace=True)
45
+ self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
46
+
47
+ def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
48
+ df = self.df.copy()
49
+ df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
50
+ if control_cases is not None:
51
+ df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
52
+
53
+ df = df[df.group > 0]
54
+ df.group -= 1
55
+ core_features = ['duration', 'event']
56
+ df = df[core_features + features_of_interest]
57
+
58
+ for col in self.treatment_features:
59
+ if col not in df:
60
+ continue
61
+ df.loc[df[col] > 0, col] = 1
62
+
63
+ df = df[core_features + [col for col in features_of_interest if
64
+ df[col].nunique() > 1]] # and df[col].value_counts(normalize=True).min() >= 0.01]]
65
+ return df
66
+
67
+ def kaplan_meier_analysis(self, df, control_label='Unaffected Patients', target_label='Affected Patients', feature='group', plot=False, title=None, time_cap=False, savepath=None, figsize=(7, 3), tmb_p_value=None):
68
+ # Can only be performed on features with two unique values
69
+ cap_time = df.groupby(feature).duration.max().min()
70
+ # df['duration'] = df['duration'].clip(upper=cap_time)
71
+ auc_vals = []
72
+ results = pd.Series()
73
+ count = 0
74
+ for val in [0, 1]:
75
+ g = df[df[feature] == val]
76
+ kmf = KaplanMeierFitter()
77
+ label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
78
+ if val == 0:
79
+ results[control_label] = len(g)
80
+ else:
81
+ results[target_label] = len(g)
82
+
83
+ kmf.fit(g['duration'], g['event'], label=label)
84
+ surv_func = kmf.survival_function_
85
+ filtered_surv_func = surv_func[surv_func.index <= cap_time]
86
+ auc = np.trapz(filtered_surv_func[label], filtered_surv_func.index)
87
+ # auc = trapz(surv_func[label], surv_func.index)
88
+ auc_vals.append(auc)
89
+ if plot:
90
+ if count == 0:
91
+ fig, ax = plt.subplots(figsize=figsize)
92
+ kmf.plot_survival_function(ax=ax, ci_show=True, color="#2430e0", lw=2)
93
+ else:
94
+ kmf.plot_survival_function(ax=ax, ci_show=True, color="#e60215", lw=2)
95
+ count += 1
96
+
97
+ p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
98
+
99
+ if plot:
100
+ ax.text(0.6, 0.6, rf'Survival $p{{v}}$: {p_value:.3e}', transform=ax.transAxes, fontsize=10,
101
+ horizontalalignment='left')
102
+ if tmb_p_value:
103
+ ax.text(0.6, 0.53, rf'TMB $p{{v}}$: {tmb_p_value:.3e}', transform=ax.transAxes, fontsize=10,
104
+ horizontalalignment='left')
105
+ # Grid and spines
106
+ ax.grid(True, which="major", linestyle="--", linewidth=0.5, color="grey", alpha=0.7)
107
+ ax.spines['top'].set_visible(False)
108
+ ax.spines['right'].set_visible(False)
109
+ ax.tick_params(axis="both", which="major", labelsize=10)
110
+ if title:
111
+ ax.set_title(title, fontsize=12)
112
+ legend = ax.legend(fontsize=9, loc='best', frameon=True)
113
+ legend.get_frame().set_facecolor('white') # Set the background color to white
114
+ legend.get_frame().set_edgecolor('black') # Set the edge color to black
115
+ plt.xlabel('Time (years)')
116
+ plt.ylabel('Survival Probability')
117
+ if time_cap:
118
+ plt.xlim([0, cap_time])
119
+ plt.tight_layout()
120
+ if savepath is not None:
121
+ plt.savefig(savepath, bbox_inches='tight', dpi=300)
122
+ plt.show()
123
+
124
+ results['p_value'] = p_value
125
+ results['auc_target'] = auc_vals[-1]
126
+ if len(auc_vals) > 1:
127
+ results['auc_delta'] = auc_vals[-1] - auc_vals[0]
128
+ results['auc_control'] = auc_vals[0]
129
+
130
+ return results
131
+
132
+ def log_rank(self, group1, group2):
133
+ return logrank_test(group1['duration'], group2['duration'],
134
+ event_observed_A=group1['event'],
135
+ event_observed_B=group2['event']).p_value
136
+
137
+ def perform_cox_analysis(self, df, features_of_interest):
138
+ # Very simple... will return a series with p values for each feature
139
+ try:
140
+ return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
141
+ except ConvergenceError:
142
+ print("Convergence Error")
143
+ return pd.Series()
geney/_tcga_utils.py ADDED
@@ -0,0 +1,405 @@
1
+ import pandas as pd
2
+ import random
3
+ from pathlib import Path
4
+ from tqdm import tqdm
5
+
6
+ class TCGACase:
7
+ def __init__(self, df):
8
+ # Here we get a dataframe of mutations within a gene
9
+ self.df = df
10
+ self.calculate_vaf()
11
+ self.space_variants(spacer_size=50)
12
+ self.case_id = df.case_id.tolist()[0]
13
+
14
+ def space_variants(self, spacer_size=100, group_likelihood_threshold=0):
15
+ df = self.df
16
+ if df.empty:
17
+ df['group'] = 0
18
+ return self
19
+ values = sorted(df.Start_Position.unique().tolist())
20
+ # groups = [list(group) for key, group in groupby(values, key=lambda x: (x - values[values.index(x) - 1] >
21
+ # spacer_size) if values.index(x) > 0 else False)] Initialize variables
22
+ groups = []
23
+ current_group = []
24
+
25
+ # Iterate through the values
26
+ for i in range(len(values)):
27
+ if i == 0:
28
+ current_group.append(values[i])
29
+ else:
30
+ if values[i] - values[i - 1] <= spacer_size:
31
+ current_group.append(values[i])
32
+ else:
33
+ groups.append(current_group)
34
+ current_group = [values[i]]
35
+
36
+ # Append the last group if it's not empty
37
+ if current_group:
38
+ groups.append(current_group)
39
+
40
+ df.loc[:, 'group'] = 0
41
+ for i, g in enumerate(groups):
42
+ df.loc[df.Start_Position.isin(g), 'group'] = i
43
+ self.df = df
44
+ return self
45
+
46
+ def calculate_vaf(self):
47
+ df = self.df
48
+ df = df[df.t_depth > 0]
49
+ df.loc[:, 'vaf'] = df.apply(lambda row: row.t_alt_count / row.t_depth, axis=1)
50
+ self.df = df
51
+ return self
52
+
53
+ def find_overlayed_variants(self):
54
+ df = self.df
55
+ mut_counts = df.mut_id.value_counts()
56
+ mut_counts = mut_counts[mut_counts > 1].index
57
+
58
+ small_df = df.groupby('mut_id', as_index=False).agg({
59
+ 't_depth': 'sum',
60
+ 't_alt_count': 'sum',
61
+ 't_ref_count': 'sum',
62
+ })
63
+
64
+ df = df.drop_duplicates(subset='mut_id', keep='first')
65
+
66
+ small_df = small_df[small_df.t_depth > 0]
67
+ small_df['vaf'] = small_df.t_alt_count / small_df.t_depth
68
+
69
+ small_df = small_df.set_index('mut_id')
70
+ df.set_index('mut_id', inplace=True)
71
+ df.update(small_df)
72
+ df.reset_index(inplace=True)
73
+ self.df = df
74
+ return self
75
+
76
+ def find_epistasis(self, pth=3, rth=0):
77
+ df = self.df
78
+ if df.empty:
79
+ return None
80
+ # df = df[df.t_alt_count > rth].sort_values('Start_Position', ascending=True)
81
+ df = df[(df.t_alt_count > df.t_ref_count / pth) & (df.t_alt_count >= rth)].sort_values('Start_Position',
82
+ ascending=True)
83
+
84
+ # display(df[['mut_id', 't_alt_count', 't_ref_count']])
85
+
86
+ # Group by the group_key
87
+ grouped = df.groupby('group').agg({
88
+ 'mut_id': lambda x: '|'.join(x),
89
+ 't_alt_count': 'mean',
90
+ 't_ref_count': 'mean',
91
+ 'case_id': 'first'
92
+ }).reset_index(drop=True)
93
+
94
+ # Drop the group_key column
95
+ return grouped[grouped.mut_id.str.contains('\|')][['mut_id', 't_alt_count', 't_ref_count', 'case_id']]
96
+
97
+
98
+ class TCGAGene:
99
+ def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
100
+ valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
101
+ file_path = cancer_path / gene / 'GeneMutTble.txt'
102
+ if not file_path.exists():
103
+ self.df = pd.DataFrame()
104
+
105
+ else:
106
+ df = pd.read_csv(file_path,
107
+ usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
108
+ 'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
109
+ 'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
110
+ 'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type',
111
+ 'Variant_Classification'] + extra_cols,
112
+ low_memory=False).sort_values('Start_Position', ascending=True)
113
+
114
+ df['attention'] = True
115
+
116
+ if df.empty:
117
+ self.df = df
118
+
119
+ else:
120
+ df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
121
+ df = df.astype({'Start_Position': int})
122
+
123
+ if include_filter is not None:
124
+ # df = df[df.FILTER == include_filter]
125
+ df.loc[~df['FILTER'].str.contains(include_filter), 'attention'] = False
126
+
127
+ elif exclude_filters is not None:
128
+ for exclude_filter in exclude_filters:
129
+ # df = df[~df.FILTER.str.contains(exclude_filter)]
130
+ df.loc[df['FILTER'].str.contains(exclude_filter), 'attention'] = False
131
+
132
+ if valid_cases is not None:
133
+ # df = df[df.case_id.isin(valid_cases)]
134
+ df.loc[~df.case_id.isin(valid_cases), 'attention'] = False
135
+
136
+ df['mut_id'] = df.apply(lambda
137
+ row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
138
+ axis=1)
139
+ df['mut_id_yoram'] = df.apply(lambda
140
+ row: f"{row.Gene_name}:{row.Chromosome}:{row.Variant_Classification}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
141
+ axis=1)
142
+ silent_mut_classes = ["3'Flank", "3'UTR", "Silent", "Splice_Site", "Splice_Region", "Intron", "5'Flank",
143
+ "3'Flank"]
144
+ df['silent'] = df.apply(lambda row: row.Variant_Classification in silent_mut_classes, axis=1)
145
+ df['ratio'] = df.t_alt_count + df.t_ref_count
146
+ df = df[df.ratio > 0]
147
+ df['ratio'] = df.t_alt_count / df.ratio
148
+ self.df = df
149
+
150
+ def __repr__(self):
151
+ return repr(self.df[self.df.attention])
152
+
153
+ @property
154
+ def data(self):
155
+ return self.df[self.df.attention]
156
+
157
+ def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
158
+ if mut_id is None:
159
+ return self.df.case_id.unique().tolist()
160
+ df = self.df
161
+ df = df[(df.mut_id == mut_id) & (df.ratio >= read_ratio)]
162
+ for filter in filters:
163
+ df = df[~df.FILTER.str.contains(filter)]
164
+ return df.case_id.unique().tolist()
165
+
166
+ def get_patient_muts(self, case_id=None, read_ratio=0, exclude_filters=None):
167
+ if case_id is None:
168
+ case_id = random.choice(self.affected_cases())
169
+ return self.df[self.df.case_id == case_id]
170
+
171
+ def get_patients_affected(self, mut_id, read_ratio=0, exclude_filters=None):
172
+ return self.data[self.data.mut_id == mut_id].case_id.unique().tolist()
173
+
174
+
175
+ def get_patients_unaffected(self, mut_id, must_contain_all=False, read_ratio=0, exclude_filters=None):
176
+ # returns all patients not affected by ALL the mutation in mut id (patients containg individual mutations only allowed) unless must_contain_all= True
177
+ pass
178
+
179
+ def split_patients(self, mut_id, strict=True):
180
+ # returns two lists: all patients affected by a mutation and all patients with none of the mutations (or the mutations but not togehter)
181
+ pass
182
+
183
+ def arrange_patients_by_project(self, mut_id):
184
+ # returns all the patients affected by a mutation grouped by cancer project
185
+ pass
186
+
187
+ def total_prevalence(self, mut_id):
188
+ pass
189
+
190
+ def project_prevalence(self, mut_id, df_p_proc):
191
+ mut_prevalence = {}
192
+ for i, g in tqdm(self.data.groupby(['mut_id', 'Transcript_ID'])):
193
+ mut_prevalence[i] = series_to_pretty_string((df_p_proc[g.case_id].value_counts() / project_counts).dropna())
194
+ return pd.Series(mut_prevalence)
195
+
196
+ def project_counts(self, mut_id):
197
+ pass
198
+
199
+ def filter_silent_muts(self):
200
+ self.df.loc[self.df.silent, 'attention'] = False
201
+ return self
202
+
203
+
204
+ def series_to_pretty_string(series):
205
+ # Format each index-value pair, applying scientific notation to floats with 3 significant figures
206
+ pretty_str = "\n".join([
207
+ f"{index}: {value:.3e}" if isinstance(value, float) else f"{index}: {value}"
208
+ for index, value in series.items()
209
+ ])
210
+ return pretty_str
211
+
212
+
213
+ # CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
214
+ # CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
215
+ # CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
216
+ # MAF_FILE_NAME = 'GeneMutTble.txt'
217
+ # CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
218
+ # PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
219
+ # OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
220
+ # MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
221
+ # PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
222
+ # PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
223
+
224
+ # def prepare_gene_sets():
225
+ # # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
226
+ # # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
227
+ # # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
228
+ # # all_oncogenes = list(set(all_oncogenes))
229
+ # return [], [], []
230
+ #
231
+ # CLIN_DF = prepare_clinical_data()
232
+ # TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
233
+ #
234
+ #
235
+ # def generate_survival_quantitative(affected_df, nonaffected_df):
236
+ # if affected_df.empty or nonaffected_df.empty:
237
+ # return np.nan, np.nan, np.nan
238
+ # results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
239
+ # event_observed_A=affected_df['event'],
240
+ # event_observed_B=nonaffected_df['event'])
241
+ # p_value = results.p_value
242
+ # kmf = KaplanMeierFitter()
243
+ # kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
244
+ # times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
245
+ # auc1 = np.trapz(surv_probs, times)
246
+ # kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
247
+ # times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
248
+ # auc2 = np.trapz(surv_probs, times)
249
+ # return p_value, auc1, auc2
250
+ #
251
+ # def generate_survival_pvalue(affected_df, unaffected_df):
252
+ # results = logrank_test(affected_df['duration'], unaffected_df['duration'],
253
+ # event_observed_A=affected_df['event'],
254
+ # event_observed_B=unaffected_df['event'])
255
+ #
256
+ # p_value = results.p_value
257
+ # kmf = KaplanMeierFitter()
258
+ # # Fit data
259
+ # kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
260
+ # ax = kmf.plot()
261
+ #
262
+ # kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
263
+ # kmf.plot(ax=ax)
264
+ # plt.text(5, 0.95, f'pval: {p_value:.3e}')
265
+ # plt.show()
266
+ # return p_value
267
+ #
268
+ # def get_project_prevalence(cases_affected):
269
+ # ca = [c for c in cases_affected if c in CASE_TRACKER.index]
270
+ # prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
271
+ # prevalences.fillna(0, inplace=True)
272
+ # prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
273
+ # prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
274
+ # return prevalences
275
+ #
276
+ # def get_project_counts(cases_affected):
277
+ # ca = [c for c in cases_affected if c in CASE_TRACKER.index]
278
+ # prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
279
+ # prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
280
+ # prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
281
+ # return prevalences
282
+ #
283
+ # def get_event_consequence(df):
284
+ # assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
285
+ # return df.iloc[0].Consequence
286
+ #
287
+ # def get_dbSNP_id(df):
288
+ # return df.iloc[0].dbSNP_RS
289
+ #
290
+ # def load_variant_file(gene):
291
+ # df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
292
+ # df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
293
+ # return df
294
+ #
295
+ # def find_event_data(event):
296
+ # df = load_variant_file(event.gene)
297
+ # if df.empty:
298
+ # return None
299
+ #
300
+ # df = df.query \
301
+ # ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
302
+ #
303
+ # if df.empty:
304
+ # return None
305
+ #
306
+ # if event.transcript_id is not None:
307
+ # df = df[df.Transcript_ID == event.transcript_id]
308
+ # df['mut_id'] = event.event_id
309
+ # return df
310
+ #
311
+ #
312
+ # class GEvent:
313
+ # def __init__(self, event_id, transcript_id=None):
314
+ # self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
315
+ # self.transcript_id = transcript_id
316
+ # self.chromosome = f'chr{self.chromosome}'
317
+ # self.start = int(self.start)
318
+ # self.event_id = event_id
319
+ #
320
+ #
321
+ #
322
+ # def get_okgp_mutation_frequency(mut_id):
323
+ # if mut_id in MUTATION_FREQ_DF.index:
324
+ # return MUTATION_FREQ_DF.loc[mut_id].cases_affected
325
+ # else:
326
+ # return 0
327
+ #
328
+ # def get_df_filter_info(df):
329
+ # filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
330
+ # MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
331
+ # 'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
332
+ # filter_col_name: str = "FILTER_info" # column name to add to the dataframe
333
+ # filter_info_list: list = []
334
+ # f_cnr_info = {}
335
+ #
336
+ # for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
337
+ # filter_vals = list(df_prj['FILTER'])
338
+ # num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
339
+ # for filter_val in filter_vals:
340
+ # num_pass += ('PASS' in filter_val)
341
+ # num_artifacts += any([x in filter_val for x in filter_artifact_values])
342
+ # num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
343
+ # num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
344
+ # f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
345
+ # return f_cnr_info
346
+ #
347
+ # def yoram_mutid(row):
348
+ # return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
349
+ #
350
+ #
351
+ # def annotate_level_two(mut_id, tid):
352
+ # mut = GEvent(mut_id, tid)
353
+ # df = find_event_data(mut)
354
+ #
355
+ # if df.empty or df is None:
356
+ # return None
357
+ #
358
+ # patients_affected = df.cases_affected.unique().tolist()
359
+ # p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
360
+ # project_prevalences = get_project_prevalence(patients_affected)
361
+ # prev_dict = project_prevalences.to_dict().sort()
362
+ # project_counts = get_project_counts(patients_affected)
363
+ #
364
+ # s = pd.Series({
365
+ # 'mut_id': mut_id,
366
+ # 'yoram_mut_id': yoram_mutid(df.iloc[0]),
367
+ # 'transcript_id': tid,
368
+ # 'affected_cases': len(patients_affected),
369
+ # 'dbSNP_id': get_dbSNP_id(df),
370
+ # 'consequence': get_event_consequence(df),
371
+ # 'survival_p_value': p_val,
372
+ # 'auc_affected': auc_a,
373
+ # 'auc_nonaffected': auc_n,
374
+ # 'TSG': contains(TSGS, mut.gene),
375
+ # 'oncogene': contains(ONCOGENES, mut.gene),
376
+ # 'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
377
+ # 'filter_inf': get_df_filter_info(df),
378
+ # 'strand': df.Strand.unique().tolist()[0],
379
+ # 'prevalences': prev_dict
380
+ # })
381
+ #
382
+ # s['max_prev'] = project_prevalences.max()
383
+ # s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
384
+ # s = pd.concat([s, project_prevalences, project_counts])
385
+ # del df
386
+ # return s
387
+ #
388
+ # def get_mut_counts():
389
+ # cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
390
+ # cases = pd.Series(cases)
391
+ # cases.name = 'num_cases'
392
+ # cases.index.name = 'mut_id'
393
+ # cases = cases.to_frame()
394
+ # cases.reset_index(inplace=True)
395
+ # return cases
396
+ #
397
+ #
398
+ def create_mut_id(row):
399
+ return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
400
+ #
401
+ #
402
+ # def is_in_exon(mut_id, tid):
403
+ # from geney.Gene import Gene
404
+ # transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
405
+ # return int(mut_id.split(':')[2]) in transcript.exonic_indices