geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +55 -57
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/spliceai_utils.py +17 -17
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/METADATA +1 -1
- geney-1.4.0.dist-info/RECORD +51 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/WHEEL +1 -1
- geney-1.3.78.dist-info/RECORD +0 -31
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/top_level.txt +0 -0
geney/_survival_utils.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
# from scipy.integrate import trapezoid
|
|
6
|
+
from geney.utils import unload_pickle, unload_json, contains
|
|
7
|
+
from lifelines.exceptions import ConvergenceError
|
|
8
|
+
from lifelines import KaplanMeierFitter
|
|
9
|
+
from lifelines.statistics import logrank_test
|
|
10
|
+
from lifelines import CoxPHFitter
|
|
11
|
+
|
|
12
|
+
pd.set_option('display.max_columns', None)
|
|
13
|
+
pd.options.mode.chained_assignment = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def prepare_clinical_data(df=None):
|
|
17
|
+
if df is None:
|
|
18
|
+
CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
|
|
19
|
+
df = unload_pickle(CLINICAL_DATA_FILE)
|
|
20
|
+
|
|
21
|
+
df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
|
|
22
|
+
cols = list(df.columns)
|
|
23
|
+
cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
|
|
24
|
+
cols_days_to_know_alive = [col for col in cols if 'days_to_know_alive' in col] + [col for col in cols if 'days_to_last_known_alive' in col]
|
|
25
|
+
cols_days_to_death = [col for col in cols if 'days_to_death' in col]
|
|
26
|
+
cols_duration = cols_days_to_followup + cols_days_to_know_alive + cols_days_to_death
|
|
27
|
+
col_vital_status = 'days_to_death'
|
|
28
|
+
event_col_label = 'event'
|
|
29
|
+
duration_col_label = 'duration'
|
|
30
|
+
df.insert(1, event_col_label, df.apply(lambda x: int(not np.isnan(x[col_vital_status])), axis=1))
|
|
31
|
+
df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
|
|
32
|
+
df[duration_col_label] /= 365
|
|
33
|
+
df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
|
|
34
|
+
# df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SurvivalAnalysis:
|
|
39
|
+
def __init__(self, clindf=None):
|
|
40
|
+
self.clindf = prepare_clinical_data(clindf)
|
|
41
|
+
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
42
|
+
self.df = self.clindf.copy()
|
|
43
|
+
self.df['group'] = 0
|
|
44
|
+
self.df.fillna(0, inplace=True)
|
|
45
|
+
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
46
|
+
|
|
47
|
+
def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
|
|
48
|
+
df = self.df.copy()
|
|
49
|
+
df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
|
|
50
|
+
if control_cases is not None:
|
|
51
|
+
df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
|
|
52
|
+
|
|
53
|
+
df = df[df.group > 0]
|
|
54
|
+
df.group -= 1
|
|
55
|
+
core_features = ['duration', 'event']
|
|
56
|
+
df = df[core_features + features_of_interest]
|
|
57
|
+
|
|
58
|
+
for col in self.treatment_features:
|
|
59
|
+
if col not in df:
|
|
60
|
+
continue
|
|
61
|
+
df.loc[df[col] > 0, col] = 1
|
|
62
|
+
|
|
63
|
+
df = df[core_features + [col for col in features_of_interest if
|
|
64
|
+
df[col].nunique() > 1]] # and df[col].value_counts(normalize=True).min() >= 0.01]]
|
|
65
|
+
return df
|
|
66
|
+
|
|
67
|
+
def kaplan_meier_analysis(self, df, control_label='Unaffected Patients', target_label='Affected Patients', feature='group', plot=False, title=None, time_cap=False, savepath=None, figsize=(7, 3), tmb_p_value=None):
|
|
68
|
+
# Can only be performed on features with two unique values
|
|
69
|
+
cap_time = df.groupby(feature).duration.max().min()
|
|
70
|
+
# df['duration'] = df['duration'].clip(upper=cap_time)
|
|
71
|
+
auc_vals = []
|
|
72
|
+
results = pd.Series()
|
|
73
|
+
count = 0
|
|
74
|
+
for val in [0, 1]:
|
|
75
|
+
g = df[df[feature] == val]
|
|
76
|
+
kmf = KaplanMeierFitter()
|
|
77
|
+
label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
|
|
78
|
+
if val == 0:
|
|
79
|
+
results[control_label] = len(g)
|
|
80
|
+
else:
|
|
81
|
+
results[target_label] = len(g)
|
|
82
|
+
|
|
83
|
+
kmf.fit(g['duration'], g['event'], label=label)
|
|
84
|
+
surv_func = kmf.survival_function_
|
|
85
|
+
filtered_surv_func = surv_func[surv_func.index <= cap_time]
|
|
86
|
+
auc = np.trapz(filtered_surv_func[label], filtered_surv_func.index)
|
|
87
|
+
# auc = trapz(surv_func[label], surv_func.index)
|
|
88
|
+
auc_vals.append(auc)
|
|
89
|
+
if plot:
|
|
90
|
+
if count == 0:
|
|
91
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
92
|
+
kmf.plot_survival_function(ax=ax, ci_show=True, color="#2430e0", lw=2)
|
|
93
|
+
else:
|
|
94
|
+
kmf.plot_survival_function(ax=ax, ci_show=True, color="#e60215", lw=2)
|
|
95
|
+
count += 1
|
|
96
|
+
|
|
97
|
+
p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
|
|
98
|
+
|
|
99
|
+
if plot:
|
|
100
|
+
ax.text(0.6, 0.6, rf'Survival $p{{v}}$: {p_value:.3e}', transform=ax.transAxes, fontsize=10,
|
|
101
|
+
horizontalalignment='left')
|
|
102
|
+
if tmb_p_value:
|
|
103
|
+
ax.text(0.6, 0.53, rf'TMB $p{{v}}$: {tmb_p_value:.3e}', transform=ax.transAxes, fontsize=10,
|
|
104
|
+
horizontalalignment='left')
|
|
105
|
+
# Grid and spines
|
|
106
|
+
ax.grid(True, which="major", linestyle="--", linewidth=0.5, color="grey", alpha=0.7)
|
|
107
|
+
ax.spines['top'].set_visible(False)
|
|
108
|
+
ax.spines['right'].set_visible(False)
|
|
109
|
+
ax.tick_params(axis="both", which="major", labelsize=10)
|
|
110
|
+
if title:
|
|
111
|
+
ax.set_title(title, fontsize=12)
|
|
112
|
+
legend = ax.legend(fontsize=9, loc='best', frameon=True)
|
|
113
|
+
legend.get_frame().set_facecolor('white') # Set the background color to white
|
|
114
|
+
legend.get_frame().set_edgecolor('black') # Set the edge color to black
|
|
115
|
+
plt.xlabel('Time (years)')
|
|
116
|
+
plt.ylabel('Survival Probability')
|
|
117
|
+
if time_cap:
|
|
118
|
+
plt.xlim([0, cap_time])
|
|
119
|
+
plt.tight_layout()
|
|
120
|
+
if savepath is not None:
|
|
121
|
+
plt.savefig(savepath, bbox_inches='tight', dpi=300)
|
|
122
|
+
plt.show()
|
|
123
|
+
|
|
124
|
+
results['p_value'] = p_value
|
|
125
|
+
results['auc_target'] = auc_vals[-1]
|
|
126
|
+
if len(auc_vals) > 1:
|
|
127
|
+
results['auc_delta'] = auc_vals[-1] - auc_vals[0]
|
|
128
|
+
results['auc_control'] = auc_vals[0]
|
|
129
|
+
|
|
130
|
+
return results
|
|
131
|
+
|
|
132
|
+
def log_rank(self, group1, group2):
|
|
133
|
+
return logrank_test(group1['duration'], group2['duration'],
|
|
134
|
+
event_observed_A=group1['event'],
|
|
135
|
+
event_observed_B=group2['event']).p_value
|
|
136
|
+
|
|
137
|
+
def perform_cox_analysis(self, df, features_of_interest):
|
|
138
|
+
# Very simple... will return a series with p values for each feature
|
|
139
|
+
try:
|
|
140
|
+
return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
|
|
141
|
+
except ConvergenceError:
|
|
142
|
+
print("Convergence Error")
|
|
143
|
+
return pd.Series()
|
geney/_tcga_utils.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import random
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
class TCGACase:
|
|
7
|
+
def __init__(self, df):
|
|
8
|
+
# Here we get a dataframe of mutations within a gene
|
|
9
|
+
self.df = df
|
|
10
|
+
self.calculate_vaf()
|
|
11
|
+
self.space_variants(spacer_size=50)
|
|
12
|
+
self.case_id = df.case_id.tolist()[0]
|
|
13
|
+
|
|
14
|
+
def space_variants(self, spacer_size=100, group_likelihood_threshold=0):
|
|
15
|
+
df = self.df
|
|
16
|
+
if df.empty:
|
|
17
|
+
df['group'] = 0
|
|
18
|
+
return self
|
|
19
|
+
values = sorted(df.Start_Position.unique().tolist())
|
|
20
|
+
# groups = [list(group) for key, group in groupby(values, key=lambda x: (x - values[values.index(x) - 1] >
|
|
21
|
+
# spacer_size) if values.index(x) > 0 else False)] Initialize variables
|
|
22
|
+
groups = []
|
|
23
|
+
current_group = []
|
|
24
|
+
|
|
25
|
+
# Iterate through the values
|
|
26
|
+
for i in range(len(values)):
|
|
27
|
+
if i == 0:
|
|
28
|
+
current_group.append(values[i])
|
|
29
|
+
else:
|
|
30
|
+
if values[i] - values[i - 1] <= spacer_size:
|
|
31
|
+
current_group.append(values[i])
|
|
32
|
+
else:
|
|
33
|
+
groups.append(current_group)
|
|
34
|
+
current_group = [values[i]]
|
|
35
|
+
|
|
36
|
+
# Append the last group if it's not empty
|
|
37
|
+
if current_group:
|
|
38
|
+
groups.append(current_group)
|
|
39
|
+
|
|
40
|
+
df.loc[:, 'group'] = 0
|
|
41
|
+
for i, g in enumerate(groups):
|
|
42
|
+
df.loc[df.Start_Position.isin(g), 'group'] = i
|
|
43
|
+
self.df = df
|
|
44
|
+
return self
|
|
45
|
+
|
|
46
|
+
def calculate_vaf(self):
|
|
47
|
+
df = self.df
|
|
48
|
+
df = df[df.t_depth > 0]
|
|
49
|
+
df.loc[:, 'vaf'] = df.apply(lambda row: row.t_alt_count / row.t_depth, axis=1)
|
|
50
|
+
self.df = df
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def find_overlayed_variants(self):
|
|
54
|
+
df = self.df
|
|
55
|
+
mut_counts = df.mut_id.value_counts()
|
|
56
|
+
mut_counts = mut_counts[mut_counts > 1].index
|
|
57
|
+
|
|
58
|
+
small_df = df.groupby('mut_id', as_index=False).agg({
|
|
59
|
+
't_depth': 'sum',
|
|
60
|
+
't_alt_count': 'sum',
|
|
61
|
+
't_ref_count': 'sum',
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
df = df.drop_duplicates(subset='mut_id', keep='first')
|
|
65
|
+
|
|
66
|
+
small_df = small_df[small_df.t_depth > 0]
|
|
67
|
+
small_df['vaf'] = small_df.t_alt_count / small_df.t_depth
|
|
68
|
+
|
|
69
|
+
small_df = small_df.set_index('mut_id')
|
|
70
|
+
df.set_index('mut_id', inplace=True)
|
|
71
|
+
df.update(small_df)
|
|
72
|
+
df.reset_index(inplace=True)
|
|
73
|
+
self.df = df
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def find_epistasis(self, pth=3, rth=0):
|
|
77
|
+
df = self.df
|
|
78
|
+
if df.empty:
|
|
79
|
+
return None
|
|
80
|
+
# df = df[df.t_alt_count > rth].sort_values('Start_Position', ascending=True)
|
|
81
|
+
df = df[(df.t_alt_count > df.t_ref_count / pth) & (df.t_alt_count >= rth)].sort_values('Start_Position',
|
|
82
|
+
ascending=True)
|
|
83
|
+
|
|
84
|
+
# display(df[['mut_id', 't_alt_count', 't_ref_count']])
|
|
85
|
+
|
|
86
|
+
# Group by the group_key
|
|
87
|
+
grouped = df.groupby('group').agg({
|
|
88
|
+
'mut_id': lambda x: '|'.join(x),
|
|
89
|
+
't_alt_count': 'mean',
|
|
90
|
+
't_ref_count': 'mean',
|
|
91
|
+
'case_id': 'first'
|
|
92
|
+
}).reset_index(drop=True)
|
|
93
|
+
|
|
94
|
+
# Drop the group_key column
|
|
95
|
+
return grouped[grouped.mut_id.str.contains('\|')][['mut_id', 't_alt_count', 't_ref_count', 'case_id']]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class TCGAGene:
|
|
99
|
+
def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
|
|
100
|
+
valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
|
|
101
|
+
file_path = cancer_path / gene / 'GeneMutTble.txt'
|
|
102
|
+
if not file_path.exists():
|
|
103
|
+
self.df = pd.DataFrame()
|
|
104
|
+
|
|
105
|
+
else:
|
|
106
|
+
df = pd.read_csv(file_path,
|
|
107
|
+
usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
|
|
108
|
+
'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
|
|
109
|
+
'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
110
|
+
'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type',
|
|
111
|
+
'Variant_Classification'] + extra_cols,
|
|
112
|
+
low_memory=False).sort_values('Start_Position', ascending=True)
|
|
113
|
+
|
|
114
|
+
df['attention'] = True
|
|
115
|
+
|
|
116
|
+
if df.empty:
|
|
117
|
+
self.df = df
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
|
|
121
|
+
df = df.astype({'Start_Position': int})
|
|
122
|
+
|
|
123
|
+
if include_filter is not None:
|
|
124
|
+
# df = df[df.FILTER == include_filter]
|
|
125
|
+
df.loc[~df['FILTER'].str.contains(include_filter), 'attention'] = False
|
|
126
|
+
|
|
127
|
+
elif exclude_filters is not None:
|
|
128
|
+
for exclude_filter in exclude_filters:
|
|
129
|
+
# df = df[~df.FILTER.str.contains(exclude_filter)]
|
|
130
|
+
df.loc[df['FILTER'].str.contains(exclude_filter), 'attention'] = False
|
|
131
|
+
|
|
132
|
+
if valid_cases is not None:
|
|
133
|
+
# df = df[df.case_id.isin(valid_cases)]
|
|
134
|
+
df.loc[~df.case_id.isin(valid_cases), 'attention'] = False
|
|
135
|
+
|
|
136
|
+
df['mut_id'] = df.apply(lambda
|
|
137
|
+
row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
|
|
138
|
+
axis=1)
|
|
139
|
+
df['mut_id_yoram'] = df.apply(lambda
|
|
140
|
+
row: f"{row.Gene_name}:{row.Chromosome}:{row.Variant_Classification}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
|
|
141
|
+
axis=1)
|
|
142
|
+
silent_mut_classes = ["3'Flank", "3'UTR", "Silent", "Splice_Site", "Splice_Region", "Intron", "5'Flank",
|
|
143
|
+
"3'Flank"]
|
|
144
|
+
df['silent'] = df.apply(lambda row: row.Variant_Classification in silent_mut_classes, axis=1)
|
|
145
|
+
df['ratio'] = df.t_alt_count + df.t_ref_count
|
|
146
|
+
df = df[df.ratio > 0]
|
|
147
|
+
df['ratio'] = df.t_alt_count / df.ratio
|
|
148
|
+
self.df = df
|
|
149
|
+
|
|
150
|
+
def __repr__(self):
|
|
151
|
+
return repr(self.df[self.df.attention])
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def data(self):
|
|
155
|
+
return self.df[self.df.attention]
|
|
156
|
+
|
|
157
|
+
def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
|
|
158
|
+
if mut_id is None:
|
|
159
|
+
return self.df.case_id.unique().tolist()
|
|
160
|
+
df = self.df
|
|
161
|
+
df = df[(df.mut_id == mut_id) & (df.ratio >= read_ratio)]
|
|
162
|
+
for filter in filters:
|
|
163
|
+
df = df[~df.FILTER.str.contains(filter)]
|
|
164
|
+
return df.case_id.unique().tolist()
|
|
165
|
+
|
|
166
|
+
def get_patient_muts(self, case_id=None, read_ratio=0, exclude_filters=None):
|
|
167
|
+
if case_id is None:
|
|
168
|
+
case_id = random.choice(self.affected_cases())
|
|
169
|
+
return self.df[self.df.case_id == case_id]
|
|
170
|
+
|
|
171
|
+
def get_patients_affected(self, mut_id, read_ratio=0, exclude_filters=None):
|
|
172
|
+
return self.data[self.data.mut_id == mut_id].case_id.unique().tolist()
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_patients_unaffected(self, mut_id, must_contain_all=False, read_ratio=0, exclude_filters=None):
|
|
176
|
+
# returns all patients not affected by ALL the mutation in mut id (patients containg individual mutations only allowed) unless must_contain_all= True
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
def split_patients(self, mut_id, strict=True):
|
|
180
|
+
# returns two lists: all patients affected by a mutation and all patients with none of the mutations (or the mutations but not togehter)
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
def arrange_patients_by_project(self, mut_id):
|
|
184
|
+
# returns all the patients affected by a mutation grouped by cancer project
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
def total_prevalence(self, mut_id):
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
def project_prevalence(self, mut_id, df_p_proc):
|
|
191
|
+
mut_prevalence = {}
|
|
192
|
+
for i, g in tqdm(self.data.groupby(['mut_id', 'Transcript_ID'])):
|
|
193
|
+
mut_prevalence[i] = series_to_pretty_string((df_p_proc[g.case_id].value_counts() / project_counts).dropna())
|
|
194
|
+
return pd.Series(mut_prevalence)
|
|
195
|
+
|
|
196
|
+
def project_counts(self, mut_id):
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
def filter_silent_muts(self):
|
|
200
|
+
self.df.loc[self.df.silent, 'attention'] = False
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def series_to_pretty_string(series):
|
|
205
|
+
# Format each index-value pair, applying scientific notation to floats with 3 significant figures
|
|
206
|
+
pretty_str = "\n".join([
|
|
207
|
+
f"{index}: {value:.3e}" if isinstance(value, float) else f"{index}: {value}"
|
|
208
|
+
for index, value in series.items()
|
|
209
|
+
])
|
|
210
|
+
return pretty_str
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
|
|
214
|
+
# CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
|
|
215
|
+
# CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
|
|
216
|
+
# MAF_FILE_NAME = 'GeneMutTble.txt'
|
|
217
|
+
# CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
|
|
218
|
+
# PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
|
|
219
|
+
# OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
|
|
220
|
+
# MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
|
|
221
|
+
# PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
|
|
222
|
+
# PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
|
|
223
|
+
|
|
224
|
+
# def prepare_gene_sets():
|
|
225
|
+
# # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
|
|
226
|
+
# # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
|
|
227
|
+
# # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
|
|
228
|
+
# # all_oncogenes = list(set(all_oncogenes))
|
|
229
|
+
# return [], [], []
|
|
230
|
+
#
|
|
231
|
+
# CLIN_DF = prepare_clinical_data()
|
|
232
|
+
# TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
|
|
233
|
+
#
|
|
234
|
+
#
|
|
235
|
+
# def generate_survival_quantitative(affected_df, nonaffected_df):
|
|
236
|
+
# if affected_df.empty or nonaffected_df.empty:
|
|
237
|
+
# return np.nan, np.nan, np.nan
|
|
238
|
+
# results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
|
|
239
|
+
# event_observed_A=affected_df['event'],
|
|
240
|
+
# event_observed_B=nonaffected_df['event'])
|
|
241
|
+
# p_value = results.p_value
|
|
242
|
+
# kmf = KaplanMeierFitter()
|
|
243
|
+
# kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
|
|
244
|
+
# times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
|
|
245
|
+
# auc1 = np.trapz(surv_probs, times)
|
|
246
|
+
# kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
|
|
247
|
+
# times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
|
|
248
|
+
# auc2 = np.trapz(surv_probs, times)
|
|
249
|
+
# return p_value, auc1, auc2
|
|
250
|
+
#
|
|
251
|
+
# def generate_survival_pvalue(affected_df, unaffected_df):
|
|
252
|
+
# results = logrank_test(affected_df['duration'], unaffected_df['duration'],
|
|
253
|
+
# event_observed_A=affected_df['event'],
|
|
254
|
+
# event_observed_B=unaffected_df['event'])
|
|
255
|
+
#
|
|
256
|
+
# p_value = results.p_value
|
|
257
|
+
# kmf = KaplanMeierFitter()
|
|
258
|
+
# # Fit data
|
|
259
|
+
# kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
|
|
260
|
+
# ax = kmf.plot()
|
|
261
|
+
#
|
|
262
|
+
# kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
|
|
263
|
+
# kmf.plot(ax=ax)
|
|
264
|
+
# plt.text(5, 0.95, f'pval: {p_value:.3e}')
|
|
265
|
+
# plt.show()
|
|
266
|
+
# return p_value
|
|
267
|
+
#
|
|
268
|
+
# def get_project_prevalence(cases_affected):
|
|
269
|
+
# ca = [c for c in cases_affected if c in CASE_TRACKER.index]
|
|
270
|
+
# prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
|
|
271
|
+
# prevalences.fillna(0, inplace=True)
|
|
272
|
+
# prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
|
|
273
|
+
# prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
|
|
274
|
+
# return prevalences
|
|
275
|
+
#
|
|
276
|
+
# def get_project_counts(cases_affected):
|
|
277
|
+
# ca = [c for c in cases_affected if c in CASE_TRACKER.index]
|
|
278
|
+
# prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
|
|
279
|
+
# prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
|
|
280
|
+
# prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
|
|
281
|
+
# return prevalences
|
|
282
|
+
#
|
|
283
|
+
# def get_event_consequence(df):
|
|
284
|
+
# assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
|
|
285
|
+
# return df.iloc[0].Consequence
|
|
286
|
+
#
|
|
287
|
+
# def get_dbSNP_id(df):
|
|
288
|
+
# return df.iloc[0].dbSNP_RS
|
|
289
|
+
#
|
|
290
|
+
# def load_variant_file(gene):
|
|
291
|
+
# df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
|
|
292
|
+
# df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
|
|
293
|
+
# return df
|
|
294
|
+
#
|
|
295
|
+
# def find_event_data(event):
|
|
296
|
+
# df = load_variant_file(event.gene)
|
|
297
|
+
# if df.empty:
|
|
298
|
+
# return None
|
|
299
|
+
#
|
|
300
|
+
# df = df.query \
|
|
301
|
+
# ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
|
|
302
|
+
#
|
|
303
|
+
# if df.empty:
|
|
304
|
+
# return None
|
|
305
|
+
#
|
|
306
|
+
# if event.transcript_id is not None:
|
|
307
|
+
# df = df[df.Transcript_ID == event.transcript_id]
|
|
308
|
+
# df['mut_id'] = event.event_id
|
|
309
|
+
# return df
|
|
310
|
+
#
|
|
311
|
+
#
|
|
312
|
+
# class GEvent:
|
|
313
|
+
# def __init__(self, event_id, transcript_id=None):
|
|
314
|
+
# self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
|
|
315
|
+
# self.transcript_id = transcript_id
|
|
316
|
+
# self.chromosome = f'chr{self.chromosome}'
|
|
317
|
+
# self.start = int(self.start)
|
|
318
|
+
# self.event_id = event_id
|
|
319
|
+
#
|
|
320
|
+
#
|
|
321
|
+
#
|
|
322
|
+
# def get_okgp_mutation_frequency(mut_id):
|
|
323
|
+
# if mut_id in MUTATION_FREQ_DF.index:
|
|
324
|
+
# return MUTATION_FREQ_DF.loc[mut_id].cases_affected
|
|
325
|
+
# else:
|
|
326
|
+
# return 0
|
|
327
|
+
#
|
|
328
|
+
# def get_df_filter_info(df):
|
|
329
|
+
# filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
|
|
330
|
+
# MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
|
|
331
|
+
# 'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
|
|
332
|
+
# filter_col_name: str = "FILTER_info" # column name to add to the dataframe
|
|
333
|
+
# filter_info_list: list = []
|
|
334
|
+
# f_cnr_info = {}
|
|
335
|
+
#
|
|
336
|
+
# for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
|
|
337
|
+
# filter_vals = list(df_prj['FILTER'])
|
|
338
|
+
# num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
|
|
339
|
+
# for filter_val in filter_vals:
|
|
340
|
+
# num_pass += ('PASS' in filter_val)
|
|
341
|
+
# num_artifacts += any([x in filter_val for x in filter_artifact_values])
|
|
342
|
+
# num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
|
|
343
|
+
# num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
|
|
344
|
+
# f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
|
|
345
|
+
# return f_cnr_info
|
|
346
|
+
#
|
|
347
|
+
# def yoram_mutid(row):
|
|
348
|
+
# return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
|
|
349
|
+
#
|
|
350
|
+
#
|
|
351
|
+
# def annotate_level_two(mut_id, tid):
|
|
352
|
+
# mut = GEvent(mut_id, tid)
|
|
353
|
+
# df = find_event_data(mut)
|
|
354
|
+
#
|
|
355
|
+
# if df.empty or df is None:
|
|
356
|
+
# return None
|
|
357
|
+
#
|
|
358
|
+
# patients_affected = df.cases_affected.unique().tolist()
|
|
359
|
+
# p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
|
|
360
|
+
# project_prevalences = get_project_prevalence(patients_affected)
|
|
361
|
+
# prev_dict = project_prevalences.to_dict().sort()
|
|
362
|
+
# project_counts = get_project_counts(patients_affected)
|
|
363
|
+
#
|
|
364
|
+
# s = pd.Series({
|
|
365
|
+
# 'mut_id': mut_id,
|
|
366
|
+
# 'yoram_mut_id': yoram_mutid(df.iloc[0]),
|
|
367
|
+
# 'transcript_id': tid,
|
|
368
|
+
# 'affected_cases': len(patients_affected),
|
|
369
|
+
# 'dbSNP_id': get_dbSNP_id(df),
|
|
370
|
+
# 'consequence': get_event_consequence(df),
|
|
371
|
+
# 'survival_p_value': p_val,
|
|
372
|
+
# 'auc_affected': auc_a,
|
|
373
|
+
# 'auc_nonaffected': auc_n,
|
|
374
|
+
# 'TSG': contains(TSGS, mut.gene),
|
|
375
|
+
# 'oncogene': contains(ONCOGENES, mut.gene),
|
|
376
|
+
# 'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
|
|
377
|
+
# 'filter_inf': get_df_filter_info(df),
|
|
378
|
+
# 'strand': df.Strand.unique().tolist()[0],
|
|
379
|
+
# 'prevalences': prev_dict
|
|
380
|
+
# })
|
|
381
|
+
#
|
|
382
|
+
# s['max_prev'] = project_prevalences.max()
|
|
383
|
+
# s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
|
|
384
|
+
# s = pd.concat([s, project_prevalences, project_counts])
|
|
385
|
+
# del df
|
|
386
|
+
# return s
|
|
387
|
+
#
|
|
388
|
+
# def get_mut_counts():
|
|
389
|
+
# cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
|
|
390
|
+
# cases = pd.Series(cases)
|
|
391
|
+
# cases.name = 'num_cases'
|
|
392
|
+
# cases.index.name = 'mut_id'
|
|
393
|
+
# cases = cases.to_frame()
|
|
394
|
+
# cases.reset_index(inplace=True)
|
|
395
|
+
# return cases
|
|
396
|
+
#
|
|
397
|
+
#
|
|
398
|
+
def create_mut_id(row):
|
|
399
|
+
return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
|
|
400
|
+
#
|
|
401
|
+
#
|
|
402
|
+
# def is_in_exon(mut_id, tid):
|
|
403
|
+
# from geney.Gene import Gene
|
|
404
|
+
# transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
|
|
405
|
+
# return int(mut_id.split(':')[2]) in transcript.exonic_indices
|