geney 1.1.1__py2.py3-none-any.whl → 1.1.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/power_utils.py +25 -12
- geney/survival.py +85 -102
- geney/tcga_utils.py +366 -0
- {geney-1.1.1.dist-info → geney-1.1.3.dist-info}/METADATA +2 -1
- {geney-1.1.1.dist-info → geney-1.1.3.dist-info}/RECORD +7 -6
- {geney-1.1.1.dist-info → geney-1.1.3.dist-info}/WHEEL +0 -0
- {geney-1.1.1.dist-info → geney-1.1.3.dist-info}/top_level.txt +0 -0
geney/power_utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import subprocess
|
|
2
2
|
import time
|
|
3
|
-
from dask_jobqueue import PBSCluster
|
|
3
|
+
from dask_jobqueue import PBSCluster, SLURMCluster
|
|
4
4
|
from dask.distributed import Client, wait
|
|
5
5
|
import os
|
|
6
6
|
from tqdm import tqdm
|
|
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
|
|
|
38
38
|
|
|
39
39
|
def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
40
40
|
walltime="24:00:00", dashboard_address=":23154",
|
|
41
|
-
log_directory="dask-logs"):
|
|
41
|
+
log_directory="dask-logs", slurm=False):
|
|
42
42
|
"""
|
|
43
43
|
Launch a Dask cluster using PBS.
|
|
44
44
|
|
|
@@ -54,16 +54,29 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
|
54
54
|
tuple: A tuple containing the Dask client and cluster objects.
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
57
|
+
if slurm:
|
|
58
|
+
dask_cluster = SLURMCluster(
|
|
59
|
+
cores=1,
|
|
60
|
+
memory=memory_size,
|
|
61
|
+
processes=1,
|
|
62
|
+
queue=queue,
|
|
63
|
+
walltime=walltime,
|
|
64
|
+
scheduler_options={"dashboard_address": dashboard_address},
|
|
65
|
+
log_directory=log_directory,
|
|
66
|
+
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
dask_cluster = PBSCluster(
|
|
70
|
+
cores=1,
|
|
71
|
+
memory=memory_size,
|
|
72
|
+
processes=1,
|
|
73
|
+
queue=queue,
|
|
74
|
+
walltime=walltime,
|
|
75
|
+
scheduler_options={"dashboard_address": dashboard_address},
|
|
76
|
+
log_directory=log_directory,
|
|
77
|
+
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
78
|
+
)
|
|
79
|
+
|
|
67
80
|
dask_cluster.scale(num_workers)
|
|
68
81
|
dask_client = Client(dask_cluster)
|
|
69
82
|
return dask_client, dask_cluster
|
geney/survival.py
CHANGED
|
@@ -12,9 +12,12 @@ from lifelines import CoxPHFitter
|
|
|
12
12
|
pd.set_option('display.max_columns', None)
|
|
13
13
|
pd.options.mode.chained_assignment = None
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
df
|
|
15
|
+
|
|
16
|
+
def prepare_clinical_data(df=None):
|
|
17
|
+
if df is None:
|
|
18
|
+
CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
|
|
19
|
+
df = unload_pickle(CLINICAL_DATA_FILE)
|
|
20
|
+
|
|
18
21
|
df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
|
|
19
22
|
cols = list(df.columns)
|
|
20
23
|
cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
|
|
@@ -28,114 +31,94 @@ def prepare_clinical_data():
|
|
|
28
31
|
df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
|
|
29
32
|
df[duration_col_label] /= 365
|
|
30
33
|
df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
|
|
31
|
-
df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
|
|
34
|
+
# df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
|
|
32
35
|
return df
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
class SurvivalAnalysis:
|
|
36
|
-
def __init__(self, clindf):
|
|
37
|
-
self.clindf = clindf
|
|
39
|
+
def __init__(self, clindf=None):
|
|
40
|
+
self.clindf = prepare_clinical_data(clindf)
|
|
41
|
+
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
42
|
+
self.df = self.clindf.copy()
|
|
43
|
+
self.df['group'] = 0
|
|
44
|
+
self.df.fillna(0, inplace=True)
|
|
38
45
|
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
39
46
|
|
|
40
|
-
def
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
df2['group'] = 1
|
|
46
|
-
df3['group'] = 1
|
|
47
|
-
df = pd.concat([df1, df2, df3])
|
|
48
|
-
core_features = ['duration', 'event', 'group']
|
|
49
|
-
treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
50
|
-
df = df[treatment_features + core_features]
|
|
51
|
-
df.fillna(0, inplace=True)
|
|
52
|
-
|
|
53
|
-
cap_time = min([df[df.group == 0].duration.max(), df[df.group == 1].duration.max()])
|
|
54
|
-
df['duration'] = df['duration'].clip(upper=cap_time)
|
|
55
|
-
|
|
56
|
-
for col in treatment_features:
|
|
57
|
-
df.loc[df[col] > 0, col] = 1
|
|
47
|
+
def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
|
|
48
|
+
df = self.df.copy()
|
|
49
|
+
df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
|
|
50
|
+
if control_cases is not None:
|
|
51
|
+
df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
|
|
58
52
|
|
|
59
|
-
df = df[
|
|
60
|
-
|
|
61
|
-
|
|
53
|
+
df = df[df.group > 0]
|
|
54
|
+
df.group -= 1
|
|
55
|
+
core_features = ['duration', 'event']
|
|
56
|
+
df = df[core_features + features_of_interest]
|
|
62
57
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
group_A = df[df[feature] == 0]
|
|
68
|
-
group_B = df[df[feature] == 1]
|
|
69
|
-
|
|
70
|
-
# Create Kaplan-Meier fitter instances
|
|
71
|
-
kmf_A = KaplanMeierFitter()
|
|
72
|
-
kmf_B = KaplanMeierFitter()
|
|
73
|
-
|
|
74
|
-
# Fit the data
|
|
75
|
-
if len(group_A) < 5 or len(group_B) < 5:
|
|
76
|
-
return 0, 0
|
|
77
|
-
label1, label2 = f'Epistasis ({len(group_A)})', f'CVs Only ({len(group_B)})'
|
|
78
|
-
self.label1, self.label2 = label1, label2
|
|
79
|
-
kmf_A.fit(group_A['duration'], group_A['event'], label=self.label1)
|
|
80
|
-
kmf_B.fit(group_B['duration'], group_B['event'], label=self.label2)
|
|
81
|
-
return kmf_A, kmf_B
|
|
82
|
-
|
|
83
|
-
def get_km_aucs(self, kmf_A, kmf_B):
|
|
84
|
-
surv_func_A = kmf_A.survival_function_
|
|
85
|
-
surv_func_B = kmf_B.survival_function_
|
|
86
|
-
|
|
87
|
-
# Numerical integration using Trapezoidal rule
|
|
88
|
-
auc_A = trapz(surv_func_A[self.label1], surv_func_A.index)
|
|
89
|
-
auc_B = trapz(surv_func_B[self.label2], surv_func_B.index)
|
|
90
|
-
return auc_A, auc_B
|
|
91
|
-
|
|
92
|
-
def plot_km_curve(self, kmf_A, kmf_B):
|
|
93
|
-
# Plot the survival curves
|
|
94
|
-
ax = kmf_A.plot()
|
|
95
|
-
kmf_B.plot(ax=ax)
|
|
96
|
-
|
|
97
|
-
# Add labels and title
|
|
98
|
-
p_value = 0.01
|
|
99
|
-
ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
|
|
100
|
-
# ax.text(0.45, 0.85, f'AUCe: {auc_A:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
|
|
101
|
-
# ax.text(0.45, 0.85, f'AUCc: {auc_B:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
|
|
102
|
-
|
|
103
|
-
plt.title('Kaplan-Meier Survival Curves')
|
|
104
|
-
plt.xlabel('Time')
|
|
105
|
-
plt.ylabel('Survival Probability')
|
|
106
|
-
plt.show()
|
|
107
|
-
return self
|
|
108
|
-
|
|
109
|
-
def log_rank(self, df, column):
|
|
110
|
-
group1, group2 = df[df[column] == 0], df[df[column] == 1]
|
|
111
|
-
result = logrank_test(group1['duration'], group2['duration'],
|
|
112
|
-
event_observed_A=group1['event'],
|
|
113
|
-
event_observed_B=group2['event'])
|
|
114
|
-
return result.p_value
|
|
115
|
-
|
|
116
|
-
def run_analysis(self, dict1, event_name):
|
|
117
|
-
try:
|
|
118
|
-
df = self.prepare_data(dict1)
|
|
119
|
-
if len(df[df.group == 0]) < 2 or len(df[df.group == 1]) < 2:
|
|
120
|
-
return None
|
|
58
|
+
for col in self.treatment_features:
|
|
59
|
+
if col not in df:
|
|
60
|
+
continue
|
|
61
|
+
df.loc[df[col] > 0, col] = 1
|
|
121
62
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
for column in [c for c in df.columns if c != 'duration' and c != 'event']:
|
|
126
|
-
temp[column] = self.log_rank(df, column)
|
|
63
|
+
df = df[core_features + [col for col in features_of_interest if
|
|
64
|
+
df[col].nunique() > 1]] # and df[col].value_counts(normalize=True).min() >= 0.01]]
|
|
65
|
+
return df
|
|
127
66
|
|
|
67
|
+
def kaplan_meier_analysis(self, df, control_label='CV', target_label='Epistasis', feature='group', plot=False, time_cap=False):
|
|
68
|
+
# Can only be performed on features with two unique values
|
|
69
|
+
cap_time = df.groupby(feature).duration.max().min()
|
|
70
|
+
# df['duration'] = df['duration'].clip(upper=cap_time)
|
|
71
|
+
auc_vals = []
|
|
72
|
+
results = pd.Series()
|
|
73
|
+
count = 0
|
|
74
|
+
for val in [0, 1]:
|
|
75
|
+
g = df[df[feature] == val]
|
|
76
|
+
kmf = KaplanMeierFitter()
|
|
77
|
+
label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
|
|
78
|
+
if val == 0:
|
|
79
|
+
results[control_label] = len(g)
|
|
128
80
|
else:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
81
|
+
results[target_label] = len(g)
|
|
82
|
+
|
|
83
|
+
kmf.fit(g['duration'], g['event'], label=label)
|
|
84
|
+
surv_func = kmf.survival_function_
|
|
85
|
+
auc = trapz(surv_func[label], surv_func.index)
|
|
86
|
+
auc_vals.append(auc)
|
|
87
|
+
if plot:
|
|
88
|
+
if count == 0:
|
|
89
|
+
ax = kmf.plot()
|
|
90
|
+
else:
|
|
91
|
+
kmf.plot(ax=ax)
|
|
92
|
+
count += 1
|
|
93
|
+
p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
|
|
94
|
+
|
|
95
|
+
if plot:
|
|
96
|
+
ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12,
|
|
97
|
+
horizontalalignment='center')
|
|
98
|
+
plt.title('Kaplan-Meier Survival Curves')
|
|
99
|
+
plt.xlabel('Time')
|
|
100
|
+
plt.ylabel('Survival Probability')
|
|
101
|
+
if time_cap:
|
|
102
|
+
plt.xlim([0, cap_time])
|
|
103
|
+
plt.show()
|
|
104
|
+
|
|
105
|
+
results['p_value'] = p_value
|
|
106
|
+
results['auc_target'] = auc_vals[-1]
|
|
107
|
+
if len(auc_vals) > 1:
|
|
108
|
+
results['auc_delta'] = auc_vals[-1] - auc_vals[0]
|
|
109
|
+
results['auc_control'] = auc_vals[0]
|
|
110
|
+
|
|
111
|
+
return results
|
|
112
|
+
|
|
113
|
+
def log_rank(self, group1, group2):
|
|
114
|
+
return logrank_test(group1['duration'], group2['duration'],
|
|
115
|
+
event_observed_A=group1['event'],
|
|
116
|
+
event_observed_B=group2['event']).p_value
|
|
117
|
+
|
|
118
|
+
def perform_cox_analysis(self, df, features_of_interest):
|
|
119
|
+
# Very simple... will return a series with p values for each feature
|
|
120
|
+
try:
|
|
121
|
+
return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
|
|
138
122
|
except ConvergenceError:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
123
|
+
print("Convergence Error")
|
|
124
|
+
return pd.Series()
|
geney/tcga_utils.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import random
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
class TCGACase:
|
|
6
|
+
def __init__(self, df):
|
|
7
|
+
# Here we get a dataframe of mutations within a gene
|
|
8
|
+
self.df = df
|
|
9
|
+
self.calculate_vaf()
|
|
10
|
+
self.space_variants(spacer_size=50)
|
|
11
|
+
self.case_id = df.case_id.tolist()[0]
|
|
12
|
+
|
|
13
|
+
def space_variants(self, spacer_size=100, group_likelihood_threshold=0):
|
|
14
|
+
df = self.df
|
|
15
|
+
if df.empty:
|
|
16
|
+
df['group'] = 0
|
|
17
|
+
return self
|
|
18
|
+
values = sorted(df.Start_Position.unique().tolist())
|
|
19
|
+
# groups = [list(group) for key, group in groupby(values, key=lambda x: (x - values[values.index(x) - 1] >
|
|
20
|
+
# spacer_size) if values.index(x) > 0 else False)] Initialize variables
|
|
21
|
+
groups = []
|
|
22
|
+
current_group = []
|
|
23
|
+
|
|
24
|
+
# Iterate through the values
|
|
25
|
+
for i in range(len(values)):
|
|
26
|
+
if i == 0:
|
|
27
|
+
current_group.append(values[i])
|
|
28
|
+
else:
|
|
29
|
+
if values[i] - values[i - 1] <= spacer_size:
|
|
30
|
+
current_group.append(values[i])
|
|
31
|
+
else:
|
|
32
|
+
groups.append(current_group)
|
|
33
|
+
current_group = [values[i]]
|
|
34
|
+
|
|
35
|
+
# Append the last group if it's not empty
|
|
36
|
+
if current_group:
|
|
37
|
+
groups.append(current_group)
|
|
38
|
+
|
|
39
|
+
df.loc[:, 'group'] = 0
|
|
40
|
+
for i, g in enumerate(groups):
|
|
41
|
+
df.loc[df.Start_Position.isin(g), 'group'] = i
|
|
42
|
+
self.df = df
|
|
43
|
+
return self
|
|
44
|
+
|
|
45
|
+
def calculate_vaf(self):
|
|
46
|
+
df = self.df
|
|
47
|
+
df = df[df.t_depth > 0]
|
|
48
|
+
df.loc[:, 'vaf'] = df.apply(lambda row: row.t_alt_count / row.t_depth, axis=1)
|
|
49
|
+
self.df = df
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
def find_overlayed_variants(self):
|
|
53
|
+
df = self.df
|
|
54
|
+
mut_counts = df.mut_id.value_counts()
|
|
55
|
+
mut_counts = mut_counts[mut_counts > 1].index
|
|
56
|
+
|
|
57
|
+
small_df = df.groupby('mut_id', as_index=False).agg({
|
|
58
|
+
't_depth': 'sum',
|
|
59
|
+
't_alt_count': 'sum',
|
|
60
|
+
't_ref_count': 'sum',
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
df = df.drop_duplicates(subset='mut_id', keep='first')
|
|
64
|
+
|
|
65
|
+
small_df = small_df[small_df.t_depth > 0]
|
|
66
|
+
small_df['vaf'] = small_df.t_alt_count / small_df.t_depth
|
|
67
|
+
|
|
68
|
+
small_df = small_df.set_index('mut_id')
|
|
69
|
+
df.set_index('mut_id', inplace=True)
|
|
70
|
+
df.update(small_df)
|
|
71
|
+
df.reset_index(inplace=True)
|
|
72
|
+
self.df = df
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def find_epistasis(self, pth=3, rth=0):
|
|
76
|
+
df = self.df
|
|
77
|
+
if df.empty:
|
|
78
|
+
return None
|
|
79
|
+
# df = df[df.t_alt_count > rth].sort_values('Start_Position', ascending=True)
|
|
80
|
+
df = df[(df.t_alt_count > df.t_ref_count / pth) & (df.t_alt_count >= rth)].sort_values('Start_Position',
|
|
81
|
+
ascending=True)
|
|
82
|
+
|
|
83
|
+
# display(df[['mut_id', 't_alt_count', 't_ref_count']])
|
|
84
|
+
|
|
85
|
+
# Group by the group_key
|
|
86
|
+
grouped = df.groupby('group').agg({
|
|
87
|
+
'mut_id': lambda x: '|'.join(x),
|
|
88
|
+
't_alt_count': 'mean',
|
|
89
|
+
't_ref_count': 'mean',
|
|
90
|
+
'case_id': 'first'
|
|
91
|
+
}).reset_index(drop=True)
|
|
92
|
+
|
|
93
|
+
# Drop the group_key column
|
|
94
|
+
return grouped[grouped.mut_id.str.contains('\|')][['mut_id', 't_alt_count', 't_ref_count', 'case_id']]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class TCGAGene:
|
|
98
|
+
def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
|
|
99
|
+
valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
|
|
100
|
+
df = pd.read_csv(cancer_path / gene / 'GeneMutTble.txt',
|
|
101
|
+
usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
|
|
102
|
+
'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
|
|
103
|
+
'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
104
|
+
'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
|
|
105
|
+
low_memory=False).sort_values('Start_Position', ascending=True)
|
|
106
|
+
|
|
107
|
+
if df.empty:
|
|
108
|
+
self.df = df
|
|
109
|
+
|
|
110
|
+
else:
|
|
111
|
+
df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
|
|
112
|
+
|
|
113
|
+
if include_filter is not None:
|
|
114
|
+
df = df[df.FILTER == include_filter]
|
|
115
|
+
|
|
116
|
+
elif exclude_filters is not None:
|
|
117
|
+
for exclude_filter in exclude_filters:
|
|
118
|
+
df = df[~df.FILTER.str.contains(exclude_filter)]
|
|
119
|
+
|
|
120
|
+
if valid_cases is not None:
|
|
121
|
+
df = df[df.case_id.isin(valid_cases)]
|
|
122
|
+
|
|
123
|
+
df['mut_id'] = df.apply(lambda
|
|
124
|
+
row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
|
|
125
|
+
axis=1)
|
|
126
|
+
|
|
127
|
+
df['ratio'] = df.t_alt_count + df.t_ref_count
|
|
128
|
+
df = df[df.ratio > 0]
|
|
129
|
+
df['ratio'] = df.t_alt_count / df.ratio
|
|
130
|
+
self.df = df
|
|
131
|
+
|
|
132
|
+
def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
|
|
133
|
+
if mut_id is None:
|
|
134
|
+
return self.df.case_id.unique().tolist()
|
|
135
|
+
df = self.df
|
|
136
|
+
df = df[(df.mut_id == mut_id) & (df.ratio >= read_ratio)]
|
|
137
|
+
for filter in filters:
|
|
138
|
+
df = df[~df.FILTER.str.contains(filter)]
|
|
139
|
+
return df.case_id.unique().tolist()
|
|
140
|
+
|
|
141
|
+
def get_patient_muts(self, case_id=None):
|
|
142
|
+
if case_id is None:
|
|
143
|
+
case_id = random.choice(self.affected_cases())
|
|
144
|
+
return self.df[self.df.case_id == case_id]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class TCGAMut:
|
|
148
|
+
def __init__(self, mut_id):
|
|
149
|
+
self.num_muts = mut_id.count('|') + 1
|
|
150
|
+
data = []
|
|
151
|
+
for mut in mut_id.split('|'):
|
|
152
|
+
data.append(mut.split(':'))
|
|
153
|
+
data = pd.DataFrame(data, columns=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
154
|
+
'Tumor_Seq_Allele2'])
|
|
155
|
+
data.Chromosome = data.apply(lambda row: f'chr{row.Chromosome}', axis=1)
|
|
156
|
+
data = data.astype({'Start_Position': int})
|
|
157
|
+
self.gene = data.Gene_name.unique().tolist()[0]
|
|
158
|
+
self.df = data
|
|
159
|
+
|
|
160
|
+
def find_affected_patients(self, read_ratio=0, exclude_filters=None):
|
|
161
|
+
gene = TCGAGene(self.gene, exclude_filters=exclude_filters).df
|
|
162
|
+
gene = gene[gene.ratio >= read_ratio]
|
|
163
|
+
return pd.merge(self.df, gene,
|
|
164
|
+
on=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2'])
|
|
165
|
+
|
|
166
|
+
def find_affected_patients_list(self, read_ratio=0, exclude_filters=None):
|
|
167
|
+
df = self.find_affected_patients(read_ratio=read_ratio, exclude_filters=exclude_filters)
|
|
168
|
+
case_count = df.case_id.value_counts()
|
|
169
|
+
case_count = case_count[case_count == self.num_muts]
|
|
170
|
+
return case_count.index.tolist()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
|
|
175
|
+
# CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
|
|
176
|
+
# CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
|
|
177
|
+
# MAF_FILE_NAME = 'GeneMutTble.txt'
|
|
178
|
+
# CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
|
|
179
|
+
# PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
|
|
180
|
+
# OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
|
|
181
|
+
# MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
|
|
182
|
+
# PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
|
|
183
|
+
# PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
|
|
184
|
+
|
|
185
|
+
# def prepare_gene_sets():
|
|
186
|
+
# # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
|
|
187
|
+
# # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
|
|
188
|
+
# # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
|
|
189
|
+
# # all_oncogenes = list(set(all_oncogenes))
|
|
190
|
+
# return [], [], []
|
|
191
|
+
#
|
|
192
|
+
# CLIN_DF = prepare_clinical_data()
|
|
193
|
+
# TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
|
|
194
|
+
#
|
|
195
|
+
#
|
|
196
|
+
# def generate_survival_quantitative(affected_df, nonaffected_df):
|
|
197
|
+
# if affected_df.empty or nonaffected_df.empty:
|
|
198
|
+
# return np.nan, np.nan, np.nan
|
|
199
|
+
# results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
|
|
200
|
+
# event_observed_A=affected_df['event'],
|
|
201
|
+
# event_observed_B=nonaffected_df['event'])
|
|
202
|
+
# p_value = results.p_value
|
|
203
|
+
# kmf = KaplanMeierFitter()
|
|
204
|
+
# kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
|
|
205
|
+
# times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
|
|
206
|
+
# auc1 = np.trapz(surv_probs, times)
|
|
207
|
+
# kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
|
|
208
|
+
# times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
|
|
209
|
+
# auc2 = np.trapz(surv_probs, times)
|
|
210
|
+
# return p_value, auc1, auc2
|
|
211
|
+
#
|
|
212
|
+
# def generate_survival_pvalue(affected_df, unaffected_df):
|
|
213
|
+
# results = logrank_test(affected_df['duration'], unaffected_df['duration'],
|
|
214
|
+
# event_observed_A=affected_df['event'],
|
|
215
|
+
# event_observed_B=unaffected_df['event'])
|
|
216
|
+
#
|
|
217
|
+
# p_value = results.p_value
|
|
218
|
+
# kmf = KaplanMeierFitter()
|
|
219
|
+
# # Fit data
|
|
220
|
+
# kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
|
|
221
|
+
# ax = kmf.plot()
|
|
222
|
+
#
|
|
223
|
+
# kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
|
|
224
|
+
# kmf.plot(ax=ax)
|
|
225
|
+
# plt.text(5, 0.95, f'pval: {p_value:.3e}')
|
|
226
|
+
# plt.show()
|
|
227
|
+
# return p_value
|
|
228
|
+
#
|
|
229
|
+
# def get_project_prevalence(cases_affected):
|
|
230
|
+
# ca = [c for c in cases_affected if c in CASE_TRACKER.index]
|
|
231
|
+
# prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
|
|
232
|
+
# prevalences.fillna(0, inplace=True)
|
|
233
|
+
# prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
|
|
234
|
+
# prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
|
|
235
|
+
# return prevalences
|
|
236
|
+
#
|
|
237
|
+
# def get_project_counts(cases_affected):
|
|
238
|
+
# ca = [c for c in cases_affected if c in CASE_TRACKER.index]
|
|
239
|
+
# prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
|
|
240
|
+
# prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
|
|
241
|
+
# prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
|
|
242
|
+
# return prevalences
|
|
243
|
+
#
|
|
244
|
+
# def get_event_consequence(df):
|
|
245
|
+
# assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
|
|
246
|
+
# return df.iloc[0].Consequence
|
|
247
|
+
#
|
|
248
|
+
# def get_dbSNP_id(df):
|
|
249
|
+
# return df.iloc[0].dbSNP_RS
|
|
250
|
+
#
|
|
251
|
+
# def load_variant_file(gene):
|
|
252
|
+
# df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
|
|
253
|
+
# df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
|
|
254
|
+
# return df
|
|
255
|
+
#
|
|
256
|
+
# def find_event_data(event):
|
|
257
|
+
# df = load_variant_file(event.gene)
|
|
258
|
+
# if df.empty:
|
|
259
|
+
# return None
|
|
260
|
+
#
|
|
261
|
+
# df = df.query \
|
|
262
|
+
# ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
|
|
263
|
+
#
|
|
264
|
+
# if df.empty:
|
|
265
|
+
# return None
|
|
266
|
+
#
|
|
267
|
+
# if event.transcript_id is not None:
|
|
268
|
+
# df = df[df.Transcript_ID == event.transcript_id]
|
|
269
|
+
# df['mut_id'] = event.event_id
|
|
270
|
+
# return df
|
|
271
|
+
#
|
|
272
|
+
#
|
|
273
|
+
# class GEvent:
|
|
274
|
+
# def __init__(self, event_id, transcript_id=None):
|
|
275
|
+
# self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
|
|
276
|
+
# self.transcript_id = transcript_id
|
|
277
|
+
# self.chromosome = f'chr{self.chromosome}'
|
|
278
|
+
# self.start = int(self.start)
|
|
279
|
+
# self.event_id = event_id
|
|
280
|
+
#
|
|
281
|
+
#
|
|
282
|
+
#
|
|
283
|
+
# def get_okgp_mutation_frequency(mut_id):
|
|
284
|
+
# if mut_id in MUTATION_FREQ_DF.index:
|
|
285
|
+
# return MUTATION_FREQ_DF.loc[mut_id].cases_affected
|
|
286
|
+
# else:
|
|
287
|
+
# return 0
|
|
288
|
+
#
|
|
289
|
+
# def get_df_filter_info(df):
|
|
290
|
+
# filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
|
|
291
|
+
# MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
|
|
292
|
+
# 'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
|
|
293
|
+
# filter_col_name: str = "FILTER_info" # column name to add to the dataframe
|
|
294
|
+
# filter_info_list: list = []
|
|
295
|
+
# f_cnr_info = {}
|
|
296
|
+
#
|
|
297
|
+
# for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
|
|
298
|
+
# filter_vals = list(df_prj['FILTER'])
|
|
299
|
+
# num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
|
|
300
|
+
# for filter_val in filter_vals:
|
|
301
|
+
# num_pass += ('PASS' in filter_val)
|
|
302
|
+
# num_artifacts += any([x in filter_val for x in filter_artifact_values])
|
|
303
|
+
# num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
|
|
304
|
+
# num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
|
|
305
|
+
# f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
|
|
306
|
+
# return f_cnr_info
|
|
307
|
+
#
|
|
308
|
+
# def yoram_mutid(row):
|
|
309
|
+
# return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
|
|
310
|
+
#
|
|
311
|
+
#
|
|
312
|
+
# def annotate_level_two(mut_id, tid):
|
|
313
|
+
# mut = GEvent(mut_id, tid)
|
|
314
|
+
# df = find_event_data(mut)
|
|
315
|
+
#
|
|
316
|
+
# if df.empty or df is None:
|
|
317
|
+
# return None
|
|
318
|
+
#
|
|
319
|
+
# patients_affected = df.cases_affected.unique().tolist()
|
|
320
|
+
# p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
|
|
321
|
+
# project_prevalences = get_project_prevalence(patients_affected)
|
|
322
|
+
# prev_dict = project_prevalences.to_dict().sort()
|
|
323
|
+
# project_counts = get_project_counts(patients_affected)
|
|
324
|
+
#
|
|
325
|
+
# s = pd.Series({
|
|
326
|
+
# 'mut_id': mut_id,
|
|
327
|
+
# 'yoram_mut_id': yoram_mutid(df.iloc[0]),
|
|
328
|
+
# 'transcript_id': tid,
|
|
329
|
+
# 'affected_cases': len(patients_affected),
|
|
330
|
+
# 'dbSNP_id': get_dbSNP_id(df),
|
|
331
|
+
# 'consequence': get_event_consequence(df),
|
|
332
|
+
# 'survival_p_value': p_val,
|
|
333
|
+
# 'auc_affected': auc_a,
|
|
334
|
+
# 'auc_nonaffected': auc_n,
|
|
335
|
+
# 'TSG': contains(TSGS, mut.gene),
|
|
336
|
+
# 'oncogene': contains(ONCOGENES, mut.gene),
|
|
337
|
+
# 'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
|
|
338
|
+
# 'filter_inf': get_df_filter_info(df),
|
|
339
|
+
# 'strand': df.Strand.unique().tolist()[0],
|
|
340
|
+
# 'prevalences': prev_dict
|
|
341
|
+
# })
|
|
342
|
+
#
|
|
343
|
+
# s['max_prev'] = project_prevalences.max()
|
|
344
|
+
# s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
|
|
345
|
+
# s = pd.concat([s, project_prevalences, project_counts])
|
|
346
|
+
# del df
|
|
347
|
+
# return s
|
|
348
|
+
#
|
|
349
|
+
# def get_mut_counts():
|
|
350
|
+
# cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
|
|
351
|
+
# cases = pd.Series(cases)
|
|
352
|
+
# cases.name = 'num_cases'
|
|
353
|
+
# cases.index.name = 'mut_id'
|
|
354
|
+
# cases = cases.to_frame()
|
|
355
|
+
# cases.reset_index(inplace=True)
|
|
356
|
+
# return cases
|
|
357
|
+
#
|
|
358
|
+
#
|
|
359
|
+
# def create_mut_id(row):
|
|
360
|
+
# return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
|
|
361
|
+
#
|
|
362
|
+
#
|
|
363
|
+
# def is_in_exon(mut_id, tid):
|
|
364
|
+
# from geney.Gene import Gene
|
|
365
|
+
# transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
|
|
366
|
+
# return int(mut_id.split(':')[2]) in transcript.exonic_indices
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.3
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
@@ -27,6 +27,7 @@ Requires-Dist: joblib ==1.3.2
|
|
|
27
27
|
Requires-Dist: gtfparse ==1.3.0
|
|
28
28
|
Requires-Dist: sh ==2.0.6
|
|
29
29
|
Requires-Dist: termplotlib ==0.3.9
|
|
30
|
+
Requires-Dist: lifelines
|
|
30
31
|
Requires-Dist: notebook
|
|
31
32
|
Requires-Dist: matplotlib
|
|
32
33
|
Requires-Dist: dask[complete]
|
|
@@ -9,9 +9,10 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
|
9
9
|
geney/netchop.py,sha256=mgKe9Yv2m1SlZUmIXBVNtH-rP5PtBn9SlEi9lE1L0SE,2821
|
|
10
10
|
geney/oncosplice.py,sha256=Fyc_UtAhV3Pv0vk8V55rO_jnb2Dwj5sW98KVwP3PHwU,68964
|
|
11
11
|
geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
|
|
12
|
-
geney/power_utils.py,sha256=
|
|
13
|
-
geney/survival.py,sha256=
|
|
12
|
+
geney/power_utils.py,sha256=WRpqMnqUv1xrAeTduAUhx6YpSEJQci7bC2od12JcVtE,7267
|
|
13
|
+
geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
|
|
14
14
|
geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
|
|
15
|
+
geney/tcga_utils.py,sha256=cX9hbDX-qECyCMSYaBL8r1FWWuju08jQvlPT3q13B3Y,15777
|
|
15
16
|
geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
|
|
16
17
|
geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
18
|
geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
|
|
@@ -39,7 +40,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
|
|
|
39
40
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
40
41
|
geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
42
|
geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
geney-1.1.
|
|
43
|
-
geney-1.1.
|
|
44
|
-
geney-1.1.
|
|
45
|
-
geney-1.1.
|
|
43
|
+
geney-1.1.3.dist-info/METADATA,sha256=ec8t6aiZh-SlD6yyhfar7GBs7ljgXw66-TBM7lPXZCo,1130
|
|
44
|
+
geney-1.1.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
|
45
|
+
geney-1.1.3.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
46
|
+
geney-1.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|