geney 1.1.1__py2.py3-none-any.whl → 1.1.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/power_utils.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import subprocess
2
2
  import time
3
- from dask_jobqueue import PBSCluster
3
+ from dask_jobqueue import PBSCluster, SLURMCluster
4
4
  from dask.distributed import Client, wait
5
5
  import os
6
6
  from tqdm import tqdm
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
38
38
 
39
39
  def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
40
40
  walltime="24:00:00", dashboard_address=":23154",
41
- log_directory="dask-logs"):
41
+ log_directory="dask-logs", slurm=False):
42
42
  """
43
43
  Launch a Dask cluster using PBS.
44
44
 
@@ -54,16 +54,29 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
54
54
  tuple: A tuple containing the Dask client and cluster objects.
55
55
  """
56
56
  try:
57
- dask_cluster = PBSCluster(
58
- cores=1,
59
- memory=memory_size,
60
- processes=1,
61
- queue=queue,
62
- walltime=walltime,
63
- scheduler_options={"dashboard_address": dashboard_address},
64
- log_directory=log_directory,
65
- job_script_prologue=[f"cd {config_setup['BASE']}"]
66
- )
57
+ if slurm:
58
+ dask_cluster = SLURMCluster(
59
+ cores=1,
60
+ memory=memory_size,
61
+ processes=1,
62
+ queue=queue,
63
+ walltime=walltime,
64
+ scheduler_options={"dashboard_address": dashboard_address},
65
+ log_directory=log_directory,
66
+ job_script_prologue=[f"cd {config_setup['BASE']}"]
67
+ )
68
+ else:
69
+ dask_cluster = PBSCluster(
70
+ cores=1,
71
+ memory=memory_size,
72
+ processes=1,
73
+ queue=queue,
74
+ walltime=walltime,
75
+ scheduler_options={"dashboard_address": dashboard_address},
76
+ log_directory=log_directory,
77
+ job_script_prologue=[f"cd {config_setup['BASE']}"]
78
+ )
79
+
67
80
  dask_cluster.scale(num_workers)
68
81
  dask_client = Client(dask_cluster)
69
82
  return dask_client, dask_cluster
geney/survival.py CHANGED
@@ -12,9 +12,12 @@ from lifelines import CoxPHFitter
12
12
  pd.set_option('display.max_columns', None)
13
13
  pd.options.mode.chained_assignment = None
14
14
 
15
- def prepare_clinical_data():
16
- CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
17
- df = unload_pickle(CLINICAL_DATA_FILE)
15
+
16
+ def prepare_clinical_data(df=None):
17
+ if df is None:
18
+ CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
19
+ df = unload_pickle(CLINICAL_DATA_FILE)
20
+
18
21
  df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
19
22
  cols = list(df.columns)
20
23
  cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
@@ -28,114 +31,94 @@ def prepare_clinical_data():
28
31
  df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
29
32
  df[duration_col_label] /= 365
30
33
  df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
31
- df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
34
+ # df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
32
35
  return df
33
36
 
34
37
 
35
38
  class SurvivalAnalysis:
36
- def __init__(self, clindf):
37
- self.clindf = clindf
39
+ def __init__(self, clindf=None):
40
+ self.clindf = prepare_clinical_data(clindf)
41
+ self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
42
+ self.df = self.clindf.copy()
43
+ self.df['group'] = 0
44
+ self.df.fillna(0, inplace=True)
38
45
  self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
39
46
 
40
- def prepare_data(self, case_dict):
41
- df1 = self.clindf.query(f"case_id in {case_dict['affected']}")
42
- df2 = self.clindf.query(f"case_id in {case_dict['na1']}")
43
- df3 = self.clindf.query(f"case_id in {case_dict['na2']}")
44
- df1['group'] = 0
45
- df2['group'] = 1
46
- df3['group'] = 1
47
- df = pd.concat([df1, df2, df3])
48
- core_features = ['duration', 'event', 'group']
49
- treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
50
- df = df[treatment_features + core_features]
51
- df.fillna(0, inplace=True)
52
-
53
- cap_time = min([df[df.group == 0].duration.max(), df[df.group == 1].duration.max()])
54
- df['duration'] = df['duration'].clip(upper=cap_time)
55
-
56
- for col in treatment_features:
57
- df.loc[df[col] > 0, col] = 1
47
+ def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
48
+ df = self.df.copy()
49
+ df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
50
+ if control_cases is not None:
51
+ df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
58
52
 
59
- df = df[core_features + [col for col in treatment_features if
60
- df[col].nunique() > 1 and df[col].value_counts(normalize=True).min() >= 0.01]]
61
- return df
53
+ df = df[df.group > 0]
54
+ df.group -= 1
55
+ core_features = ['duration', 'event']
56
+ df = df[core_features + features_of_interest]
62
57
 
63
- def perform_cox_analysis(self, df):
64
- return CoxPHFitter().fit(df, 'duration', 'event')
65
-
66
- def get_km_fits(self, df, feature):
67
- group_A = df[df[feature] == 0]
68
- group_B = df[df[feature] == 1]
69
-
70
- # Create Kaplan-Meier fitter instances
71
- kmf_A = KaplanMeierFitter()
72
- kmf_B = KaplanMeierFitter()
73
-
74
- # Fit the data
75
- if len(group_A) < 5 or len(group_B) < 5:
76
- return 0, 0
77
- label1, label2 = f'Epistasis ({len(group_A)})', f'CVs Only ({len(group_B)})'
78
- self.label1, self.label2 = label1, label2
79
- kmf_A.fit(group_A['duration'], group_A['event'], label=self.label1)
80
- kmf_B.fit(group_B['duration'], group_B['event'], label=self.label2)
81
- return kmf_A, kmf_B
82
-
83
- def get_km_aucs(self, kmf_A, kmf_B):
84
- surv_func_A = kmf_A.survival_function_
85
- surv_func_B = kmf_B.survival_function_
86
-
87
- # Numerical integration using Trapezoidal rule
88
- auc_A = trapz(surv_func_A[self.label1], surv_func_A.index)
89
- auc_B = trapz(surv_func_B[self.label2], surv_func_B.index)
90
- return auc_A, auc_B
91
-
92
- def plot_km_curve(self, kmf_A, kmf_B):
93
- # Plot the survival curves
94
- ax = kmf_A.plot()
95
- kmf_B.plot(ax=ax)
96
-
97
- # Add labels and title
98
- p_value = 0.01
99
- ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
100
- # ax.text(0.45, 0.85, f'AUCe: {auc_A:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
101
- # ax.text(0.45, 0.85, f'AUCc: {auc_B:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
102
-
103
- plt.title('Kaplan-Meier Survival Curves')
104
- plt.xlabel('Time')
105
- plt.ylabel('Survival Probability')
106
- plt.show()
107
- return self
108
-
109
- def log_rank(self, df, column):
110
- group1, group2 = df[df[column] == 0], df[df[column] == 1]
111
- result = logrank_test(group1['duration'], group2['duration'],
112
- event_observed_A=group1['event'],
113
- event_observed_B=group2['event'])
114
- return result.p_value
115
-
116
- def run_analysis(self, dict1, event_name):
117
- try:
118
- df = self.prepare_data(dict1)
119
- if len(df[df.group == 0]) < 2 or len(df[df.group == 1]) < 2:
120
- return None
58
+ for col in self.treatment_features:
59
+ if col not in df:
60
+ continue
61
+ df.loc[df[col] > 0, col] = 1
121
62
 
122
- elif len(df[df.group == 0]) < 10 or len(df[df.group == 1]) < 10:
123
- temp = pd.Series()
124
- temp['mut_id'] = event_name
125
- for column in [c for c in df.columns if c != 'duration' and c != 'event']:
126
- temp[column] = self.log_rank(df, column)
63
+ df = df[core_features + [col for col in features_of_interest if
64
+ df[col].nunique() > 1]] # and df[col].value_counts(normalize=True).min() >= 0.01]]
65
+ return df
127
66
 
67
+ def kaplan_meier_analysis(self, df, control_label='CV', target_label='Epistasis', feature='group', plot=False, time_cap=False):
68
+ # Can only be performed on features with two unique values
69
+ cap_time = df.groupby(feature).duration.max().min()
70
+ # df['duration'] = df['duration'].clip(upper=cap_time)
71
+ auc_vals = []
72
+ results = pd.Series()
73
+ count = 0
74
+ for val in [0, 1]:
75
+ g = df[df[feature] == val]
76
+ kmf = KaplanMeierFitter()
77
+ label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
78
+ if val == 0:
79
+ results[control_label] = len(g)
128
80
  else:
129
- auca, aucb = self.get_km_aucs(*self.get_km_fits(df, 'group'))
130
- cph = self.perform_cox_analysis(df)
131
- temp = cph.summary.p
132
- temp.name = ''
133
- temp.index.name = ''
134
- temp['auc_diff'] = auca - aucb
135
- temp['mut_id'] = event_name
136
- return temp
137
-
81
+ results[target_label] = len(g)
82
+
83
+ kmf.fit(g['duration'], g['event'], label=label)
84
+ surv_func = kmf.survival_function_
85
+ auc = trapz(surv_func[label], surv_func.index)
86
+ auc_vals.append(auc)
87
+ if plot:
88
+ if count == 0:
89
+ ax = kmf.plot()
90
+ else:
91
+ kmf.plot(ax=ax)
92
+ count += 1
93
+ p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
94
+
95
+ if plot:
96
+ ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12,
97
+ horizontalalignment='center')
98
+ plt.title('Kaplan-Meier Survival Curves')
99
+ plt.xlabel('Time')
100
+ plt.ylabel('Survival Probability')
101
+ if time_cap:
102
+ plt.xlim([0, cap_time])
103
+ plt.show()
104
+
105
+ results['p_value'] = p_value
106
+ results['auc_target'] = auc_vals[-1]
107
+ if len(auc_vals) > 1:
108
+ results['auc_delta'] = auc_vals[-1] - auc_vals[0]
109
+ results['auc_control'] = auc_vals[0]
110
+
111
+ return results
112
+
113
+ def log_rank(self, group1, group2):
114
+ return logrank_test(group1['duration'], group2['duration'],
115
+ event_observed_A=group1['event'],
116
+ event_observed_B=group2['event']).p_value
117
+
118
+ def perform_cox_analysis(self, df, features_of_interest):
119
+ # Very simple... will return a series with p values for each feature
120
+ try:
121
+ return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
138
122
  except ConvergenceError:
139
- return None
140
-
141
-
123
+ print("Convergence Error")
124
+ return pd.Series()
geney/tcga_utils.py ADDED
@@ -0,0 +1,366 @@
1
+
2
+ import pandas as pd
3
+ import random
4
+ from pathlib import Path
5
+ class TCGACase:
6
+ def __init__(self, df):
7
+ # Here we get a dataframe of mutations within a gene
8
+ self.df = df
9
+ self.calculate_vaf()
10
+ self.space_variants(spacer_size=50)
11
+ self.case_id = df.case_id.tolist()[0]
12
+
13
+ def space_variants(self, spacer_size=100, group_likelihood_threshold=0):
14
+ df = self.df
15
+ if df.empty:
16
+ df['group'] = 0
17
+ return self
18
+ values = sorted(df.Start_Position.unique().tolist())
19
+ # groups = [list(group) for key, group in groupby(values, key=lambda x: (x - values[values.index(x) - 1] >
20
+ # spacer_size) if values.index(x) > 0 else False)] Initialize variables
21
+ groups = []
22
+ current_group = []
23
+
24
+ # Iterate through the values
25
+ for i in range(len(values)):
26
+ if i == 0:
27
+ current_group.append(values[i])
28
+ else:
29
+ if values[i] - values[i - 1] <= spacer_size:
30
+ current_group.append(values[i])
31
+ else:
32
+ groups.append(current_group)
33
+ current_group = [values[i]]
34
+
35
+ # Append the last group if it's not empty
36
+ if current_group:
37
+ groups.append(current_group)
38
+
39
+ df.loc[:, 'group'] = 0
40
+ for i, g in enumerate(groups):
41
+ df.loc[df.Start_Position.isin(g), 'group'] = i
42
+ self.df = df
43
+ return self
44
+
45
+ def calculate_vaf(self):
46
+ df = self.df
47
+ df = df[df.t_depth > 0]
48
+ df.loc[:, 'vaf'] = df.apply(lambda row: row.t_alt_count / row.t_depth, axis=1)
49
+ self.df = df
50
+ return self
51
+
52
+ def find_overlayed_variants(self):
53
+ df = self.df
54
+ mut_counts = df.mut_id.value_counts()
55
+ mut_counts = mut_counts[mut_counts > 1].index
56
+
57
+ small_df = df.groupby('mut_id', as_index=False).agg({
58
+ 't_depth': 'sum',
59
+ 't_alt_count': 'sum',
60
+ 't_ref_count': 'sum',
61
+ })
62
+
63
+ df = df.drop_duplicates(subset='mut_id', keep='first')
64
+
65
+ small_df = small_df[small_df.t_depth > 0]
66
+ small_df['vaf'] = small_df.t_alt_count / small_df.t_depth
67
+
68
+ small_df = small_df.set_index('mut_id')
69
+ df.set_index('mut_id', inplace=True)
70
+ df.update(small_df)
71
+ df.reset_index(inplace=True)
72
+ self.df = df
73
+ return self
74
+
75
+ def find_epistasis(self, pth=3, rth=0):
76
+ df = self.df
77
+ if df.empty:
78
+ return None
79
+ # df = df[df.t_alt_count > rth].sort_values('Start_Position', ascending=True)
80
+ df = df[(df.t_alt_count > df.t_ref_count / pth) & (df.t_alt_count >= rth)].sort_values('Start_Position',
81
+ ascending=True)
82
+
83
+ # display(df[['mut_id', 't_alt_count', 't_ref_count']])
84
+
85
+ # Group by the group_key
86
+ grouped = df.groupby('group').agg({
87
+ 'mut_id': lambda x: '|'.join(x),
88
+ 't_alt_count': 'mean',
89
+ 't_ref_count': 'mean',
90
+ 'case_id': 'first'
91
+ }).reset_index(drop=True)
92
+
93
+ # Drop the group_key column
94
+ return grouped[grouped.mut_id.str.contains('\|')][['mut_id', 't_alt_count', 't_ref_count', 'case_id']]
95
+
96
+
97
+ class TCGAGene:
98
+ def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
99
+ valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
100
+ df = pd.read_csv(cancer_path / gene / 'GeneMutTble.txt',
101
+ usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
102
+ 'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
103
+ 'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
104
+ 'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
105
+ low_memory=False).sort_values('Start_Position', ascending=True)
106
+
107
+ if df.empty:
108
+ self.df = df
109
+
110
+ else:
111
+ df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
112
+
113
+ if include_filter is not None:
114
+ df = df[df.FILTER == include_filter]
115
+
116
+ elif exclude_filters is not None:
117
+ for exclude_filter in exclude_filters:
118
+ df = df[~df.FILTER.str.contains(exclude_filter)]
119
+
120
+ if valid_cases is not None:
121
+ df = df[df.case_id.isin(valid_cases)]
122
+
123
+ df['mut_id'] = df.apply(lambda
124
+ row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
125
+ axis=1)
126
+
127
+ df['ratio'] = df.t_alt_count + df.t_ref_count
128
+ df = df[df.ratio > 0]
129
+ df['ratio'] = df.t_alt_count / df.ratio
130
+ self.df = df
131
+
132
+ def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
133
+ if mut_id is None:
134
+ return self.df.case_id.unique().tolist()
135
+ df = self.df
136
+ df = df[(df.mut_id == mut_id) & (df.ratio >= read_ratio)]
137
+ for filter in filters:
138
+ df = df[~df.FILTER.str.contains(filter)]
139
+ return df.case_id.unique().tolist()
140
+
141
+ def get_patient_muts(self, case_id=None):
142
+ if case_id is None:
143
+ case_id = random.choice(self.affected_cases())
144
+ return self.df[self.df.case_id == case_id]
145
+
146
+
147
+ class TCGAMut:
148
+ def __init__(self, mut_id):
149
+ self.num_muts = mut_id.count('|') + 1
150
+ data = []
151
+ for mut in mut_id.split('|'):
152
+ data.append(mut.split(':'))
153
+ data = pd.DataFrame(data, columns=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele',
154
+ 'Tumor_Seq_Allele2'])
155
+ data.Chromosome = data.apply(lambda row: f'chr{row.Chromosome}', axis=1)
156
+ data = data.astype({'Start_Position': int})
157
+ self.gene = data.Gene_name.unique().tolist()[0]
158
+ self.df = data
159
+
160
+ def find_affected_patients(self, read_ratio=0, exclude_filters=None):
161
+ gene = TCGAGene(self.gene, exclude_filters=exclude_filters).df
162
+ gene = gene[gene.ratio >= read_ratio]
163
+ return pd.merge(self.df, gene,
164
+ on=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2'])
165
+
166
+ def find_affected_patients_list(self, read_ratio=0, exclude_filters=None):
167
+ df = self.find_affected_patients(read_ratio=read_ratio, exclude_filters=exclude_filters)
168
+ case_count = df.case_id.value_counts()
169
+ case_count = case_count[case_count == self.num_muts]
170
+ return case_count.index.tolist()
171
+
172
+
173
+
174
+ # CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
175
+ # CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
176
+ # CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
177
+ # MAF_FILE_NAME = 'GeneMutTble.txt'
178
+ # CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
179
+ # PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
180
+ # OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
181
+ # MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
182
+ # PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
183
+ # PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
184
+
185
+ # def prepare_gene_sets():
186
+ # # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
187
+ # # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
188
+ # # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
189
+ # # all_oncogenes = list(set(all_oncogenes))
190
+ # return [], [], []
191
+ #
192
+ # CLIN_DF = prepare_clinical_data()
193
+ # TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
194
+ #
195
+ #
196
+ # def generate_survival_quantitative(affected_df, nonaffected_df):
197
+ # if affected_df.empty or nonaffected_df.empty:
198
+ # return np.nan, np.nan, np.nan
199
+ # results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
200
+ # event_observed_A=affected_df['event'],
201
+ # event_observed_B=nonaffected_df['event'])
202
+ # p_value = results.p_value
203
+ # kmf = KaplanMeierFitter()
204
+ # kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
205
+ # times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
206
+ # auc1 = np.trapz(surv_probs, times)
207
+ # kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
208
+ # times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
209
+ # auc2 = np.trapz(surv_probs, times)
210
+ # return p_value, auc1, auc2
211
+ #
212
+ # def generate_survival_pvalue(affected_df, unaffected_df):
213
+ # results = logrank_test(affected_df['duration'], unaffected_df['duration'],
214
+ # event_observed_A=affected_df['event'],
215
+ # event_observed_B=unaffected_df['event'])
216
+ #
217
+ # p_value = results.p_value
218
+ # kmf = KaplanMeierFitter()
219
+ # # Fit data
220
+ # kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
221
+ # ax = kmf.plot()
222
+ #
223
+ # kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
224
+ # kmf.plot(ax=ax)
225
+ # plt.text(5, 0.95, f'pval: {p_value:.3e}')
226
+ # plt.show()
227
+ # return p_value
228
+ #
229
+ # def get_project_prevalence(cases_affected):
230
+ # ca = [c for c in cases_affected if c in CASE_TRACKER.index]
231
+ # prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
232
+ # prevalences.fillna(0, inplace=True)
233
+ # prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
234
+ # prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
235
+ # return prevalences
236
+ #
237
+ # def get_project_counts(cases_affected):
238
+ # ca = [c for c in cases_affected if c in CASE_TRACKER.index]
239
+ # prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
240
+ # prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
241
+ # prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
242
+ # return prevalences
243
+ #
244
+ # def get_event_consequence(df):
245
+ # assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
246
+ # return df.iloc[0].Consequence
247
+ #
248
+ # def get_dbSNP_id(df):
249
+ # return df.iloc[0].dbSNP_RS
250
+ #
251
+ # def load_variant_file(gene):
252
+ # df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
253
+ # df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
254
+ # return df
255
+ #
256
+ # def find_event_data(event):
257
+ # df = load_variant_file(event.gene)
258
+ # if df.empty:
259
+ # return None
260
+ #
261
+ # df = df.query \
262
+ # ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
263
+ #
264
+ # if df.empty:
265
+ # return None
266
+ #
267
+ # if event.transcript_id is not None:
268
+ # df = df[df.Transcript_ID == event.transcript_id]
269
+ # df['mut_id'] = event.event_id
270
+ # return df
271
+ #
272
+ #
273
+ # class GEvent:
274
+ # def __init__(self, event_id, transcript_id=None):
275
+ # self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
276
+ # self.transcript_id = transcript_id
277
+ # self.chromosome = f'chr{self.chromosome}'
278
+ # self.start = int(self.start)
279
+ # self.event_id = event_id
280
+ #
281
+ #
282
+ #
283
+ # def get_okgp_mutation_frequency(mut_id):
284
+ # if mut_id in MUTATION_FREQ_DF.index:
285
+ # return MUTATION_FREQ_DF.loc[mut_id].cases_affected
286
+ # else:
287
+ # return 0
288
+ #
289
+ # def get_df_filter_info(df):
290
+ # filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
291
+ # MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
292
+ # 'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
293
+ # filter_col_name: str = "FILTER_info" # column name to add to the dataframe
294
+ # filter_info_list: list = []
295
+ # f_cnr_info = {}
296
+ #
297
+ # for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
298
+ # filter_vals = list(df_prj['FILTER'])
299
+ # num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
300
+ # for filter_val in filter_vals:
301
+ # num_pass += ('PASS' in filter_val)
302
+ # num_artifacts += any([x in filter_val for x in filter_artifact_values])
303
+ # num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
304
+ # num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
305
+ # f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
306
+ # return f_cnr_info
307
+ #
308
+ # def yoram_mutid(row):
309
+ # return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
310
+ #
311
+ #
312
+ # def annotate_level_two(mut_id, tid):
313
+ # mut = GEvent(mut_id, tid)
314
+ # df = find_event_data(mut)
315
+ #
316
+ # if df.empty or df is None:
317
+ # return None
318
+ #
319
+ # patients_affected = df.cases_affected.unique().tolist()
320
+ # p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
321
+ # project_prevalences = get_project_prevalence(patients_affected)
322
+ # prev_dict = project_prevalences.to_dict().sort()
323
+ # project_counts = get_project_counts(patients_affected)
324
+ #
325
+ # s = pd.Series({
326
+ # 'mut_id': mut_id,
327
+ # 'yoram_mut_id': yoram_mutid(df.iloc[0]),
328
+ # 'transcript_id': tid,
329
+ # 'affected_cases': len(patients_affected),
330
+ # 'dbSNP_id': get_dbSNP_id(df),
331
+ # 'consequence': get_event_consequence(df),
332
+ # 'survival_p_value': p_val,
333
+ # 'auc_affected': auc_a,
334
+ # 'auc_nonaffected': auc_n,
335
+ # 'TSG': contains(TSGS, mut.gene),
336
+ # 'oncogene': contains(ONCOGENES, mut.gene),
337
+ # 'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
338
+ # 'filter_inf': get_df_filter_info(df),
339
+ # 'strand': df.Strand.unique().tolist()[0],
340
+ # 'prevalences': prev_dict
341
+ # })
342
+ #
343
+ # s['max_prev'] = project_prevalences.max()
344
+ # s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
345
+ # s = pd.concat([s, project_prevalences, project_counts])
346
+ # del df
347
+ # return s
348
+ #
349
+ # def get_mut_counts():
350
+ # cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
351
+ # cases = pd.Series(cases)
352
+ # cases.name = 'num_cases'
353
+ # cases.index.name = 'mut_id'
354
+ # cases = cases.to_frame()
355
+ # cases.reset_index(inplace=True)
356
+ # return cases
357
+ #
358
+ #
359
+ # def create_mut_id(row):
360
+ # return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
361
+ #
362
+ #
363
+ # def is_in_exon(mut_id, tid):
364
+ # from geney.Gene import Gene
365
+ # transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
366
+ # return int(mut_id.split(':')[2]) in transcript.exonic_indices
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.1.1
3
+ Version: 1.1.3
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -27,6 +27,7 @@ Requires-Dist: joblib ==1.3.2
27
27
  Requires-Dist: gtfparse ==1.3.0
28
28
  Requires-Dist: sh ==2.0.6
29
29
  Requires-Dist: termplotlib ==0.3.9
30
+ Requires-Dist: lifelines
30
31
  Requires-Dist: notebook
31
32
  Requires-Dist: matplotlib
32
33
  Requires-Dist: dask[complete]
@@ -9,9 +9,10 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/netchop.py,sha256=mgKe9Yv2m1SlZUmIXBVNtH-rP5PtBn9SlEi9lE1L0SE,2821
10
10
  geney/oncosplice.py,sha256=Fyc_UtAhV3Pv0vk8V55rO_jnb2Dwj5sW98KVwP3PHwU,68964
11
11
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
12
- geney/power_utils.py,sha256=OP2GRwOnQ2zhBHN0Rz4EVdZLaj1GV9bR4IDRnaRysWc,6770
13
- geney/survival.py,sha256=zSEVY3HiKcTSR2jfjcxg_WKOe7GqXLYFby6Mj0hM6bI,6147
12
+ geney/power_utils.py,sha256=WRpqMnqUv1xrAeTduAUhx6YpSEJQci7bC2od12JcVtE,7267
13
+ geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
14
14
  geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
15
+ geney/tcga_utils.py,sha256=cX9hbDX-qECyCMSYaBL8r1FWWuju08jQvlPT3q13B3Y,15777
15
16
  geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
16
17
  geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
18
  geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
@@ -39,7 +40,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
39
40
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
40
41
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
42
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- geney-1.1.1.dist-info/METADATA,sha256=cRGsGjHn0ZtWpktPw7AsUQ3Rl4DzHMVNQ7-HYDHPr08,1105
43
- geney-1.1.1.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
44
- geney-1.1.1.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
45
- geney-1.1.1.dist-info/RECORD,,
43
+ geney-1.1.3.dist-info/METADATA,sha256=ec8t6aiZh-SlD6yyhfar7GBs7ljgXw66-TBM7lPXZCo,1130
44
+ geney-1.1.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
45
+ geney-1.1.3.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
46
+ geney-1.1.3.dist-info/RECORD,,
File without changes