geney 1.2.19__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (40) hide show
  1. geney/__init__.py +1 -1
  2. geney/oncosplice.py +1 -3
  3. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
  4. geney-1.2.21.dist-info/RECORD +19 -0
  5. geney/Gene.py +0 -258
  6. geney/analyzers/__init__.py +0 -0
  7. geney/analyzers/benchmark_clinvar.py +0 -158
  8. geney/analyzers/characterize_epistasis.py +0 -15
  9. geney/analyzers/compare_sets.py +0 -91
  10. geney/analyzers/group_comparison.py +0 -81
  11. geney/analyzers/survival.py +0 -144
  12. geney/analyzers/tcga_annotations.py +0 -194
  13. geney/analyzers/visualize_protein_conservation.py +0 -398
  14. geney/benchmark_clinvar.py +0 -158
  15. geney/compare_sets.py +0 -91
  16. geney/data_parsers/__init__.py +0 -0
  17. geney/data_parsers/gtex.py +0 -68
  18. geney/gtex.py +0 -68
  19. geney/immunotherapy/__init__.py +0 -0
  20. geney/immunotherapy/netchop.py +0 -78
  21. geney/mutations/__init__.py +0 -0
  22. geney/mutations/variant_utils.py +0 -125
  23. geney/netchop.py +0 -79
  24. geney/oncosplice/__init__.py +0 -0
  25. geney/oncosplice_mouse.py +0 -277
  26. geney/oncosplice_pipeline.py +0 -1588
  27. geney/performance_utils.py +0 -138
  28. geney/pipelines/__init__.py +0 -0
  29. geney/pipelines/dask_utils.py +0 -153
  30. geney/splicing/__init__.py +0 -2
  31. geney/splicing/spliceai_utils.py +0 -253
  32. geney/splicing/splicing_isoform_utils.py +0 -0
  33. geney/splicing/splicing_utils.py +0 -366
  34. geney/survival.py +0 -124
  35. geney/tcga_annotations.py +0 -352
  36. geney/translation_termination/__init__.py +0 -0
  37. geney/translation_termination/tts_utils.py +0 -0
  38. geney-1.2.19.dist-info/RECORD +0 -52
  39. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
  40. {geney-1.2.19.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0
@@ -1,144 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import matplotlib.pyplot as plt
4
- from pathlib import Path
5
- from scipy.integrate import trapz
6
- from geney.utils import unload_pickle, unload_json, contains
7
- from lifelines.exceptions import ConvergenceError
8
- from lifelines import KaplanMeierFitter
9
- from lifelines.statistics import logrank_test
10
- from lifelines import CoxPHFitter
11
-
12
- pd.set_option('display.max_columns', None)
13
- pd.options.mode.chained_assignment = None
14
-
15
- # epistasis_tracker = unload_pickle('epistasis2case_tracker.pkl')
16
- # mutation_tracker = unload_pickle('mutation2case_tracker.pkl')
17
-
18
- def prepare_clinical_data():
19
- CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
20
- df = unload_pickle(CLINICAL_DATA_FILE)
21
- df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
22
- cols = list(df.columns)
23
- cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
24
- cols_days_to_know_alive = [col for col in cols if 'days_to_know_alive' in col] + [col for col in cols if 'days_to_last_known_alive' in col]
25
- cols_days_to_death = [col for col in cols if 'days_to_death' in col]
26
- cols_duration = cols_days_to_followup + cols_days_to_know_alive + cols_days_to_death
27
- col_vital_status = 'days_to_death'
28
- event_col_label = 'event'
29
- duration_col_label = 'duration'
30
- df.insert(1, event_col_label, df.apply(lambda x: int(not np.isnan(x[col_vital_status])), axis=1))
31
- df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
32
- df[duration_col_label] /= 365
33
- df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
34
- df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
35
- return df
36
-
37
-
38
- class SurvivalAnalysis:
39
- def __init__(self, clindf):
40
- self.clindf = clindf
41
- self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
42
-
43
- def prepare_data(self, case_dict):
44
- df1 = self.clindf.query(f"case_id in {case_dict['affected']}")
45
- df2 = self.clindf.query(f"case_id in {case_dict['na1']}")
46
- df3 = self.clindf.query(f"case_id in {case_dict['na2']}")
47
- df1['group'] = 0
48
- df2['group'] = 1
49
- df3['group'] = 1
50
- df = pd.concat([df1, df2, df3])
51
- core_features = ['duration', 'event', 'group']
52
- treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
53
- df = df[treatment_features + core_features]
54
- df.fillna(0, inplace=True)
55
-
56
- cap_time = min([df[df.group == 0].duration.max(), df[df.group == 1].duration.max()])
57
- df['duration'] = df['duration'].clip(upper=cap_time)
58
-
59
- for col in treatment_features:
60
- df.loc[df[col] > 0, col] = 1
61
-
62
- df = df[core_features + [col for col in treatment_features if
63
- df[col].nunique() > 1 and df[col].value_counts(normalize=True).min() >= 0.01]]
64
- return df
65
-
66
- def perform_cox_analysis(self, df):
67
- return CoxPHFitter().fit(df, 'duration', 'event')
68
-
69
- def get_km_fits(self, df, feature):
70
- group_A = df[df[feature] == 0]
71
- group_B = df[df[feature] == 1]
72
-
73
- # Create Kaplan-Meier fitter instances
74
- kmf_A = KaplanMeierFitter()
75
- kmf_B = KaplanMeierFitter()
76
-
77
- # Fit the data
78
- if len(group_A) < 5 or len(group_B) < 5:
79
- return 0, 0
80
- label1, label2 = f'Epistasis ({len(group_A)})', f'CVs Only ({len(group_B)})'
81
- self.label1, self.label2 = label1, label2
82
- kmf_A.fit(group_A['duration'], group_A['event'], label=self.label1)
83
- kmf_B.fit(group_B['duration'], group_B['event'], label=self.label2)
84
- return kmf_A, kmf_B
85
-
86
- def get_km_aucs(self, kmf_A, kmf_B):
87
- surv_func_A = kmf_A.survival_function_
88
- surv_func_B = kmf_B.survival_function_
89
-
90
- # Numerical integration using Trapezoidal rule
91
- auc_A = trapz(surv_func_A[self.label1], surv_func_A.index)
92
- auc_B = trapz(surv_func_B[self.label2], surv_func_B.index)
93
- return auc_A, auc_B
94
-
95
- def plot_km_curve(self, kmf_A, kmf_B):
96
- # Plot the survival curves
97
- ax = kmf_A.plot()
98
- kmf_B.plot(ax=ax)
99
-
100
- # Add labels and title
101
- p_value = 0.01
102
- ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
103
- # ax.text(0.45, 0.85, f'AUCe: {auc_A:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
104
- # ax.text(0.45, 0.85, f'AUCc: {auc_B:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
105
-
106
- plt.title('Kaplan-Meier Survival Curves')
107
- plt.xlabel('Time')
108
- plt.ylabel('Survival Probability')
109
- plt.show()
110
- return self
111
-
112
- def log_rank(self, df, column):
113
- group1, group2 = df[df[column] == 0], df[df[column] == 1]
114
- result = logrank_test(group1['duration'], group2['duration'],
115
- event_observed_A=group1['event'],
116
- event_observed_B=group2['event'])
117
- return result.p_value
118
-
119
- def run_analysis(self, dict1, event_name):
120
- try:
121
- df = self.prepare_data(dict1)
122
- if len(df[df.group == 0]) < 2 or len(df[df.group == 1]) < 2:
123
- return None
124
-
125
- elif len(df[df.group == 0]) < 10 or len(df[df.group == 1]) < 10:
126
- temp = pd.Series()
127
- temp['mut_id'] = event_name
128
- for column in [c for c in df.columns if c != 'duration' and c != 'event']:
129
- temp[column] = self.log_rank(df, column)
130
-
131
- else:
132
- auca, aucb = self.get_km_aucs(*self.get_km_fits(df, 'group'))
133
- cph = self.perform_cox_analysis(df)
134
- temp = cph.summary.p
135
- temp.name = ''
136
- temp.index.name = ''
137
- temp['auc_diff'] = auca - aucb
138
- temp['mut_id'] = event_name
139
- return temp
140
-
141
- except ConvergenceError:
142
- return None
143
-
144
-
@@ -1,194 +0,0 @@
1
-
2
- # CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
3
- # CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
4
- # CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
5
- # MAF_FILE_NAME = 'GeneMutTble.txt'
6
- # CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
7
- # PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
8
- # OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
9
- # MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
10
- # PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
11
- # PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
12
-
13
- def prepare_gene_sets():
14
- # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
15
- # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
16
- # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
17
- # all_oncogenes = list(set(all_oncogenes))
18
- return [], [], []
19
-
20
- CLIN_DF = prepare_clinical_data()
21
- TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
22
-
23
-
24
- def generate_survival_quantitative(affected_df, nonaffected_df):
25
- if affected_df.empty or nonaffected_df.empty:
26
- return np.nan, np.nan, np.nan
27
- results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
28
- event_observed_A=affected_df['event'],
29
- event_observed_B=nonaffected_df['event'])
30
- p_value = results.p_value
31
- kmf = KaplanMeierFitter()
32
- kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
33
- times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
34
- auc1 = np.trapz(surv_probs, times)
35
- kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
36
- times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
37
- auc2 = np.trapz(surv_probs, times)
38
- return p_value, auc1, auc2
39
-
40
- def generate_survival_pvalue(affected_df, unaffected_df):
41
- results = logrank_test(affected_df['duration'], unaffected_df['duration'],
42
- event_observed_A=affected_df['event'],
43
- event_observed_B=unaffected_df['event'])
44
-
45
- p_value = results.p_value
46
- kmf = KaplanMeierFitter()
47
- # Fit data
48
- kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
49
- ax = kmf.plot()
50
-
51
- kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
52
- kmf.plot(ax=ax)
53
- plt.text(5, 0.95, f'pval: {p_value:.3e}')
54
- plt.show()
55
- return p_value
56
-
57
- def get_project_prevalence(cases_affected):
58
- ca = [c for c in cases_affected if c in CASE_TRACKER.index]
59
- prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
60
- prevalences.fillna(0, inplace=True)
61
- prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
62
- prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
63
- return prevalences
64
-
65
- def get_project_counts(cases_affected):
66
- ca = [c for c in cases_affected if c in CASE_TRACKER.index]
67
- prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
68
- prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
69
- prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
70
- return prevalences
71
-
72
- def get_event_consequence(df):
73
- assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
74
- return df.iloc[0].Consequence
75
-
76
- def get_dbSNP_id(df):
77
- return df.iloc[0].dbSNP_RS
78
-
79
- def load_variant_file(gene):
80
- df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
81
- df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
82
- return df
83
-
84
- def find_event_data(event):
85
- df = load_variant_file(event.gene)
86
- if df.empty:
87
- return None
88
-
89
- df = df.query \
90
- ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
91
-
92
- if df.empty:
93
- return None
94
-
95
- if event.transcript_id is not None:
96
- df = df[df.Transcript_ID == event.transcript_id]
97
- df['mut_id'] = event.event_id
98
- return df
99
-
100
-
101
- class GEvent:
102
- def __init__(self, event_id, transcript_id=None):
103
- self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
104
- self.transcript_id = transcript_id
105
- self.chromosome = f'chr{self.chromosome}'
106
- self.start = int(self.start)
107
- self.event_id = event_id
108
-
109
-
110
-
111
- def get_okgp_mutation_frequency(mut_id):
112
- if mut_id in MUTATION_FREQ_DF.index:
113
- return MUTATION_FREQ_DF.loc[mut_id].cases_affected
114
- else:
115
- return 0
116
-
117
- def get_df_filter_info(df):
118
- filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
119
- MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
120
- 'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
121
- filter_col_name: str = "FILTER_info" # column name to add to the dataframe
122
- filter_info_list: list = []
123
- f_cnr_info = {}
124
-
125
- for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
126
- filter_vals = list(df_prj['FILTER'])
127
- num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
128
- for filter_val in filter_vals:
129
- num_pass += ('PASS' in filter_val)
130
- num_artifacts += any([x in filter_val for x in filter_artifact_values])
131
- num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
132
- num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
133
- f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
134
- return f_cnr_info
135
-
136
- def yoram_mutid(row):
137
- return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
138
-
139
-
140
- def annotate_level_two(mut_id, tid):
141
- mut = GEvent(mut_id, tid)
142
- df = find_event_data(mut)
143
-
144
- if df.empty or df is None:
145
- return None
146
-
147
- patients_affected = df.cases_affected.unique().tolist()
148
- p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
149
- project_prevalences = get_project_prevalence(patients_affected)
150
- prev_dict = project_prevalences.to_dict().sort()
151
- project_counts = get_project_counts(patients_affected)
152
-
153
- s = pd.Series({
154
- 'mut_id': mut_id,
155
- 'yoram_mut_id': yoram_mutid(df.iloc[0]),
156
- 'transcript_id': tid,
157
- 'affected_cases': len(patients_affected),
158
- 'dbSNP_id': get_dbSNP_id(df),
159
- 'consequence': get_event_consequence(df),
160
- 'survival_p_value': p_val,
161
- 'auc_affected': auc_a,
162
- 'auc_nonaffected': auc_n,
163
- 'TSG': contains(TSGS, mut.gene),
164
- 'oncogene': contains(ONCOGENES, mut.gene),
165
- 'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
166
- 'filter_inf': get_df_filter_info(df),
167
- 'strand': df.Strand.unique().tolist()[0],
168
- 'prevalences': prev_dict
169
- })
170
-
171
- s['max_prev'] = project_prevalences.max()
172
- s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
173
- s = pd.concat([s, project_prevalences, project_counts])
174
- del df
175
- return s
176
-
177
- def get_mut_counts():
178
- cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
179
- cases = pd.Series(cases)
180
- cases.name = 'num_cases'
181
- cases.index.name = 'mut_id'
182
- cases = cases.to_frame()
183
- cases.reset_index(inplace=True)
184
- return cases
185
-
186
-
187
- def create_mut_id(row):
188
- return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
189
-
190
-
191
- def is_in_exon(mut_id, tid):
192
- from geney.Gene import Gene
193
- transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
194
- return int(mut_id.split(':')[2]) in transcript.exonic_indices