PyPI - geney - Versions diffs - 1.2.20__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl - Mend

geney 1.2.20py2.py3-none-any.whl → 1.2.21py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (38) hide show

{geney-1.2.20.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
geney-1.2.21.dist-info/RECORD +19 -0
geney/Gene.py +0 -258
geney/analyzers/__init__.py +0 -0
geney/analyzers/benchmark_clinvar.py +0 -158
geney/analyzers/characterize_epistasis.py +0 -15
geney/analyzers/compare_sets.py +0 -91
geney/analyzers/group_comparison.py +0 -81
geney/analyzers/survival.py +0 -144
geney/analyzers/tcga_annotations.py +0 -194
geney/analyzers/visualize_protein_conservation.py +0 -398
geney/benchmark_clinvar.py +0 -158
geney/compare_sets.py +0 -91
geney/data_parsers/__init__.py +0 -0
geney/data_parsers/gtex.py +0 -68
geney/gtex.py +0 -68
geney/immunotherapy/__init__.py +0 -0
geney/immunotherapy/netchop.py +0 -78
geney/mutations/__init__.py +0 -0
geney/mutations/variant_utils.py +0 -125
geney/netchop.py +0 -79
geney/oncosplice/__init__.py +0 -0
geney/oncosplice_mouse.py +0 -277
geney/oncosplice_pipeline.py +0 -1588
geney/performance_utils.py +0 -138
geney/pipelines/__init__.py +0 -0
geney/pipelines/dask_utils.py +0 -153
geney/splicing/__init__.py +0 -2
geney/splicing/spliceai_utils.py +0 -253
geney/splicing/splicing_isoform_utils.py +0 -0
geney/splicing/splicing_utils.py +0 -366
geney/survival.py +0 -124
geney/tcga_annotations.py +0 -352
geney/translation_termination/__init__.py +0 -0
geney/translation_termination/tts_utils.py +0 -0
geney-1.2.20.dist-info/RECORD +0 -52
{geney-1.2.20.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
{geney-1.2.20.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0

geney/analyzers/survival.py DELETED Viewed

@@ -1,144 +0,0 @@
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from pathlib import Path
-from scipy.integrate import trapz
-from geney.utils import unload_pickle, unload_json, contains
-from lifelines.exceptions import ConvergenceError
-from lifelines import KaplanMeierFitter
-from lifelines.statistics import logrank_test
-from lifelines import CoxPHFitter
-pd.set_option('display.max_columns', None)
-pd.options.mode.chained_assignment = None
-# epistasis_tracker = unload_pickle('epistasis2case_tracker.pkl')
-# mutation_tracker = unload_pickle('mutation2case_tracker.pkl')
-def prepare_clinical_data():
-    CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
-    df = unload_pickle(CLINICAL_DATA_FILE)
-    df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
-    cols = list(df.columns)
-    cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
-    cols_days_to_know_alive = [col for col in cols if 'days_to_know_alive' in col] + [col for col in cols if 'days_to_last_known_alive' in col]
-    cols_days_to_death = [col for col in cols if 'days_to_death' in col]
-    cols_duration = cols_days_to_followup + cols_days_to_know_alive + cols_days_to_death
-    col_vital_status = 'days_to_death'
-    event_col_label = 'event'
-    duration_col_label = 'duration'
-    df.insert(1, event_col_label, df.apply(lambda x: int(not np.isnan(x[col_vital_status])), axis=1))
-    df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
-    df[duration_col_label] /= 365
-    df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
-    df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
-    return df
-class SurvivalAnalysis:
-    def __init__(self, clindf):
-        self.clindf = clindf
-        self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
-    def prepare_data(self, case_dict):
-        df1 = self.clindf.query(f"case_id in {case_dict['affected']}")
-        df2 = self.clindf.query(f"case_id in {case_dict['na1']}")
-        df3 = self.clindf.query(f"case_id in {case_dict['na2']}")
-        df1['group'] = 0
-        df2['group'] = 1
-        df3['group'] = 1
-        df = pd.concat([df1, df2, df3])
-        core_features = ['duration', 'event', 'group']
-        treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
-        df = df[treatment_features + core_features]
-        df.fillna(0, inplace=True)
-        cap_time = min([df[df.group == 0].duration.max(), df[df.group == 1].duration.max()])
-        df['duration'] = df['duration'].clip(upper=cap_time)
-        for col in treatment_features:
-            df.loc[df[col] > 0, col] = 1
-        df = df[core_features + [col for col in treatment_features if
-                                 df[col].nunique() > 1 and df[col].value_counts(normalize=True).min() >= 0.01]]
-        return df
-    def perform_cox_analysis(self, df):
-        return CoxPHFitter().fit(df, 'duration', 'event')
-    def get_km_fits(self, df, feature):
-        group_A = df[df[feature] == 0]
-        group_B = df[df[feature] == 1]
-        # Create Kaplan-Meier fitter instances
-        kmf_A = KaplanMeierFitter()
-        kmf_B = KaplanMeierFitter()
-        # Fit the data
-        if len(group_A) < 5 or len(group_B) < 5:
-            return 0, 0
-        label1, label2 = f'Epistasis ({len(group_A)})', f'CVs Only ({len(group_B)})'
-        self.label1, self.label2 = label1, label2
-        kmf_A.fit(group_A['duration'], group_A['event'], label=self.label1)
-        kmf_B.fit(group_B['duration'], group_B['event'], label=self.label2)
-        return kmf_A, kmf_B
-    def get_km_aucs(self, kmf_A, kmf_B):
-        surv_func_A = kmf_A.survival_function_
-        surv_func_B = kmf_B.survival_function_
-        # Numerical integration using Trapezoidal rule
-        auc_A = trapz(surv_func_A[self.label1], surv_func_A.index)
-        auc_B = trapz(surv_func_B[self.label2], surv_func_B.index)
-        return auc_A, auc_B
-    def plot_km_curve(self, kmf_A, kmf_B):
-        # Plot the survival curves
-        ax = kmf_A.plot()
-        kmf_B.plot(ax=ax)
-        # Add labels and title
-        p_value = 0.01
-        ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        # ax.text(0.45, 0.85, f'AUCe: {auc_A:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        # ax.text(0.45, 0.85, f'AUCc: {auc_B:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        plt.title('Kaplan-Meier Survival Curves')
-        plt.xlabel('Time')
-        plt.ylabel('Survival Probability')
-        plt.show()
-        return self
-    def log_rank(self, df, column):
-        group1, group2 = df[df[column] == 0], df[df[column] == 1]
-        result = logrank_test(group1['duration'], group2['duration'],
-                              event_observed_A=group1['event'],
-                              event_observed_B=group2['event'])
-        return result.p_value
-    def run_analysis(self, dict1, event_name):
-        try:
-            df = self.prepare_data(dict1)
-            if len(df[df.group == 0]) < 2 or len(df[df.group == 1]) < 2:
-                return None
-            elif len(df[df.group == 0]) < 10 or len(df[df.group == 1]) < 10:
-                temp = pd.Series()
-                temp['mut_id'] = event_name
-                for column in [c for c in df.columns if c != 'duration' and c != 'event']:
-                    temp[column] = self.log_rank(df, column)
-            else:
-                auca, aucb = self.get_km_aucs(*self.get_km_fits(df, 'group'))
-                cph = self.perform_cox_analysis(df)
-                temp = cph.summary.p
-                temp.name = ''
-                temp.index.name = ''
-                temp['auc_diff'] = auca - aucb
-                temp['mut_id'] = event_name
-            return temp
-        except ConvergenceError:
-            return None

geney/analyzers/tcga_annotations.py DELETED Viewed

@@ -1,194 +0,0 @@
-# CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
-# CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
-# CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
-# MAF_FILE_NAME = 'GeneMutTble.txt'
-# CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
-# PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
-# OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
-# MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
-# PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
-# PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
-def prepare_gene_sets():
-    # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
-    # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
-    # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
-    # all_oncogenes = list(set(all_oncogenes))
-    return [], [], []
-CLIN_DF = prepare_clinical_data()
-TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
-def generate_survival_quantitative(affected_df, nonaffected_df):
-    if affected_df.empty or nonaffected_df.empty:
-        return np.nan, np.nan, np.nan
-    results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
-                           event_observed_A=affected_df['event'],
-                           event_observed_B=nonaffected_df['event'])
-    p_value = results.p_value
-    kmf = KaplanMeierFitter()
-    kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
-    times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
-    auc1 = np.trapz(surv_probs, times)
-    kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
-    times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
-    auc2 = np.trapz(surv_probs, times)
-    return p_value, auc1, auc2
-def generate_survival_pvalue(affected_df, unaffected_df):
-    results = logrank_test(affected_df['duration'], unaffected_df['duration'],
-                           event_observed_A=affected_df['event'],
-                           event_observed_B=unaffected_df['event'])
-    p_value = results.p_value
-    kmf = KaplanMeierFitter()
-    # Fit data
-    kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
-    ax = kmf.plot()
-    kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
-    kmf.plot(ax=ax)
-    plt.text(5, 0.95, f'pval: {p_value:.3e}')
-    plt.show()
-    return p_value
-def get_project_prevalence(cases_affected):
-    ca = [c for c in cases_affected if c in CASE_TRACKER.index]
-    prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
-    prevalences.fillna(0, inplace=True)
-    prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
-    prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
-    return prevalences
-def get_project_counts(cases_affected):
-    ca = [c for c in cases_affected if c in CASE_TRACKER.index]
-    prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
-    prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
-    prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
-    return prevalences
-def get_event_consequence(df):
-    assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
-    return df.iloc[0].Consequence
-def get_dbSNP_id(df):
-    return df.iloc[0].dbSNP_RS
-def load_variant_file(gene):
-    df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
-    df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
-    return df
-def find_event_data(event):
-    df = load_variant_file(event.gene)
-    if df.empty:
-        return None
-    df = df.query \
-        ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
-    if df.empty:
-        return None
-    if event.transcript_id is not None:
-        df = df[df.Transcript_ID == event.transcript_id]
-    df['mut_id'] = event.event_id
-    return df
-class GEvent:
-    def __init__(self, event_id, transcript_id=None):
-        self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
-        self.transcript_id = transcript_id
-        self.chromosome = f'chr{self.chromosome}'
-        self.start = int(self.start)
-        self.event_id = event_id
-def get_okgp_mutation_frequency(mut_id):
-    if mut_id in MUTATION_FREQ_DF.index:
-        return MUTATION_FREQ_DF.loc[mut_id].cases_affected
-    else:
-        return 0
-def get_df_filter_info(df):
-    filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
-    MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
-                             'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
-    filter_col_name: str = "FILTER_info"  # column name to add to the dataframe
-    filter_info_list: list = []
-    f_cnr_info = {}
-    for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
-        filter_vals = list(df_prj['FILTER'])
-        num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
-        for filter_val in filter_vals:
-            num_pass += ('PASS' in filter_val)
-            num_artifacts += any([x in filter_val for x in filter_artifact_values])
-            num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
-        num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
-        f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
-    return f_cnr_info
-def yoram_mutid(row):
-    return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
-def annotate_level_two(mut_id, tid):
-    mut = GEvent(mut_id, tid)
-    df = find_event_data(mut)
-    if df.empty or df is None:
-        return None
-    patients_affected = df.cases_affected.unique().tolist()
-    p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
-    project_prevalences = get_project_prevalence(patients_affected)
-    prev_dict = project_prevalences.to_dict().sort()
-    project_counts = get_project_counts(patients_affected)
-    s = pd.Series({
-        'mut_id': mut_id,
-        'yoram_mut_id': yoram_mutid(df.iloc[0]),
-        'transcript_id': tid,
-        'affected_cases': len(patients_affected),
-        'dbSNP_id': get_dbSNP_id(df),
-        'consequence': get_event_consequence(df),
-        'survival_p_value': p_val,
-        'auc_affected': auc_a,
-        'auc_nonaffected': auc_n,
-        'TSG': contains(TSGS, mut.gene),
-        'oncogene': contains(ONCOGENES, mut.gene),
-        'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
-        'filter_inf': get_df_filter_info(df),
-        'strand': df.Strand.unique().tolist()[0],
-        'prevalences': prev_dict
-    })
-    s['max_prev'] = project_prevalences.max()
-    s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
-    s = pd.concat([s, project_prevalences, project_counts])
-    del df
-    return s
-def get_mut_counts():
-    cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
-    cases = pd.Series(cases)
-    cases.name = 'num_cases'
-    cases.index.name = 'mut_id'
-    cases = cases.to_frame()
-    cases.reset_index(inplace=True)
-    return cases
-def create_mut_id(row):
-    return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
-def is_in_exon(mut_id, tid):
-    from geney.Gene import Gene
-    transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
-    return int(mut_id.split(':')[2]) in transcript.exonic_indices

geney 1.2.20__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl

Potentially problematic release.

geney 1.2.20py2.py3-none-any.whl → 1.2.21py2.py3-none-any.whl