PyPI - geney - Versions diffs - 1.1.1__py2.py3-none-any.whl → 1.1.3__py2.py3-none-any.whl - Mend

geney 1.1.1py2.py3-none-any.whl → 1.1.3py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

geney/power_utils.py +25 -12
geney/survival.py +85 -102
geney/tcga_utils.py +366 -0
{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/METADATA +2 -1
{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/RECORD +7 -6
{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/WHEEL +0 -0
{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/top_level.txt +0 -0

geney/power_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import subprocess
 import time
-from dask_jobqueue import PBSCluster
+from dask_jobqueue import PBSCluster, SLURMCluster
 from dask.distributed import Client, wait
 import os
 from tqdm import tqdm
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
 def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
                         walltime="24:00:00", dashboard_address=":23154",
-                        log_directory="dask-logs"):
+                        log_directory="dask-logs", slurm=False):
     """
     Launch a Dask cluster using PBS.
@@ -54,16 +54,29 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
     tuple: A tuple containing the Dask client and cluster objects.
     """
     try:
-        dask_cluster = PBSCluster(
-            cores=1,
-            memory=memory_size,
-            processes=1,
-            queue=queue,
-            walltime=walltime,
-            scheduler_options={"dashboard_address": dashboard_address},
-            log_directory=log_directory,
-            job_script_prologue=[f"cd {config_setup['BASE']}"]
-        )
+        if slurm:
+            dask_cluster = SLURMCluster(
+                cores=1,
+                memory=memory_size,
+                processes=1,
+                queue=queue,
+                walltime=walltime,
+                scheduler_options={"dashboard_address": dashboard_address},
+                log_directory=log_directory,
+                job_script_prologue=[f"cd {config_setup['BASE']}"]
+            )
+        else:
+            dask_cluster = PBSCluster(
+                cores=1,
+                memory=memory_size,
+                processes=1,
+                queue=queue,
+                walltime=walltime,
+                scheduler_options={"dashboard_address": dashboard_address},
+                log_directory=log_directory,
+                job_script_prologue=[f"cd {config_setup['BASE']}"]
+            )
         dask_cluster.scale(num_workers)
         dask_client = Client(dask_cluster)
         return dask_client, dask_cluster

geney/survival.py CHANGED Viewed

@@ -12,9 +12,12 @@ from lifelines import CoxPHFitter
 pd.set_option('display.max_columns', None)
 pd.options.mode.chained_assignment = None
-def prepare_clinical_data():
-    CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
-    df = unload_pickle(CLINICAL_DATA_FILE)
+def prepare_clinical_data(df=None):
+    if df is None:
+        CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
+        df = unload_pickle(CLINICAL_DATA_FILE)
     df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
     cols = list(df.columns)
     cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
@@ -28,114 +31,94 @@ def prepare_clinical_data():
     df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
     df[duration_col_label] /= 365
     df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
-    df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
+    # df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
     return df
 class SurvivalAnalysis:
-    def __init__(self, clindf):
-        self.clindf = clindf
+    def __init__(self, clindf=None):
+        self.clindf = prepare_clinical_data(clindf)
+        self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
+        self.df = self.clindf.copy()
+        self.df['group'] = 0
+        self.df.fillna(0, inplace=True)
         self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
-    def prepare_data(self, case_dict):
-        df1 = self.clindf.query(f"case_id in {case_dict['affected']}")
-        df2 = self.clindf.query(f"case_id in {case_dict['na1']}")
-        df3 = self.clindf.query(f"case_id in {case_dict['na2']}")
-        df1['group'] = 0
-        df2['group'] = 1
-        df3['group'] = 1
-        df = pd.concat([df1, df2, df3])
-        core_features = ['duration', 'event', 'group']
-        treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
-        df = df[treatment_features + core_features]
-        df.fillna(0, inplace=True)
-        cap_time = min([df[df.group == 0].duration.max(), df[df.group == 1].duration.max()])
-        df['duration'] = df['duration'].clip(upper=cap_time)
-        for col in treatment_features:
-            df.loc[df[col] > 0, col] = 1
+    def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
+        df = self.df.copy()
+        df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
+        if control_cases is not None:
+            df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
-        df = df[core_features + [col for col in treatment_features if
-                                 df[col].nunique() > 1 and df[col].value_counts(normalize=True).min() >= 0.01]]
-        return df
+        df = df[df.group > 0]
+        df.group -= 1
+        core_features = ['duration', 'event']
+        df = df[core_features + features_of_interest]
-    def perform_cox_analysis(self, df):
-        return CoxPHFitter().fit(df, 'duration', 'event')
-    def get_km_fits(self, df, feature):
-        group_A = df[df[feature] == 0]
-        group_B = df[df[feature] == 1]
-        # Create Kaplan-Meier fitter instances
-        kmf_A = KaplanMeierFitter()
-        kmf_B = KaplanMeierFitter()
-        # Fit the data
-        if len(group_A) < 5 or len(group_B) < 5:
-            return 0, 0
-        label1, label2 = f'Epistasis ({len(group_A)})', f'CVs Only ({len(group_B)})'
-        self.label1, self.label2 = label1, label2
-        kmf_A.fit(group_A['duration'], group_A['event'], label=self.label1)
-        kmf_B.fit(group_B['duration'], group_B['event'], label=self.label2)
-        return kmf_A, kmf_B
-    def get_km_aucs(self, kmf_A, kmf_B):
-        surv_func_A = kmf_A.survival_function_
-        surv_func_B = kmf_B.survival_function_
-        # Numerical integration using Trapezoidal rule
-        auc_A = trapz(surv_func_A[self.label1], surv_func_A.index)
-        auc_B = trapz(surv_func_B[self.label2], surv_func_B.index)
-        return auc_A, auc_B
-    def plot_km_curve(self, kmf_A, kmf_B):
-        # Plot the survival curves
-        ax = kmf_A.plot()
-        kmf_B.plot(ax=ax)
-        # Add labels and title
-        p_value = 0.01
-        ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        # ax.text(0.45, 0.85, f'AUCe: {auc_A:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        # ax.text(0.45, 0.85, f'AUCc: {auc_B:.4f}', transform=ax.transAxes, fontsize=12, horizontalalignment='center')
-        plt.title('Kaplan-Meier Survival Curves')
-        plt.xlabel('Time')
-        plt.ylabel('Survival Probability')
-        plt.show()
-        return self
-    def log_rank(self, df, column):
-        group1, group2 = df[df[column] == 0], df[df[column] == 1]
-        result = logrank_test(group1['duration'], group2['duration'],
-                              event_observed_A=group1['event'],
-                              event_observed_B=group2['event'])
-        return result.p_value
-    def run_analysis(self, dict1, event_name):
-        try:
-            df = self.prepare_data(dict1)
-            if len(df[df.group == 0]) < 2 or len(df[df.group == 1]) < 2:
-                return None
+        for col in self.treatment_features:
+            if col not in df:
+                continue
+            df.loc[df[col] > 0, col] = 1
-            elif len(df[df.group == 0]) < 10 or len(df[df.group == 1]) < 10:
-                temp = pd.Series()
-                temp['mut_id'] = event_name
-                for column in [c for c in df.columns if c != 'duration' and c != 'event']:
-                    temp[column] = self.log_rank(df, column)
+        df = df[core_features + [col for col in features_of_interest if
+                                 df[col].nunique() > 1]]  # and df[col].value_counts(normalize=True).min() >= 0.01]]
+        return df
+    def kaplan_meier_analysis(self, df, control_label='CV', target_label='Epistasis', feature='group', plot=False, time_cap=False):
+        # Can only be performed on features with two unique values
+        cap_time = df.groupby(feature).duration.max().min()
+        # df['duration'] = df['duration'].clip(upper=cap_time)
+        auc_vals = []
+        results = pd.Series()
+        count = 0
+        for val in [0, 1]:
+            g = df[df[feature] == val]
+            kmf = KaplanMeierFitter()
+            label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
+            if val == 0:
+                results[control_label] = len(g)
             else:
-                auca, aucb = self.get_km_aucs(*self.get_km_fits(df, 'group'))
-                cph = self.perform_cox_analysis(df)
-                temp = cph.summary.p
-                temp.name = ''
-                temp.index.name = ''
-                temp['auc_diff'] = auca - aucb
-                temp['mut_id'] = event_name
-            return temp
+                results[target_label] = len(g)
+            kmf.fit(g['duration'], g['event'], label=label)
+            surv_func = kmf.survival_function_
+            auc = trapz(surv_func[label], surv_func.index)
+            auc_vals.append(auc)
+            if plot:
+                if count == 0:
+                    ax = kmf.plot()
+                else:
+                    kmf.plot(ax=ax)
+                count += 1
+        p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
+        if plot:
+            ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12,
+                    horizontalalignment='center')
+            plt.title('Kaplan-Meier Survival Curves')
+            plt.xlabel('Time')
+            plt.ylabel('Survival Probability')
+            if time_cap:
+                plt.xlim([0, cap_time])
+            plt.show()
+        results['p_value'] = p_value
+        results['auc_target'] = auc_vals[-1]
+        if len(auc_vals) > 1:
+            results['auc_delta'] = auc_vals[-1] - auc_vals[0]
+            results['auc_control'] = auc_vals[0]
+        return results
+    def log_rank(self, group1, group2):
+        return logrank_test(group1['duration'], group2['duration'],
+                            event_observed_A=group1['event'],
+                            event_observed_B=group2['event']).p_value
+    def perform_cox_analysis(self, df, features_of_interest):
+        # Very simple... will return a series with p values for each feature
+        try:
+            return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
         except ConvergenceError:
-            return None
+            print("Convergence Error")
+            return pd.Series()

geney/tcga_utils.py ADDED Viewed

@@ -0,0 +1,366 @@
+import pandas as pd
+import random
+from pathlib import Path
+class TCGACase:
+    def __init__(self, df):
+        # Here we get a dataframe of mutations within a gene
+        self.df = df
+        self.calculate_vaf()
+        self.space_variants(spacer_size=50)
+        self.case_id = df.case_id.tolist()[0]
+    def space_variants(self, spacer_size=100, group_likelihood_threshold=0):
+        df = self.df
+        if df.empty:
+            df['group'] = 0
+            return self
+        values = sorted(df.Start_Position.unique().tolist())
+        # groups = [list(group) for key, group in groupby(values, key=lambda x: (x - values[values.index(x) - 1] >
+        # spacer_size) if values.index(x) > 0 else False)] Initialize variables
+        groups = []
+        current_group = []
+        # Iterate through the values
+        for i in range(len(values)):
+            if i == 0:
+                current_group.append(values[i])
+            else:
+                if values[i] - values[i - 1] <= spacer_size:
+                    current_group.append(values[i])
+                else:
+                    groups.append(current_group)
+                    current_group = [values[i]]
+        # Append the last group if it's not empty
+        if current_group:
+            groups.append(current_group)
+        df.loc[:, 'group'] = 0
+        for i, g in enumerate(groups):
+            df.loc[df.Start_Position.isin(g), 'group'] = i
+        self.df = df
+        return self
+    def calculate_vaf(self):
+        df = self.df
+        df = df[df.t_depth > 0]
+        df.loc[:, 'vaf'] = df.apply(lambda row: row.t_alt_count / row.t_depth, axis=1)
+        self.df = df
+        return self
+    def find_overlayed_variants(self):
+        df = self.df
+        mut_counts = df.mut_id.value_counts()
+        mut_counts = mut_counts[mut_counts > 1].index
+        small_df = df.groupby('mut_id', as_index=False).agg({
+            't_depth': 'sum',
+            't_alt_count': 'sum',
+            't_ref_count': 'sum',
+        })
+        df = df.drop_duplicates(subset='mut_id', keep='first')
+        small_df = small_df[small_df.t_depth > 0]
+        small_df['vaf'] = small_df.t_alt_count / small_df.t_depth
+        small_df = small_df.set_index('mut_id')
+        df.set_index('mut_id', inplace=True)
+        df.update(small_df)
+        df.reset_index(inplace=True)
+        self.df = df
+        return self
+    def find_epistasis(self, pth=3, rth=0):
+        df = self.df
+        if df.empty:
+            return None
+        # df = df[df.t_alt_count > rth].sort_values('Start_Position', ascending=True)
+        df = df[(df.t_alt_count > df.t_ref_count / pth) & (df.t_alt_count >= rth)].sort_values('Start_Position',
+                                                                                               ascending=True)
+        # display(df[['mut_id', 't_alt_count', 't_ref_count']])
+        # Group by the group_key
+        grouped = df.groupby('group').agg({
+            'mut_id': lambda x: '|'.join(x),
+            't_alt_count': 'mean',
+            't_ref_count': 'mean',
+            'case_id': 'first'
+        }).reset_index(drop=True)
+        # Drop the group_key column
+        return grouped[grouped.mut_id.str.contains('\|')][['mut_id', 't_alt_count', 't_ref_count', 'case_id']]
+class TCGAGene:
+    def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
+                 valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
+        df = pd.read_csv(cancer_path / gene / 'GeneMutTble.txt',
+                         usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
+                                  'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
+                                  'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
+                                  'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
+                         low_memory=False).sort_values('Start_Position', ascending=True)
+        if df.empty:
+            self.df = df
+        else:
+            df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
+            if include_filter is not None:
+                df = df[df.FILTER == include_filter]
+            elif exclude_filters is not None:
+                for exclude_filter in exclude_filters:
+                    df = df[~df.FILTER.str.contains(exclude_filter)]
+            if valid_cases is not None:
+                df = df[df.case_id.isin(valid_cases)]
+            df['mut_id'] = df.apply(lambda
+                                        row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
+                                    axis=1)
+            df['ratio'] = df.t_alt_count + df.t_ref_count
+            df = df[df.ratio > 0]
+            df['ratio'] = df.t_alt_count / df.ratio
+            self.df = df
+    def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
+        if mut_id is None:
+            return self.df.case_id.unique().tolist()
+        df = self.df
+        df = df[(df.mut_id == mut_id) & (df.ratio >= read_ratio)]
+        for filter in filters:
+            df = df[~df.FILTER.str.contains(filter)]
+        return df.case_id.unique().tolist()
+    def get_patient_muts(self, case_id=None):
+        if case_id is None:
+            case_id = random.choice(self.affected_cases())
+        return self.df[self.df.case_id == case_id]
+class TCGAMut:
+    def __init__(self, mut_id):
+        self.num_muts = mut_id.count('|') + 1
+        data = []
+        for mut in mut_id.split('|'):
+            data.append(mut.split(':'))
+        data = pd.DataFrame(data, columns=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele',
+                                           'Tumor_Seq_Allele2'])
+        data.Chromosome = data.apply(lambda row: f'chr{row.Chromosome}', axis=1)
+        data = data.astype({'Start_Position': int})
+        self.gene = data.Gene_name.unique().tolist()[0]
+        self.df = data
+    def find_affected_patients(self, read_ratio=0, exclude_filters=None):
+        gene = TCGAGene(self.gene, exclude_filters=exclude_filters).df
+        gene = gene[gene.ratio >= read_ratio]
+        return pd.merge(self.df, gene,
+                        on=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2'])
+    def find_affected_patients_list(self, read_ratio=0, exclude_filters=None):
+        df = self.find_affected_patients(read_ratio=read_ratio, exclude_filters=exclude_filters)
+        case_count = df.case_id.value_counts()
+        case_count = case_count[case_count == self.num_muts]
+        return case_count.index.tolist()
+# CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
+# CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
+# CANCER_DATA_PATH = Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes')
+# MAF_FILE_NAME = 'GeneMutTble.txt'
+# CASE_TRACKER = pd.read_csv('/tamir2/nicolaslynn/projects/TCGAParsed/case2proj.csv', index_col=0)
+# PROJ_COUNTS = CASE_TRACKER.proj.value_counts()
+# OKGP_DATA_FILE = Path('/tamir2/nicolaslynn/projects/1000GenomesProjMutations/parsed_1kgp_mutations_in_target_genes.csv')
+# MUTATION_FREQ_DF = pd.read_csv(OKGP_DATA_FILE, index_col=0)
+# PROTEIN_ANNOTATIONS = pd.read_csv('/tamir2/nicolaslynn/data/BioMart/protein_annotations.csv').rename(columns={'Interpro start': 'start', 'Interpro end': 'end', 'Interpro Short Description': 'name'})[['Gene stable ID', 'Transcript stable ID', 'start', 'end', 'name']]
+# PROTEIN_ANNOTATIONS['length'] = PROTEIN_ANNOTATIONS.apply(lambda row: abs(row.start - row.end), axis=1)
+# def prepare_gene_sets():
+#     # gene_annotations_file = Path('/tamir2/nicolaslynn/data/COSMIC/cancer_gene_roles.csv')
+#     # GENE_DF = pd.read_csv(gene_annotations_file, index_col=0)
+#     # all_oncogenes = GENE_DF[GENE_DF.OG==True].index.tolist()
+#     # all_oncogenes = list(set(all_oncogenes))
+#     return [], [], []
+#
+# CLIN_DF = prepare_clinical_data()
+# TSGS, ONCOGENES, CANCER_GENES = prepare_gene_sets()
+#
+#
+# def generate_survival_quantitative(affected_df, nonaffected_df):
+#     if affected_df.empty or nonaffected_df.empty:
+#         return np.nan, np.nan, np.nan
+#     results = logrank_test(affected_df['duration'], nonaffected_df['duration'],
+#                            event_observed_A=affected_df['event'],
+#                            event_observed_B=nonaffected_df['event'])
+#     p_value = results.p_value
+#     kmf = KaplanMeierFitter()
+#     kmf.fit(affected_df['duration'], affected_df['event'], label=f'With Epistasis ({len(affected_df)})')
+#     times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
+#     auc1 = np.trapz(surv_probs, times)
+#     kmf.fit(nonaffected_df['duration'], nonaffected_df['event'], label=f'Without Epistasis ({len(nonaffected_df)})')
+#     times, surv_probs = kmf.survival_function_.index.values, kmf.survival_function_.values.flatten()
+#     auc2 = np.trapz(surv_probs, times)
+#     return p_value, auc1, auc2
+#
+# def generate_survival_pvalue(affected_df, unaffected_df):
+#     results = logrank_test(affected_df['duration'], unaffected_df['duration'],
+#                            event_observed_A=affected_df['event'],
+#                            event_observed_B=unaffected_df['event'])
+#
+#     p_value = results.p_value
+#     kmf = KaplanMeierFitter()
+#     # Fit data
+#     kmf.fit(affected_df['duration'], affected_df['event'], label=f'Without Epistasis ({len(affected_df)})')
+#     ax = kmf.plot()
+#
+#     kmf.fit(unaffected_df['duration'], unaffected_df['event'], label=f'With Epistasis ({len(unaffected_df)})')
+#     kmf.plot(ax=ax)
+#     plt.text(5, 0.95, f'pval: {p_value:.3e}')
+#     plt.show()
+#     return p_value
+#
+# def get_project_prevalence(cases_affected):
+#     ca = [c for c in cases_affected if c in CASE_TRACKER.index]
+#     prevalences = CASE_TRACKER.loc[ca].proj.value_counts() / PROJ_COUNTS
+#     prevalences.fillna(0, inplace=True)
+#     prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
+#     prevalences.index = [s.replace('TCGA', 'prev') for s in prevalences.index]
+#     return prevalences
+#
+# def get_project_counts(cases_affected):
+#     ca = [c for c in cases_affected if c in CASE_TRACKER.index]
+#     prevalences = CASE_TRACKER.loc[ca].proj.value_counts()
+#     prevalences = prevalences[[i for i in prevalences.index if 'TCGA' in i]]
+#     prevalences.index = [s.replace('TCGA_', '') for s in prevalences.index]
+#     return prevalences
+#
+# def get_event_consequence(df):
+#     assert df.Transcript_ID.nunique() == 1, 'Too many transcripts to return a single consequenc.'
+#     return df.iloc[0].Consequence
+#
+# def get_dbSNP_id(df):
+#     return df.iloc[0].dbSNP_RS
+#
+# def load_variant_file(gene):
+#     df = pd.read_csv(CANCER_DATA_PATH / gene / MAF_FILE_NAME, low_memory=False)
+#     df['mut_id'] = df.apply(lambda row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}", axis=1)
+#     return df
+#
+# def find_event_data(event):
+#     df = load_variant_file(event.gene)
+#     if df.empty:
+#         return None
+#
+#     df = df.query \
+#         ('Chromosome == @event.chromosome & Start_Position == @event.start & Reference_Allele == @event.ref & Tumor_Seq_Allele2 == @event.alt')
+#
+#     if df.empty:
+#         return None
+#
+#     if event.transcript_id is not None:
+#         df = df[df.Transcript_ID == event.transcript_id]
+#     df['mut_id'] = event.event_id
+#     return df
+#
+#
+# class GEvent:
+#     def __init__(self, event_id, transcript_id=None):
+#         self.gene, self.chromosome, self.start, self.ref, self.alt = event_id.split(':')
+#         self.transcript_id = transcript_id
+#         self.chromosome = f'chr{self.chromosome}'
+#         self.start = int(self.start)
+#         self.event_id = event_id
+#
+#
+#
+# def get_okgp_mutation_frequency(mut_id):
+#     if mut_id in MUTATION_FREQ_DF.index:
+#         return MUTATION_FREQ_DF.loc[mut_id].cases_affected
+#     else:
+#         return 0
+#
+# def get_df_filter_info(df):
+#     filter_artifact_values: list = ["oxog", "bPcr", "bSeq"]
+#     MuTect2_filters: list = ['Germline risk', 't_lod_fstar', 'alt_allele_in_normal', 'panel_of_normals', 'clustered_events',
+#                              'str_contraction', 'multi_event_alt_allele_in_normal', 'homologous_mapping_event', 'triallelic_site']
+#     filter_col_name: str = "FILTER_info"  # column name to add to the dataframe
+#     filter_info_list: list = []
+#     f_cnr_info = {}
+#
+#     for j, (prj, df_prj) in enumerate(df.groupby('Proj_name')):
+#         filter_vals = list(df_prj['FILTER'])
+#         num_pass, num_artifacts, num_mutect2_filters = 0, 0, 0
+#         for filter_val in filter_vals:
+#             num_pass += ('PASS' in filter_val)
+#             num_artifacts += any([x in filter_val for x in filter_artifact_values])
+#             num_mutect2_filters += any([x in filter_val for x in MuTect2_filters])
+#         num_rest = max(0, (len(filter_vals) - num_pass - num_artifacts - num_mutect2_filters))
+#         f_cnr_info[str(prj)[5:]] = (num_pass, num_mutect2_filters, num_artifacts, num_rest)
+#     return f_cnr_info
+#
+# def yoram_mutid(row):
+#     return f'{row.Gene_name}:{row.Chromosome}:{row.Consequence}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}'
+#
+#
+# def annotate_level_two(mut_id, tid):
+#     mut = GEvent(mut_id, tid)
+#     df = find_event_data(mut)
+#
+#     if df.empty or df is None:
+#         return None
+#
+#     patients_affected = df.cases_affected.unique().tolist()
+#     p_val, auc_a, auc_n = generate_survival_quantitative(CLIN_DF[CLIN_DF.case_id.isin(patients_affected)], CLIN_DF[~CLIN_DF.case_id.isin(patients_affected)])
+#     project_prevalences = get_project_prevalence(patients_affected)
+#     prev_dict = project_prevalences.to_dict().sort()
+#     project_counts = get_project_counts(patients_affected)
+#
+#     s = pd.Series({
+#         'mut_id': mut_id,
+#         'yoram_mut_id': yoram_mutid(df.iloc[0]),
+#         'transcript_id': tid,
+#         'affected_cases': len(patients_affected),
+#         'dbSNP_id': get_dbSNP_id(df),
+#         'consequence': get_event_consequence(df),
+#         'survival_p_value': p_val,
+#         'auc_affected': auc_a,
+#         'auc_nonaffected': auc_n,
+#         'TSG': contains(TSGS, mut.gene),
+#         'oncogene': contains(ONCOGENES, mut.gene),
+#         'cases_1kgp': get_okgp_mutation_frequency(mut.event_id),
+#         'filter_inf': get_df_filter_info(df),
+#         'strand': df.Strand.unique().tolist()[0],
+#         'prevalences': prev_dict
+#     })
+#
+#     s['max_prev'] = project_prevalences.max()
+#     s['rel_proj'] = ','.join([c.split('_')[-1] for c in project_prevalences[project_prevalences == project_prevalences.max()].index.tolist()])
+#     s = pd.concat([s, project_prevalences, project_counts])
+#     del df
+#     return s
+#
+# def get_mut_counts():
+#     cases = unload_json('/tamir2/nicolaslynn/projects/TCGAParsed/recurring_single_muts_tcga.json')
+#     cases = pd.Series(cases)
+#     cases.name = 'num_cases'
+#     cases.index.name = 'mut_id'
+#     cases = cases.to_frame()
+#     cases.reset_index(inplace=True)
+#     return cases
+#
+#
+# def create_mut_id(row):
+#     return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
+#
+#
+# def is_in_exon(mut_id, tid):
+#     from geney.Gene import Gene
+#     transcript = Gene(mut_id.split(':')[0]).generate_transcript(tid)
+#     return int(mut_id.split(':')[2]) in transcript.exonic_indices

{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.1.1
+Version: 1.1.3
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn
@@ -27,6 +27,7 @@ Requires-Dist: joblib ==1.3.2
 Requires-Dist: gtfparse ==1.3.0
 Requires-Dist: sh ==2.0.6
 Requires-Dist: termplotlib ==0.3.9
+Requires-Dist: lifelines
 Requires-Dist: notebook
 Requires-Dist: matplotlib
 Requires-Dist: dask[complete]

{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/RECORD RENAMED Viewed

@@ -9,9 +9,10 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/netchop.py,sha256=mgKe9Yv2m1SlZUmIXBVNtH-rP5PtBn9SlEi9lE1L0SE,2821
 geney/oncosplice.py,sha256=Fyc_UtAhV3Pv0vk8V55rO_jnb2Dwj5sW98KVwP3PHwU,68964
 geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
-geney/power_utils.py,sha256=OP2GRwOnQ2zhBHN0Rz4EVdZLaj1GV9bR4IDRnaRysWc,6770
-geney/survival.py,sha256=zSEVY3HiKcTSR2jfjcxg_WKOe7GqXLYFby6Mj0hM6bI,6147
+geney/power_utils.py,sha256=WRpqMnqUv1xrAeTduAUhx6YpSEJQci7bC2od12JcVtE,7267
+geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
 geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
+geney/tcga_utils.py,sha256=cX9hbDX-qECyCMSYaBL8r1FWWuju08jQvlPT3q13B3Y,15777
 geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
 geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
@@ -39,7 +40,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
 geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-geney-1.1.1.dist-info/METADATA,sha256=cRGsGjHn0ZtWpktPw7AsUQ3Rl4DzHMVNQ7-HYDHPr08,1105
-geney-1.1.1.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-geney-1.1.1.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.1.1.dist-info/RECORD,,
+geney-1.1.3.dist-info/METADATA,sha256=ec8t6aiZh-SlD6yyhfar7GBs7ljgXw66-TBM7lPXZCo,1130
+geney-1.1.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+geney-1.1.3.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.1.3.dist-info/RECORD,,

{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.1.1.dist-info → geney-1.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.1.1__py2.py3-none-any.whl → 1.1.3__py2.py3-none-any.whl

geney 1.1.1py2.py3-none-any.whl → 1.1.3py2.py3-none-any.whl