PyPI - geney - Versions diffs - 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl - Mend

geney 1.2.20py2.py3-none-any.whl → 1.2.22py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (39) hide show

geney/oncosplice.py +1 -1
{geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
geney-1.2.22.dist-info/RECORD +19 -0
geney/Gene.py +0 -258
geney/analyzers/__init__.py +0 -0
geney/analyzers/benchmark_clinvar.py +0 -158
geney/analyzers/characterize_epistasis.py +0 -15
geney/analyzers/compare_sets.py +0 -91
geney/analyzers/group_comparison.py +0 -81
geney/analyzers/survival.py +0 -144
geney/analyzers/tcga_annotations.py +0 -194
geney/analyzers/visualize_protein_conservation.py +0 -398
geney/benchmark_clinvar.py +0 -158
geney/compare_sets.py +0 -91
geney/data_parsers/__init__.py +0 -0
geney/data_parsers/gtex.py +0 -68
geney/gtex.py +0 -68
geney/immunotherapy/__init__.py +0 -0
geney/immunotherapy/netchop.py +0 -78
geney/mutations/__init__.py +0 -0
geney/mutations/variant_utils.py +0 -125
geney/netchop.py +0 -79
geney/oncosplice/__init__.py +0 -0
geney/oncosplice_mouse.py +0 -277
geney/oncosplice_pipeline.py +0 -1588
geney/performance_utils.py +0 -138
geney/pipelines/__init__.py +0 -0
geney/pipelines/dask_utils.py +0 -153
geney/splicing/__init__.py +0 -2
geney/splicing/spliceai_utils.py +0 -253
geney/splicing/splicing_isoform_utils.py +0 -0
geney/splicing/splicing_utils.py +0 -366
geney/survival.py +0 -124
geney/tcga_annotations.py +0 -352
geney/translation_termination/__init__.py +0 -0
geney/translation_termination/tts_utils.py +0 -0
geney-1.2.20.dist-info/RECORD +0 -52
{geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
{geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0

geney/oncosplice.py CHANGED Viewed

@@ -1047,7 +1047,7 @@ class PredictSpliceAI:
             # self.missplicing = run_spliceai_transcript(self.modification, transcript_data=gene_data, sai_mrg_context=sai_mrg_context, min_coverage=min_coverage, sai_threshold=0.1)
             # print(f"RUNNING: {mutation.mut_id}")
-            ref_transcript, var_transcript = Gene(mutation.mut_id.split(':')[0], organism=organism).transcript(gene_data.transcript_id), Gene(mutation.mut_id.split(':')[0], mutation.mut_id, organism='mm39').transcript(gene_data.transcript_id)
+            ref_transcript, var_transcript = Gene(mutation.mut_id.split(':')[0], organism=organism).transcript(gene_data.transcript_id), Gene(mutation.mut_id.split(':')[0], mutation.mut_id, organism=organism).transcript(gene_data.transcript_id)
             # print(f"Second check : {ref_transcript.pre_mrna == var_transcript.pre_mrna}")
             self.missplicing = find_transcript_missplicing(self.modification, ref_transcript, var_transcript, context=sai_mrg_context+min_coverage, threshold=threshold,
                                 engine=engine)

{geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.2.20
+Version: 1.2.22
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

geney-1.2.22.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
+geney/__init__.py,sha256=knezxgbV2c2gcO2ek2-xxEC15HL4aO1WuoMiYOOvKf8,428
+geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
+geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
+geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
+geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
+geney/oncosplice.py,sha256=AZm8Vj7z65DokPmeflwoqs2BM11neV9hQLA_Ao4ysnM,78242
+geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
+geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
+geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
+geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
+geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
+geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
+geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
+geney-1.2.22.dist-info/METADATA,sha256=eTTiyuGPZ5lD7jV8YZXSocPyewD3OPwvgeaqiXxuVfo,1163
+geney-1.2.22.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
+geney-1.2.22.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.2.22.dist-info/RECORD,,

geney/Gene.py DELETED Viewed

@@ -1,258 +0,0 @@
-from copy import copy
-from Bio.Seq import Seq
-from geney.mutations.variant_utils import generate_mut_variant, Mutation, find_new_tts
-from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle
-from geney.Fasta_segment import Fasta_segment
-from geney import config_setup
-from geney.translation_initiation.tis_utils import TISFInder
-class Gene:
-    def __init__(self, gene_name, variation):
-        self.gene_name = gene_name
-        self.gene_id = ''
-        self.rev = None
-        self.chrm = ''
-        self.gene_start = 0
-        self.gene_end = 0
-        self.transcripts = {}
-        self.load_from_file(find_files_by_gene_name(gene_name))
-        self.variation = variation
-    def __repr__(self):
-        return f'Gene(gene_name={self.gene_name})'
-    def __len__(self):
-        return len(self.transcripts)
-    def __str__(self):
-        return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
-    def __copy__(self):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        result.__dict__.update(self.__dict__)
-        return result
-    def __getitem__(self, index):
-        return Transcript(list(self.transcripts.values())[index])
-    def load_from_file(self, file_name):
-        if not file_name.exists():
-            raise FileNotFoundError(f"File '{file_name}' not found.")
-        self.load_from_dict(dict_data=unload_pickle(file_name))
-        return self
-    def load_from_dict(self, dict_data=None):
-        for k, v in dict_data.items():
-            setattr(self, k, v)
-        return self
-    # def generate_transcript(self, tid=None):
-    #     if tid == None:
-    #         tid = [k for k, v in self.transcripts.items() if v['primary_transcript']][0]
-    #     return Transcript(self.transcripts[tid])
-    def transcript(self, tid):
-        if tid not in self.transcripts:
-            raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
-        return Transcript(self.transcripts[tid]) #self.generate_transcript(tid)
-class Transcript:
-    def __init__(self, d=None):
-        self.transcript_id = None
-        self.transcript_start = None                            # transcription
-        self.transcript_end = None                              # transcription
-        self.transcript_biotype = None                          # metadata
-        self.acceptors, self.donors = [], []                    # splicing
-        self.TIS, self.TTS = None, None                         # translation
-        self.transcript_seq, self.transcript_indices = '', []   # sequence data
-        self.rev = None                                         # sequence data
-        self.chrm = ''                                          # sequence data
-        self.pre_mrna = ''                                      # sequence data
-        self.orf = ''                                           # sequence data
-        self.protein = ''                                       # sequence data
-        self.log = ''                                           # sequence data
-        self.primary_transcript=None                            # sequence data
-        self.cons_available=False                               # metadata
-        self.cons_seq = ''
-        self.cons_vector = ''
-        if d:
-            self.load_from_dict(d)
-        if self.cons_available:
-            if '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector):
-                self.cons_seq = self.cons_seq.replace('*', '')
-                self.cons_vector = self.cons_vector[:-1]
-            elif '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector) + 1:
-                self.cons_seq = self.cons_seq.replace('*', '')
-            else:
-                self.cons_available = False
-    def __repr__(self):
-        return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
-    def __len__(self):
-        return len(self.transcript_seq)
-    def __str__(self):
-        return 'Transcript {tid}, Transcript Type: ' \
-               '{protein_coding}'.format(
-                tid=self.transcript_id, protein_coding=self.transcript_biotype)
-    def __eq__(self, other):
-        return self.transcript_seq == other.transcript_seq
-    def __contains__(self, subvalue):
-        if isinstance(subvalue, str):
-            return subvalue in self.transcript_seq
-        elif isinstance(subvalue, int):
-            return subvalue in self.transcript_indices
-        else:
-            print(
-                "Pass an integer to check against the span of the gene's coordinates or a string to check against the "
-                "pre-mRNA sequence.")
-            return False
-    def __copy__(self, other):
-        return copy(self)
-    @property
-    def constructor(self):
-        core_attributes = ['transcript_id', 'transcript_start', 'transcript_end', 'transcript_biotype', 'acceptors', 'donors', 'TIS', 'TTS', 'rev', 'chrm']
-        return {k: v for k, v in self.__dict__.items() if k in core_attributes}
-    def load_from_dict(self, data):
-        for k, v in data.items():
-            setattr(self, k, v)
-        self.__arrange_boundaries()
-        self.generate_mature_mrna(inplace=True)
-        return self
-    @property
-    def exons(self):
-        return list(zip(self.acceptors, self.donors))
-    @property
-    def introns(self):
-        return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
-    def set_exons(self, boundaries):
-        self.acceptors, self.donors = boundaries['acceptors'], boundaries['donors']
-        self.__arrange_boundaries()
-        return self
-    @property
-    def introns(self):
-        return list(zip([v for v in self.donors if v != self.transcript_end], [v for v in self.acceptors if v != self.transcript_start]))
-    def __exon_coverage_check(self):
-        if sum([abs(a-b) + 1 for a, b in self.exons]) == len(self):
-            return True
-        else:
-            return False
-    @property
-    def exons_pos(self):
-        temp = self.exons
-        if self.rev:
-            temp = [(b, a) for a, b in temp[::-1]]
-        return temp
-    @property
-    def mrna_indices(self):
-        temp = [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
-        return sorted(temp, reverse=self.rev)
-    @property
-    def exonic_indices(self):
-        return [lst for lsts in [list(range(a, b+1)) for a, b in self.exons_pos] for lst in lsts]
-    def __arrange_boundaries(self):
-        self.acceptors.append(self.transcript_start)
-        self.donors.append(self.transcript_end)
-        self.acceptors = list(set(self.acceptors))
-        self.donors = list(set(self.donors))
-        self.acceptors.sort(reverse=self.rev)
-        self.donors.sort(reverse=self.rev)
-        return self
-    def positive_strand(self):
-        if self.rev:
-            return reverse_complement(self.transcript_seq)
-        else:
-            return self.transcript_seq
-    def __pos2sense(self, mrna, indices):
-        if self.rev:
-            mrna = reverse_complement(mrna)
-            indices = indices[::-1]
-        return mrna, indices
-    def pull_pre_mrna_pos(self):
-        fasta_obj = Fasta_segment()
-        if self.rev:
-            return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_end,
-                                                                   self.transcript_start)
-        else:
-            return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta', self.transcript_start,
-                                                                   self.transcript_end)
-    def generate_pre_mrna_pos(self, mutations=[]):
-        seq, indices = self.pull_pre_mrna_pos()
-        for mutation in mutations:
-            mutation = Mutation(mutation)
-            seq, indices, _, _ = generate_mut_variant(seq, indices, mut=mutation)
-        self.pre_mrna, _ = self.__pos2sense(seq, indices)
-        return seq, indices
-    def generate_pre_mrna(self, mutations=[], inplace=True):
-        pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos(mutations))
-        self.pre_mrna = pre_mrna
-        if inplace:
-            return self
-        return pre_mrna
-    def generate_mature_mrna_pos(self, mutations=[]):
-        mature_mrna, mature_indices = '', []
-        pre_seq, pre_indices = self.generate_pre_mrna_pos(mutations)
-        for i, j in self.exons_pos:
-            rel_start, rel_end = pre_indices.index(i), pre_indices.index(j)
-            mature_mrna += pre_seq[rel_start:rel_end + 1]
-            mature_indices.extend(pre_indices[rel_start:rel_end + 1])
-        return mature_mrna, mature_indices
-    def generate_mature_mrna(self, mutations=[], inplace=True):
-        if inplace:
-            self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
-            return self
-        return self.__pos2sense(*self.generate_mature_mrna_pos(mutations))
-    def generate_protein(self, inplace=True):
-        rel_start = self.transcript_indices.index(self.TIS)
-        rel_end = self.transcript_indices.index(self.TTS)
-        orf = self.transcript_seq[rel_start:rel_end + 1 + 3]
-        protein = str(Seq(orf).translate()).replace('*', '')
-        if inplace:
-            self.orf = orf
-            self.protein = protein
-            if self.protein != self.cons_seq:
-                self.cons_available = False
-            return self
-        return protein
-    def generate_translational_boundaries(self):
-        if self.TIS not in self.transcript_indices or self.transcript_seq[self.transcript_indices.index(self.TIS):self.transcript_indices.index(self.TIS)+3] != 'ATG':
-            new_tis = TISFInder(self.transcript_seq, self.transcript_indices)
-            self.log += f' TIS for transcript reacquired: {self.TIS} --> {new_tis}.'
-            self.TIS = new_tis
-        self.TTS = find_new_tts(self.transcript_seq, self.transcript_indices, self.TIS)
-        return self

geney/analyzers/__init__.py DELETED Viewed

File without changes

geney/analyzers/benchmark_clinvar.py DELETED Viewed

@@ -1,158 +0,0 @@
-import pandas as pd
-from sklearn.metrics import roc_curve, precision_recall_curve
-import matplotlib.pyplot as plt
-from datetime import datetime
-from pathlib import Path
-import subprocess
-from geney import config_setup
-from geney.utils import download_and_gunzip
-from geney.oncosplice import oncosplice_reduced
-def download_and_parse_clinvar():
-    url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
-    local_file = download_and_gunzip(url, target_path)
-    return local_file
-def aggregate_clinvar_results(benchmark_path, aggregate_mode=False, benchmark_feature=None, local_clinvar_df='/tamir2/nicolaslynn/data/ClinVar/clinvar_compact.csv'):
-    data = pd.concat([pd.read_csv(file) for file in Path(benchmark_path).glob('*.csv')])
-    if not aggregate_mode:
-        data = data[(data.cons_available) & (data.primary_transcript)]
-    data = oncosplice_reduced(data)
-    data = data.loc[:, ~data.columns.duplicated()]
-    data = pd.merge(data, pd.read_csv(local_clinvar_df), on='mut_id')
-    data['clinsig_val'] = data.apply(lambda row: {'Benign': 0, 'Pathogenic': 1}[row.clinsig], axis=1)
-    for c in data.columns:
-        try:
-            if data[c].min() < 0:
-                data[f'{c}_abs'] = abs(data[c])
-        except TypeError:
-            pass
-    print(data.corr(numeric_only=True))
-    print(data.corrwith(data['clinsig_val'], method='spearman'))
-    print(data.corrwith(data['clinsig_val'], method='pearson'))
-    return data
-def plot_performance(true_values, predictions):
-    clinsig_map = {'Benign': 0, 'Pathogenic': 1}
-    true_values = [clinsig_map[t] for t in true_values]
-    predictions = scale_predictions(predictions)
-    fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
-    # Calculate Precision-Recall curve
-    precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
-    # Plotting ROC curve
-    plt.figure(figsize=(20, 5))
-    plt.subplot(1, 4, 1)
-    plt.plot(fpr, tpr)
-    plt.title('ROC Curve')
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
-    # Plotting Precision-Recall curve
-    plt.subplot(1, 4, 2)
-    plt.plot(recall, precision)
-    plt.title('Precision-Recall Curve')
-    plt.xlabel('Recall')
-    plt.ylabel('Precision')
-    # Plotting Precision vs. Thresholds
-    plt.subplot(1, 4, 3)
-    plt.plot(thresholds_pr, precision[:-1])  # Precision and thresholds have off-by-one lengths
-    plt.title('Precision vs. Threshold')
-    plt.xlabel('Threshold')
-    plt.ylabel('Precision')
-    # Plotting Sample Percentage Captured vs. Thresholds
-    plt.subplot(1, 4, 4)
-    # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
-    plt.plot(thresholds_roc, tpr)  # Update 'tpr' with the correct metric if necessary
-    plt.title('Cumulative Percentage vs. Threshold')
-    plt.xlabel('Threshold')
-    plt.ylabel('Cumulative Percentage of Population')
-    plt.tight_layout()
-    plt.show()
-class ClinVarBenchmark:
-    def __init__(self, df):
-        assert 'clinsig' in df.columns, 'No clinsig column found in dataframe.'
-        self.df = df
-    def scale_predictions(self, p):
-        max_val = max(p)
-        min_val = min(p)
-        return (p - min_val) / (max_val - min_val)
-    def plot_performance(self, true_values, predictions):
-        clinsig_map = {'Benign': 0, 'Pathogenic': 1}
-        predictions = [clinsig_map[t] for t in true_values]
-        predictions = self.scale_predictions(predictions)
-        fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
-        # Calculate Precision-Recall curve
-        precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
-        # Plotting ROC curve
-        plt.figure(figsize=(20, 5))
-        plt.subplot(1, 4, 1)
-        plt.plot(fpr, tpr)
-        plt.title('ROC Curve')
-        plt.xlabel('False Positive Rate')
-        plt.ylabel('True Positive Rate')
-        # Plotting Precision-Recall curve
-        plt.subplot(1, 4, 2)
-        plt.plot(recall, precision)
-        plt.title('Precision-Recall Curve')
-        plt.xlabel('Recall')
-        plt.ylabel('Precision')
-        # Plotting Precision vs. Thresholds
-        plt.subplot(1, 4, 3)
-        plt.plot(thresholds_pr, precision[:-1])  # Precision and thresholds have off-by-one lengths
-        plt.title('Precision vs. Threshold')
-        plt.xlabel('Threshold')
-        plt.ylabel('Precision')
-        # Plotting Sample Percentage Captured vs. Thresholds
-        plt.subplot(1, 4, 4)
-        # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
-        plt.plot(thresholds_roc, tpr)  # Update 'tpr' with the correct metric if necessary
-        plt.title('Cumulative Percentage vs. Threshold')
-        plt.xlabel('Threshold')
-        plt.ylabel('Cumulative Percentage of Population')
-        plt.tight_layout()
-        plt.show()
-        return None
-    def report(self, feature):
-        pass
-    def find_ppv_threshold(self, feature, ppv_threshold=0.95):
-        pass
-if __name__ ==  '__main__':
-    now = datetime.now()
-    benchmark_path = config_setup['ONCOSPLICE'] / f'clinvar_benchmark_{now.strftime("%m_%d_%Y")}'
-    print(f"Saving benchmark results to {benchmark_path}")
-    benchmark_path.mkdir(parents=True, exist_ok=True)
-    subprocess.run(['python', '-m', 'geney.pipelines.dask_utils', '-i',
-                    '/tamir2/nicolaslynn/data/ClinVar/clinvar_oncosplice_input.txt', '-r', str(benchmark_path),
-                    '-n', '10', '-m', '5GB'])

geney/analyzers/characterize_epistasis.py DELETED Viewed

@@ -1,15 +0,0 @@
-from geney.oncosplice import *
-class PairwiseEpistasis:
-    def __init__(self, epistasis):
-        # need some check here making sure format of mtuations isi good
-        self.epistasis = epistasis
-        self.mut_id1, self.mut_id2 = epistasis.split('|')
-    def compare_functional_changes(self):
-        self.results_mut1 = oncosplice(self.mut_id1, sai_threshold=0.5)
-        self.results_mut2 = oncosplice(self.mut_id2, sai_threshold=0.5)
-        self.results_epi = oncosplice(self.epistasis, sai_threshold=0.5)
-        splicing1, splicing2, splicing_epi = 0, 0, 0
-        oncosplice_score1, oncosplice_score2, oncosplice_score_epi = 0, 0, 0

geney/analyzers/compare_sets.py DELETED Viewed

@@ -1,91 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.metrics import precision_score, recall_score, accuracy_score
-from sklearn.metrics import roc_auc_score, roc_curve
-import matplotlib.pyplot as plt
-def plot_auc_curve(y_true, y_pred_proba):
-    """
-    Plots the AUC curve.
-    Args:
-        y_true (array-like): True labels (0 or 1).
-        y_pred_proba (array-like): Predicted probabilities for positive class.
-    Returns:
-        None
-    """
-    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-    auc_value = roc_auc_score(y_true, y_pred_proba)
-    plt.figure(figsize=(8, 6))
-    plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
-    plt.plot([0, 1], [0, 1], 'k--')
-    plt.xlabel("False Positive Rate")
-    plt.ylabel("True Positive Rate")
-    plt.title("Receiver Operating Characteristic (ROC) Curve")
-    plt.legend()
-    plt.show()
-    return auc_value
-def optimal_ppv(dataframe, feature_name, plot=False):
-    """
-    Calculates the optimal positive predictive value (PPV) for a given feature.
-    Args:
-        dataframe (pd.DataFrame): Input dataframe.
-        feature_name (str): Name of the feature column.
-    Returns:
-        float: Optimal PPV.
-    """
-    # Assuming 'target' is the binary target column (0 or 1)
-    threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
-    ppv_values = []
-    for threshold in threshold_values:
-        predictions = (dataframe[feature_name] >= threshold).astype(int)
-        ppv = precision_score(dataframe['target'], predictions)
-        ppv_values.append(ppv)
-    optimal_threshold = threshold_values[np.argmax(ppv_values)]
-    optimal_ppv = max(ppv_values)
-    if plot:
-        plt.figure(figsize=(8, 6))
-        plt.scatter(threshold_values, ppv_values)
-        plt.xlabel("Threshold")
-        plt.ylabel("Positive Predictive Value (PPV)")
-        plt.title("Optimal Positive Predictive Value (PPV)")
-        plt.show()
-    return optimal_ppv, optimal_threshold
-def measure_prediction_quality(prediction_vector, quality_vector):
-    """
-    Measure the quality of the predictions using the quality_vector as the characteristic to check.
-    """
-    pass
-def create_ppv_vector(prediction_vector, true_value_vector):
-    """
-    Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
-    """
-    df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
-    df.sort_values('prediction', ascending=True, inplace=True)
-    df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
-    for bin in df.bin.unique():
-        temp_df = df[df.bin >= bin].
-def group_retention(predictions, predictor):
-    # first i need to get the ratio of values that are retained at particular values
-    predictions.sort_values(predictor, inplace=True)
-    _, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
-    tracker = []
-    for th in thresholds:

geney/analyzers/group_comparison.py DELETED Viewed

@@ -1,81 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.metrics import precision_score, recall_score, accuracy_score
-from sklearn.metrics import roc_auc_score, roc_curve
-import matplotlib.pyplot as plt
-def plot_auc_curve(y_true, y_pred_proba):
-    """
-    Plots the AUC curve.
-    Args:
-        y_true (array-like): True labels (0 or 1).
-        y_pred_proba (array-like): Predicted probabilities for positive class.
-    Returns:
-        None
-    """
-    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-    auc_value = roc_auc_score(y_true, y_pred_proba)
-    plt.figure(figsize=(8, 6))
-    plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
-    plt.plot([0, 1], [0, 1], 'k--')
-    plt.xlabel("False Positive Rate")
-    plt.ylabel("True Positive Rate")
-    plt.title("Receiver Operating Characteristic (ROC) Curve")
-    plt.legend()
-    plt.show()
-    return auc_value
-def optimal_ppv(dataframe, feature_name, plot=False):
-    """
-    Calculates the optimal positive predictive value (PPV) for a given feature.
-    Args:
-        dataframe (pd.DataFrame): Input dataframe.
-        feature_name (str): Name of the feature column.
-    Returns:
-        float: Optimal PPV.
-    """
-    # Assuming 'target' is the binary target column (0 or 1)
-    threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
-    ppv_values = []
-    for threshold in threshold_values:
-        predictions = (dataframe[feature_name] >= threshold).astype(int)
-        ppv = precision_score(dataframe['target'], predictions)
-        ppv_values.append(ppv)
-    optimal_threshold = threshold_values[np.argmax(ppv_values)]
-    optimal_ppv = max(ppv_values)
-    if plot:
-        plt.figure(figsize=(8, 6))
-        plt.scatter(threshold_values, ppv_values)
-        plt.xlabel("Threshold")
-        plt.ylabel("Positive Predictive Value (PPV)")
-        plt.title("Optimal Positive Predictive Value (PPV)")
-        plt.show()
-    return optimal_ppv, optimal_threshold
-def measure_prediction_quality(prediction_vector, quality_vector):
-    """
-    Measure the quality of the predictions using the quality_vector as the characteristic to check.
-    """
-    pass
-def create_ppv_vector(prediction_vector, true_value_vector):
-    """
-    Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
-    """
-    df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
-    df.sort_values('prediction', ascending=True, inplace=True)
-    df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
-    for bin in df.bin.unique():
-        temp_df = df[df.bin >= bin].

geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl

Potentially problematic release.

geney 1.2.20py2.py3-none-any.whl → 1.2.22py2.py3-none-any.whl