PyPI - python-katlas - Versions diffs - 0.0.1__py3-none-any.whl - Mend

python-katlas 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

katlas/__init__.py +1 -0
katlas/_modidx.py +110 -0
katlas/core.py +769 -0
katlas/dl.py +355 -0
katlas/feature.py +290 -0
katlas/imports.py +7 -0
katlas/plot.py +663 -0
katlas/train.py +231 -0
python_katlas-0.0.1.dist-info/LICENSE +201 -0
python_katlas-0.0.1.dist-info/METADATA +402 -0
python_katlas-0.0.1.dist-info/RECORD +14 -0
python_katlas-0.0.1.dist-info/WHEEL +5 -0
python_katlas-0.0.1.dist-info/entry_points.txt +2 -0
python_katlas-0.0.1.dist-info/top_level.txt +1 -0

katlas/core.py ADDED Viewed

@@ -0,0 +1,769 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
+# %% auto 0
+__all__ = ['param_PSPA_st', 'param_PSPA_y', 'param_PSPA', 'param_CDDM', 'param_CDDM_upper', 'Data', 'CPTAC', 'convert_string',
+           'checker', 'STY2sty', 'cut_seq', 'get_dict', 'multiply_func', 'multiply', 'sumup', 'predict_kinase',
+           'predict_kinase_df', 'get_pct', 'get_pct_df', 'get_unique_site', 'extract_site_seq', 'get_freq',
+           'query_gene', 'get_ttest', 'get_metaP', 'raw2norm', 'get_one_kinase']
+# %% ../nbs/00_core.ipynb 4
+import math, pandas as pd, numpy as np, seaborn as sns
+from tqdm import tqdm
+from scipy.stats import chi2
+from typing import Callable
+from functools import partial
+from joblib import Parallel, delayed
+from scipy.stats import ttest_ind
+from statsmodels.stats.multitest import multipletests
+# %% ../nbs/00_core.ipynb 7
+class Data:
+    "A class for fetching various datasets."
+    @staticmethod
+    def fetch_data(url):
+        "Fetches the data from the given URL and returns a DataFrame"
+        df = pd.read_parquet(url)
+        if 'Unnamed: 0' in df.columns:
+            df = df.rename(columns={'Unnamed: 0': 'kinase'})
+        return df
+    #---------------------------kinase-------------------------------
+    # kinase info
+    KINASE_INFO_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/kinase_info.parquet"
+    @staticmethod
+    def get_kinase_info():
+        return Data.fetch_data(Data.KINASE_INFO_URL)
+    #---------------------------PSPA-------------------------------
+    # PSPA tyrosine normalized data
+    PSPA_TYR_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_tyr_norm.parquet"
+    @staticmethod
+    def get_pspa_tyr_norm():
+        "PSPA tyrosine kinase normalized data"
+        return Data.fetch_data(Data.PSPA_TYR_NORM_URL)
+    # PSPA ST kinase normalized data
+    PSPA_ST_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_st_norm.parquet"
+    @staticmethod
+    def get_pspa_st_norm():
+        "PSPA Ser/Thr kinase normalized data"
+        return Data.fetch_data(Data.PSPA_ST_NORM_URL)
+    # PSPA all kinase normalized data
+    PSPA_ALL_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_all_norm.parquet"
+    @staticmethod
+    def get_pspa_all_norm():
+        "PSPA Ser/Thr and Tyr kinase normalized data"
+        return Data.fetch_data(Data.PSPA_ALL_NORM_URL)
+    # scoring human all capital phosphoproteome via PSPA
+    PSPA_ST_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_pct_st.parquet"
+    # PSPA_TYR_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/pspa_pct_tyr.parquet"
+    @staticmethod
+    def get_pspa_st_pct():
+        return Data.fetch_data(Data.PSPA_ST_PCT_URL)
+    PSPA_TYR_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_pct_tyr.parquet"
+    @staticmethod
+    def get_pspa_tyr_pct():
+        return Data.fetch_data(Data.PSPA_TYR_PCT_URL)
+    # PSPA number of random amino acids
+    PSPA_NUM_RANDOM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_divide_num.csv"
+    @staticmethod
+    def get_num_dict():
+        "PSPA number of random amino acids"
+        num = pd.read_csv(Data.PSPA_NUM_RANDOM_URL)
+        num_dict = num.set_index('kinase')['num_random_aa'].to_dict()
+        return num_dict
+    #---------------------------CDDM-------------------------------
+        # Kinase substrate datasets
+    KS_DATASET_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_datasets.parquet"
+    @staticmethod
+    def get_ks_dataset():
+        df = Data.fetch_data(Data.KS_DATASET_URL)
+        #Convert the number in the column name into integer
+        df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
+        return df
+    # CDDM reference
+    CDDM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_main.parquet"
+    @staticmethod
+    def get_cddm():
+        return Data.fetch_data(Data.CDDM_URL)
+    CDDM_UPPER_URL ="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_main_upper.parquet"
+    @staticmethod
+    def get_cddm_upper():
+        return Data.fetch_data(Data.CDDM_UPPER_URL)
+    # CDDM of other kinase with mutation
+    CDDM_OTHERS_URL="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_others.parquet"
+    @staticmethod
+    def get_cddm_others():
+        return Data.fetch_data(Data.CDDM_OTHERS_URL)
+    CDDM_OTHERS_INFO_URL="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_others_info.parquet"
+    @staticmethod
+    def get_cddm_others_info():
+        return Data.fetch_data(Data.CDDM_OTHERS_INFO_URL)
+    #---------------------------CDDM+PSPA-------------------------------
+    # Combined PSPA and CDDM
+    COMBINE_URL =  "https://github.com/sky1ove/katlas/raw/main/dataset/combine_main.parquet"
+    @staticmethod
+    def get_combine():
+        return Data.fetch_data(Data.COMBINE_URL)
+    #---------------------------Amino acid-------------------------------
+    # Amino acid info
+    AA_INFO_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_info.parquet"
+    AA_RDKIT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_rdkit.parquet"
+    AA_MORGAN_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_morgan.parquet"
+    @staticmethod
+    def get_aa_info():
+        return Data.fetch_data(Data.AA_INFO_URL)
+    @staticmethod
+    def get_aa_rdkit():
+        return Data.fetch_data(Data.AA_RDKIT_URL)
+    @staticmethod
+    def get_aa_morgan():
+        return Data.fetch_data(Data.AA_MORGAN_URL)
+    #---------------------------phosphoproteomics dataset-------------------------------
+    # For reference of linkedomicsKB, contains unique EnsemblProteinID+site, more sites
+    CPTAC_KB_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/linkedOmicsKB_ref_pan.parquet"
+    @staticmethod
+    def get_cptac_ensembl_site():
+        "For reference of linkedomicsKB, contains unique EnsemblProteinID+site"
+        return Data.fetch_data(Data.CPTAC_KB_URL)
+    # From the above, but keep the unique site seq, with gene_site separated by |
+    CPTAC_UNIQUE_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/cptac_unique_site.parquet"
+    @staticmethod
+    def get_cptac_unique_site():
+        "Unique site sequence of CPTAC"
+        return Data.fetch_data(Data.CPTAC_UNIQUE_URL)
+    # for reference of linkedomics, contains unique Gene+site, fewer cases
+    CPTAC_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/linkedOmics_ref_pan.parquet"
+    @staticmethod
+    def get_cptac_gene_site():
+        "For reference of linkedomics, contains unique Gene+site, fewer cases than unique EnsemblID+site"
+        return Data.fetch_data(Data.CPTAC_URL)
+    # from PhosphositePlus, contains Gene+site
+    PSP_HUMAN_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/psp_human.parquet"
+    @staticmethod
+    def get_psp_human_site():
+        "PhosphositePlus human, contains Gene+site"
+        return Data.fetch_data(Data.PSP_HUMAN_URL)
+    # from ochoa et al. The functional landscape of the human phosphoproteome
+    OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/ochoa_site.parquet"
+    @staticmethod
+    def get_ochoa_site():
+        "Ochoa et al. dataset"
+        return Data.fetch_data(Data.OCHOA_URL)
+    # combine ochoa and PSP low throughput data
+    COMBINE_PSP_OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/combine_site_ochoa_psp.parquet"
+    @staticmethod
+    def get_combine_site_psp_ochoa():
+        "Combined Ochoa and PhosphoSitePlus"
+        df = Data.fetch_data(Data.COMBINE_PSP_OCHOA_URL)
+        #Convert the number in the column name into integer
+        df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
+        return df
+# %% ../nbs/00_core.ipynb 12
+class CPTAC:
+    "A class for fetching CPTAC phosphoproteomics data."
+#     # Phosphoproteomics (Tumor)
+#     HNSCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/HNSCC/HNSCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     GBM = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/GBM/GBM_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     COAD = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/COAD/COAD_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     CCRCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/CCRCC/CCRCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     LSCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LSCC/LSCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     BRCA = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/BRCA/BRCA_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     UCEC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/UCEC/UCEC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     LUAD = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LUAD/LUAD_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     PDAC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/PDAC/PDAC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     OV = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/OV/OV_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
+#     # Phosphoproteomics (Normal)
+#     HNSCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/HNSCC/HNSCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     GBM_normal = None
+#     COAD_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/COAD/COAD_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     CCRCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/CCRCC/CCRCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     LSCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LSCC/LSCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     BRCA_normal = None
+#     UCEC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/UCEC/UCEC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     LUAD_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LUAD/LUAD_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     PDAC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/PDAC/PDAC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     OV_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/OV/OV_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
+#     # Ensemble ID gene mapping
+#     HNSCC_ID = "https://zenodo.org/records/8196130/files/bcm-hnscc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     GBM_ID = "https://zenodo.org/records/8196130/files/bcm-gbm-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     COAD_ID = "https://zenodo.org/records/8196130/files/bcm-coad-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     CCRCC_ID = "https://zenodo.org/records/8196130/files/bcm-ccrcc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     LSCC_ID = "https://zenodo.org/records/8196130/files/bcm-lscc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     BRCA_ID = "https://zenodo.org/records/8196130/files/bcm-brca-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     UCEC_ID = "https://zenodo.org/records/8196130/files/bcm-ucec-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     LUAD_ID = "https://zenodo.org/records/8196130/files/bcm-luad-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     PDAC_ID = "https://zenodo.org/records/8196130/files/bcm-pdac-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+#     OV_ID = "https://zenodo.org/records/8196130/files/bcm-ov-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+    @staticmethod
+    def _fetch_data(cancer: str, # cancer type CPTAC
+                    is_Tumor: bool=True, # tumor tissue or normal
+                    is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
+                   ):
+        "Fetches the data from the given URL and returns a DataFrame"
+        # URL of ID and data
+        sample_type = "Tumor" if is_Tumor else "Normal"
+        ID_URL = f"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
+        DATA_URL = f"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_{sample_type}.txt"
+        # Load ID data
+        ref = pd.read_csv(ID_URL, compression='gzip', sep='\t')[['protein','gene','gene_name']].drop_duplicates().reset_index(drop=True)
+        # Load CPTAC phosphoproteomics data
+        try:
+            raw = pd.read_csv(DATA_URL, sep='\t')
+        except Exception as e:
+            print(f'{cancer} has {e}')
+        else:
+            info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],
+                                 'site':raw.idx.str.split('|').str[2],
+                                 'site_seq':raw.idx.str.split('|').str[3]})
+            print(f'the {cancer} dataset length is: {info.shape[0]}')
+            # Merge ensembl ID with gene name
+            info = info.merge(ref,'left')
+            print(f'after id mapping, the length is {info.shape[0]}')
+            print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')
+            info['gene_site'] = info['gene_name'] + '_' + info['site']
+            info['protein_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']
+            info = info.drop_duplicates(subset="protein_site" if is_KB else "gene_site").reset_index(drop=True)
+            print(f'after removing duplicates of protein_site, the length is {info.shape[0]}')
+            return info
+    @staticmethod
+    def list_cancer():
+        "Get available CPTAC cancer type"
+        return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']
+    @staticmethod
+    def get_id(cancer_type: str,
+               is_Tumor: bool=True, # tumor tissue or normal
+               is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
+              ):
+        "Get CPTAC phosphorylation sites information given a cancer type"
+        assert cancer_type in CPTAC.list_cancer(), "cancer type is not included, check available cancer types from CPTAC.list_cancer()"
+        return CPTAC._fetch_data(cancer_type,is_Tumor, is_KB)
+# %% ../nbs/00_core.ipynb 19
+def convert_string(input_string:str):
+    "Convert amino acids of lower case other than s,t,y to capital; convert rare amino acids to _"
+    allowed_chars = 'PGACSTVILMFYWHKRQNDEsty'
+    result = ""
+    for char in input_string:
+        # convert non-s/t/y to upper case
+        result_char = char if char in ['s', 't', 'y'] else char.upper()
+        # Replace with underscore if the character is not in the allowed set
+        result += result_char if result_char in allowed_chars else '_'
+    return result
+# %% ../nbs/00_core.ipynb 22
+def checker(input_string):
+    "Check if the input string contains non-s/t/y at the middle position"
+    acceptor = input_string[len(input_string)//2]
+    assert acceptor.lower() in list('sty'),f"{input_string} has {acceptor} at position 0; need to have one of s,t and y"
+def STY2sty(input_string: str):
+    "Replace 'STY' with 'sty'"
+    return input_string.replace('S', 's').replace('T', 't').replace('Y', 'y')
+# %% ../nbs/00_core.ipynb 24
+def cut_seq(input_string: str, # site sequence
+            min_position: int, # minimum position relative to its center
+            max_position: int, # maximum position relative to its center
+            ):
+    "Extract sequence based on a range relative to its center position"
+    # Find the center position of the string
+    center_position = len(input_string) // 2
+    # Calculate the start and end indices
+    start_index = max(center_position + min_position, 0)  # Ensure start_index is not negative
+    end_index = min(center_position + max_position + 1, len(input_string))  # Ensure end_index does not exceed string length
+    # Extract and return the substring
+    return input_string[start_index:end_index]
+# %% ../nbs/00_core.ipynb 26
+def get_dict(input_string:str, # phosphorylation site sequence
+            ):
+    "Get a dictionary of input string; no need for the star in the middle; make sure it is 15 or 10 length"
+    center_index = len(input_string) // 2
+    center_char = input_string[center_index]
+    result = []
+    for i, char in enumerate(input_string):
+        position = i - center_index
+        if char.isalpha():
+            result.append(f"{position}{char}")
+    return result
+# %% ../nbs/00_core.ipynb 29
+def multiply_func(values, # list of values, possibilities of amino acids at certain positions
+             factor=17, # scale factor
+            ):
+    "Multiply the possibilities of the amino acids at each position in a phosphorylation site"
+    # Using the logarithmic property: log(a*b) = log(a) + log(b)
+    # Compute the sum of the logarithms of the values and the scale factor
+    log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(factor)
+    return log_sum
+# %% ../nbs/00_core.ipynb 33
+class multiply:
+    "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
+    def __init__(self):
+        self.num_dict = Data.get_num_dict()
+    def func(self, values, kinase):
+        # Check if any values are less than or equal to zero
+        if np.any(np.array(values) == 0):
+            return np.nan
+        else:
+            # Retrieve the divide factor from the dictionary
+            self.divide = self.num_dict[kinase]
+            # Using the logarithmic property: log(a*b) = log(a) + log(b)
+            # Compute the sum of the logarithms of the values and the divide factor
+            log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
+            return log_sum
+# %% ../nbs/00_core.ipynb 37
+def sumup(values, # list of values, possibilities of amino acids at certain positions
+          kinase=None,
+         ):
+    "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
+    return sum(values)
+# %% ../nbs/00_core.ipynb 40
+def predict_kinase(input_string: str, # site sequence
+                   ref: pd.DataFrame, # reference dataframe for scoring
+                   func: Callable, # function to calculate score
+                   to_lower: bool=False, # convert capital STY to lower case
+                   verbose=True
+                   ):
+    "Predict kinase given a phosphorylation site sequence"
+    # check whether the middle position is STY (Serine, Threonine, Tyrosine)
+    checker(input_string)
+    # Convert rare amino acids to '_', and if specified, convert STY to lowercase
+    input_string = convert_string(input_string)
+    # If to_lower is True, convert STY in the sequence to lower case
+    if to_lower:
+        input_string = STY2sty(input_string)
+    results = [] # Initialize a list to store the scores for each kinase
+    # Iterate over each kinase and its associated data in the reference dataframe
+    for kinase, row in ref.iterrows():
+        # Convert the row into a dictionary, excluding NaN values, to create a PSSM dictionary for a kinase
+        r_dict = row.dropna().to_dict()
+        # Extract position+amino acid name from the input string and filter them against the name in PSSM
+        pos_aa_name = get_dict(input_string)
+        pos_aa_name = [key for key in pos_aa_name if key in r_dict.keys()]
+        # Collect corresponding PSSM values for these positions and amino acids
+        pos_aa_val = [r_dict[key] for key in pos_aa_name] # Further checks for NaN values
+        # Calculate the score for this kinase using the specified function
+        score = func(pos_aa_val, kinase)
+        results.append(score)
+    # If verbose is True, print the positions and amino acids considered
+    if verbose:
+        print(f'considering string: {pos_aa_name}')
+    # Convert the list of results into a pandas Series, index by the kinase, sort by score in descending order
+    out = pd.Series(results, index=ref.index).sort_values(ascending=False)
+    return out.round(3)  # Return the scores rounded to three decimal places
+# %% ../nbs/00_core.ipynb 42
+# PSPA
+param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
+param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
+param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
+# Kinase-substrate dataset, CDDM
+param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
+param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup} # specific for all uppercase
+# %% ../nbs/00_core.ipynb 46
+def predict_kinase_df(df, seq_col, ref, func, to_lower=False):
+    print('input dataframe has a length', df.shape[0])
+    print('Preprocessing')
+    # Make a copy of df to avoid changes to the original dataframe
+    df = df.copy()
+    # Check whether the middle position of each sequence is one of S, T, or Y
+    df[seq_col].apply(checker)
+    # Convert rare amino acids to '_', and potentially change case of STY based on settings
+    df[seq_col] = df[seq_col].apply(convert_string)
+    # Optionally convert STY to lowercase in each sequence
+    if to_lower:
+        df[seq_col] = df[seq_col].apply(STY2sty)
+    # Adjust sequence lengths to match the reference matrix's expected inputs
+    max_value = ref.columns.str[:-1].astype(int).max() # Get the highest position index from the reference columns
+    min_value = ref.columns.str[:-1].astype(int).min() # Get the lowest position index
+    df[seq_col] = df[seq_col].apply(partial(cut_seq, min_position=min_value, max_position=max_value))
+    print('Finish preprocessing')
+    results = []
+    # Extract numerical part of reference DataFrame columns, sort them
+    num = list(set(ref.columns.str[:-1].astype(int)))
+    num.sort()
+    print(f'Calculating position: {num}')
+    # Transform reference DataFrame to a dictionary and clean up NaN values
+    ref_dict = ref.T.to_dict()
+    ref_dict = {
+        outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
+        for outer_k, outer_v in ref_dict.items()}
+    # Function to process each kinase with its dictionary, using parallel processing
+    def process_kinase(kinase, r_dict):
+        return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
+    # Process all kinases in parallel, using tqdm for progress tracking
+    results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
+    # Return results as a DataFrame
+    return pd.DataFrame(results, index=ref.index, columns=df.index).T
+# %% ../nbs/00_core.ipynb 55
+def get_pct(site,ref,func,pct_ref):
+    "Replicate the precentile results from The Kinase Library."
+    # As here we try to replicate the results, we use site.upper(); consider removing it for future version.
+    score = predict_kinase(site.upper(),ref=ref,func=func)
+    percentiles = {}
+    for kinase in score.index:
+        # Get the values from `ref` for this kinase
+        ref_values = pct_ref[kinase].values
+        # Calculate how many values in `ref` are less than the new score
+        less = np.sum(ref_values < score[kinase])
+        # Calculate how many values are equal to the new score
+        equal = np.sum(ref_values == score[kinase])
+        # Calculate the percentile rank
+        percentile = (less + 0.5 * equal) / len(ref_values) * 100
+        percentiles[kinase] = percentile
+    pct = pd.Series(percentiles)
+    final = pd.concat([score,pct],axis=1)
+    final.columns=['log2(score)','percentile']
+    return final
+# %% ../nbs/00_core.ipynb 61
+def get_pct_df(score_df, # output from predict_kinase_df
+               pct_ref, # a reference df for percentile calculation
+              ):
+    "Replicate the precentile results from The Kinase Library."
+    # Create an array to hold percentile ranks
+    percentiles = np.zeros(score_df.shape)
+    # Calculate percentiles for each column in a vectorized manner
+    for i, kinase in tqdm(enumerate(score_df.columns),total=len(score_df.columns)):
+        ref_values = np.sort(pct_ref[kinase].values)
+        # Use searchsorted to find indices where the scores would be inserted to maintain order
+        indices = np.searchsorted(ref_values, score_df[kinase].values, side='right')
+        # Calculate percentile ranks
+        percentiles[:, i] = indices / len(ref_values) * 100
+    # Convert the array to a DataFrame with appropriate indices and columns
+    percentiles_df = pd.DataFrame(percentiles, index=score_df.index, columns=score_df.columns).astype(float).round(3)
+    return percentiles_df
+# %% ../nbs/00_core.ipynb 66
+def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
+                    seq_col: str='site_seq', # column name of site sequence
+                    id_col: str='gene_site' # column name of site id
+                   ):
+    "Remove duplicates among phosphorylation sites; return df with new columns of acceptor and number of duplicates"
+    unique = df.groupby(seq_col).agg(
+        {id_col: lambda r: '|'.join(r.unique())} )
+    unique['num_site'] = unique[id_col].str.split('|').apply(len)
+    unique = unique.reset_index()
+    position = len(unique[seq_col][0])//2
+    unique['acceptor'] = unique[seq_col].str[position]
+    return unique
+# %% ../nbs/00_core.ipynb 69
+def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
+                     seq_col: str, # column name of protein sequence
+                     position_col: str # column name of position 0
+                    ):
+    "Extract -7 to +7 site sequence from protein sequence"
+    data = []
+    for i, r in tqdm(df.iterrows(),total=len(df)):
+        position = r[position_col] - 1
+        start = position - 7
+        end = position + 8
+        # Extract the subsequence
+        subseq = r[seq_col][max(0, start):min(len(r[seq_col]), end)]
+        # Pad the subsequence if needed
+        if start < 0:
+            subseq = "_" * abs(start) + subseq
+        if end > len(r[seq_col]):
+            subseq = subseq + "_" * (end - len(r[seq_col]))
+        data.append(subseq)
+    return np.array(data)
+# %% ../nbs/00_core.ipynb 74
+def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
+             aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
+             aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
+             position = [i for i in range(-7,8)], # position to include in the full matrix
+             position_paper = [-5,-4,-3,-2,-1,1,2,3,4] # position to include in the partial matrix
+             ):
+    "Get frequency matrix given a dataframe of phosphorylation sites for a single kinase"
+    #Count frequency for each amino acid at each position
+    melted_k = df_k.melt(
+                    value_vars=[i for i in range(-7, 8)],
+                    var_name='Position',
+                    value_name='aa')
+    # Group by Position and Amino Acid and count occurrences
+    grouped = melted_k.groupby(['Position', 'aa']).size().reset_index(name='Count')
+    # Remove wired amino acid
+    aa_include = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
+    grouped = grouped[grouped.aa.isin(aa_include)].reset_index(drop=True)
+    # get pivot table
+    pivot_k = grouped.pivot(index='aa', columns='Position', values='Count').fillna(0)
+    # Get frequency by dividing the sum of each column
+    freq_k = pivot_k/pivot_k.sum()
+    # data from the kinase-substrate dataset, and format is Lew's paper's format
+    paper = freq_k.reindex(index=aa_order_paper,columns=position_paper,fill_value=0)
+    # full pivot data from kinase-substrate dataset
+    full = freq_k.reindex(index=aa_order,columns=position, fill_value=0)
+    return paper,full
+# %% ../nbs/00_core.ipynb 78
+def query_gene(df,gene):
+    "Query gene in the phosphoproteomics dataset"
+    # query gene in the dataframe
+    df_gene = df[df.gene_site.str.contains(f'{gene}_')]
+    # sort dataframe based on position
+    sort_position = df_gene.gene_site.str.split('_').str[-1].str[1:].astype(int).sort_values().index
+    df_gene = df_gene.loc[sort_position]
+    return df_gene
+# %% ../nbs/00_core.ipynb 82
+def get_ttest(df,
+              columns1, # list of column names for group1
+              columns2, # list of column names for group2
+              FC_method = 'median', # or mean
+              alpha=0.05, # significance level in multipletests for p_adj
+              correction_method='fdr_bh', # method in multipletests for p_adj
+             ):
+    """
+    Performs t-tests and calculates log2 fold change between two groups of columns in a DataFrame.
+    NaN p-values are excluded from the multiple testing correction.
+    Returns:
+    DataFrame: Results including log2FC, p-values, adjusted p-values, significance, signed log10 P value, and signed log10 Padj
+    """
+    group1 = df[columns1]
+    group2 = df[columns2]
+    # Compute median values for each gene in both groups
+    if FC_method == "median":
+        m1 = group1.median(axis=1)
+        m2 = group2.median(axis=1)
+    elif FC_method == "mean":
+        m1 = group1.mean(axis=1)
+        m2 = group2.mean(axis=1)
+    # As phosphoproteomics data has already been log transformed, we can directly use subtraction
+    FCs = m2 - m1
+    # Perform t-tests and handle NaN p-values
+    t_results = [ttest_ind(group1.loc[idx], group2.loc[idx], nan_policy='omit') for idx in tqdm(df.index, desc="Computing t-tests")]
+    # Exclude NaN p-values before multiple testing correction
+    p_values = [result.pvalue if result.pvalue is not np.nan else np.nan for result in t_results]
+    valid_p_values = np.array(p_values, dtype=float)  # Ensure the correct data type
+    # valid_p_values = np.array(p_values)
+    valid_p_values = valid_p_values[~np.isnan(valid_p_values)]
+    # Adjust for multiple testing on valid p-values only
+    reject, pvals_corrected, _, _ = multipletests(valid_p_values, alpha=alpha, method=correction_method)
+    # Create a full list of corrected p-values including NaNs
+    full_pvals_corrected = np.empty_like(p_values)
+    full_pvals_corrected[:] = np.nan
+    np.place(full_pvals_corrected, ~np.isnan(p_values), pvals_corrected)
+    # Adjust the significance accordingly
+    full_reject = np.zeros_like(p_values, dtype=bool)
+    np.place(full_reject, ~np.isnan(p_values), reject)
+    # Create DataFrame with results
+    results = pd.DataFrame({
+        'log2FC': FCs,
+        'p_value': p_values,
+        'p_adj': full_pvals_corrected,
+        'significant': full_reject
+    })
+    results['p_value'] = results['p_value'].astype(float)
+    def get_signed_logP(r,p_col):
+        log10 = -np.log10(r[p_col])
+        return -log10 if r['log2FC']<0 else log10
+    results['signed_logP'] = results.apply(partial(get_signed_logP,p_col='p_value'),axis=1)
+    results['signed_logPadj'] = results.apply(partial(get_signed_logP,p_col='p_adj'),axis=1)
+    return results
+# %% ../nbs/00_core.ipynb 83
+def get_metaP(p_values):
+    "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
+    logs = [math.log(abs(p))*-1 if p<0 else math.log(abs(p)) for p in p_values]
+    chi_square_stat = -2 * sum(logs)
+    degrees_of_freedom = 2 * len(p_values)
+    score = stats.chi2.sf(abs(chi_square_stat), degrees_of_freedom)*-1 if chi_square_stat<0 else chi2.sf(abs(chi_square_stat), degrees_of_freedom)
+    return score
+# %% ../nbs/00_core.ipynb 86
+def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
+             PDHK: bool=False, # whether this kinase belongs to PDHK family
+            ):
+    "Normalize single ST kinase data"
+    columns_to_exclude = ['S', 'T', 'C', 't', 'y']
+    if PDHK:
+        columns_to_exclude.append('Y')
+        divisor = 16
+    else:
+        divisor = 17
+    s = df.drop(columns=columns_to_exclude).sum(1)
+    df2 = df.div(s, axis=0)
+    df2.C = df2.C / (df2.C.median() * divisor)
+    df2['S'] = df2.drop(columns=columns_to_exclude).median(1)
+    df2['T'] = df2.drop(columns=columns_to_exclude).median(1)
+    df2 = round(df2, 4)
+    return df2
+# %% ../nbs/00_core.ipynb 88
+def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
+                   kinase:str, # a specific kinase
+                   normalize: bool=False, # normalize according to the paper; special for PDHK1/4
+                   drop_s: bool= True, # drop s as s is a duplicates of t in PSPA
+                  ):
+    "Obtain a specific kinase data from stacked dataframe"
+    p = pd.DataFrame(df.loc[kinase],columns = [kinase]).reset_index().rename(columns={'index':'substrate'})
+    p['position'] = p.substrate.str.extract('(-?\d+)')
+    p['aa'] = p.substrate.str[-1]
+    p.position = p.position.astype(int)
+    pp = p.pivot(index='position', columns='aa', values=kinase)
+    if drop_s:
+        if 's' in pp.columns:
+            pp = pp.drop(columns=['s'])
+    if normalize:
+        pp = raw2norm(pp, PDHK=True if kinase == 'PDHK1' or kinase == 'PDHK4' else False)
+    return pp