PyPI - python-katlas - Versions diffs - 0.0.8__tar.gz → 0.1.0__tar.gz - Mend

python-katlas 0.0.8tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{python-katlas-0.0.8/python_katlas.egg-info → python-katlas-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: python-katlas
-Version: 0.0.8
+Version: 0.1.0
 Summary: tools for predicting kinome specificities
 Home-page: https://github.com/sky1ove/python-katlas
 Author: lily

python-katlas-0.1.0/katlas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.9"

{python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/_modidx.py RENAMED Viewed

@@ -49,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
                              'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
                              'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
                              'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
-                             'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
-                             'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
                              'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
                              'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
                              'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),

{python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/core.py RENAMED Viewed

@@ -14,7 +14,6 @@ from tqdm import tqdm
 from scipy.stats import chi2
 from typing import Callable
 from functools import partial
-from joblib import Parallel, delayed
 from scipy.stats import ttest_ind
 from statsmodels.stats.multitest import multipletests
@@ -375,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
     return log_sum
 # %% ../nbs/00_core.ipynb 33
-class multiply:
+def multiply(values, kinase, num_dict=Data.get_num_dict()):
     "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
-    def __init__(self):
-        self.num_dict = Data.get_num_dict()
-    def func(self, values, kinase):
-        # Check if any values are less than or equal to zero
-        if np.any(np.array(values) == 0):
-            return np.nan
-        else:
-            # Retrieve the divide factor from the dictionary
-            self.divide = self.num_dict[kinase]
-            # Using the logarithmic property: log(a*b) = log(a) + log(b)
-            # Compute the sum of the logarithms of the values and the divide factor
-            log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
+    # Check if any values are less than or equal to zero
+    if np.any(np.array(values) == 0):
+        return np.nan
+    else:
+        # Retrieve the divide factor from the dictionary
+        divide_factor = num_dict[kinase]
+        # Using the logarithmic property: log(a*b) = log(a) + log(b)
+        # Compute the sum of the logarithms of the values and the divide factor
+        log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
-            return log_sum
+        return log_sum
-# %% ../nbs/00_core.ipynb 37
+# %% ../nbs/00_core.ipynb 36
 def sumup(values, # list of values, possibilities of amino acids at certain positions
           kinase=None,
          ):
     "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
     return sum(values)
-# %% ../nbs/00_core.ipynb 40
+# %% ../nbs/00_core.ipynb 39
 def predict_kinase(input_string: str, # site sequence
                    ref: pd.DataFrame, # reference dataframe for scoring
                    func: Callable, # function to calculate score
@@ -455,19 +449,20 @@ def predict_kinase(input_string: str, # site sequence
     return out.round(3)  # Return the scores rounded to three decimal places
-# %% ../nbs/00_core.ipynb 42
+# %% ../nbs/00_core.ipynb 41
 # PSPA
-param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
-param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
-param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
+param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official
+param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}
+param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}
 # Kinase-substrate dataset, CDDM
-param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
-param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
+param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}
+param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase
 # %% ../nbs/00_core.ipynb 46
 def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
     print('input dataframe has a length', df.shape[0])
     print('Preprocessing')
@@ -494,29 +489,68 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
     print('Finish preprocessing')
-    results = []
-    # Extract numerical part of reference DataFrame columns, sort them
-    num = list(set(ref.columns.str[:-1].astype(int)))
-    num.sort()
-    print(f'Calculating position: {num}')
-    # Transform reference DataFrame to a dictionary and clean up NaN values
-    ref_dict = ref.T.to_dict()
-    ref_dict = {
-        outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
-        for outer_k, outer_v in ref_dict.items()}
-    # Function to process each kinase with its dictionary, using parallel processing
-    def process_kinase(kinase, r_dict):
-        return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
+    # wide form to long form
+    df['keys'] = df['site_seq'].apply(get_dict)
+    input_keys_df  = df[['keys']].explode('keys').reset_index()
+    input_keys_df.columns = ['input_index', 'key']
-    # Process all kinases in parallel, using tqdm for progress tracking
-    results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
-    # Return results as a DataFrame
-    return pd.DataFrame(results, index=ref.index, columns=df.index).T
+    ref_T = ref.T
+    input_keys_df = input_keys_df.set_index('key')
+    print('Merging reference')
+    merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')
+    print('Finish merging')
+    if func == sumup:
+        grouped_df = merged_df.groupby('input_index').sum()
+        out = grouped_df.reindex(df.index)
+    elif func==multiply:
+        # Get the list of kinases and num_dict
+        kinases = ref_T.columns
+        num_dict = Data.get_num_dict()
+        out = {}
+        for kinase in tqdm(kinases):
+            divide_factor = num_dict[kinase]
+            # Extract data for this kinase
+            kinase_df = merged_df[['input_index', kinase]].copy()
+            kinase_df = kinase_df.rename(columns={kinase: 'value'})
-# %% ../nbs/00_core.ipynb 55
+            # Compute log_value
+            kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))
+            # Group by 'input_index' and compute sum and count
+            grouped = kinase_df.dropna().groupby('input_index')
+            sum_log_values = grouped['log_value'].sum()
+            len_values = grouped['log_value'].count()
+            # Compute log_sum using the formula
+            log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
+            # Find all 'input_index' where 'log_value' is NaN
+            nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
+            # Set log_sum at those indices to NaN
+            log_sum.loc[nan_input_indices] = np.nan
+            # Assign the computed values to the results DataFrame
+            out[kinase] = log_sum
+        out = pd.DataFrame(out).reindex(df.index)
+    else:
+        grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
+        out = grouped_df.reindex(df.index)
+    # Return results as a DataFrame
+    return out
+# %% ../nbs/00_core.ipynb 56
 def get_pct(site,ref,func,pct_ref):
     "Replicate the precentile results from The Kinase Library."
@@ -541,7 +575,7 @@ def get_pct(site,ref,func,pct_ref):
     final.columns=['log2(score)','percentile']
     return final
-# %% ../nbs/00_core.ipynb 61
+# %% ../nbs/00_core.ipynb 62
 def get_pct_df(score_df, # output from predict_kinase_df
                pct_ref, # a reference df for percentile calculation
               ):
@@ -566,7 +600,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
     return percentiles_df
-# %% ../nbs/00_core.ipynb 66
+# %% ../nbs/00_core.ipynb 67
 def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
                     seq_col: str='site_seq', # column name of site sequence
                     id_col: str='gene_site' # column name of site id
@@ -582,7 +616,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
     return unique
-# %% ../nbs/00_core.ipynb 69
+# %% ../nbs/00_core.ipynb 70
 def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
                      seq_col: str, # column name of protein sequence
                      position_col: str # column name of position 0
@@ -608,7 +642,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
     return np.array(data)
-# %% ../nbs/00_core.ipynb 74
+# %% ../nbs/00_core.ipynb 75
 def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
              aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
              aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
@@ -649,7 +683,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
     return paper,full
-# %% ../nbs/00_core.ipynb 78
+# %% ../nbs/00_core.ipynb 79
 def query_gene(df,gene):
     "Query gene in the phosphoproteomics dataset"
@@ -663,7 +697,7 @@ def query_gene(df,gene):
     return df_gene
-# %% ../nbs/00_core.ipynb 82
+# %% ../nbs/00_core.ipynb 83
 def get_ttest(df,
               columns1, # list of column names for group1
               columns2, # list of column names for group2
@@ -733,7 +767,7 @@ def get_ttest(df,
     return results
-# %% ../nbs/00_core.ipynb 83
+# %% ../nbs/00_core.ipynb 84
 def get_metaP(p_values):
     "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
@@ -745,7 +779,7 @@ def get_metaP(p_values):
     return score
-# %% ../nbs/00_core.ipynb 86
+# %% ../nbs/00_core.ipynb 87
 def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
              PDHK: bool=False, # whether this kinase belongs to PDHK family
             ):
@@ -768,7 +802,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
     return df2
-# %% ../nbs/00_core.ipynb 88
+# %% ../nbs/00_core.ipynb 89
 def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
                    kinase:str, # a specific kinase
                    normalize: bool=False, # normalize according to the paper; special for PDHK1/4

{python-katlas-0.0.8 → python-katlas-0.1.0/python_katlas.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: python-katlas
-Version: 0.0.8
+Version: 0.1.0
 Summary: tools for predicting kinome specificities
 Home-page: https://github.com/sky1ove/python-katlas
 Author: lily

{python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,6 @@
 statsmodels
 fastparquet
 tqdm
-joblib
 [dev]
 nbdev

{python-katlas-0.0.8 → python-katlas-0.1.0}/settings.ini RENAMED Viewed

@@ -5,7 +5,7 @@
 ### Python library ###
 repo = python-katlas
 lib_name = %(repo)s
-version = 0.0.8
+version = 0.1.0
 min_python = 3.7
 license = apache2
 black_formatting = False
@@ -38,6 +38,6 @@ status = 3
 user = sky1ove
 ### Optional ###
-requirements = statsmodels fastparquet tqdm joblib
+requirements = statsmodels fastparquet tqdm
 dev_requirements = nbdev pyngrok fastai>=2.7.12 fastbook fairscale fair-esm logomaker seaborn rdkit umap-learn adjustText bokeh scikit-learn>=1.3.0 openpyxl
 # console_scripts =