python-katlas 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- katlas/__init__.py +1 -1
- katlas/_modidx.py +0 -2
- katlas/core.py +77 -52
- {python_katlas-0.0.8.dist-info → python_katlas-0.0.9.dist-info}/METADATA +1 -2
- python_katlas-0.0.9.dist-info/RECORD +14 -0
- python_katlas-0.0.8.dist-info/RECORD +0 -14
- {python_katlas-0.0.8.dist-info → python_katlas-0.0.9.dist-info}/LICENSE +0 -0
- {python_katlas-0.0.8.dist-info → python_katlas-0.0.9.dist-info}/WHEEL +0 -0
- {python_katlas-0.0.8.dist-info → python_katlas-0.0.9.dist-info}/entry_points.txt +0 -0
- {python_katlas-0.0.8.dist-info → python_katlas-0.0.9.dist-info}/top_level.txt +0 -0
katlas/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.8"
|
katlas/_modidx.py
CHANGED
|
@@ -49,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
|
|
|
49
49
|
'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
|
|
50
50
|
'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
|
|
51
51
|
'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
|
|
52
|
-
'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
|
|
53
|
-
'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
|
|
54
52
|
'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
|
|
55
53
|
'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
|
|
56
54
|
'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),
|
katlas/core.py
CHANGED
|
@@ -14,7 +14,6 @@ from tqdm import tqdm
|
|
|
14
14
|
from scipy.stats import chi2
|
|
15
15
|
from typing import Callable
|
|
16
16
|
from functools import partial
|
|
17
|
-
from joblib import Parallel, delayed
|
|
18
17
|
from scipy.stats import ttest_ind
|
|
19
18
|
from statsmodels.stats.multitest import multipletests
|
|
20
19
|
|
|
@@ -375,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
|
|
|
375
374
|
return log_sum
|
|
376
375
|
|
|
377
376
|
# %% ../nbs/00_core.ipynb 33
|
|
378
|
-
|
|
377
|
+
def multiply(values, kinase, num_dict=Data.get_num_dict()):
|
|
379
378
|
"Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
|
|
380
|
-
def __init__(self):
|
|
381
|
-
self.num_dict = Data.get_num_dict()
|
|
382
|
-
|
|
383
|
-
def func(self, values, kinase):
|
|
384
|
-
|
|
385
|
-
# Check if any values are less than or equal to zero
|
|
386
|
-
if np.any(np.array(values) == 0):
|
|
387
|
-
return np.nan
|
|
388
|
-
|
|
389
|
-
else:
|
|
390
|
-
# Retrieve the divide factor from the dictionary
|
|
391
|
-
self.divide = self.num_dict[kinase]
|
|
392
379
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
380
|
+
# Check if any values are less than or equal to zero
|
|
381
|
+
if np.any(np.array(values) == 0):
|
|
382
|
+
return np.nan
|
|
383
|
+
else:
|
|
384
|
+
# Retrieve the divide factor from the dictionary
|
|
385
|
+
divide_factor = num_dict[kinase]
|
|
386
|
+
|
|
387
|
+
# Using the logarithmic property: log(a*b) = log(a) + log(b)
|
|
388
|
+
# Compute the sum of the logarithms of the values and the divide factor
|
|
389
|
+
log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
|
|
396
390
|
|
|
397
|
-
|
|
391
|
+
return log_sum
|
|
398
392
|
|
|
399
|
-
# %% ../nbs/00_core.ipynb
|
|
393
|
+
# %% ../nbs/00_core.ipynb 36
|
|
400
394
|
def sumup(values, # list of values, possibilities of amino acids at certain positions
|
|
401
395
|
kinase=None,
|
|
402
396
|
):
|
|
403
397
|
"Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
|
|
404
398
|
return sum(values)
|
|
405
399
|
|
|
406
|
-
# %% ../nbs/00_core.ipynb
|
|
400
|
+
# %% ../nbs/00_core.ipynb 39
|
|
407
401
|
def predict_kinase(input_string: str, # site sequence
|
|
408
402
|
ref: pd.DataFrame, # reference dataframe for scoring
|
|
409
403
|
func: Callable, # function to calculate score
|
|
@@ -455,18 +449,18 @@ def predict_kinase(input_string: str, # site sequence
|
|
|
455
449
|
|
|
456
450
|
return out.round(3) # Return the scores rounded to three decimal places
|
|
457
451
|
|
|
458
|
-
# %% ../nbs/00_core.ipynb
|
|
452
|
+
# %% ../nbs/00_core.ipynb 41
|
|
459
453
|
# PSPA
|
|
460
|
-
param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply
|
|
461
|
-
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply
|
|
462
|
-
param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply
|
|
454
|
+
param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official
|
|
455
|
+
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}
|
|
456
|
+
param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}
|
|
463
457
|
|
|
464
458
|
|
|
465
459
|
# Kinase-substrate dataset, CDDM
|
|
466
460
|
param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
|
|
467
461
|
param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
|
|
468
462
|
|
|
469
|
-
# %% ../nbs/00_core.ipynb
|
|
463
|
+
# %% ../nbs/00_core.ipynb 45
|
|
470
464
|
def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
|
|
471
465
|
print('input dataframe has a length', df.shape[0])
|
|
472
466
|
print('Preprocessing')
|
|
@@ -494,29 +488,60 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
|
|
|
494
488
|
|
|
495
489
|
print('Finish preprocessing')
|
|
496
490
|
|
|
497
|
-
results = []
|
|
498
|
-
# Extract numerical part of reference DataFrame columns, sort them
|
|
499
|
-
num = list(set(ref.columns.str[:-1].astype(int)))
|
|
500
|
-
num.sort()
|
|
501
|
-
print(f'Calculating position: {num}')
|
|
502
|
-
# Transform reference DataFrame to a dictionary and clean up NaN values
|
|
503
|
-
ref_dict = ref.T.to_dict()
|
|
504
|
-
ref_dict = {
|
|
505
|
-
outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
|
|
506
|
-
for outer_k, outer_v in ref_dict.items()}
|
|
507
491
|
|
|
508
|
-
#
|
|
509
|
-
|
|
510
|
-
|
|
492
|
+
# wide form to long form
|
|
493
|
+
df['keys'] = df['site_seq'].apply(get_dict)
|
|
494
|
+
input_keys_df = df[['keys']].explode('keys').reset_index()
|
|
495
|
+
input_keys_df.columns = ['input_index', 'key']
|
|
496
|
+
ref_T = ref.T
|
|
511
497
|
|
|
512
|
-
|
|
513
|
-
results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
|
|
498
|
+
merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')
|
|
514
499
|
|
|
515
|
-
|
|
516
|
-
|
|
500
|
+
if func == sumup:
|
|
501
|
+
grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()
|
|
502
|
+
out = grouped_df.reindex(df.index)
|
|
503
|
+
|
|
504
|
+
elif func==multiply:
|
|
505
|
+
# Get the list of kinases and num_dict
|
|
506
|
+
kinases = ref_T.columns
|
|
507
|
+
num_dict = Data.get_num_dict()
|
|
508
|
+
|
|
509
|
+
out = {}
|
|
510
|
+
for kinase in tqdm(kinases):
|
|
511
|
+
divide_factor = num_dict[kinase]
|
|
512
|
+
# Extract data for this kinase
|
|
513
|
+
kinase_df = merged_df[['input_index', kinase]].copy()
|
|
514
|
+
kinase_df = kinase_df.rename(columns={kinase: 'value'})
|
|
515
|
+
|
|
516
|
+
# Compute log_value
|
|
517
|
+
kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)
|
|
518
|
+
|
|
519
|
+
# Group by 'input_index' and compute sum and count
|
|
520
|
+
grouped = kinase_df.dropna().groupby('input_index')
|
|
521
|
+
sum_log_values = grouped['log_value'].sum()
|
|
522
|
+
len_values = grouped['log_value'].count()
|
|
517
523
|
|
|
524
|
+
# Compute log_sum using the formula
|
|
525
|
+
log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
|
|
518
526
|
|
|
519
|
-
#
|
|
527
|
+
# Find all 'input_index' where 'log_value' is NaN
|
|
528
|
+
nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
|
|
529
|
+
# Set log_sum at those indices to NaN
|
|
530
|
+
log_sum.loc[nan_input_indices] = np.nan
|
|
531
|
+
|
|
532
|
+
# Assign the computed values to the results DataFrame
|
|
533
|
+
out[kinase] = log_sum
|
|
534
|
+
|
|
535
|
+
out = pd.DataFrame(out).reindex(df.index)
|
|
536
|
+
|
|
537
|
+
else:
|
|
538
|
+
grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
|
|
539
|
+
out = grouped_df.reindex(df.index)
|
|
540
|
+
|
|
541
|
+
# Return results as a DataFrame
|
|
542
|
+
return out
|
|
543
|
+
|
|
544
|
+
# %% ../nbs/00_core.ipynb 54
|
|
520
545
|
def get_pct(site,ref,func,pct_ref):
|
|
521
546
|
|
|
522
547
|
"Replicate the precentile results from The Kinase Library."
|
|
@@ -541,7 +566,7 @@ def get_pct(site,ref,func,pct_ref):
|
|
|
541
566
|
final.columns=['log2(score)','percentile']
|
|
542
567
|
return final
|
|
543
568
|
|
|
544
|
-
# %% ../nbs/00_core.ipynb
|
|
569
|
+
# %% ../nbs/00_core.ipynb 60
|
|
545
570
|
def get_pct_df(score_df, # output from predict_kinase_df
|
|
546
571
|
pct_ref, # a reference df for percentile calculation
|
|
547
572
|
):
|
|
@@ -566,7 +591,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
|
|
|
566
591
|
|
|
567
592
|
return percentiles_df
|
|
568
593
|
|
|
569
|
-
# %% ../nbs/00_core.ipynb
|
|
594
|
+
# %% ../nbs/00_core.ipynb 65
|
|
570
595
|
def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
|
|
571
596
|
seq_col: str='site_seq', # column name of site sequence
|
|
572
597
|
id_col: str='gene_site' # column name of site id
|
|
@@ -582,7 +607,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
|
|
|
582
607
|
|
|
583
608
|
return unique
|
|
584
609
|
|
|
585
|
-
# %% ../nbs/00_core.ipynb
|
|
610
|
+
# %% ../nbs/00_core.ipynb 68
|
|
586
611
|
def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
|
|
587
612
|
seq_col: str, # column name of protein sequence
|
|
588
613
|
position_col: str # column name of position 0
|
|
@@ -608,7 +633,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
|
|
|
608
633
|
|
|
609
634
|
return np.array(data)
|
|
610
635
|
|
|
611
|
-
# %% ../nbs/00_core.ipynb
|
|
636
|
+
# %% ../nbs/00_core.ipynb 73
|
|
612
637
|
def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
|
|
613
638
|
aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
|
|
614
639
|
aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
|
|
@@ -649,7 +674,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
|
|
|
649
674
|
|
|
650
675
|
return paper,full
|
|
651
676
|
|
|
652
|
-
# %% ../nbs/00_core.ipynb
|
|
677
|
+
# %% ../nbs/00_core.ipynb 77
|
|
653
678
|
def query_gene(df,gene):
|
|
654
679
|
|
|
655
680
|
"Query gene in the phosphoproteomics dataset"
|
|
@@ -663,7 +688,7 @@ def query_gene(df,gene):
|
|
|
663
688
|
|
|
664
689
|
return df_gene
|
|
665
690
|
|
|
666
|
-
# %% ../nbs/00_core.ipynb
|
|
691
|
+
# %% ../nbs/00_core.ipynb 81
|
|
667
692
|
def get_ttest(df,
|
|
668
693
|
columns1, # list of column names for group1
|
|
669
694
|
columns2, # list of column names for group2
|
|
@@ -733,7 +758,7 @@ def get_ttest(df,
|
|
|
733
758
|
|
|
734
759
|
return results
|
|
735
760
|
|
|
736
|
-
# %% ../nbs/00_core.ipynb
|
|
761
|
+
# %% ../nbs/00_core.ipynb 82
|
|
737
762
|
def get_metaP(p_values):
|
|
738
763
|
|
|
739
764
|
"Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
|
|
@@ -745,7 +770,7 @@ def get_metaP(p_values):
|
|
|
745
770
|
|
|
746
771
|
return score
|
|
747
772
|
|
|
748
|
-
# %% ../nbs/00_core.ipynb
|
|
773
|
+
# %% ../nbs/00_core.ipynb 85
|
|
749
774
|
def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
|
|
750
775
|
PDHK: bool=False, # whether this kinase belongs to PDHK family
|
|
751
776
|
):
|
|
@@ -768,7 +793,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
|
|
|
768
793
|
|
|
769
794
|
return df2
|
|
770
795
|
|
|
771
|
-
# %% ../nbs/00_core.ipynb
|
|
796
|
+
# %% ../nbs/00_core.ipynb 87
|
|
772
797
|
def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
|
|
773
798
|
kinase:str, # a specific kinase
|
|
774
799
|
normalize: bool=False, # normalize according to the paper; special for PDHK1/4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: python-katlas
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: tools for predicting kinome specificities
|
|
5
5
|
Home-page: https://github.com/sky1ove/python-katlas
|
|
6
6
|
Author: lily
|
|
@@ -21,7 +21,6 @@ License-File: LICENSE
|
|
|
21
21
|
Requires-Dist: statsmodels
|
|
22
22
|
Requires-Dist: fastparquet
|
|
23
23
|
Requires-Dist: tqdm
|
|
24
|
-
Requires-Dist: joblib
|
|
25
24
|
Provides-Extra: dev
|
|
26
25
|
Requires-Dist: nbdev ; extra == 'dev'
|
|
27
26
|
Requires-Dist: pyngrok ; extra == 'dev'
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
katlas/__init__.py,sha256=wOJN3HxAgnSon5vWYU3Txm2UZ_7tBHDKXUKZIH-mXX8,22
|
|
2
|
+
katlas/_modidx.py,sha256=N4eRU0s8l9NGhHf-PZynH4p87iyw9ksebr7DKd8SLR0,11027
|
|
3
|
+
katlas/core.py,sha256=eSQonCHahRPgSbNGVhrbPlYkhSqarnFWm_-juS4gCY0,36654
|
|
4
|
+
katlas/dl.py,sha256=gV-rwTLU9IudwFyNJKo-MP8nmOzhqURZQt3rJalHoG8,10903
|
|
5
|
+
katlas/feature.py,sha256=Wv94R0hnAovErifV2x5ky8uvRMNSGPjnS4ivyWVoZps,11548
|
|
6
|
+
katlas/imports.py,sha256=-ZphRU8K1KspxMpgRxisE0OskrCw3S8JR8tvmeXBRY0,147
|
|
7
|
+
katlas/plot.py,sha256=_bzvMwxzAjXg0Zxb5Sv7bfCIopxJegjCA-DutjrLybo,23668
|
|
8
|
+
katlas/train.py,sha256=OhiR2ev1UhPBBSWLncwfNjy_UImSmB_kVhZp7DyCQ50,7671
|
|
9
|
+
python_katlas-0.0.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
10
|
+
python_katlas-0.0.9.dist-info/METADATA,sha256=hGnYSHNOMM3r60gt9VcOgo3se2r5H43lJFYciFkJ1_Q,15460
|
|
11
|
+
python_katlas-0.0.9.dist-info/WHEEL,sha256=EVRjI69F5qVjm_YgqcTXPnTAv3BfSUr0WVAHuSP3Xoo,92
|
|
12
|
+
python_katlas-0.0.9.dist-info/entry_points.txt,sha256=SF3xDlCmE84ECTBIMDo_FNg1aXGX2-lXkCvH5o4VgpM,34
|
|
13
|
+
python_katlas-0.0.9.dist-info/top_level.txt,sha256=pKBKw9KOSJgnnFkoilkDij_iJ_tJbIO4XnrSXIleqNc,7
|
|
14
|
+
python_katlas-0.0.9.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
katlas/__init__.py,sha256=R9xOYoYrWKcfO5zvTeGC3m_eDNOvxMd8CocQs2tLufo,22
|
|
2
|
-
katlas/_modidx.py,sha256=W8FiqZd2da7DE4nVJaoG56fLD8I1fkmfyXQwHqOFYZg,11245
|
|
3
|
-
katlas/core.py,sha256=8jPQS49qIitfbfyQOCljcql8K05aTCWKHjMCXcdcwWs,35871
|
|
4
|
-
katlas/dl.py,sha256=gV-rwTLU9IudwFyNJKo-MP8nmOzhqURZQt3rJalHoG8,10903
|
|
5
|
-
katlas/feature.py,sha256=Wv94R0hnAovErifV2x5ky8uvRMNSGPjnS4ivyWVoZps,11548
|
|
6
|
-
katlas/imports.py,sha256=-ZphRU8K1KspxMpgRxisE0OskrCw3S8JR8tvmeXBRY0,147
|
|
7
|
-
katlas/plot.py,sha256=_bzvMwxzAjXg0Zxb5Sv7bfCIopxJegjCA-DutjrLybo,23668
|
|
8
|
-
katlas/train.py,sha256=OhiR2ev1UhPBBSWLncwfNjy_UImSmB_kVhZp7DyCQ50,7671
|
|
9
|
-
python_katlas-0.0.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
10
|
-
python_katlas-0.0.8.dist-info/METADATA,sha256=86tumS-YaIzTRrU7Tu7xcA0WlYbsdn1zhCAUC9Ba7OE,15482
|
|
11
|
-
python_katlas-0.0.8.dist-info/WHEEL,sha256=EVRjI69F5qVjm_YgqcTXPnTAv3BfSUr0WVAHuSP3Xoo,92
|
|
12
|
-
python_katlas-0.0.8.dist-info/entry_points.txt,sha256=SF3xDlCmE84ECTBIMDo_FNg1aXGX2-lXkCvH5o4VgpM,34
|
|
13
|
-
python_katlas-0.0.8.dist-info/top_level.txt,sha256=pKBKw9KOSJgnnFkoilkDij_iJ_tJbIO4XnrSXIleqNc,7
|
|
14
|
-
python_katlas-0.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|