python-katlas 0.0.8__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python-katlas-0.0.8/python_katlas.egg-info → python-katlas-0.1.0}/PKG-INFO +1 -1
- python-katlas-0.1.0/katlas/__init__.py +1 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/_modidx.py +0 -2
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/core.py +87 -53
- {python-katlas-0.0.8 → python-katlas-0.1.0/python_katlas.egg-info}/PKG-INFO +1 -1
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/requires.txt +0 -1
- {python-katlas-0.0.8 → python-katlas-0.1.0}/settings.ini +2 -2
- python-katlas-0.0.8/katlas/__init__.py +0 -1
- {python-katlas-0.0.8 → python-katlas-0.1.0}/LICENSE +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/MANIFEST.in +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/README.md +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/dl.py +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/feature.py +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/imports.py +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/plot.py +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/train.py +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/SOURCES.txt +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/dependency_links.txt +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/entry_points.txt +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/not-zip-safe +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/top_level.txt +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/setup.cfg +0 -0
- {python-katlas-0.0.8 → python-katlas-0.1.0}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.9"
|
|
@@ -49,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
|
|
|
49
49
|
'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
|
|
50
50
|
'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
|
|
51
51
|
'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
|
|
52
|
-
'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
|
|
53
|
-
'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
|
|
54
52
|
'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
|
|
55
53
|
'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
|
|
56
54
|
'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),
|
|
@@ -14,7 +14,6 @@ from tqdm import tqdm
|
|
|
14
14
|
from scipy.stats import chi2
|
|
15
15
|
from typing import Callable
|
|
16
16
|
from functools import partial
|
|
17
|
-
from joblib import Parallel, delayed
|
|
18
17
|
from scipy.stats import ttest_ind
|
|
19
18
|
from statsmodels.stats.multitest import multipletests
|
|
20
19
|
|
|
@@ -375,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
|
|
|
375
374
|
return log_sum
|
|
376
375
|
|
|
377
376
|
# %% ../nbs/00_core.ipynb 33
|
|
378
|
-
|
|
377
|
+
def multiply(values, kinase, num_dict=Data.get_num_dict()):
|
|
379
378
|
"Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
|
|
380
|
-
def __init__(self):
|
|
381
|
-
self.num_dict = Data.get_num_dict()
|
|
382
|
-
|
|
383
|
-
def func(self, values, kinase):
|
|
384
|
-
|
|
385
|
-
# Check if any values are less than or equal to zero
|
|
386
|
-
if np.any(np.array(values) == 0):
|
|
387
|
-
return np.nan
|
|
388
|
-
|
|
389
|
-
else:
|
|
390
|
-
# Retrieve the divide factor from the dictionary
|
|
391
|
-
self.divide = self.num_dict[kinase]
|
|
392
379
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
380
|
+
# Check if any values are less than or equal to zero
|
|
381
|
+
if np.any(np.array(values) == 0):
|
|
382
|
+
return np.nan
|
|
383
|
+
else:
|
|
384
|
+
# Retrieve the divide factor from the dictionary
|
|
385
|
+
divide_factor = num_dict[kinase]
|
|
386
|
+
|
|
387
|
+
# Using the logarithmic property: log(a*b) = log(a) + log(b)
|
|
388
|
+
# Compute the sum of the logarithms of the values and the divide factor
|
|
389
|
+
log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
|
|
396
390
|
|
|
397
|
-
|
|
391
|
+
return log_sum
|
|
398
392
|
|
|
399
|
-
# %% ../nbs/00_core.ipynb
|
|
393
|
+
# %% ../nbs/00_core.ipynb 36
|
|
400
394
|
def sumup(values, # list of values, possibilities of amino acids at certain positions
|
|
401
395
|
kinase=None,
|
|
402
396
|
):
|
|
403
397
|
"Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
|
|
404
398
|
return sum(values)
|
|
405
399
|
|
|
406
|
-
# %% ../nbs/00_core.ipynb
|
|
400
|
+
# %% ../nbs/00_core.ipynb 39
|
|
407
401
|
def predict_kinase(input_string: str, # site sequence
|
|
408
402
|
ref: pd.DataFrame, # reference dataframe for scoring
|
|
409
403
|
func: Callable, # function to calculate score
|
|
@@ -455,19 +449,20 @@ def predict_kinase(input_string: str, # site sequence
|
|
|
455
449
|
|
|
456
450
|
return out.round(3) # Return the scores rounded to three decimal places
|
|
457
451
|
|
|
458
|
-
# %% ../nbs/00_core.ipynb
|
|
452
|
+
# %% ../nbs/00_core.ipynb 41
|
|
459
453
|
# PSPA
|
|
460
|
-
param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply
|
|
461
|
-
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply
|
|
462
|
-
param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply
|
|
454
|
+
param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official
|
|
455
|
+
param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}
|
|
456
|
+
param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}
|
|
463
457
|
|
|
464
458
|
|
|
465
459
|
# Kinase-substrate dataset, CDDM
|
|
466
|
-
param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
|
|
467
|
-
param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
|
|
460
|
+
param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}
|
|
461
|
+
param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase
|
|
468
462
|
|
|
469
463
|
# %% ../nbs/00_core.ipynb 46
|
|
470
464
|
def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
|
|
465
|
+
|
|
471
466
|
print('input dataframe has a length', df.shape[0])
|
|
472
467
|
print('Preprocessing')
|
|
473
468
|
|
|
@@ -494,29 +489,68 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
|
|
|
494
489
|
|
|
495
490
|
print('Finish preprocessing')
|
|
496
491
|
|
|
497
|
-
results = []
|
|
498
|
-
# Extract numerical part of reference DataFrame columns, sort them
|
|
499
|
-
num = list(set(ref.columns.str[:-1].astype(int)))
|
|
500
|
-
num.sort()
|
|
501
|
-
print(f'Calculating position: {num}')
|
|
502
|
-
# Transform reference DataFrame to a dictionary and clean up NaN values
|
|
503
|
-
ref_dict = ref.T.to_dict()
|
|
504
|
-
ref_dict = {
|
|
505
|
-
outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
|
|
506
|
-
for outer_k, outer_v in ref_dict.items()}
|
|
507
492
|
|
|
508
|
-
#
|
|
509
|
-
|
|
510
|
-
|
|
493
|
+
# wide form to long form
|
|
494
|
+
df['keys'] = df['site_seq'].apply(get_dict)
|
|
495
|
+
input_keys_df = df[['keys']].explode('keys').reset_index()
|
|
496
|
+
input_keys_df.columns = ['input_index', 'key']
|
|
511
497
|
|
|
512
|
-
# Process all kinases in parallel, using tqdm for progress tracking
|
|
513
|
-
results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
|
|
514
498
|
|
|
515
|
-
|
|
516
|
-
|
|
499
|
+
ref_T = ref.T
|
|
500
|
+
|
|
501
|
+
input_keys_df = input_keys_df.set_index('key')
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
print('Merging reference')
|
|
505
|
+
merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')
|
|
517
506
|
|
|
507
|
+
print('Finish merging')
|
|
508
|
+
|
|
509
|
+
if func == sumup:
|
|
510
|
+
grouped_df = merged_df.groupby('input_index').sum()
|
|
511
|
+
out = grouped_df.reindex(df.index)
|
|
512
|
+
|
|
513
|
+
elif func==multiply:
|
|
514
|
+
# Get the list of kinases and num_dict
|
|
515
|
+
kinases = ref_T.columns
|
|
516
|
+
num_dict = Data.get_num_dict()
|
|
517
|
+
|
|
518
|
+
out = {}
|
|
519
|
+
for kinase in tqdm(kinases):
|
|
520
|
+
divide_factor = num_dict[kinase]
|
|
521
|
+
# Extract data for this kinase
|
|
522
|
+
kinase_df = merged_df[['input_index', kinase]].copy()
|
|
523
|
+
kinase_df = kinase_df.rename(columns={kinase: 'value'})
|
|
518
524
|
|
|
519
|
-
#
|
|
525
|
+
# Compute log_value
|
|
526
|
+
kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))
|
|
527
|
+
|
|
528
|
+
# Group by 'input_index' and compute sum and count
|
|
529
|
+
grouped = kinase_df.dropna().groupby('input_index')
|
|
530
|
+
sum_log_values = grouped['log_value'].sum()
|
|
531
|
+
len_values = grouped['log_value'].count()
|
|
532
|
+
|
|
533
|
+
# Compute log_sum using the formula
|
|
534
|
+
log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
|
|
535
|
+
|
|
536
|
+
# Find all 'input_index' where 'log_value' is NaN
|
|
537
|
+
nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
|
|
538
|
+
# Set log_sum at those indices to NaN
|
|
539
|
+
log_sum.loc[nan_input_indices] = np.nan
|
|
540
|
+
|
|
541
|
+
# Assign the computed values to the results DataFrame
|
|
542
|
+
out[kinase] = log_sum
|
|
543
|
+
|
|
544
|
+
out = pd.DataFrame(out).reindex(df.index)
|
|
545
|
+
|
|
546
|
+
else:
|
|
547
|
+
grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
|
|
548
|
+
out = grouped_df.reindex(df.index)
|
|
549
|
+
|
|
550
|
+
# Return results as a DataFrame
|
|
551
|
+
return out
|
|
552
|
+
|
|
553
|
+
# %% ../nbs/00_core.ipynb 56
|
|
520
554
|
def get_pct(site,ref,func,pct_ref):
|
|
521
555
|
|
|
522
556
|
"Replicate the precentile results from The Kinase Library."
|
|
@@ -541,7 +575,7 @@ def get_pct(site,ref,func,pct_ref):
|
|
|
541
575
|
final.columns=['log2(score)','percentile']
|
|
542
576
|
return final
|
|
543
577
|
|
|
544
|
-
# %% ../nbs/00_core.ipynb
|
|
578
|
+
# %% ../nbs/00_core.ipynb 62
|
|
545
579
|
def get_pct_df(score_df, # output from predict_kinase_df
|
|
546
580
|
pct_ref, # a reference df for percentile calculation
|
|
547
581
|
):
|
|
@@ -566,7 +600,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
|
|
|
566
600
|
|
|
567
601
|
return percentiles_df
|
|
568
602
|
|
|
569
|
-
# %% ../nbs/00_core.ipynb
|
|
603
|
+
# %% ../nbs/00_core.ipynb 67
|
|
570
604
|
def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
|
|
571
605
|
seq_col: str='site_seq', # column name of site sequence
|
|
572
606
|
id_col: str='gene_site' # column name of site id
|
|
@@ -582,7 +616,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
|
|
|
582
616
|
|
|
583
617
|
return unique
|
|
584
618
|
|
|
585
|
-
# %% ../nbs/00_core.ipynb
|
|
619
|
+
# %% ../nbs/00_core.ipynb 70
|
|
586
620
|
def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
|
|
587
621
|
seq_col: str, # column name of protein sequence
|
|
588
622
|
position_col: str # column name of position 0
|
|
@@ -608,7 +642,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
|
|
|
608
642
|
|
|
609
643
|
return np.array(data)
|
|
610
644
|
|
|
611
|
-
# %% ../nbs/00_core.ipynb
|
|
645
|
+
# %% ../nbs/00_core.ipynb 75
|
|
612
646
|
def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
|
|
613
647
|
aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
|
|
614
648
|
aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
|
|
@@ -649,7 +683,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
|
|
|
649
683
|
|
|
650
684
|
return paper,full
|
|
651
685
|
|
|
652
|
-
# %% ../nbs/00_core.ipynb
|
|
686
|
+
# %% ../nbs/00_core.ipynb 79
|
|
653
687
|
def query_gene(df,gene):
|
|
654
688
|
|
|
655
689
|
"Query gene in the phosphoproteomics dataset"
|
|
@@ -663,7 +697,7 @@ def query_gene(df,gene):
|
|
|
663
697
|
|
|
664
698
|
return df_gene
|
|
665
699
|
|
|
666
|
-
# %% ../nbs/00_core.ipynb
|
|
700
|
+
# %% ../nbs/00_core.ipynb 83
|
|
667
701
|
def get_ttest(df,
|
|
668
702
|
columns1, # list of column names for group1
|
|
669
703
|
columns2, # list of column names for group2
|
|
@@ -733,7 +767,7 @@ def get_ttest(df,
|
|
|
733
767
|
|
|
734
768
|
return results
|
|
735
769
|
|
|
736
|
-
# %% ../nbs/00_core.ipynb
|
|
770
|
+
# %% ../nbs/00_core.ipynb 84
|
|
737
771
|
def get_metaP(p_values):
|
|
738
772
|
|
|
739
773
|
"Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
|
|
@@ -745,7 +779,7 @@ def get_metaP(p_values):
|
|
|
745
779
|
|
|
746
780
|
return score
|
|
747
781
|
|
|
748
|
-
# %% ../nbs/00_core.ipynb
|
|
782
|
+
# %% ../nbs/00_core.ipynb 87
|
|
749
783
|
def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
|
|
750
784
|
PDHK: bool=False, # whether this kinase belongs to PDHK family
|
|
751
785
|
):
|
|
@@ -768,7 +802,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
|
|
|
768
802
|
|
|
769
803
|
return df2
|
|
770
804
|
|
|
771
|
-
# %% ../nbs/00_core.ipynb
|
|
805
|
+
# %% ../nbs/00_core.ipynb 89
|
|
772
806
|
def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
|
|
773
807
|
kinase:str, # a specific kinase
|
|
774
808
|
normalize: bool=False, # normalize according to the paper; special for PDHK1/4
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
### Python library ###
|
|
6
6
|
repo = python-katlas
|
|
7
7
|
lib_name = %(repo)s
|
|
8
|
-
version = 0.0
|
|
8
|
+
version = 0.1.0
|
|
9
9
|
min_python = 3.7
|
|
10
10
|
license = apache2
|
|
11
11
|
black_formatting = False
|
|
@@ -38,6 +38,6 @@ status = 3
|
|
|
38
38
|
user = sky1ove
|
|
39
39
|
|
|
40
40
|
### Optional ###
|
|
41
|
-
requirements = statsmodels fastparquet tqdm
|
|
41
|
+
requirements = statsmodels fastparquet tqdm
|
|
42
42
|
dev_requirements = nbdev pyngrok fastai>=2.7.12 fastbook fairscale fair-esm logomaker seaborn rdkit umap-learn adjustText bokeh scikit-learn>=1.3.0 openpyxl
|
|
43
43
|
# console_scripts =
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.7"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|