python-katlas 0.0.8__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {python-katlas-0.0.8/python_katlas.egg-info → python-katlas-0.1.0}/PKG-INFO +1 -1
  2. python-katlas-0.1.0/katlas/__init__.py +1 -0
  3. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/_modidx.py +0 -2
  4. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/core.py +87 -53
  5. {python-katlas-0.0.8 → python-katlas-0.1.0/python_katlas.egg-info}/PKG-INFO +1 -1
  6. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/requires.txt +0 -1
  7. {python-katlas-0.0.8 → python-katlas-0.1.0}/settings.ini +2 -2
  8. python-katlas-0.0.8/katlas/__init__.py +0 -1
  9. {python-katlas-0.0.8 → python-katlas-0.1.0}/LICENSE +0 -0
  10. {python-katlas-0.0.8 → python-katlas-0.1.0}/MANIFEST.in +0 -0
  11. {python-katlas-0.0.8 → python-katlas-0.1.0}/README.md +0 -0
  12. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/dl.py +0 -0
  13. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/feature.py +0 -0
  14. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/imports.py +0 -0
  15. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/plot.py +0 -0
  16. {python-katlas-0.0.8 → python-katlas-0.1.0}/katlas/train.py +0 -0
  17. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/SOURCES.txt +0 -0
  18. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/dependency_links.txt +0 -0
  19. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/entry_points.txt +0 -0
  20. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/not-zip-safe +0 -0
  21. {python-katlas-0.0.8 → python-katlas-0.1.0}/python_katlas.egg-info/top_level.txt +0 -0
  22. {python-katlas-0.0.8 → python-katlas-0.1.0}/setup.cfg +0 -0
  23. {python-katlas-0.0.8 → python-katlas-0.1.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: python-katlas
3
- Version: 0.0.8
3
+ Version: 0.1.0
4
4
  Summary: tools for predicting kinome specificities
5
5
  Home-page: https://github.com/sky1ove/python-katlas
6
6
  Author: lily
@@ -0,0 +1 @@
1
+ __version__ = "0.0.9"
@@ -49,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
49
49
  'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
50
50
  'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
51
51
  'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
52
- 'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
53
- 'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
54
52
  'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
55
53
  'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
56
54
  'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),
@@ -14,7 +14,6 @@ from tqdm import tqdm
14
14
  from scipy.stats import chi2
15
15
  from typing import Callable
16
16
  from functools import partial
17
- from joblib import Parallel, delayed
18
17
  from scipy.stats import ttest_ind
19
18
  from statsmodels.stats.multitest import multipletests
20
19
 
@@ -375,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
375
374
  return log_sum
376
375
 
377
376
  # %% ../nbs/00_core.ipynb 33
378
- class multiply:
377
+ def multiply(values, kinase, num_dict=Data.get_num_dict()):
379
378
  "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
380
- def __init__(self):
381
- self.num_dict = Data.get_num_dict()
382
-
383
- def func(self, values, kinase):
384
-
385
- # Check if any values are less than or equal to zero
386
- if np.any(np.array(values) == 0):
387
- return np.nan
388
-
389
- else:
390
- # Retrieve the divide factor from the dictionary
391
- self.divide = self.num_dict[kinase]
392
379
 
393
- # Using the logarithmic property: log(a*b) = log(a) + log(b)
394
- # Compute the sum of the logarithms of the values and the divide factor
395
- log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
380
+ # Check if any values are less than or equal to zero
381
+ if np.any(np.array(values) == 0):
382
+ return np.nan
383
+ else:
384
+ # Retrieve the divide factor from the dictionary
385
+ divide_factor = num_dict[kinase]
386
+
387
+ # Using the logarithmic property: log(a*b) = log(a) + log(b)
388
+ # Compute the sum of the logarithms of the values and the divide factor
389
+ log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
396
390
 
397
- return log_sum
391
+ return log_sum
398
392
 
399
- # %% ../nbs/00_core.ipynb 37
393
+ # %% ../nbs/00_core.ipynb 36
400
394
  def sumup(values, # list of values, possibilities of amino acids at certain positions
401
395
  kinase=None,
402
396
  ):
403
397
  "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
404
398
  return sum(values)
405
399
 
406
- # %% ../nbs/00_core.ipynb 40
400
+ # %% ../nbs/00_core.ipynb 39
407
401
  def predict_kinase(input_string: str, # site sequence
408
402
  ref: pd.DataFrame, # reference dataframe for scoring
409
403
  func: Callable, # function to calculate score
@@ -455,19 +449,20 @@ def predict_kinase(input_string: str, # site sequence
455
449
 
456
450
  return out.round(3) # Return the scores rounded to three decimal places
457
451
 
458
- # %% ../nbs/00_core.ipynb 42
452
+ # %% ../nbs/00_core.ipynb 41
459
453
  # PSPA
460
- param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
461
- param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
462
- param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
454
+ param_PSPA_st = {'ref':Data.get_pspa_st_norm().astype('float32'), 'func':multiply} # Johnson et al. Nature official
455
+ param_PSPA_y = {'ref':Data.get_pspa_tyr_norm().astype('float32'), 'func':multiply}
456
+ param_PSPA = {'ref':Data.get_pspa_all_norm().astype('float32'), 'func':multiply}
463
457
 
464
458
 
465
459
  # Kinase-substrate dataset, CDDM
466
- param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
467
- param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
460
+ param_CDDM = {'ref':Data.get_cddm().astype('float32'), 'func':sumup}
461
+ param_CDDM_upper = {'ref':Data.get_cddm_upper().astype('float32'), 'func':sumup, 'to_upper':True} # specific for all uppercase
468
462
 
469
463
  # %% ../nbs/00_core.ipynb 46
470
464
  def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
465
+
471
466
  print('input dataframe has a length', df.shape[0])
472
467
  print('Preprocessing')
473
468
 
@@ -494,29 +489,68 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
494
489
 
495
490
  print('Finish preprocessing')
496
491
 
497
- results = []
498
- # Extract numerical part of reference DataFrame columns, sort them
499
- num = list(set(ref.columns.str[:-1].astype(int)))
500
- num.sort()
501
- print(f'Calculating position: {num}')
502
- # Transform reference DataFrame to a dictionary and clean up NaN values
503
- ref_dict = ref.T.to_dict()
504
- ref_dict = {
505
- outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
506
- for outer_k, outer_v in ref_dict.items()}
507
492
 
508
- # Function to process each kinase with its dictionary, using parallel processing
509
- def process_kinase(kinase, r_dict):
510
- return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
493
+ # wide form to long form
494
+ df['keys'] = df['site_seq'].apply(get_dict)
495
+ input_keys_df = df[['keys']].explode('keys').reset_index()
496
+ input_keys_df.columns = ['input_index', 'key']
511
497
 
512
- # Process all kinases in parallel, using tqdm for progress tracking
513
- results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
514
498
 
515
- # Return results as a DataFrame
516
- return pd.DataFrame(results, index=ref.index, columns=df.index).T
499
+ ref_T = ref.T
500
+
501
+ input_keys_df = input_keys_df.set_index('key')
502
+
503
+
504
+ print('Merging reference')
505
+ merged_df = input_keys_df.merge(ref_T, left_index=True, right_index=True, how='inner')
517
506
 
507
+ print('Finish merging')
508
+
509
+ if func == sumup:
510
+ grouped_df = merged_df.groupby('input_index').sum()
511
+ out = grouped_df.reindex(df.index)
512
+
513
+ elif func==multiply:
514
+ # Get the list of kinases and num_dict
515
+ kinases = ref_T.columns
516
+ num_dict = Data.get_num_dict()
517
+
518
+ out = {}
519
+ for kinase in tqdm(kinases):
520
+ divide_factor = num_dict[kinase]
521
+ # Extract data for this kinase
522
+ kinase_df = merged_df[['input_index', kinase]].copy()
523
+ kinase_df = kinase_df.rename(columns={kinase: 'value'})
518
524
 
519
- # %% ../nbs/00_core.ipynb 55
525
+ # Compute log_value
526
+ kinase_df['log_value'] = np.log2(kinase_df['value'].where(kinase_df['value'] > 0))
527
+
528
+ # Group by 'input_index' and compute sum and count
529
+ grouped = kinase_df.dropna().groupby('input_index')
530
+ sum_log_values = grouped['log_value'].sum()
531
+ len_values = grouped['log_value'].count()
532
+
533
+ # Compute log_sum using the formula
534
+ log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
535
+
536
+ # Find all 'input_index' where 'log_value' is NaN
537
+ nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
538
+ # Set log_sum at those indices to NaN
539
+ log_sum.loc[nan_input_indices] = np.nan
540
+
541
+ # Assign the computed values to the results DataFrame
542
+ out[kinase] = log_sum
543
+
544
+ out = pd.DataFrame(out).reindex(df.index)
545
+
546
+ else:
547
+ grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
548
+ out = grouped_df.reindex(df.index)
549
+
550
+ # Return results as a DataFrame
551
+ return out
552
+
553
+ # %% ../nbs/00_core.ipynb 56
520
554
  def get_pct(site,ref,func,pct_ref):
521
555
 
522
556
  "Replicate the precentile results from The Kinase Library."
@@ -541,7 +575,7 @@ def get_pct(site,ref,func,pct_ref):
541
575
  final.columns=['log2(score)','percentile']
542
576
  return final
543
577
 
544
- # %% ../nbs/00_core.ipynb 61
578
+ # %% ../nbs/00_core.ipynb 62
545
579
  def get_pct_df(score_df, # output from predict_kinase_df
546
580
  pct_ref, # a reference df for percentile calculation
547
581
  ):
@@ -566,7 +600,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
566
600
 
567
601
  return percentiles_df
568
602
 
569
- # %% ../nbs/00_core.ipynb 66
603
+ # %% ../nbs/00_core.ipynb 67
570
604
  def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
571
605
  seq_col: str='site_seq', # column name of site sequence
572
606
  id_col: str='gene_site' # column name of site id
@@ -582,7 +616,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
582
616
 
583
617
  return unique
584
618
 
585
- # %% ../nbs/00_core.ipynb 69
619
+ # %% ../nbs/00_core.ipynb 70
586
620
  def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
587
621
  seq_col: str, # column name of protein sequence
588
622
  position_col: str # column name of position 0
@@ -608,7 +642,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
608
642
 
609
643
  return np.array(data)
610
644
 
611
- # %% ../nbs/00_core.ipynb 74
645
+ # %% ../nbs/00_core.ipynb 75
612
646
  def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
613
647
  aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
614
648
  aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
@@ -649,7 +683,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
649
683
 
650
684
  return paper,full
651
685
 
652
- # %% ../nbs/00_core.ipynb 78
686
+ # %% ../nbs/00_core.ipynb 79
653
687
  def query_gene(df,gene):
654
688
 
655
689
  "Query gene in the phosphoproteomics dataset"
@@ -663,7 +697,7 @@ def query_gene(df,gene):
663
697
 
664
698
  return df_gene
665
699
 
666
- # %% ../nbs/00_core.ipynb 82
700
+ # %% ../nbs/00_core.ipynb 83
667
701
  def get_ttest(df,
668
702
  columns1, # list of column names for group1
669
703
  columns2, # list of column names for group2
@@ -733,7 +767,7 @@ def get_ttest(df,
733
767
 
734
768
  return results
735
769
 
736
- # %% ../nbs/00_core.ipynb 83
770
+ # %% ../nbs/00_core.ipynb 84
737
771
  def get_metaP(p_values):
738
772
 
739
773
  "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
@@ -745,7 +779,7 @@ def get_metaP(p_values):
745
779
 
746
780
  return score
747
781
 
748
- # %% ../nbs/00_core.ipynb 86
782
+ # %% ../nbs/00_core.ipynb 87
749
783
  def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
750
784
  PDHK: bool=False, # whether this kinase belongs to PDHK family
751
785
  ):
@@ -768,7 +802,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
768
802
 
769
803
  return df2
770
804
 
771
- # %% ../nbs/00_core.ipynb 88
805
+ # %% ../nbs/00_core.ipynb 89
772
806
  def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
773
807
  kinase:str, # a specific kinase
774
808
  normalize: bool=False, # normalize according to the paper; special for PDHK1/4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: python-katlas
3
- Version: 0.0.8
3
+ Version: 0.1.0
4
4
  Summary: tools for predicting kinome specificities
5
5
  Home-page: https://github.com/sky1ove/python-katlas
6
6
  Author: lily
@@ -1,7 +1,6 @@
1
1
  statsmodels
2
2
  fastparquet
3
3
  tqdm
4
- joblib
5
4
 
6
5
  [dev]
7
6
  nbdev
@@ -5,7 +5,7 @@
5
5
  ### Python library ###
6
6
  repo = python-katlas
7
7
  lib_name = %(repo)s
8
- version = 0.0.8
8
+ version = 0.1.0
9
9
  min_python = 3.7
10
10
  license = apache2
11
11
  black_formatting = False
@@ -38,6 +38,6 @@ status = 3
38
38
  user = sky1ove
39
39
 
40
40
  ### Optional ###
41
- requirements = statsmodels fastparquet tqdm joblib
41
+ requirements = statsmodels fastparquet tqdm
42
42
  dev_requirements = nbdev pyngrok fastai>=2.7.12 fastbook fairscale fair-esm logomaker seaborn rdkit umap-learn adjustText bokeh scikit-learn>=1.3.0 openpyxl
43
43
  # console_scripts =
@@ -1 +0,0 @@
1
- __version__ = "0.0.7"
File without changes
File without changes
File without changes
File without changes
File without changes