python-katlas 0.0.8__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {python-katlas-0.0.8/python_katlas.egg-info → python-katlas-0.0.9}/PKG-INFO +1 -1
  2. python-katlas-0.0.9/katlas/__init__.py +1 -0
  3. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/_modidx.py +0 -2
  4. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/core.py +77 -52
  5. {python-katlas-0.0.8 → python-katlas-0.0.9/python_katlas.egg-info}/PKG-INFO +1 -1
  6. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/requires.txt +0 -1
  7. {python-katlas-0.0.8 → python-katlas-0.0.9}/settings.ini +2 -2
  8. python-katlas-0.0.8/katlas/__init__.py +0 -1
  9. {python-katlas-0.0.8 → python-katlas-0.0.9}/LICENSE +0 -0
  10. {python-katlas-0.0.8 → python-katlas-0.0.9}/MANIFEST.in +0 -0
  11. {python-katlas-0.0.8 → python-katlas-0.0.9}/README.md +0 -0
  12. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/dl.py +0 -0
  13. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/feature.py +0 -0
  14. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/imports.py +0 -0
  15. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/plot.py +0 -0
  16. {python-katlas-0.0.8 → python-katlas-0.0.9}/katlas/train.py +0 -0
  17. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/SOURCES.txt +0 -0
  18. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/dependency_links.txt +0 -0
  19. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/entry_points.txt +0 -0
  20. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/not-zip-safe +0 -0
  21. {python-katlas-0.0.8 → python-katlas-0.0.9}/python_katlas.egg-info/top_level.txt +0 -0
  22. {python-katlas-0.0.8 → python-katlas-0.0.9}/setup.cfg +0 -0
  23. {python-katlas-0.0.8 → python-katlas-0.0.9}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: python-katlas
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: tools for predicting kinome specificities
5
5
  Home-page: https://github.com/sky1ove/python-katlas
6
6
  Author: lily
@@ -0,0 +1 @@
1
+ __version__ = "0.0.8"
@@ -49,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
49
49
  'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
50
50
  'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
51
51
  'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
52
- 'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
53
- 'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
54
52
  'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
55
53
  'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
56
54
  'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),
@@ -14,7 +14,6 @@ from tqdm import tqdm
14
14
  from scipy.stats import chi2
15
15
  from typing import Callable
16
16
  from functools import partial
17
- from joblib import Parallel, delayed
18
17
  from scipy.stats import ttest_ind
19
18
  from statsmodels.stats.multitest import multipletests
20
19
 
@@ -375,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
375
374
  return log_sum
376
375
 
377
376
  # %% ../nbs/00_core.ipynb 33
378
- class multiply:
377
+ def multiply(values, kinase, num_dict=Data.get_num_dict()):
379
378
  "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
380
- def __init__(self):
381
- self.num_dict = Data.get_num_dict()
382
-
383
- def func(self, values, kinase):
384
-
385
- # Check if any values are less than or equal to zero
386
- if np.any(np.array(values) == 0):
387
- return np.nan
388
-
389
- else:
390
- # Retrieve the divide factor from the dictionary
391
- self.divide = self.num_dict[kinase]
392
379
 
393
- # Using the logarithmic property: log(a*b) = log(a) + log(b)
394
- # Compute the sum of the logarithms of the values and the divide factor
395
- log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
380
+ # Check if any values are less than or equal to zero
381
+ if np.any(np.array(values) == 0):
382
+ return np.nan
383
+ else:
384
+ # Retrieve the divide factor from the dictionary
385
+ divide_factor = num_dict[kinase]
386
+
387
+ # Using the logarithmic property: log(a*b) = log(a) + log(b)
388
+ # Compute the sum of the logarithms of the values and the divide factor
389
+ log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
396
390
 
397
- return log_sum
391
+ return log_sum
398
392
 
399
- # %% ../nbs/00_core.ipynb 37
393
+ # %% ../nbs/00_core.ipynb 36
400
394
  def sumup(values, # list of values, possibilities of amino acids at certain positions
401
395
  kinase=None,
402
396
  ):
403
397
  "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
404
398
  return sum(values)
405
399
 
406
- # %% ../nbs/00_core.ipynb 40
400
+ # %% ../nbs/00_core.ipynb 39
407
401
  def predict_kinase(input_string: str, # site sequence
408
402
  ref: pd.DataFrame, # reference dataframe for scoring
409
403
  func: Callable, # function to calculate score
@@ -455,18 +449,18 @@ def predict_kinase(input_string: str, # site sequence
455
449
 
456
450
  return out.round(3) # Return the scores rounded to three decimal places
457
451
 
458
- # %% ../nbs/00_core.ipynb 42
452
+ # %% ../nbs/00_core.ipynb 41
459
453
  # PSPA
460
- param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
461
- param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
462
- param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
454
+ param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official
455
+ param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}
456
+ param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}
463
457
 
464
458
 
465
459
  # Kinase-substrate dataset, CDDM
466
460
  param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
467
461
  param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
468
462
 
469
- # %% ../nbs/00_core.ipynb 46
463
+ # %% ../nbs/00_core.ipynb 45
470
464
  def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
471
465
  print('input dataframe has a length', df.shape[0])
472
466
  print('Preprocessing')
@@ -494,29 +488,60 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
494
488
 
495
489
  print('Finish preprocessing')
496
490
 
497
- results = []
498
- # Extract numerical part of reference DataFrame columns, sort them
499
- num = list(set(ref.columns.str[:-1].astype(int)))
500
- num.sort()
501
- print(f'Calculating position: {num}')
502
- # Transform reference DataFrame to a dictionary and clean up NaN values
503
- ref_dict = ref.T.to_dict()
504
- ref_dict = {
505
- outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
506
- for outer_k, outer_v in ref_dict.items()}
507
491
 
508
- # Function to process each kinase with its dictionary, using parallel processing
509
- def process_kinase(kinase, r_dict):
510
- return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
492
+ # wide form to long form
493
+ df['keys'] = df['site_seq'].apply(get_dict)
494
+ input_keys_df = df[['keys']].explode('keys').reset_index()
495
+ input_keys_df.columns = ['input_index', 'key']
496
+ ref_T = ref.T
511
497
 
512
- # Process all kinases in parallel, using tqdm for progress tracking
513
- results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
498
+ merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')
514
499
 
515
- # Return results as a DataFrame
516
- return pd.DataFrame(results, index=ref.index, columns=df.index).T
500
+ if func == sumup:
501
+ grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()
502
+ out = grouped_df.reindex(df.index)
503
+
504
+ elif func==multiply:
505
+ # Get the list of kinases and num_dict
506
+ kinases = ref_T.columns
507
+ num_dict = Data.get_num_dict()
508
+
509
+ out = {}
510
+ for kinase in tqdm(kinases):
511
+ divide_factor = num_dict[kinase]
512
+ # Extract data for this kinase
513
+ kinase_df = merged_df[['input_index', kinase]].copy()
514
+ kinase_df = kinase_df.rename(columns={kinase: 'value'})
515
+
516
+ # Compute log_value
517
+ kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)
518
+
519
+ # Group by 'input_index' and compute sum and count
520
+ grouped = kinase_df.dropna().groupby('input_index')
521
+ sum_log_values = grouped['log_value'].sum()
522
+ len_values = grouped['log_value'].count()
517
523
 
524
+ # Compute log_sum using the formula
525
+ log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
518
526
 
519
- # %% ../nbs/00_core.ipynb 55
527
+ # Find all 'input_index' where 'log_value' is NaN
528
+ nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
529
+ # Set log_sum at those indices to NaN
530
+ log_sum.loc[nan_input_indices] = np.nan
531
+
532
+ # Assign the computed values to the results DataFrame
533
+ out[kinase] = log_sum
534
+
535
+ out = pd.DataFrame(out).reindex(df.index)
536
+
537
+ else:
538
+ grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
539
+ out = grouped_df.reindex(df.index)
540
+
541
+ # Return results as a DataFrame
542
+ return out
543
+
544
+ # %% ../nbs/00_core.ipynb 54
520
545
  def get_pct(site,ref,func,pct_ref):
521
546
 
522
547
  "Replicate the precentile results from The Kinase Library."
@@ -541,7 +566,7 @@ def get_pct(site,ref,func,pct_ref):
541
566
  final.columns=['log2(score)','percentile']
542
567
  return final
543
568
 
544
- # %% ../nbs/00_core.ipynb 61
569
+ # %% ../nbs/00_core.ipynb 60
545
570
  def get_pct_df(score_df, # output from predict_kinase_df
546
571
  pct_ref, # a reference df for percentile calculation
547
572
  ):
@@ -566,7 +591,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
566
591
 
567
592
  return percentiles_df
568
593
 
569
- # %% ../nbs/00_core.ipynb 66
594
+ # %% ../nbs/00_core.ipynb 65
570
595
  def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
571
596
  seq_col: str='site_seq', # column name of site sequence
572
597
  id_col: str='gene_site' # column name of site id
@@ -582,7 +607,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
582
607
 
583
608
  return unique
584
609
 
585
- # %% ../nbs/00_core.ipynb 69
610
+ # %% ../nbs/00_core.ipynb 68
586
611
  def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
587
612
  seq_col: str, # column name of protein sequence
588
613
  position_col: str # column name of position 0
@@ -608,7 +633,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
608
633
 
609
634
  return np.array(data)
610
635
 
611
- # %% ../nbs/00_core.ipynb 74
636
+ # %% ../nbs/00_core.ipynb 73
612
637
  def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
613
638
  aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
614
639
  aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
@@ -649,7 +674,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
649
674
 
650
675
  return paper,full
651
676
 
652
- # %% ../nbs/00_core.ipynb 78
677
+ # %% ../nbs/00_core.ipynb 77
653
678
  def query_gene(df,gene):
654
679
 
655
680
  "Query gene in the phosphoproteomics dataset"
@@ -663,7 +688,7 @@ def query_gene(df,gene):
663
688
 
664
689
  return df_gene
665
690
 
666
- # %% ../nbs/00_core.ipynb 82
691
+ # %% ../nbs/00_core.ipynb 81
667
692
  def get_ttest(df,
668
693
  columns1, # list of column names for group1
669
694
  columns2, # list of column names for group2
@@ -733,7 +758,7 @@ def get_ttest(df,
733
758
 
734
759
  return results
735
760
 
736
- # %% ../nbs/00_core.ipynb 83
761
+ # %% ../nbs/00_core.ipynb 82
737
762
  def get_metaP(p_values):
738
763
 
739
764
  "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
@@ -745,7 +770,7 @@ def get_metaP(p_values):
745
770
 
746
771
  return score
747
772
 
748
- # %% ../nbs/00_core.ipynb 86
773
+ # %% ../nbs/00_core.ipynb 85
749
774
  def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
750
775
  PDHK: bool=False, # whether this kinase belongs to PDHK family
751
776
  ):
@@ -768,7 +793,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
768
793
 
769
794
  return df2
770
795
 
771
- # %% ../nbs/00_core.ipynb 88
796
+ # %% ../nbs/00_core.ipynb 87
772
797
  def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
773
798
  kinase:str, # a specific kinase
774
799
  normalize: bool=False, # normalize according to the paper; special for PDHK1/4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: python-katlas
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: tools for predicting kinome specificities
5
5
  Home-page: https://github.com/sky1ove/python-katlas
6
6
  Author: lily
@@ -1,7 +1,6 @@
1
1
  statsmodels
2
2
  fastparquet
3
3
  tqdm
4
- joblib
5
4
 
6
5
  [dev]
7
6
  nbdev
@@ -5,7 +5,7 @@
5
5
  ### Python library ###
6
6
  repo = python-katlas
7
7
  lib_name = %(repo)s
8
- version = 0.0.8
8
+ version = 0.0.9
9
9
  min_python = 3.7
10
10
  license = apache2
11
11
  black_formatting = False
@@ -38,6 +38,6 @@ status = 3
38
38
  user = sky1ove
39
39
 
40
40
  ### Optional ###
41
- requirements = statsmodels fastparquet tqdm joblib
41
+ requirements = statsmodels fastparquet tqdm
42
42
  dev_requirements = nbdev pyngrok fastai>=2.7.12 fastbook fairscale fair-esm logomaker seaborn rdkit umap-learn adjustText bokeh scikit-learn>=1.3.0 openpyxl
43
43
  # console_scripts =
@@ -1 +0,0 @@
1
- __version__ = "0.0.7"
File without changes
File without changes
File without changes
File without changes
File without changes