python-katlas 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
katlas/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.6"
1
+ __version__ = "0.0.8"
katlas/_modidx.py CHANGED
@@ -19,6 +19,8 @@ d = { 'settings': { 'branch': 'main',
19
19
  'katlas.core.Data.get_cddm_others_info': ('core.html#data.get_cddm_others_info', 'katlas/core.py'),
20
20
  'katlas.core.Data.get_cddm_upper': ('core.html#data.get_cddm_upper', 'katlas/core.py'),
21
21
  'katlas.core.Data.get_combine': ('core.html#data.get_combine', 'katlas/core.py'),
22
+ 'katlas.core.Data.get_combine_site_phosphorylated': ( 'core.html#data.get_combine_site_phosphorylated',
23
+ 'katlas/core.py'),
22
24
  'katlas.core.Data.get_combine_site_psp_ochoa': ('core.html#data.get_combine_site_psp_ochoa', 'katlas/core.py'),
23
25
  'katlas.core.Data.get_cptac_ensembl_site': ('core.html#data.get_cptac_ensembl_site', 'katlas/core.py'),
24
26
  'katlas.core.Data.get_cptac_gene_site': ('core.html#data.get_cptac_gene_site', 'katlas/core.py'),
@@ -47,8 +49,6 @@ d = { 'settings': { 'branch': 'main',
47
49
  'katlas.core.get_ttest': ('core.html#get_ttest', 'katlas/core.py'),
48
50
  'katlas.core.get_unique_site': ('core.html#get_unique_site', 'katlas/core.py'),
49
51
  'katlas.core.multiply': ('core.html#multiply', 'katlas/core.py'),
50
- 'katlas.core.multiply.__init__': ('core.html#multiply.__init__', 'katlas/core.py'),
51
- 'katlas.core.multiply.func': ('core.html#multiply.func', 'katlas/core.py'),
52
52
  'katlas.core.multiply_func': ('core.html#multiply_func', 'katlas/core.py'),
53
53
  'katlas.core.predict_kinase': ('core.html#predict_kinase', 'katlas/core.py'),
54
54
  'katlas.core.predict_kinase_df': ('core.html#predict_kinase_df', 'katlas/core.py'),
katlas/core.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Core functions in Katlas library"""
2
+
1
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
2
4
 
3
5
  # %% auto 0
@@ -12,7 +14,6 @@ from tqdm import tqdm
12
14
  from scipy.stats import chi2
13
15
  from typing import Callable
14
16
  from functools import partial
15
- from joblib import Parallel, delayed
16
17
  from scipy.stats import ttest_ind
17
18
  from statsmodels.stats.multitest import multipletests
18
19
 
@@ -177,7 +178,7 @@ class Data:
177
178
  return Data.fetch_data(Data.OCHOA_URL)
178
179
 
179
180
  # combine ochoa and PSP low throughput data
180
- COMBINE_PSP_OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/combine_site_ochoa_psp.parquet"
181
+ COMBINE_PSP_OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/combine_site_psp_ochoa.parquet"
181
182
  @staticmethod
182
183
  def get_combine_site_psp_ochoa():
183
184
  "Combined Ochoa and PhosphoSitePlus"
@@ -187,6 +188,17 @@ class Data:
187
188
  df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
188
189
  return df
189
190
 
191
+ # combine ochoa and PSP low throughput data
192
+ P_COMBINE_PSP_OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/phosphorylated_combine_site.parquet"
193
+ @staticmethod
194
+ def get_combine_site_phosphorylated():
195
+ "Combined Ochoa and PhosphoSitePlus with phosphorylation status"
196
+ df = Data.fetch_data(Data.P_COMBINE_PSP_OCHOA_URL)
197
+
198
+ #Convert the number in the column name into integer
199
+ df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
200
+ return df
201
+
190
202
 
191
203
  # %% ../nbs/00_core.ipynb 12
192
204
  class CPTAC:
@@ -362,35 +374,30 @@ def multiply_func(values, # list of values, possibilities of amino acids at cert
362
374
  return log_sum
363
375
 
364
376
  # %% ../nbs/00_core.ipynb 33
365
- class multiply:
377
+ def multiply(values, kinase, num_dict=Data.get_num_dict()):
366
378
  "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
367
- def __init__(self):
368
- self.num_dict = Data.get_num_dict()
369
-
370
- def func(self, values, kinase):
371
-
372
- # Check if any values are less than or equal to zero
373
- if np.any(np.array(values) == 0):
374
- return np.nan
375
-
376
- else:
377
- # Retrieve the divide factor from the dictionary
378
- self.divide = self.num_dict[kinase]
379
379
 
380
- # Using the logarithmic property: log(a*b) = log(a) + log(b)
381
- # Compute the sum of the logarithms of the values and the divide factor
382
- log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
380
+ # Check if any values are less than or equal to zero
381
+ if np.any(np.array(values) == 0):
382
+ return np.nan
383
+ else:
384
+ # Retrieve the divide factor from the dictionary
385
+ divide_factor = num_dict[kinase]
386
+
387
+ # Using the logarithmic property: log(a*b) = log(a) + log(b)
388
+ # Compute the sum of the logarithms of the values and the divide factor
389
+ log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(divide_factor)
383
390
 
384
- return log_sum
391
+ return log_sum
385
392
 
386
- # %% ../nbs/00_core.ipynb 37
393
+ # %% ../nbs/00_core.ipynb 36
387
394
  def sumup(values, # list of values, possibilities of amino acids at certain positions
388
395
  kinase=None,
389
396
  ):
390
397
  "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
391
398
  return sum(values)
392
399
 
393
- # %% ../nbs/00_core.ipynb 40
400
+ # %% ../nbs/00_core.ipynb 39
394
401
  def predict_kinase(input_string: str, # site sequence
395
402
  ref: pd.DataFrame, # reference dataframe for scoring
396
403
  func: Callable, # function to calculate score
@@ -442,18 +449,18 @@ def predict_kinase(input_string: str, # site sequence
442
449
 
443
450
  return out.round(3) # Return the scores rounded to three decimal places
444
451
 
445
- # %% ../nbs/00_core.ipynb 42
452
+ # %% ../nbs/00_core.ipynb 41
446
453
  # PSPA
447
- param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
448
- param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
449
- param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
454
+ param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply} # Johnson et al. Nature official
455
+ param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply}
456
+ param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply}
450
457
 
451
458
 
452
459
  # Kinase-substrate dataset, CDDM
453
460
  param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
454
461
  param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup, 'to_upper':True} # specific for all uppercase
455
462
 
456
- # %% ../nbs/00_core.ipynb 46
463
+ # %% ../nbs/00_core.ipynb 45
457
464
  def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
458
465
  print('input dataframe has a length', df.shape[0])
459
466
  print('Preprocessing')
@@ -481,29 +488,60 @@ def predict_kinase_df(df, seq_col, ref, func, to_lower=False, to_upper=False):
481
488
 
482
489
  print('Finish preprocessing')
483
490
 
484
- results = []
485
- # Extract numerical part of reference DataFrame columns, sort them
486
- num = list(set(ref.columns.str[:-1].astype(int)))
487
- num.sort()
488
- print(f'Calculating position: {num}')
489
- # Transform reference DataFrame to a dictionary and clean up NaN values
490
- ref_dict = ref.T.to_dict()
491
- ref_dict = {
492
- outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
493
- for outer_k, outer_v in ref_dict.items()}
494
491
 
495
- # Function to process each kinase with its dictionary, using parallel processing
496
- def process_kinase(kinase, r_dict):
497
- return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
492
+ # wide form to long form
493
+ df['keys'] = df['site_seq'].apply(get_dict)
494
+ input_keys_df = df[['keys']].explode('keys').reset_index()
495
+ input_keys_df.columns = ['input_index', 'key']
496
+ ref_T = ref.T
498
497
 
499
- # Process all kinases in parallel, using tqdm for progress tracking
500
- results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
498
+ merged_df = input_keys_df.merge(ref_T, left_on='key', right_index=True, how='inner')
501
499
 
502
- # Return results as a DataFrame
503
- return pd.DataFrame(results, index=ref.index, columns=df.index).T
500
+ if func == sumup:
501
+ grouped_df = merged_df.drop(columns=['key']).groupby('input_index').sum()
502
+ out = grouped_df.reindex(df.index)
503
+
504
+ elif func==multiply:
505
+ # Get the list of kinases and num_dict
506
+ kinases = ref_T.columns
507
+ num_dict = Data.get_num_dict()
508
+
509
+ out = {}
510
+ for kinase in tqdm(kinases):
511
+ divide_factor = num_dict[kinase]
512
+ # Extract data for this kinase
513
+ kinase_df = merged_df[['input_index', kinase]].copy()
514
+ kinase_df = kinase_df.rename(columns={kinase: 'value'})
515
+
516
+ # Compute log_value
517
+ kinase_df['log_value'] = np.log2(kinase_df['value'],where=kinase_df['value']>0)
518
+
519
+ # Group by 'input_index' and compute sum and count
520
+ grouped = kinase_df.dropna().groupby('input_index')
521
+ sum_log_values = grouped['log_value'].sum()
522
+ len_values = grouped['log_value'].count()
504
523
 
524
+ # Compute log_sum using the formula
525
+ log_sum = sum_log_values + (len_values - 1) * np.log2(divide_factor)
505
526
 
506
- # %% ../nbs/00_core.ipynb 55
527
+ # Find all 'input_index' where 'log_value' is NaN
528
+ nan_input_indices = kinase_df.loc[kinase_df['value']==0, 'input_index'].unique()
529
+ # Set log_sum at those indices to NaN
530
+ log_sum.loc[nan_input_indices] = np.nan
531
+
532
+ # Assign the computed values to the results DataFrame
533
+ out[kinase] = log_sum
534
+
535
+ out = pd.DataFrame(out).reindex(df.index)
536
+
537
+ else:
538
+ grouped_df = merged_df.drop(columns=['key']).groupby('input_index').agg(func)
539
+ out = grouped_df.reindex(df.index)
540
+
541
+ # Return results as a DataFrame
542
+ return out
543
+
544
+ # %% ../nbs/00_core.ipynb 54
507
545
  def get_pct(site,ref,func,pct_ref):
508
546
 
509
547
  "Replicate the precentile results from The Kinase Library."
@@ -528,7 +566,7 @@ def get_pct(site,ref,func,pct_ref):
528
566
  final.columns=['log2(score)','percentile']
529
567
  return final
530
568
 
531
- # %% ../nbs/00_core.ipynb 61
569
+ # %% ../nbs/00_core.ipynb 60
532
570
  def get_pct_df(score_df, # output from predict_kinase_df
533
571
  pct_ref, # a reference df for percentile calculation
534
572
  ):
@@ -553,7 +591,7 @@ def get_pct_df(score_df, # output from predict_kinase_df
553
591
 
554
592
  return percentiles_df
555
593
 
556
- # %% ../nbs/00_core.ipynb 66
594
+ # %% ../nbs/00_core.ipynb 65
557
595
  def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
558
596
  seq_col: str='site_seq', # column name of site sequence
559
597
  id_col: str='gene_site' # column name of site id
@@ -569,7 +607,7 @@ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphoryla
569
607
 
570
608
  return unique
571
609
 
572
- # %% ../nbs/00_core.ipynb 69
610
+ # %% ../nbs/00_core.ipynb 68
573
611
  def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
574
612
  seq_col: str, # column name of protein sequence
575
613
  position_col: str # column name of position 0
@@ -595,7 +633,7 @@ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequenc
595
633
 
596
634
  return np.array(data)
597
635
 
598
- # %% ../nbs/00_core.ipynb 74
636
+ # %% ../nbs/00_core.ipynb 73
599
637
  def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
600
638
  aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
601
639
  aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
@@ -636,7 +674,7 @@ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains
636
674
 
637
675
  return paper,full
638
676
 
639
- # %% ../nbs/00_core.ipynb 78
677
+ # %% ../nbs/00_core.ipynb 77
640
678
  def query_gene(df,gene):
641
679
 
642
680
  "Query gene in the phosphoproteomics dataset"
@@ -650,7 +688,7 @@ def query_gene(df,gene):
650
688
 
651
689
  return df_gene
652
690
 
653
- # %% ../nbs/00_core.ipynb 82
691
+ # %% ../nbs/00_core.ipynb 81
654
692
  def get_ttest(df,
655
693
  columns1, # list of column names for group1
656
694
  columns2, # list of column names for group2
@@ -720,7 +758,7 @@ def get_ttest(df,
720
758
 
721
759
  return results
722
760
 
723
- # %% ../nbs/00_core.ipynb 83
761
+ # %% ../nbs/00_core.ipynb 82
724
762
  def get_metaP(p_values):
725
763
 
726
764
  "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
@@ -732,7 +770,7 @@ def get_metaP(p_values):
732
770
 
733
771
  return score
734
772
 
735
- # %% ../nbs/00_core.ipynb 86
773
+ # %% ../nbs/00_core.ipynb 85
736
774
  def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
737
775
  PDHK: bool=False, # whether this kinase belongs to PDHK family
738
776
  ):
@@ -755,7 +793,7 @@ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and s
755
793
 
756
794
  return df2
757
795
 
758
- # %% ../nbs/00_core.ipynb 88
796
+ # %% ../nbs/00_core.ipynb 87
759
797
  def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
760
798
  kinase:str, # a specific kinase
761
799
  normalize: bool=False, # normalize according to the paper; special for PDHK1/4
katlas/dl.py CHANGED
@@ -1,3 +1,5 @@
1
+ """A collection of deep learning tools via Fastai"""
2
+
1
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_DL.ipynb.
2
4
 
3
5
  # %% auto 0
katlas/feature.py CHANGED
@@ -1,3 +1,5 @@
1
+ """A collection of tools to extract features from SMILES, proteins, etc."""
2
+
1
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_feature.ipynb.
2
4
 
3
5
  # %% auto 0
katlas/plot.py CHANGED
@@ -1,3 +1,5 @@
1
+ """A collection of plot functions"""
2
+
1
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_plot.ipynb.
2
4
 
3
5
  # %% auto 0
katlas/train.py CHANGED
@@ -1,3 +1,5 @@
1
+ """A collection of machine learning tools"""
2
+
1
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_ML.ipynb.
2
4
 
3
5
  # %% auto 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: python-katlas
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: tools for predicting kinome specificities
5
5
  Home-page: https://github.com/sky1ove/python-katlas
6
6
  Author: lily
@@ -21,7 +21,6 @@ License-File: LICENSE
21
21
  Requires-Dist: statsmodels
22
22
  Requires-Dist: fastparquet
23
23
  Requires-Dist: tqdm
24
- Requires-Dist: joblib
25
24
  Provides-Extra: dev
26
25
  Requires-Dist: nbdev ; extra == 'dev'
27
26
  Requires-Dist: pyngrok ; extra == 'dev'
@@ -0,0 +1,14 @@
1
+ katlas/__init__.py,sha256=wOJN3HxAgnSon5vWYU3Txm2UZ_7tBHDKXUKZIH-mXX8,22
2
+ katlas/_modidx.py,sha256=N4eRU0s8l9NGhHf-PZynH4p87iyw9ksebr7DKd8SLR0,11027
3
+ katlas/core.py,sha256=eSQonCHahRPgSbNGVhrbPlYkhSqarnFWm_-juS4gCY0,36654
4
+ katlas/dl.py,sha256=gV-rwTLU9IudwFyNJKo-MP8nmOzhqURZQt3rJalHoG8,10903
5
+ katlas/feature.py,sha256=Wv94R0hnAovErifV2x5ky8uvRMNSGPjnS4ivyWVoZps,11548
6
+ katlas/imports.py,sha256=-ZphRU8K1KspxMpgRxisE0OskrCw3S8JR8tvmeXBRY0,147
7
+ katlas/plot.py,sha256=_bzvMwxzAjXg0Zxb5Sv7bfCIopxJegjCA-DutjrLybo,23668
8
+ katlas/train.py,sha256=OhiR2ev1UhPBBSWLncwfNjy_UImSmB_kVhZp7DyCQ50,7671
9
+ python_katlas-0.0.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
10
+ python_katlas-0.0.9.dist-info/METADATA,sha256=hGnYSHNOMM3r60gt9VcOgo3se2r5H43lJFYciFkJ1_Q,15460
11
+ python_katlas-0.0.9.dist-info/WHEEL,sha256=EVRjI69F5qVjm_YgqcTXPnTAv3BfSUr0WVAHuSP3Xoo,92
12
+ python_katlas-0.0.9.dist-info/entry_points.txt,sha256=SF3xDlCmE84ECTBIMDo_FNg1aXGX2-lXkCvH5o4VgpM,34
13
+ python_katlas-0.0.9.dist-info/top_level.txt,sha256=pKBKw9KOSJgnnFkoilkDij_iJ_tJbIO4XnrSXIleqNc,7
14
+ python_katlas-0.0.9.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- katlas/__init__.py,sha256=QiiYsv0kcJaB8wCWyT-FnI2b6be87HA-CrrIUn8LQhg,22
2
- katlas/_modidx.py,sha256=COjpHUccTtv9jU9ifJgAn1uHwwYCpwyyFpM_ifl42Ew,11010
3
- katlas/core.py,sha256=a4dYSYUSLsixcXmLVJwso3iwHUjlN4XJZ4Vbnr4_I98,35274
4
- katlas/dl.py,sha256=Rm1EO6oGTiHpqp4EA2xAvbIUnh608FPYOdzndRGKVkc,10849
5
- katlas/feature.py,sha256=3zgTuCnXqH1e0LGZ2Hkvan852PiaIHxj27cg_TJfKzo,11471
6
- katlas/imports.py,sha256=-ZphRU8K1KspxMpgRxisE0OskrCw3S8JR8tvmeXBRY0,147
7
- katlas/plot.py,sha256=vB3gv0aaCNERW1CtdDWqM4jIZOx1auGWwi_1I22xBa0,23630
8
- katlas/train.py,sha256=s0ucsZVaixCTZPz-XAI2J7zQDeGkiYEJKOc2dFTYsAc,7625
9
- python_katlas-0.0.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
10
- python_katlas-0.0.7.dist-info/METADATA,sha256=azPEmysqMFYpCJlC4gDDA1Yyprb5914xceAysBb7G5Q,15482
11
- python_katlas-0.0.7.dist-info/WHEEL,sha256=EVRjI69F5qVjm_YgqcTXPnTAv3BfSUr0WVAHuSP3Xoo,92
12
- python_katlas-0.0.7.dist-info/entry_points.txt,sha256=SF3xDlCmE84ECTBIMDo_FNg1aXGX2-lXkCvH5o4VgpM,34
13
- python_katlas-0.0.7.dist-info/top_level.txt,sha256=pKBKw9KOSJgnnFkoilkDij_iJ_tJbIO4XnrSXIleqNc,7
14
- python_katlas-0.0.7.dist-info/RECORD,,