python-katlas 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
katlas/core.py ADDED
@@ -0,0 +1,769 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['param_PSPA_st', 'param_PSPA_y', 'param_PSPA', 'param_CDDM', 'param_CDDM_upper', 'Data', 'CPTAC', 'convert_string',
5
+ 'checker', 'STY2sty', 'cut_seq', 'get_dict', 'multiply_func', 'multiply', 'sumup', 'predict_kinase',
6
+ 'predict_kinase_df', 'get_pct', 'get_pct_df', 'get_unique_site', 'extract_site_seq', 'get_freq',
7
+ 'query_gene', 'get_ttest', 'get_metaP', 'raw2norm', 'get_one_kinase']
8
+
9
+ # %% ../nbs/00_core.ipynb 4
10
+ import math, pandas as pd, numpy as np, seaborn as sns
11
+ from tqdm import tqdm
12
+ from scipy.stats import chi2
13
+ from typing import Callable
14
+ from functools import partial
15
+ from joblib import Parallel, delayed
16
+ from scipy.stats import ttest_ind
17
+ from statsmodels.stats.multitest import multipletests
18
+
19
+ # %% ../nbs/00_core.ipynb 7
20
+ class Data:
21
+
22
+ "A class for fetching various datasets."
23
+
24
+ @staticmethod
25
+ def fetch_data(url):
26
+ "Fetches the data from the given URL and returns a DataFrame"
27
+ df = pd.read_parquet(url)
28
+ if 'Unnamed: 0' in df.columns:
29
+ df = df.rename(columns={'Unnamed: 0': 'kinase'})
30
+ return df
31
+
32
+ #---------------------------kinase-------------------------------
33
+ # kinase info
34
+ KINASE_INFO_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/kinase_info.parquet"
35
+ @staticmethod
36
+ def get_kinase_info():
37
+ return Data.fetch_data(Data.KINASE_INFO_URL)
38
+
39
+ #---------------------------PSPA-------------------------------
40
+ # PSPA tyrosine normalized data
41
+ PSPA_TYR_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_tyr_norm.parquet"
42
+ @staticmethod
43
+ def get_pspa_tyr_norm():
44
+ "PSPA tyrosine kinase normalized data"
45
+ return Data.fetch_data(Data.PSPA_TYR_NORM_URL)
46
+
47
+
48
+ # PSPA ST kinase normalized data
49
+ PSPA_ST_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_st_norm.parquet"
50
+ @staticmethod
51
+ def get_pspa_st_norm():
52
+ "PSPA Ser/Thr kinase normalized data"
53
+ return Data.fetch_data(Data.PSPA_ST_NORM_URL)
54
+
55
+ # PSPA all kinase normalized data
56
+ PSPA_ALL_NORM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_all_norm.parquet"
57
+ @staticmethod
58
+ def get_pspa_all_norm():
59
+ "PSPA Ser/Thr and Tyr kinase normalized data"
60
+ return Data.fetch_data(Data.PSPA_ALL_NORM_URL)
61
+
62
+ # scoring human all capital phosphoproteome via PSPA
63
+ PSPA_ST_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_pct_st.parquet"
64
+ # PSPA_TYR_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/pspa_pct_tyr.parquet"
65
+ @staticmethod
66
+ def get_pspa_st_pct():
67
+ return Data.fetch_data(Data.PSPA_ST_PCT_URL)
68
+
69
+ PSPA_TYR_PCT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_pct_tyr.parquet"
70
+ @staticmethod
71
+ def get_pspa_tyr_pct():
72
+ return Data.fetch_data(Data.PSPA_TYR_PCT_URL)
73
+
74
+ # PSPA number of random amino acids
75
+ PSPA_NUM_RANDOM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/PSPA/pspa_divide_num.csv"
76
+ @staticmethod
77
+ def get_num_dict():
78
+ "PSPA number of random amino acids"
79
+ num = pd.read_csv(Data.PSPA_NUM_RANDOM_URL)
80
+ num_dict = num.set_index('kinase')['num_random_aa'].to_dict()
81
+
82
+ return num_dict
83
+
84
+ #---------------------------CDDM-------------------------------
85
+ # Kinase substrate datasets
86
+ KS_DATASET_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_datasets.parquet"
87
+ @staticmethod
88
+ def get_ks_dataset():
89
+ df = Data.fetch_data(Data.KS_DATASET_URL)
90
+ #Convert the number in the column name into integer
91
+ df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
92
+ return df
93
+
94
+ # CDDM reference
95
+ CDDM_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_main.parquet"
96
+ @staticmethod
97
+ def get_cddm():
98
+ return Data.fetch_data(Data.CDDM_URL)
99
+
100
+ CDDM_UPPER_URL ="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_main_upper.parquet"
101
+ @staticmethod
102
+ def get_cddm_upper():
103
+ return Data.fetch_data(Data.CDDM_UPPER_URL)
104
+
105
+ # CDDM of other kinase with mutation
106
+ CDDM_OTHERS_URL="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_others.parquet"
107
+ @staticmethod
108
+ def get_cddm_others():
109
+ return Data.fetch_data(Data.CDDM_OTHERS_URL)
110
+
111
+ CDDM_OTHERS_INFO_URL="https://github.com/sky1ove/katlas/raw/main/dataset/CDDM/ks_others_info.parquet"
112
+ @staticmethod
113
+ def get_cddm_others_info():
114
+ return Data.fetch_data(Data.CDDM_OTHERS_INFO_URL)
115
+
116
+
117
+ #---------------------------CDDM+PSPA-------------------------------
118
+ # Combined PSPA and CDDM
119
+ COMBINE_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/combine_main.parquet"
120
+ @staticmethod
121
+ def get_combine():
122
+ return Data.fetch_data(Data.COMBINE_URL)
123
+
124
+
125
+ #---------------------------Amino acid-------------------------------
126
+ # Amino acid info
127
+ AA_INFO_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_info.parquet"
128
+ AA_RDKIT_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_rdkit.parquet"
129
+ AA_MORGAN_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/amino_acids/aa_morgan.parquet"
130
+ @staticmethod
131
+ def get_aa_info():
132
+ return Data.fetch_data(Data.AA_INFO_URL)
133
+
134
+ @staticmethod
135
+ def get_aa_rdkit():
136
+ return Data.fetch_data(Data.AA_RDKIT_URL)
137
+
138
+ @staticmethod
139
+ def get_aa_morgan():
140
+ return Data.fetch_data(Data.AA_MORGAN_URL)
141
+
142
+
143
+ #---------------------------phosphoproteomics dataset-------------------------------
144
+ # For reference of linkedomicsKB, contains unique EnsemblProteinID+site, more sites
145
+ CPTAC_KB_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/linkedOmicsKB_ref_pan.parquet"
146
+ @staticmethod
147
+ def get_cptac_ensembl_site():
148
+ "For reference of linkedomicsKB, contains unique EnsemblProteinID+site"
149
+ return Data.fetch_data(Data.CPTAC_KB_URL)
150
+
151
+ # From the above, but keep the unique site seq, with gene_site separated by |
152
+ CPTAC_UNIQUE_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/cptac_unique_site.parquet"
153
+ @staticmethod
154
+ def get_cptac_unique_site():
155
+ "Unique site sequence of CPTAC"
156
+ return Data.fetch_data(Data.CPTAC_UNIQUE_URL)
157
+
158
+ # for reference of linkedomics, contains unique Gene+site, fewer cases
159
+ CPTAC_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/linkedOmics_ref_pan.parquet"
160
+ @staticmethod
161
+ def get_cptac_gene_site():
162
+ "For reference of linkedomics, contains unique Gene+site, fewer cases than unique EnsemblID+site"
163
+ return Data.fetch_data(Data.CPTAC_URL)
164
+
165
+ # from PhosphositePlus, contains Gene+site
166
+ PSP_HUMAN_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/psp_human.parquet"
167
+ @staticmethod
168
+ def get_psp_human_site():
169
+ "PhosphositePlus human, contains Gene+site"
170
+ return Data.fetch_data(Data.PSP_HUMAN_URL)
171
+
172
+ # from ochoa et al. The functional landscape of the human phosphoproteome
173
+ OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/ochoa_site.parquet"
174
+ @staticmethod
175
+ def get_ochoa_site():
176
+ "Ochoa et al. dataset"
177
+ return Data.fetch_data(Data.OCHOA_URL)
178
+
179
+ # combine ochoa and PSP low throughput data
180
+ COMBINE_PSP_OCHOA_URL = "https://github.com/sky1ove/katlas/raw/main/dataset/phosphosites/combine_site_ochoa_psp.parquet"
181
+ @staticmethod
182
+ def get_combine_site_psp_ochoa():
183
+ "Combined Ochoa and PhosphoSitePlus"
184
+ df = Data.fetch_data(Data.COMBINE_PSP_OCHOA_URL)
185
+
186
+ #Convert the number in the column name into integer
187
+ df.columns = [int(col) if col.lstrip('-').isdigit() else col for col in df.columns]
188
+ return df
189
+
190
+
191
+ # %% ../nbs/00_core.ipynb 12
192
+ class CPTAC:
193
+
194
+ "A class for fetching CPTAC phosphoproteomics data."
195
+
196
+ # # Phosphoproteomics (Tumor)
197
+ # HNSCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/HNSCC/HNSCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
198
+ # GBM = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/GBM/GBM_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
199
+ # COAD = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/COAD/COAD_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
200
+ # CCRCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/CCRCC/CCRCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
201
+ # LSCC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LSCC/LSCC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
202
+ # BRCA = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/BRCA/BRCA_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
203
+ # UCEC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/UCEC/UCEC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
204
+ # LUAD = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LUAD/LUAD_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
205
+ # PDAC = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/PDAC/PDAC_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
206
+ # OV = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/OV/OV_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
207
+
208
+ # # Phosphoproteomics (Normal)
209
+ # HNSCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/HNSCC/HNSCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
210
+ # GBM_normal = None
211
+ # COAD_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/COAD/COAD_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
212
+ # CCRCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/CCRCC/CCRCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
213
+ # LSCC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LSCC/LSCC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
214
+ # BRCA_normal = None
215
+ # UCEC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/UCEC/UCEC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
216
+ # LUAD_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/LUAD/LUAD_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
217
+ # PDAC_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/PDAC/PDAC_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
218
+ # OV_normal = "https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/OV/OV_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"
219
+
220
+ # # Ensemble ID gene mapping
221
+ # HNSCC_ID = "https://zenodo.org/records/8196130/files/bcm-hnscc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
222
+ # GBM_ID = "https://zenodo.org/records/8196130/files/bcm-gbm-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
223
+ # COAD_ID = "https://zenodo.org/records/8196130/files/bcm-coad-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
224
+ # CCRCC_ID = "https://zenodo.org/records/8196130/files/bcm-ccrcc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
225
+ # LSCC_ID = "https://zenodo.org/records/8196130/files/bcm-lscc-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
226
+ # BRCA_ID = "https://zenodo.org/records/8196130/files/bcm-brca-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
227
+ # UCEC_ID = "https://zenodo.org/records/8196130/files/bcm-ucec-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
228
+ # LUAD_ID = "https://zenodo.org/records/8196130/files/bcm-luad-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
229
+ # PDAC_ID = "https://zenodo.org/records/8196130/files/bcm-pdac-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
230
+ # OV_ID = "https://zenodo.org/records/8196130/files/bcm-ov-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
231
+
232
+
233
+ @staticmethod
234
+ def _fetch_data(cancer: str, # cancer type CPTAC
235
+ is_Tumor: bool=True, # tumor tissue or normal
236
+ is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
237
+ ):
238
+ "Fetches the data from the given URL and returns a DataFrame"
239
+
240
+ # URL of ID and data
241
+ sample_type = "Tumor" if is_Tumor else "Normal"
242
+ ID_URL = f"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
243
+ DATA_URL = f"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_{sample_type}.txt"
244
+
245
+ # Load ID data
246
+ ref = pd.read_csv(ID_URL, compression='gzip', sep='\t')[['protein','gene','gene_name']].drop_duplicates().reset_index(drop=True)
247
+
248
+ # Load CPTAC phosphoproteomics data
249
+ try:
250
+ raw = pd.read_csv(DATA_URL, sep='\t')
251
+ except Exception as e:
252
+ print(f'{cancer} has {e}')
253
+ else:
254
+ info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],
255
+ 'site':raw.idx.str.split('|').str[2],
256
+ 'site_seq':raw.idx.str.split('|').str[3]})
257
+
258
+ print(f'the {cancer} dataset length is: {info.shape[0]}')
259
+
260
+ # Merge ensembl ID with gene name
261
+ info = info.merge(ref,'left')
262
+ print(f'after id mapping, the length is {info.shape[0]}')
263
+
264
+ print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')
265
+
266
+ info['gene_site'] = info['gene_name'] + '_' + info['site']
267
+ info['protein_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']
268
+
269
+ info = info.drop_duplicates(subset="protein_site" if is_KB else "gene_site").reset_index(drop=True)
270
+ print(f'after removing duplicates of protein_site, the length is {info.shape[0]}')
271
+
272
+ return info
273
+
274
+
275
+ @staticmethod
276
+ def list_cancer():
277
+ "Get available CPTAC cancer type"
278
+ return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']
279
+
280
+ @staticmethod
281
+ def get_id(cancer_type: str,
282
+ is_Tumor: bool=True, # tumor tissue or normal
283
+ is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
284
+ ):
285
+ "Get CPTAC phosphorylation sites information given a cancer type"
286
+ assert cancer_type in CPTAC.list_cancer(), "cancer type is not included, check available cancer types from CPTAC.list_cancer()"
287
+ return CPTAC._fetch_data(cancer_type,is_Tumor, is_KB)
288
+
289
+ # %% ../nbs/00_core.ipynb 19
290
+ def convert_string(input_string:str):
291
+
292
+ "Convert amino acids of lower case other than s,t,y to capital; convert rare amino acids to _"
293
+
294
+ allowed_chars = 'PGACSTVILMFYWHKRQNDEsty'
295
+ result = ""
296
+ for char in input_string:
297
+ # convert non-s/t/y to upper case
298
+ result_char = char if char in ['s', 't', 'y'] else char.upper()
299
+ # Replace with underscore if the character is not in the allowed set
300
+ result += result_char if result_char in allowed_chars else '_'
301
+ return result
302
+
303
+ # %% ../nbs/00_core.ipynb 22
304
+ def checker(input_string):
305
+ "Check if the input string contains non-s/t/y at the middle position"
306
+ acceptor = input_string[len(input_string)//2]
307
+ assert acceptor.lower() in list('sty'),f"{input_string} has {acceptor} at position 0; need to have one of s,t and y"
308
+
309
+ def STY2sty(input_string: str):
310
+ "Replace 'STY' with 'sty'"
311
+ return input_string.replace('S', 's').replace('T', 't').replace('Y', 'y')
312
+
313
+ # %% ../nbs/00_core.ipynb 24
314
+ def cut_seq(input_string: str, # site sequence
315
+ min_position: int, # minimum position relative to its center
316
+ max_position: int, # maximum position relative to its center
317
+ ):
318
+
319
+ "Extract sequence based on a range relative to its center position"
320
+
321
+ # Find the center position of the string
322
+ center_position = len(input_string) // 2
323
+
324
+ # Calculate the start and end indices
325
+ start_index = max(center_position + min_position, 0) # Ensure start_index is not negative
326
+ end_index = min(center_position + max_position + 1, len(input_string)) # Ensure end_index does not exceed string length
327
+
328
+ # Extract and return the substring
329
+ return input_string[start_index:end_index]
330
+
331
+ # %% ../nbs/00_core.ipynb 26
332
+ def get_dict(input_string:str, # phosphorylation site sequence
333
+ ):
334
+
335
+ "Get a dictionary of input string; no need for the star in the middle; make sure it is 15 or 10 length"
336
+
337
+ center_index = len(input_string) // 2
338
+ center_char = input_string[center_index]
339
+
340
+ result = []
341
+
342
+ for i, char in enumerate(input_string):
343
+ position = i - center_index
344
+
345
+ if char.isalpha():
346
+ result.append(f"{position}{char}")
347
+
348
+ return result
349
+
350
+ # %% ../nbs/00_core.ipynb 29
351
+ def multiply_func(values, # list of values, possibilities of amino acids at certain positions
352
+ factor=17, # scale factor
353
+ ):
354
+
355
+ "Multiply the possibilities of the amino acids at each position in a phosphorylation site"
356
+
357
+
358
+ # Using the logarithmic property: log(a*b) = log(a) + log(b)
359
+ # Compute the sum of the logarithms of the values and the scale factor
360
+ log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(factor)
361
+
362
+ return log_sum
363
+
364
+ # %% ../nbs/00_core.ipynb 33
365
+ class multiply:
366
+ "Multiply values, consider the dynamics of scale factor, which is PSPA random aa number."
367
+ def __init__(self):
368
+ self.num_dict = Data.get_num_dict()
369
+
370
+ def func(self, values, kinase):
371
+
372
+ # Check if any values are less than or equal to zero
373
+ if np.any(np.array(values) == 0):
374
+ return np.nan
375
+
376
+ else:
377
+ # Retrieve the divide factor from the dictionary
378
+ self.divide = self.num_dict[kinase]
379
+
380
+ # Using the logarithmic property: log(a*b) = log(a) + log(b)
381
+ # Compute the sum of the logarithms of the values and the divide factor
382
+ log_sum = np.sum(np.log2(values)) + (len(values) - 1) * np.log2(self.divide)
383
+
384
+ return log_sum
385
+
386
+ # %% ../nbs/00_core.ipynb 37
387
+ def sumup(values, # list of values, possibilities of amino acids at certain positions
388
+ kinase=None,
389
+ ):
390
+ "Sum up the possibilities of the amino acids at each position in a phosphorylation site sequence"
391
+ return sum(values)
392
+
393
+ # %% ../nbs/00_core.ipynb 40
394
+ def predict_kinase(input_string: str, # site sequence
395
+ ref: pd.DataFrame, # reference dataframe for scoring
396
+ func: Callable, # function to calculate score
397
+ to_lower: bool=False, # convert capital STY to lower case
398
+ verbose=True
399
+ ):
400
+ "Predict kinase given a phosphorylation site sequence"
401
+
402
+ # check whether the middle position is STY (Serine, Threonine, Tyrosine)
403
+ checker(input_string)
404
+
405
+ # Convert rare amino acids to '_', and if specified, convert STY to lowercase
406
+ input_string = convert_string(input_string)
407
+
408
+ # If to_lower is True, convert STY in the sequence to lower case
409
+ if to_lower:
410
+ input_string = STY2sty(input_string)
411
+
412
+ results = [] # Initialize a list to store the scores for each kinase
413
+
414
+ # Iterate over each kinase and its associated data in the reference dataframe
415
+ for kinase, row in ref.iterrows():
416
+
417
+ # Convert the row into a dictionary, excluding NaN values, to create a PSSM dictionary for a kinase
418
+ r_dict = row.dropna().to_dict()
419
+
420
+ # Extract position+amino acid name from the input string and filter them against the name in PSSM
421
+ pos_aa_name = get_dict(input_string)
422
+ pos_aa_name = [key for key in pos_aa_name if key in r_dict.keys()]
423
+
424
+ # Collect corresponding PSSM values for these positions and amino acids
425
+ pos_aa_val = [r_dict[key] for key in pos_aa_name] # Further checks for NaN values
426
+
427
+ # Calculate the score for this kinase using the specified function
428
+ score = func(pos_aa_val, kinase)
429
+ results.append(score)
430
+
431
+ # If verbose is True, print the positions and amino acids considered
432
+ if verbose:
433
+ print(f'considering string: {pos_aa_name}')
434
+
435
+ # Convert the list of results into a pandas Series, index by the kinase, sort by score in descending order
436
+ out = pd.Series(results, index=ref.index).sort_values(ascending=False)
437
+
438
+ return out.round(3) # Return the scores rounded to three decimal places
439
+
440
+ # %% ../nbs/00_core.ipynb 42
441
+ # PSPA
442
+ param_PSPA_st = {'ref':Data.get_pspa_st_norm(), 'func':multiply().func} # Johnson et al. Nature official
443
+ param_PSPA_y = {'ref':Data.get_pspa_tyr_norm(), 'func':multiply().func}
444
+ param_PSPA = {'ref':Data.get_pspa_all_norm(), 'func':multiply().func}
445
+
446
+
447
+ # Kinase-substrate dataset, CDDM
448
+ param_CDDM = {'ref':Data.get_cddm(), 'func':sumup}
449
+ param_CDDM_upper = {'ref':Data.get_cddm_upper(), 'func':sumup} # specific for all uppercase
450
+
451
+ # %% ../nbs/00_core.ipynb 46
452
+ def predict_kinase_df(df, seq_col, ref, func, to_lower=False):
453
+ print('input dataframe has a length', df.shape[0])
454
+ print('Preprocessing')
455
+
456
+ # Make a copy of df to avoid changes to the original dataframe
457
+ df = df.copy()
458
+
459
+ # Check whether the middle position of each sequence is one of S, T, or Y
460
+ df[seq_col].apply(checker)
461
+
462
+ # Convert rare amino acids to '_', and potentially change case of STY based on settings
463
+ df[seq_col] = df[seq_col].apply(convert_string)
464
+
465
+ # Optionally convert STY to lowercase in each sequence
466
+ if to_lower:
467
+ df[seq_col] = df[seq_col].apply(STY2sty)
468
+
469
+ # Adjust sequence lengths to match the reference matrix's expected inputs
470
+ max_value = ref.columns.str[:-1].astype(int).max() # Get the highest position index from the reference columns
471
+ min_value = ref.columns.str[:-1].astype(int).min() # Get the lowest position index
472
+ df[seq_col] = df[seq_col].apply(partial(cut_seq, min_position=min_value, max_position=max_value))
473
+
474
+ print('Finish preprocessing')
475
+
476
+ results = []
477
+ # Extract numerical part of reference DataFrame columns, sort them
478
+ num = list(set(ref.columns.str[:-1].astype(int)))
479
+ num.sort()
480
+ print(f'Calculating position: {num}')
481
+ # Transform reference DataFrame to a dictionary and clean up NaN values
482
+ ref_dict = ref.T.to_dict()
483
+ ref_dict = {
484
+ outer_k: {inner_k: val for inner_k, val in outer_v.items() if not pd.isna(val)}
485
+ for outer_k, outer_v in ref_dict.items()}
486
+
487
+ # Function to process each kinase with its dictionary, using parallel processing
488
+ def process_kinase(kinase, r_dict):
489
+ return [func(np.array([r_dict.get(key) for key in get_dict(input_string) if key in r_dict]), kinase) for input_string in df[seq_col]]
490
+
491
+ # Process all kinases in parallel, using tqdm for progress tracking
492
+ results = Parallel(n_jobs=-1)(delayed(process_kinase)(kinase, r_dict) for kinase, r_dict in tqdm(ref_dict.items()))
493
+
494
+ # Return results as a DataFrame
495
+ return pd.DataFrame(results, index=ref.index, columns=df.index).T
496
+
497
+
498
+ # %% ../nbs/00_core.ipynb 55
499
+ def get_pct(site,ref,func,pct_ref):
500
+
501
+ "Replicate the precentile results from The Kinase Library."
502
+
503
+ # As here we try to replicate the results, we use site.upper(); consider removing it for future version.
504
+ score = predict_kinase(site.upper(),ref=ref,func=func)
505
+
506
+ percentiles = {}
507
+ for kinase in score.index:
508
+ # Get the values from `ref` for this kinase
509
+ ref_values = pct_ref[kinase].values
510
+ # Calculate how many values in `ref` are less than the new score
511
+ less = np.sum(ref_values < score[kinase])
512
+ # Calculate how many values are equal to the new score
513
+ equal = np.sum(ref_values == score[kinase])
514
+ # Calculate the percentile rank
515
+ percentile = (less + 0.5 * equal) / len(ref_values) * 100
516
+ percentiles[kinase] = percentile
517
+
518
+ pct = pd.Series(percentiles)
519
+ final = pd.concat([score,pct],axis=1)
520
+ final.columns=['log2(score)','percentile']
521
+ return final
522
+
523
+ # %% ../nbs/00_core.ipynb 61
524
+ def get_pct_df(score_df, # output from predict_kinase_df
525
+ pct_ref, # a reference df for percentile calculation
526
+ ):
527
+
528
+ "Replicate the precentile results from The Kinase Library."
529
+
530
+ # Create an array to hold percentile ranks
531
+ percentiles = np.zeros(score_df.shape)
532
+
533
+ # Calculate percentiles for each column in a vectorized manner
534
+ for i, kinase in tqdm(enumerate(score_df.columns),total=len(score_df.columns)):
535
+ ref_values = np.sort(pct_ref[kinase].values)
536
+
537
+ # Use searchsorted to find indices where the scores would be inserted to maintain order
538
+ indices = np.searchsorted(ref_values, score_df[kinase].values, side='right')
539
+
540
+ # Calculate percentile ranks
541
+ percentiles[:, i] = indices / len(ref_values) * 100
542
+
543
+ # Convert the array to a DataFrame with appropriate indices and columns
544
+ percentiles_df = pd.DataFrame(percentiles, index=score_df.index, columns=score_df.columns).astype(float).round(3)
545
+
546
+ return percentiles_df
547
+
548
+ # %% ../nbs/00_core.ipynb 66
549
+ def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
550
+ seq_col: str='site_seq', # column name of site sequence
551
+ id_col: str='gene_site' # column name of site id
552
+ ):
553
+ "Remove duplicates among phosphorylation sites; return df with new columns of acceptor and number of duplicates"
554
+
555
+ unique = df.groupby(seq_col).agg(
556
+ {id_col: lambda r: '|'.join(r.unique())} )
557
+ unique['num_site'] = unique[id_col].str.split('|').apply(len)
558
+ unique = unique.reset_index()
559
+ position = len(unique[seq_col][0])//2
560
+ unique['acceptor'] = unique[seq_col].str[position]
561
+
562
+ return unique
563
+
564
+ # %% ../nbs/00_core.ipynb 69
565
+ def extract_site_seq(df: pd.DataFrame, # dataframe that contains protein sequence
566
+ seq_col: str, # column name of protein sequence
567
+ position_col: str # column name of position 0
568
+ ):
569
+ "Extract -7 to +7 site sequence from protein sequence"
570
+
571
+ data = []
572
+ for i, r in tqdm(df.iterrows(),total=len(df)):
573
+ position = r[position_col] - 1
574
+ start = position - 7
575
+ end = position + 8
576
+
577
+ # Extract the subsequence
578
+ subseq = r[seq_col][max(0, start):min(len(r[seq_col]), end)]
579
+
580
+ # Pad the subsequence if needed
581
+ if start < 0:
582
+ subseq = "_" * abs(start) + subseq
583
+ if end > len(r[seq_col]):
584
+ subseq = subseq + "_" * (end - len(r[seq_col]))
585
+
586
+ data.append(subseq)
587
+
588
+ return np.array(data)
589
+
590
+ # %% ../nbs/00_core.ipynb 74
591
+ def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
592
+ aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix
593
+ aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
594
+ position = [i for i in range(-7,8)], # position to include in the full matrix
595
+ position_paper = [-5,-4,-3,-2,-1,1,2,3,4] # position to include in the partial matrix
596
+ ):
597
+
598
+ "Get frequency matrix given a dataframe of phosphorylation sites for a single kinase"
599
+
600
+
601
+ #Count frequency for each amino acid at each position
602
+ melted_k = df_k.melt(
603
+ value_vars=[i for i in range(-7, 8)],
604
+ var_name='Position',
605
+ value_name='aa')
606
+
607
+ # Group by Position and Amino Acid and count occurrences
608
+ grouped = melted_k.groupby(['Position', 'aa']).size().reset_index(name='Count')
609
+
610
+
611
+ # Remove wired amino acid
612
+ aa_include = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
613
+ grouped = grouped[grouped.aa.isin(aa_include)].reset_index(drop=True)
614
+
615
+ # get pivot table
616
+ pivot_k = grouped.pivot(index='aa', columns='Position', values='Count').fillna(0)
617
+
618
+ # Get frequency by dividing the sum of each column
619
+ freq_k = pivot_k/pivot_k.sum()
620
+
621
+
622
+ # data from the kinase-substrate dataset, and format is Lew's paper's format
623
+ paper = freq_k.reindex(index=aa_order_paper,columns=position_paper,fill_value=0)
624
+
625
+ # full pivot data from kinase-substrate dataset
626
+ full = freq_k.reindex(index=aa_order,columns=position, fill_value=0)
627
+
628
+
629
+ return paper,full
630
+
631
+ # %% ../nbs/00_core.ipynb 78
632
+ def query_gene(df,gene):
633
+
634
+ "Query gene in the phosphoproteomics dataset"
635
+
636
+ # query gene in the dataframe
637
+ df_gene = df[df.gene_site.str.contains(f'{gene}_')]
638
+
639
+ # sort dataframe based on position
640
+ sort_position = df_gene.gene_site.str.split('_').str[-1].str[1:].astype(int).sort_values().index
641
+ df_gene = df_gene.loc[sort_position]
642
+
643
+ return df_gene
644
+
645
+ # %% ../nbs/00_core.ipynb 82
646
+ def get_ttest(df,
647
+ columns1, # list of column names for group1
648
+ columns2, # list of column names for group2
649
+ FC_method = 'median', # or mean
650
+ alpha=0.05, # significance level in multipletests for p_adj
651
+ correction_method='fdr_bh', # method in multipletests for p_adj
652
+ ):
653
+ """
654
+ Performs t-tests and calculates log2 fold change between two groups of columns in a DataFrame.
655
+ NaN p-values are excluded from the multiple testing correction.
656
+
657
+ Returns:
658
+ DataFrame: Results including log2FC, p-values, adjusted p-values, significance, signed log10 P value, and signed log10 Padj
659
+ """
660
+ group1 = df[columns1]
661
+ group2 = df[columns2]
662
+
663
+ # Compute median values for each gene in both groups
664
+ if FC_method == "median":
665
+ m1 = group1.median(axis=1)
666
+ m2 = group2.median(axis=1)
667
+ elif FC_method == "mean":
668
+ m1 = group1.mean(axis=1)
669
+ m2 = group2.mean(axis=1)
670
+
671
+ # As phosphoproteomics data has already been log transformed, we can directly use subtraction
672
+ FCs = m2 - m1
673
+
674
+ # Perform t-tests and handle NaN p-values
675
+ t_results = [ttest_ind(group1.loc[idx], group2.loc[idx], nan_policy='omit') for idx in tqdm(df.index, desc="Computing t-tests")]
676
+
677
+ # Exclude NaN p-values before multiple testing correction
678
+ p_values = [result.pvalue if result.pvalue is not np.nan else np.nan for result in t_results]
679
+ valid_p_values = np.array(p_values, dtype=float) # Ensure the correct data type
680
+
681
+ # valid_p_values = np.array(p_values)
682
+ valid_p_values = valid_p_values[~np.isnan(valid_p_values)]
683
+
684
+ # Adjust for multiple testing on valid p-values only
685
+ reject, pvals_corrected, _, _ = multipletests(valid_p_values, alpha=alpha, method=correction_method)
686
+
687
+ # Create a full list of corrected p-values including NaNs
688
+ full_pvals_corrected = np.empty_like(p_values)
689
+ full_pvals_corrected[:] = np.nan
690
+ np.place(full_pvals_corrected, ~np.isnan(p_values), pvals_corrected)
691
+
692
+ # Adjust the significance accordingly
693
+ full_reject = np.zeros_like(p_values, dtype=bool)
694
+ np.place(full_reject, ~np.isnan(p_values), reject)
695
+
696
+ # Create DataFrame with results
697
+ results = pd.DataFrame({
698
+ 'log2FC': FCs,
699
+ 'p_value': p_values,
700
+ 'p_adj': full_pvals_corrected,
701
+ 'significant': full_reject
702
+ })
703
+
704
+ results['p_value'] = results['p_value'].astype(float)
705
+
706
+ def get_signed_logP(r,p_col):
707
+ log10 = -np.log10(r[p_col])
708
+ return -log10 if r['log2FC']<0 else log10
709
+
710
+ results['signed_logP'] = results.apply(partial(get_signed_logP,p_col='p_value'),axis=1)
711
+ results['signed_logPadj'] = results.apply(partial(get_signed_logP,p_col='p_adj'),axis=1)
712
+
713
+ return results
714
+
715
+ # %% ../nbs/00_core.ipynb 83
716
+ def get_metaP(p_values):
717
+
718
+ "Use Fisher's method to calculate a combined p value given a list of p values; this function also allows negative p values (negative correlation)"
719
+
720
+ logs = [math.log(abs(p))*-1 if p<0 else math.log(abs(p)) for p in p_values]
721
+ chi_square_stat = -2 * sum(logs)
722
+ degrees_of_freedom = 2 * len(p_values)
723
+ score = stats.chi2.sf(abs(chi_square_stat), degrees_of_freedom)*-1 if chi_square_stat<0 else chi2.sf(abs(chi_square_stat), degrees_of_freedom)
724
+
725
+ return score
726
+
727
+ # %% ../nbs/00_core.ipynb 86
728
+ def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
729
+ PDHK: bool=False, # whether this kinase belongs to PDHK family
730
+ ):
731
+
732
+ "Normalize single ST kinase data"
733
+ columns_to_exclude = ['S', 'T', 'C', 't', 'y']
734
+
735
+ if PDHK:
736
+ columns_to_exclude.append('Y')
737
+ divisor = 16
738
+ else:
739
+ divisor = 17
740
+
741
+ s = df.drop(columns=columns_to_exclude).sum(1)
742
+ df2 = df.div(s, axis=0)
743
+ df2.C = df2.C / (df2.C.median() * divisor)
744
+ df2['S'] = df2.drop(columns=columns_to_exclude).median(1)
745
+ df2['T'] = df2.drop(columns=columns_to_exclude).median(1)
746
+ df2 = round(df2, 4)
747
+
748
+ return df2
749
+
750
+ # %% ../nbs/00_core.ipynb 88
751
+ def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
752
+ kinase:str, # a specific kinase
753
+ normalize: bool=False, # normalize according to the paper; special for PDHK1/4
754
+ drop_s: bool= True, # drop s as s is a duplicates of t in PSPA
755
+ ):
756
+ "Obtain a specific kinase data from stacked dataframe"
757
+
758
+ p = pd.DataFrame(df.loc[kinase],columns = [kinase]).reset_index().rename(columns={'index':'substrate'})
759
+ p['position'] = p.substrate.str.extract('(-?\d+)')
760
+ p['aa'] = p.substrate.str[-1]
761
+ p.position = p.position.astype(int)
762
+ pp = p.pivot(index='position', columns='aa', values=kinase)
763
+ if drop_s:
764
+ if 's' in pp.columns:
765
+ pp = pp.drop(columns=['s'])
766
+
767
+ if normalize:
768
+ pp = raw2norm(pp, PDHK=True if kinase == 'PDHK1' or kinase == 'PDHK4' else False)
769
+ return pp