PySAR 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pySAR/pySAR.py ADDED
@@ -0,0 +1,962 @@
1
+ ################################################################################
2
+ ################# pySAR #################
3
+ ################################################################################
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+ import warnings
9
+ from difflib import get_close_matches
10
+ import json
11
+ import textwrap
12
+
13
+ from aaindex import aaindex1
14
+ from .model import Model
15
+ from .pyDSP import PyDSP
16
+ from .evaluate import Evaluate
17
+ from .utils import Map, valid_sequence, remove_gaps, zero_padding, save_results
18
+ from .plots import plot_reg
19
+ from .descriptors import Descriptors
20
+
21
+ class PySAR():
22
+ """
23
+ The PySAR class is the main class for the pySAR software. The class allows for
24
+ the encoding of protein sequences via a plethora of techniques, mainly via AAI
25
+ Indices and or structural, biochemical and physicochemical protein descriptors that are
26
+ then used as features in the building of predictive regression ML models created to map the
27
+ protein sequences to a sought-after activity/fitness value (activity attribute), this is
28
+ known as a Sequence Activity Relationship (SAR) or Sequence Function Relationship (SFR).
29
+ Creating this mapping from sequence to activity/fitness then allows for the future prediction
30
+ of the sought activity/fitness value for unseen protein sequences.
31
+
32
+ Three main encoding strategies are possible in the class and in the software,
33
+ namely using AAI Indices or protein descriptors as well as AAI Indices + Descriptors.
34
+ Additionally, the protein sequences can be encoded using Digital Signal Processing (DSP)
35
+ techniques, mainly through the use of informational protein spectra, this is achieved
36
+ via the pyDSP class in the software. This class accepts strings or lists of AAI Indices
37
+ or descriptors and then passes these through a pipeline to get the required numerical
38
+ encoding of the respective sequences. The calculated encodings of the sequences are
39
+ used as features in the building of the predictive ML models that will then predict the
40
+ activity values for new unseen protein sequences. After the encoding process,
41
+ various metrics will be captured and stored in a local output folder according to the
42
+ OUTPUT_FOLDER global var as well as a regression plot showing how well the model,
43
+ and the selected protein feature attributes, fit to the test data of unseen protein
44
+ sequences.
45
+
46
+ The class has one main input parameter (config_file), that is the filename or filepath
47
+ to the configuration file that contains all the required parameters for the encoding
48
+ strategy/process. The class also accepts a variable number of keyword arguments
49
+ (**kwargs) that will override the config file parameter values of the same name if
50
+ they are passed in.
51
+
52
+ Parameters
53
+ ==========
54
+ :config_file : str
55
+ path to configuration file.
56
+ **kwargs: dict
57
+ keyword arguments and values passed into constructor. The keywords should be
58
+ the same name and form of those in the configuration file. The keyword values
59
+ input take precedence over those in the config files.
60
+
61
+ Methods
62
+ =======
63
+ read_data():
64
+ read dataset of protein sequences.
65
+ preprocessing():
66
+ pre-process / clean protein sequence dataset.
67
+ get_aai_encoding(indices):
68
+ get AAI encoding for user inputted index/indices.
69
+ encode_aai(aai_indices=None, show_plot=False, print_results=True, output_folder=""):
70
+ full pipeline for encoding protein sequences according to user specified
71
+ index/indices from the respective records in the AAI database using the
72
+ get_aai_encoding() function, and outputting the results with all the predictability
73
+ metrics. Also applying a DSP pipeline if applicable.
74
+ get_descriptor_encoding(descriptors=None):
75
+ calculate user inputted descriptor/descriptors using the input protein sequences
76
+ and protpy package.
77
+ encode_descriptor(descriptors=None, show_plot=False, print_results=True, output_folder=""):
78
+ full pipeline for encoding protein sequences according to user inputted descriptor/descriptors,
79
+ calculated using the get_descriptor_encoding() function and the protpy package and outputting
80
+ the results with all the predictability metrics.
81
+ encode_aai_descriptor(aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""):
82
+ full pipeline for encoding protein sequences according to user specified index/indices
83
+ in concatenation with descriptor/descriptors using the get_aai_encoding() and
84
+ get_descriptor_encoding() functions. Output the results with all the predictability
85
+ metrics.
86
+ output_results(results):
87
+ print out the predictive model parameters/attributes and its results.
88
+ """
89
+ def __init__(self, config_file="", **kwargs):
90
+
91
+ self.kwargs = kwargs # keyword arguments override config parameters
92
+ self.config_parameters = {}
93
+
94
+ config_filepath = ""
95
+
96
+ #open json config file and read in parameters
97
+ if not isinstance(config_file, str):
98
+ raise TypeError(f'JSON config file must be a filepath of type string, got type {type(config_file)}.')
99
+
100
+ #append extension if only filename input
101
+ if (os.path.splitext(config_file)[1] == ''):
102
+ config_file = config_file + '.json'
103
+
104
+ #set class config file after normalization
105
+ self.config_file = config_file
106
+
107
+ if (os.path.isfile(self.config_file)):
108
+ config_filepath = self.config_file
109
+ elif (os.path.isfile(os.path.join('config', self.config_file))):
110
+ config_filepath = os.path.join('config', self.config_file)
111
+ else:
112
+ raise OSError(f'JSON config file {self.config_file} not found at path: {config_filepath}.')
113
+ try:
114
+ with open(config_filepath) as f:
115
+ self.config_parameters = json.load(f)
116
+ except json.JSONDecodeError as exc:
117
+ raise ValueError(f'Error parsing config JSON file: {config_filepath}.') from exc
118
+
119
+ #create instance of Map class so parameters can be accessed via dot notation
120
+ self.config_parameters = Map(self.config_parameters)
121
+
122
+ #dataset parameters
123
+ self.dataset = self.kwargs.get('dataset') if 'dataset' in self.kwargs else self.config_parameters.dataset["dataset"]
124
+ self.sequence_col = self.kwargs.get('sequence_col') if 'sequence_col' in self.kwargs else self.config_parameters.dataset["sequence_col"]
125
+ self.activity_col = self.kwargs.get('activity_col') if 'activity_col' in self.kwargs else self.config_parameters.dataset["activity"]
126
+
127
+ #model parameters
128
+ self.model_parameters = self.kwargs.get('model_parameters') if 'model_parameters' in self.kwargs else self.config_parameters.model["parameters"]
129
+ self.algorithm = self.kwargs.get('algorithm') if 'algorithm' in self.kwargs else self.config_parameters.model["algorithm"]
130
+ self.test_split = self.kwargs.get('test_split') if 'test_split' in self.kwargs else self.config_parameters.model["test_split"]
131
+
132
+ #aai parameters
133
+ self.aai_indices = None
134
+
135
+ #descriptors parameters
136
+ self.descriptors = None
137
+
138
+ #pyDSP parameters - use_dsp, spectrum, window function, window filter
139
+ self.use_dsp = self.kwargs.get('use_dsp') if 'use_dsp' in self.kwargs else self.config_parameters.pyDSP["use_dsp"]
140
+ self.dsp_parameters = self.kwargs.get('dsp_parameters') if 'dsp_parameters' in self.kwargs else self.config_parameters.pyDSP
141
+ self.filter_parameters = self.kwargs.get('filter_parameters') if 'filter_parameters' in self.kwargs else self.dsp_parameters["filter"]
142
+ self.spectrum = self.kwargs.get('spectrum') if 'spectrum' in self.kwargs else self.config_parameters.pyDSP["spectrum"]
143
+ self.window_type = self.kwargs.get('window_type') if 'window_type' in self.kwargs else self.config_parameters.pyDSP["window"]["type"]
144
+ self.filter_type = self.kwargs.get('filter_type') if 'filter_type' in self.kwargs else self.config_parameters.pyDSP["filter"]["type"]
145
+
146
+ #set use_dsp variable to true if any of the DSP parameters passed in as kwargs
147
+ if any(k in self.kwargs for k in ('spectrum', 'window_type', 'filter_type')):
148
+ self.use_dsp = True
149
+
150
+ #import and read dataset
151
+ self.data = self.read_data()
152
+
153
+ #array of protein sequences
154
+ self.sequences = self.data[self.sequence_col]
155
+
156
+ #array of activity values
157
+ self.activity = self.data[self.activity_col]
158
+
159
+ #pre-process dataset and protein sequences
160
+ self.preprocessing()
161
+
162
+ #get number of rows and cols of dataset
163
+ self.num_seqs = len(self.sequences)
164
+ self.sequence_length = len(max(self.sequences, key=len))
165
+
166
+ #feature space dimensions used in building the model
167
+ self.feature_space = ()
168
+
169
+ #create instance of Descriptors class using config file, protein sequences and any kwargs
170
+ self.descriptor = Descriptors(self.config_file, protein_seqs=self.sequences, **self.kwargs)
171
+
172
+ def read_data(self):
173
+ """
174
+ Read in dataset according to file name from 'dataset' attribute.
175
+
176
+ Parameters
177
+ ==========
178
+ None
179
+
180
+ Returns
181
+ =======
182
+ :data: pd.DataFrame
183
+ dataframe of imported dataset.
184
+ """
185
+ #read in dataset csv if found in path, if not raise error
186
+ if not (os.path.isfile(self.dataset)):
187
+ raise OSError(f'Dataset filepath is not correct: {self.dataset}.')
188
+
189
+ #read in dataset csv
190
+ try:
191
+ data = pd.read_csv(self.dataset, sep=",", header=0)
192
+ return data
193
+ except Exception as e:
194
+ raise OSError(f'Error opening dataset file: {self.dataset}.') from e
195
+
196
+ def preprocessing(self):
197
+ """
198
+ Pre-process protein sequences in dataset. Validate column names, check
199
+ for invalid amino acids in sequences, remove any gaps in sequence and
200
+ remove any NAN or +/- infinity values.
201
+
202
+ Parameters
203
+ ==========
204
+ None
205
+
206
+ Returns
207
+ =======
208
+ None
209
+ """
210
+ #get closest match for sequence column name in dataset
211
+ sequence_col_matches = get_close_matches(self.sequence_col, self.data.columns, cutoff=0.6)
212
+
213
+ #set sequence col to the first match found, else raise error
214
+ if (sequence_col_matches != []):
215
+ self.sequence_col = sequence_col_matches[0]
216
+ else:
217
+ raise ValueError(f'Sequence column ({self.sequence_col}) not present in dataset columns:\n{self.data.columns}.')
218
+
219
+ #remove any gaps found in sequences in dataset
220
+ self.sequences = remove_gaps(self.sequences)
221
+
222
+ #verify no invalid amino acids found in sequences, if so then raise error
223
+ invalid_seqs = valid_sequence(self.sequences)
224
+ if (invalid_seqs != None):
225
+ raise ValueError(f'Invalid amino acids found in protein sequence dataset: {invalid_seqs}.')
226
+
227
+ #get closest match for activity column name in dataset
228
+ activity_matches = get_close_matches(self.activity_col, self.data.columns, cutoff=0.6)
229
+
230
+ #set activity col to the first match found, else raise error
231
+ if (activity_matches != []):
232
+ self.activity_col = activity_matches[0]
233
+ else:
234
+ raise ValueError(f'Activity column ({self.activity_col}) not present in dataset columns:\n{list(self.data.columns)}.')
235
+
236
+ #remove any +/- infinity values or any Null/NAN's from activity values
237
+ nan_count = self.data[self.activity_col].replace([np.inf, -np.inf], np.nan).isna().sum()
238
+ if nan_count > 0:
239
+ warnings.warn(
240
+ f'{nan_count} missing/infinite activity value(s) in column "{self.activity_col}" '
241
+ f'replaced with 0. Consider reviewing or dropping these rows.',
242
+ UserWarning, stacklevel=2
243
+ )
244
+ self.data[self.activity_col] = (
245
+ self.data[self.activity_col]
246
+ .replace([np.inf, -np.inf], np.nan)
247
+ .fillna(0)
248
+ )
249
+ #refresh self.activity to reflect the updated (NaN-replaced) column
250
+ self.activity = self.data[self.activity_col]
251
+
252
+ def get_aai_encoding(self, aai_indices=None):
253
+ """
254
+ Get AAI index encoding values for input index/indices and their respective
255
+ record values from the AAI database. Encode each amino acid in the protein
256
+ sequences in the dataset to the respective values specified in the AAI
257
+ The index/indices should be in the form of the properties accession number
258
+ which is the 10 length alphanumeric code that represents each property within
259
+ the AAI database. If multiple indices/accession numbers input then encode
260
+ protein sequences with each index and concatenate.
261
+
262
+ Parameters
263
+ ==========
264
+ :aai_indices: str/list (default=None)
265
+ string or list of AAI indices/accession numbers.
266
+
267
+ Returns
268
+ =======
269
+ :encoded_seqs: np.ndarray
270
+ array of the encoded protein sequences in dataset via user input index/indices.
271
+ """
272
+ #validate AAI indices are present in the input parameter, if not raise error
273
+ if (aai_indices == None or aai_indices == ""):
274
+ raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
275
+
276
+ #check input indices is of correct type (str/list), if not raise type error
277
+ if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list))):
278
+ raise TypeError(f"Input indices parameter must be a string or list, got {type(aai_indices)}.")
279
+
280
+ #cast index string to list, split multiple indices using comma
281
+ if (isinstance(aai_indices, str)):
282
+ if (',' in aai_indices):
283
+ aai_indices = aai_indices.split(',') #split on ',' just in case multiple indices passed in as str
284
+ else:
285
+ aai_indices = [aai_indices]
286
+
287
+ #create zeros numpy array to store encoded sequence output
288
+ encoded_aai_ = np.zeros((self.num_seqs, self.sequence_length*len(aai_indices)))
289
+
290
+ #if multiple indices used then calculate AAI index encoding for each and concatenate after each calculation
291
+ for index in range(0, len(aai_indices)):
292
+
293
+ #get values from aaindex record using its accession number and the aaindex package
294
+ encoded_aai = aaindex1[aai_indices[index]].values
295
+
296
+ #initialise temp arrays to store encoded sequences
297
+ temp_seq_vals = []
298
+ temp_all_seqs = []
299
+
300
+ #iterate through each protein sequence and amino acid, getting the AAI index encoding value
301
+ for protein in range(0, len(self.sequences)):
302
+ for aa in self.sequences[protein]:
303
+ temp_seq_vals.append(encoded_aai[aa])
304
+
305
+ #append encoding and reset temp array
306
+ temp_all_seqs.append(temp_seq_vals)
307
+ temp_seq_vals = []
308
+
309
+ #zero-pad encoding list so that sequences are all the same length
310
+ temp_all_seqs = zero_padding(temp_all_seqs)
311
+
312
+ #convert list of lists into array
313
+ temp_all_seqs = np.array(temp_all_seqs, dtype="float32")
314
+
315
+ #in first iteration through aai_indices (index=0) set encoded_aai_ to zero-initialised
316
+ #numpy array, else concatenate to the array in previous iteration
317
+ if (index == 0):
318
+ encoded_aai_ = temp_all_seqs
319
+ else:
320
+ encoded_aai_ = np.concatenate((encoded_aai_, temp_all_seqs), axis=1)
321
+
322
+ return encoded_aai_
323
+
324
+ def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, output_folder=""):
325
+ """
326
+ Full pipeline for encoding proteins sequences in dataset using the input AAI indices
327
+ from the AAI database. If multiple indices/accession numbers input then calculate each
328
+ and concatenate them. Build predictive regression ML model from encoded AAI feature data
329
+ for predicting the activity/fitness values of unseen sequences.
330
+
331
+ The resulting model assets and its results will be exported to the directory pointed to
332
+ by the global var OUTPUT_DIR. If use_dsp config parameter is true then pass AAI
333
+ Indices through a DSP transformation pipeline specified by the config's DSP parameters
334
+ (spectrum, window & filter) via the PyDSP module and class.
335
+
336
+ Parameters
337
+ ==========
338
+ :aai_indices: str/list (default=None)
339
+ string or list of indices/accession numbers from the AAI.
340
+ :show_plot: bool (default=False)
341
+ display regression plot of best predictive model. If False then the plot
342
+ will just be saved to the output folder, else it'll be displayed & also saved.
343
+ :print_results: bool (default=True)
344
+ if true, output verbose output of results and parameters from encoding process.
345
+ :output_folder: str (default="")
346
+ output folder to store results csv to, if empty input it will be stored in
347
+ the OUTPUT_FOLDER global var.
348
+
349
+ Returns
350
+ =======
351
+ :aai_df: pd.Dataframe
352
+ pandas Dataframe storing metrics and results of encoding.
353
+ """
354
+ #validate AAI indices are present in the input parameter
355
+ if (aai_indices == None or aai_indices == "" or aai_indices == []):
356
+ raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
357
+
358
+ #check input indices is of correct type (str/list), if not raise type error
359
+ if ((not isinstance(aai_indices, str)) and (not isinstance(aai_indices, list))):
360
+ raise TypeError(f"Input indices parameter must be a string or list, got {type(aai_indices)}.")
361
+
362
+ self.aai_indices = aai_indices
363
+
364
+ #if list of one element with multiple indices, split them into list of individual elements
365
+ if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1:
366
+ self.aai_indices = self.aai_indices[0].replace(' ', '').split(',')
367
+
368
+ #convert string indices into comma seperated list, remove whitespace
369
+ if isinstance(self.aai_indices, str):
370
+ self.aai_indices = self.aai_indices.replace(' ', '').split(',')
371
+
372
+ #sort list of indices into alphabetical order
373
+ self.aai_indices.sort()
374
+
375
+ #dataframe to store encoding of inputted aai indices
376
+ aai_encoding_df = pd.DataFrame()
377
+
378
+ #iterate over each index, calculate its encoding, apply DSP functionality if applicable, concat into one dataframe
379
+ for index in self.aai_indices:
380
+
381
+ #get AAI index encodings specified by indices input parameter
382
+ encoded_seqs = self.get_aai_encoding(index)
383
+
384
+ #if use_dsp true then get protein spectra from encoded sequences via the AAI indices using PyDSP class,
385
+ #else use the AAI indices encoding's themselves as the feature/training data (X)
386
+ if (self.use_dsp):
387
+ #if input spectrum is none or empty, raise error.
388
+ if (self.spectrum == None or self.spectrum == ""):
389
+ raise ValueError(f'Spectrum cannot be None or empty: {self.spectrum}.')
390
+ pyDSP = PyDSP(self.config_file, protein_seqs=encoded_seqs)
391
+ X = pd.DataFrame(pyDSP.spectrum_encoding) #set training data to FFT spectrum encoding
392
+ else:
393
+ X = pd.DataFrame(encoded_seqs) #no DSP applied to encoded sequences
394
+
395
+ #concat encoding of current aai index with other encodings for training data
396
+ aai_encoding_df = pd.concat([aai_encoding_df, X], axis=1)
397
+
398
+ #renaming columns in format aai_X, where X is the amino acid number in the sequence
399
+ columns = ["aai_" + str(x) for x in range(1, len(aai_encoding_df.columns) + 1)]
400
+ aai_encoding_df.columns = columns
401
+
402
+ #set class variable to the training data feature space
403
+ self.feature_space = aai_encoding_df.shape
404
+
405
+ #create instance of model class of type specified by algorithm parameter using X and Y data
406
+ self.model = Model(aai_encoding_df, self.activity, self.algorithm, parameters=self.model_parameters)
407
+
408
+ #updating algorithm attribute
409
+ self.algorithm = repr(self.model)
410
+
411
+ #get training and test dataset split from model class
412
+ X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
413
+
414
+ #fit predictive model
415
+ self.model.fit()
416
+
417
+ #predict activity values for test data
418
+ Y_pred = self.model.predict()
419
+
420
+ #create instance of Evaluate class which will get all the evaluation metrics
421
+ evaluation = Evaluate(Y_test, Y_pred)
422
+
423
+ #get categories for all indices in self.aai_indices
424
+ index_cat = []
425
+ if (isinstance(self.aai_indices, list)):
426
+ for i in range(0, len(self.aai_indices)):
427
+ index_cat.append(aaindex1[self.aai_indices[i]].category)
428
+ else:
429
+ index_cat = [aaindex1[self.aai_indices].category]
430
+
431
+ #create comma seperated list of categories
432
+ index_cat = ', '.join(index_cat)
433
+
434
+ #create output dataframe, set first row to attribute/metric values
435
+ aai_df = pd.DataFrame(columns=['Index', 'Category', 'R2', 'RMSE', 'MSE', 'MAE', 'RPD', 'Explained Variance'])
436
+ aai_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), evaluation.r2, evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
437
+
438
+ #convert index and category from default Object type -> String datatypes
439
+ aai_df['Index'] = aai_df['Index'].astype(pd.StringDtype())
440
+ aai_df['Category'] = aai_df['Category'].astype(pd.StringDtype())
441
+
442
+ #print out results from encoding
443
+ if (print_results):
444
+ self.output_results(aai_df)
445
+
446
+ #plot regression plot for predictive model
447
+ plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
448
+
449
+ #save results of encoding to output folder specified by input param
450
+ save_results(aai_df, 'aai_results', output_folder=output_folder)
451
+
452
+ return aai_df
453
+
454
+ def get_descriptor_encoding(self, descriptors=None):
455
+ """
456
+ Calculate inputted descriptor(s), using the Descriptors class and custom-built
457
+ protpy package, requried for the encoding process. Get closest match to user
458
+ inputted string or list of descriptors using difflib library. If a single
459
+ descriptor is input then calculate it and return, if list of descriptors input
460
+ then calculate each descriptor's value and concatenate.
461
+
462
+ Parameters
463
+ ==========
464
+ :descriptors: str/list (default=None)
465
+ string or list of protein descriptor names.
466
+
467
+ Returns
468
+ =======
469
+ :encoded_desc: pd.DataFrame
470
+ pandas dataframe of calculated descriptor values according to user
471
+ inputted descriptor(s).
472
+ """
473
+ #raise error if no descriptors specified in input
474
+ if (descriptors == None or descriptors == "" or descriptors == []):
475
+ raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
476
+
477
+ #check input descriptor is of correct type str or list, if not raise type error
478
+ if (not isinstance(descriptors, str) and (not isinstance(descriptors, list))):
479
+ raise TypeError(f"Input descriptor parameter must be a str or list, got {type(descriptors)}.")
480
+
481
+ #cast descriptors parameter to a list if it is a str by creating comma seperated list
482
+ if (isinstance(descriptors, str)):
483
+ descriptors = descriptors.split(',')
484
+
485
+ #remove any leading or trailing whitespace from descriptors
486
+ descriptors = [de.strip() for de in descriptors]
487
+
488
+ #reuse cached Descriptors instance created in __init__
489
+ descr = self.descriptor
490
+
491
+ #store list of correct descriptor names from ones user input using the difflib library
492
+ temp_descriptors = []
493
+
494
+ #get closest valid available descriptor name from input descriptor parameter,
495
+ #if a list of descriptors passed in as the input parameter then get
496
+ #all valid descriptors in list
497
+ for de in range(0, len(descriptors)):
498
+ desc_matches = get_close_matches(descriptors[de],
499
+ descr.valid_descriptors, cutoff=0.6)
500
+ if (desc_matches == []):
501
+ raise ValueError(f'No approximate descriptor found from one input: {de}.')
502
+ descriptors[de] = desc_matches[0]
503
+ temp_descriptors.append(desc_matches[0])
504
+
505
+ #initialise temp lists and DF to store encoded descriptor values
506
+ encoded_desc_temp = []
507
+ encoded_desc_vals = []
508
+ encoded_desc_temp = pd.DataFrame()
509
+
510
+ #iterate and get each descriptors' values using Descriptor class and protpy package
511
+ for d in range(0, len(descriptors)):
512
+ encoded_desc_temp = descr.get_descriptor_encoding(descriptors[d])
513
+ #raise value error if descriptor is empty/None
514
+ if (encoded_desc_temp.empty):
515
+ raise ValueError(f'Descriptor cannot be empty or None: {descriptors[d]}.')
516
+ encoded_desc_vals.append(encoded_desc_temp) #append to array of all descriptor values
517
+ encoded_desc_temp = pd.DataFrame() #reset to empty dataframe for next iteration
518
+
519
+ #concatenate dataframes of descriptors
520
+ encoded_desc = pd.concat(encoded_desc_vals, axis=1)
521
+
522
+ return encoded_desc
523
+
524
+ def encode_descriptor(self, descriptors=None, show_plot=False, print_results=True, output_folder=""):
525
+ """
526
+
527
+ Full pipeline for encoding the protein sequences in the dataset using protein
528
+ physicochemical, biochemical and or structural descriptors, using the Descriptors
529
+ class and custom-built protpy package, and build predictive ML regression model
530
+ from the descriptor feature/training data. This model is then used to calculate
531
+ the activity/fitness value of unseen test sequences. If multiple descriptors input
532
+ then calculate each and concatenate them. The resulting model assets and its metric's
533
+ results will be exported to the directory pointed to by the global variable OUTPUT_DIR.
534
+
535
+ Parameters
536
+ ==========
537
+ :descriptors: str/list (default=None)
538
+ string or list of protein descriptor names.
539
+ :show_plot: bool (default=False)
540
+ display regression plot of best predictive model. If False then the plot
541
+ will just be saved to the output folder, else it'll be displayed & also saved.
542
+ :print_results: bool (default=True)
543
+ if true, output verbose output of results and parameters from encoding process.
544
+ :output_folder: str (default="")
545
+ output folder to store results csv to, if empty input it will be stored in
546
+ the OUTPUT_FOLDER global var.
547
+
548
+ Returns
549
+ =======
550
+ :desc_df: pd.DataFrame
551
+ pandas dataframe storing metrics and results of encoding.
552
+ """
553
+ #raise error if no descriptor specified in input
554
+ if (descriptors == None or descriptors == ""):
555
+ raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
556
+
557
+ #check input descriptor is of correct type (str or list), if not raise type error
558
+ if (not (isinstance(descriptors, str))) and (not (isinstance(descriptors, list))):
559
+ raise TypeError(f"Input descriptor parameter must be a string or list, got {type(descriptors)}.")
560
+
561
+ #set class attribute
562
+ self.descriptors = descriptors
563
+
564
+ #if multiple descriptors input as str, split into comma seperated list
565
+ if isinstance(self.descriptors, str):
566
+ self.descriptors = self.descriptors.replace(' ', '').split(',')
567
+
568
+ #if list of multiple descriptors input in one string, seperate into commas seperated list of individual elements
569
+ if isinstance(self.descriptors, list) and len(self.descriptors) == 1:
570
+ self.descriptors = self.descriptors[0].replace(' ', '').split(',')
571
+
572
+ #sort list of descriptors into alphabetical order
573
+ self.descriptors.sort()
574
+
575
+ #reuse cached Descriptors instance created in __init__
576
+ descr = self.descriptor
577
+
578
+ #pandas dataframe to store all output results
579
+ desc_df = pd.DataFrame(columns=['Descriptor', 'Group', 'R2', 'RMSE', 'MSE', 'MAE', 'RPD', 'Explained Variance'])
580
+
581
+ #object to store sequence encodings for each input descriptor
582
+ descriptor_encoding_df = pd.DataFrame()
583
+
584
+ #iterate over each input descriptor, calculate its encoding from its respective function, concatenate with main encoding object
585
+ for desc in range(0, len(self.descriptors)):
586
+
587
+ #get closest matching descriptor from descriptor input parameter using difflib library
588
+ desc_matches = get_close_matches(self.descriptors[desc], descr.valid_descriptors, cutoff=0.6)
589
+ if (desc_matches != []):
590
+ self.descriptors[desc] = desc_matches[0]
591
+ else:
592
+ raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'.
593
+ format(self.descriptors[desc], descr.valid_descriptors))
594
+
595
+ #concatenate encoding of current descriptor to main encodng object
596
+ descriptor_encoding_df = pd.concat([descriptor_encoding_df, self.get_descriptor_encoding(descriptors=self.descriptors[desc])], axis=1)
597
+
598
+ #set class variable to the training data feature space
599
+ self.feature_space = descriptor_encoding_df.shape
600
+
601
+ #create instance of model class of type specified by algorithm parameter using X and Y data
602
+ self.model = Model(descriptor_encoding_df, self.activity, self.algorithm, parameters=self.model_parameters)
603
+
604
+ #updating algorithm attribute
605
+ self.algorithm = repr(self.model)
606
+
607
+ #get training and test dataset split using Model class
608
+ X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
609
+
610
+ #fit predictive model
611
+ self.model.fit()
612
+
613
+ #predict activity values for test data
614
+ Y_pred = self.model.predict()
615
+
616
+ #create instance of Evaluate class which will get all the evaluation metrics
617
+ evaluation = Evaluate(Y_test, Y_pred)
618
+
619
+ #get groups for all descriptors in self.desciptors, put multiple descriptor groups into comma seperated list
620
+ if (isinstance(self.descriptors, list)):
621
+ desc_group = []
622
+ for desc_ in self.descriptors:
623
+ desc_group.append(descr.descriptor_groups[desc_])
624
+ desc_group = ', '.join(desc_group)
625
+ else:
626
+ desc_group = descr.descriptor_groups[self.descriptors]
627
+
628
+ #add metric values to output dataframe
629
+ desc_df.loc[0] = [', '.join(self.descriptors), desc_group, evaluation.r2, evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
630
+
631
+ #convert Descriptor and Group from default Object type -> String datatypes
632
+ desc_df['Descriptor'] = desc_df['Descriptor'].astype(pd.StringDtype())
633
+ desc_df['Group'] = desc_df['Group'].astype(pd.StringDtype())
634
+
635
+ #ensure aai indices attribute doesn't show up in output results
636
+ if (self.aai_indices != None):
637
+ self.aai_indices = None
638
+
639
+ #print out results from encoding
640
+ if (print_results):
641
+ self.output_results(desc_df)
642
+
643
+ #plot regression plot for predictive model
644
+ plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
645
+
646
+ #save results of encoding to output folder
647
+ save_results(desc_df, 'desc_results', output_folder=output_folder)
648
+
649
+ #reset descriptors instance variable
650
+ self.descriptors = None
651
+
652
+ return desc_df
653
+
654
+ def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""):
655
+ """
656
+ Encode using both AAI indices and the physicochemical/structural descriptors from
657
+ the get_aai_encoding() and get_descriptor_encoding() functions. The two outputs
658
+ from the individual encoding strategies, previously described above, will be
659
+ concatenated together and used in the building of a predictive regression ML
660
+ model. The resulting model assets and its results will be exported to the
661
+ directory pointed to by the global variable OUTPUT_DIR. If the config parameter
662
+ use_dsp is true then pass AAI Indices through a DSP transformation pipeline
663
+ specified by the DSP parameters (spectrum, window & filter) via the PyDSP
664
+ class/module.
665
+
666
+ Parameters
667
+ ==========
668
+ :aai_indices: str/list (default=None)
669
+ string or list of indices/accession numbers from the AAI database.
670
+ :descriptors: str/list (default=None)
671
+ string or list of protein descriptors names.
672
+ :show_plot: bool (default=False)
673
+ display regression plot of best predictive model. If false then the plot
674
+ will just be saved to the output folder, else it'll be displayed & also saved.
675
+ :print_results: bool (default=True)
676
+ if true, output verbose output of results and parameters from encoding process.
677
+ :output_folder: str (default="")
678
+ output folder to store results csv to, if empty input it will be stored in
679
+ the OUTPUT_FOLDER global var.
680
+
681
+ Returns
682
+ =======
683
+ :aai_desc_df : pd.Dataframe
684
+ pandas dataframe storing metrics and results of encoding.
685
+ """
686
+ #validate AAI indices and Descriptors are present in the input parameters, return error if either is None
687
+ if (descriptors == None or descriptors == "") or (aai_indices == None or aai_indices == ""):
688
+ raise ValueError('AAI Indices and Descriptor input parameters must not be empty or None.')
689
+
690
+ #check input descriptor & indices are of correct type (str/list), if not raise type error
691
+ if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list)) or \
692
+ (not isinstance(descriptors, str) and (not isinstance(descriptors, list)))):
693
+ raise TypeError("Input AAI indices and descriptors parameter must be of type string or list.")
694
+
695
+ #set instance attributes
696
+ self.aai_indices = aai_indices
697
+ self.descriptors = descriptors
698
+
699
+ #if list of multiple descriptors input in one string, seperate into commas seperated list of individual elements
700
+ if isinstance(self.descriptors, list) and len(self.descriptors) == 1:
701
+ self.descriptors = self.descriptors[0].replace(' ', '').split(',')
702
+
703
+ #convert descriptors into comma seperated list if str input, remove whitespace
704
+ if isinstance(self.descriptors, str):
705
+ self.descriptors = self.descriptors.replace(' ', '').split(',')
706
+
707
+ #if list of one element with multiple indices, split them into list of individual elements
708
+ if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1:
709
+ self.aai_indices = self.aai_indices[0].replace(' ', '').split(',')
710
+
711
+ #convert string indices into comma seperated list, remove whitespace
712
+ if isinstance(self.aai_indices, str):
713
+ self.aai_indices = self.aai_indices.replace(' ', '').split(',')
714
+
715
+ #sort list of indices into alphabetical order
716
+ self.aai_indices.sort()
717
+
718
+ #sort list of descriptors into alphabetical order
719
+ self.descriptors.sort()
720
+
721
+ #create output results Dataframe
722
+ aai_desc_df = pd.DataFrame(columns=['Index', 'Category', 'Descriptor', 'Group', 'R2', 'RMSE', \
723
+ 'MSE', 'MAE', 'RPD', 'Explained Variance'])
724
+
725
+ #dataframe to store the encodings for the AAI indices
726
+ aai_encoding_df = pd.DataFrame()
727
+
728
+ #iterate over each index code, calculate its encoding features using respective function, convert to dataframe and concat to main dataframe
729
+ for index in self.aai_indices:
730
+ aai_encoding_df = pd.concat([aai_encoding_df, pd.DataFrame(self.get_aai_encoding(index))], axis=1)
731
+
732
+ #renaming columns in format aai_X, where X is the amino acid number in the sequence
733
+ columns = ["aai_" + str(x) for x in range(1, len(aai_encoding_df.columns) + 1)]
734
+ aai_encoding_df.columns = columns
735
+
736
+ #if AAI indices encoding is empty, raise error
737
+ if (aai_encoding_df.empty):
738
+ raise ValueError(f'AAI Indices encoding cannot be empty or None: {aai_indices}.')
739
+
740
+ #reuse cached Descriptors instance from __init__
741
+ descr = self.descriptor
742
+
743
+ #dataframe to store the encodings for the descriptors
744
+ descriptor_encoding_df = pd.DataFrame()
745
+
746
+ #iterate over each input descriptor, calculate its encoding from its respective function, concatenate with main encoding object
747
+ for desc in range(0, len(self.descriptors)):
748
+
749
+ #get closest matching descriptor from descriptor input parameter using difflib library
750
+ desc_matches = get_close_matches(self.descriptors[desc], descr.valid_descriptors, cutoff=0.6)
751
+ if (desc_matches != []):
752
+ self.descriptors[desc] = desc_matches[0]
753
+ else:
754
+ raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'.
755
+ format(self.descriptors[desc], descr.valid_descriptors))
756
+
757
+ #concatenate encoding of current descriptor to main encodng object
758
+ descriptor_encoding_df = pd.concat([descriptor_encoding_df, self.get_descriptor_encoding(descriptors=self.descriptors[desc])], axis=1)
759
+
760
+ #reset index for aai indices and in descriptors output dataframe
761
+ aai_encoding_df.reset_index(inplace=True, drop=True)
762
+ descriptor_encoding_df.reset_index(inplace=True, drop=True)
763
+
764
+ #concatenate AAI index and Descriptor features to get training data (X)
765
+ X = pd.concat([aai_encoding_df, descriptor_encoding_df], axis=1)
766
+
767
+ #set class variable to the training data feature space
768
+ self.feature_space = X.shape
769
+
770
+ #create instance of model class of type specified by algorithm parameter using X and Y data
771
+ self.model = Model(X, self.activity, self.algorithm, parameters=self.model_parameters)
772
+
773
+ #updating algorithm attribute
774
+ self.algorithm = repr(self.model)
775
+
776
+ #get training and test dataset split using Model class
777
+ X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
778
+
779
+ #fit predictive model
780
+ self.model.fit()
781
+
782
+ #predict activity values for test data
783
+ Y_pred = self.model.predict()
784
+
785
+ #create instance of Evaluate class which will get all the evaluation metrics
786
+ evaluation = Evaluate(Y_test, Y_pred)
787
+
788
+ #get categories for all indices in self.aai_indices
789
+ index_cat = []
790
+ if (isinstance(self.aai_indices, list)):
791
+ for i in range(0, len(self.aai_indices)):
792
+ index_cat.append(aaindex1[self.aai_indices[i]].category)
793
+ else:
794
+ index_cat = [aaindex1[self.aai_indices].category]
795
+
796
+ #seperate index categories into comma seperated string
797
+ index_cat = ', '.join(index_cat)
798
+
799
+ #get groups for all descriptors in self.desciptors, can be string or list of descriptors
800
+ if (isinstance(self.descriptors, list)):
801
+ desc_group = []
802
+ for desc_ in self.descriptors:
803
+ desc_group.append(descr.descriptor_groups[desc_])
804
+ desc_group = ', '.join(desc_group)
805
+ else:
806
+ desc_group = descr.descriptor_groups[self.descriptors]
807
+
808
+ #set output dataframe columns
809
+ aai_desc_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), ', '.join(self.descriptors), str(desc_group), evaluation.r2,
810
+ evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
811
+
812
+ #convert Index, Category, Descriptor and Group from default Object type -> String datatypes
813
+ # aai_desc_df['Index'] = aai_desc_df['Index'].astype(pd.StringDtype())
814
+ aai_desc_df['Index'] = aai_desc_df['Index'].astype("string")
815
+ aai_desc_df['Category'] = aai_desc_df['Category'].astype(pd.StringDtype())
816
+ aai_desc_df['Descriptor'] = aai_desc_df['Descriptor'].astype(pd.StringDtype())
817
+ aai_desc_df['Group'] = aai_desc_df['Group'].astype(pd.StringDtype())
818
+
819
+ #print out results from encoding
820
+ if (print_results):
821
+ self.output_results(aai_desc_df)
822
+
823
+ #plot regression plot for predictive model
824
+ plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
825
+
826
+ #save results of encoding to output folder
827
+ save_results(aai_desc_df, 'aai_desc_results', output_folder=output_folder)
828
+
829
+ return aai_desc_df
830
+
831
+ def output_results(self, results):
832
+ """
833
+ Print out the predictive model parameters/attributes and its results.
834
+
835
+ Parameters
836
+ ==========
837
+ :results: dict/pd.Series
838
+ dictionary or Series of metrics and their associated values.
839
+
840
+ Returns
841
+ =======
842
+ None
843
+ """
844
+ #create text wrapper for aai indices, descriptors and model parameters text
845
+ line_length = 90
846
+
847
+ print('##########################################################################################')
848
+ print('###################################### Parameters ########################################\n')
849
+ if (not (self.aai_indices is None)) and (len(self.aai_indices) <= 10):
850
+ print(textwrap.fill('# AAI Indices: {}'.format(', '.join(self.aai_indices)), line_length))
851
+ if (self.use_dsp):
852
+ print('# DSP Parameters:\n # Spectrum: {}\n # Window Function: {} \
853
+ \n # Filter Function: {}'.format(self.spectrum, self.window_type, self.filter_type))
854
+ if (self.descriptors is not None):
855
+ print(textwrap.fill('# Descriptors: {}'.format(', '.join(self.descriptors)), line_length))
856
+ print('# Configuration File: {}\n# Dataset: {}\n# Number of Sequences/Sequence Length: {} x {} \
857
+ \n# Target Activity: {}'.format(os.path.basename(self.config_file), self.dataset, self.num_seqs, self.sequence_length, self.activity_col))
858
+ print(f"# Algorithm: {repr(self.model)}")
859
+ if (self.model_parameters == "" or self.model_parameters is None or self.model_parameters == {}):
860
+ # print('# Model Parameters: {}'.format("\n\t".join(tw.wrap(', '.join(temp_model_parameters.model.get_params())))))
861
+ print(textwrap.fill(f'# Model Parameters: {self.model.model.get_params()}', line_length))
862
+ else:
863
+ print(textwrap.fill(f'# Model Parameters: {self.model_parameters}', line_length))
864
+ print(f'# Test Split: {self.test_split}\n# Feature Space: {self.feature_space}')
865
+
866
+ print('\n##########################################################################################')
867
+ print('######################################## Results #########################################\n')
868
+ print('# R2: {}'.format(results['R2'].values[0]))
869
+ print('# RMSE: {} '.format(results['RMSE'].values[0]))
870
+ print('# MSE: {} '.format(results['MSE'].values[0]))
871
+ print('# MAE: {}'.format(results['MAE'].values[0]))
872
+ print('# RPD {}'.format(results['RPD'].values[0]))
873
+ print('# Explained Variance {}\n'.format(results['Explained Variance'].values[0]))
874
+ print('##########################################################################################\n')
875
+
876
+ ###################### Getters & Setters ######################
877
+
878
+ @property
879
+ def dataset(self):
880
+ return self._dataset
881
+
882
+ @dataset.setter
883
+ def dataset(self, val):
884
+ self._dataset = val
885
+
886
+ @property
887
+ def sequences(self):
888
+ return self._sequences
889
+
890
+ @sequences.setter
891
+ def sequences(self, val):
892
+ self._sequences = val
893
+
894
+ @property
895
+ def sequence_col(self):
896
+ return self._sequence_col
897
+
898
+ @sequence_col.setter
899
+ def sequence_col(self, val):
900
+ self._sequence_col = val
901
+
902
+ @property
903
+ def activity_col(self):
904
+ return self._activity_col
905
+
906
+ @activity_col.setter
907
+ def activity_col(self, val):
908
+ self._activity_col = val
909
+
910
+ @property
911
+ def activity(self):
912
+ return self._activity
913
+
914
+ @activity.setter
915
+ def activity(self, val):
916
+ self._activity = val
917
+
918
+ @property
919
+ def algorithm(self):
920
+ return self._algorithm
921
+
922
+ @algorithm.setter
923
+ def algorithm(self, val):
924
+ self._algorithm = val
925
+
926
+ @property
927
+ def model_parameters(self):
928
+ return self._model_parameters
929
+
930
+ @model_parameters.setter
931
+ def model_parameters(self, val):
932
+ self._model_parameters = val
933
+
934
+ @property
935
+ def test_split(self):
936
+ return self._test_split
937
+
938
+ @test_split.setter
939
+ def test_split(self, val):
940
+ self._test_split = val
941
+
942
+ @property
943
+ def num_seqs(self):
944
+ return self._num_seqs
945
+
946
+ @num_seqs.setter
947
+ def num_seqs(self, val):
948
+ self._num_seqs = val
949
+
950
+ @property
951
+ def sequence_length(self):
952
+ return self._sequence_length
953
+
954
+ @sequence_length.setter
955
+ def sequence_length(self, val):
956
+ self._sequence_length = val
957
+
958
+ # def __str__(self):
959
+ # return "Instance of PySAR class, using parameters: {}.".format(self.__dict__)
960
+
961
+ # def __repr__(self):
962
+ # return "<PySAR: {}>".format(self)