PySAR 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +53 -0
- pySAR/__init__.py +28 -0
- pySAR/descriptors.py +2893 -0
- pySAR/encoding.py +986 -0
- pySAR/evaluate.py +231 -0
- pySAR/globals_.py +21 -0
- pySAR/model.py +559 -0
- pySAR/plots.py +92 -0
- pySAR/py.typed +0 -0
- pySAR/pyDSP.py +582 -0
- pySAR/pySAR.py +962 -0
- pySAR/utils.py +283 -0
- pysar-2.5.0.dist-info/METADATA +740 -0
- pysar-2.5.0.dist-info/RECORD +17 -0
- pysar-2.5.0.dist-info/WHEEL +5 -0
- pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
- pysar-2.5.0.dist-info/top_level.txt +2 -0
pySAR/pySAR.py
ADDED
|
@@ -0,0 +1,962 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# pySAR #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import os
|
|
8
|
+
import warnings
|
|
9
|
+
from difflib import get_close_matches
|
|
10
|
+
import json
|
|
11
|
+
import textwrap
|
|
12
|
+
|
|
13
|
+
from aaindex import aaindex1
|
|
14
|
+
from .model import Model
|
|
15
|
+
from .pyDSP import PyDSP
|
|
16
|
+
from .evaluate import Evaluate
|
|
17
|
+
from .utils import Map, valid_sequence, remove_gaps, zero_padding, save_results
|
|
18
|
+
from .plots import plot_reg
|
|
19
|
+
from .descriptors import Descriptors
|
|
20
|
+
|
|
21
|
+
class PySAR():
|
|
22
|
+
"""
|
|
23
|
+
The PySAR class is the main class for the pySAR software. The class allows for
|
|
24
|
+
the encoding of protein sequences via a plethora of techniques, mainly via AAI
|
|
25
|
+
Indices and or structural, biochemical and physicochemical protein descriptors that are
|
|
26
|
+
then used as features in the building of predictive regression ML models created to map the
|
|
27
|
+
protein sequences to a sought-after activity/fitness value (activity attribute), this is
|
|
28
|
+
known as a Sequence Activity Relationship (SAR) or Sequence Function Relationship (SFR).
|
|
29
|
+
Creating this mapping from sequence to activity/fitness then allows for the future prediction
|
|
30
|
+
of the sought activity/fitness value for unseen protein sequences.
|
|
31
|
+
|
|
32
|
+
Three main encoding strategies are possible in the class and in the software,
|
|
33
|
+
namely using AAI Indices or protein descriptors as well as AAI Indices + Descriptors.
|
|
34
|
+
Additionally, the protein sequences can be encoded using Digital Signal Processing (DSP)
|
|
35
|
+
techniques, mainly through the use of informational protein spectra, this is achieved
|
|
36
|
+
via the pyDSP class in the software. This class accepts strings or lists of AAI Indices
|
|
37
|
+
or descriptors and then passes these through a pipeline to get the required numerical
|
|
38
|
+
encoding of the respective sequences. The calculated encodings of the sequences are
|
|
39
|
+
used as features in the building of the predictive ML models that will then predict the
|
|
40
|
+
activity values for new unseen protein sequences. After the encoding process,
|
|
41
|
+
various metrics will be captured and stored in a local output folder according to the
|
|
42
|
+
OUTPUT_FOLDER global var as well as a regression plot showing how well the model,
|
|
43
|
+
and the selected protein feature attributes, fit to the test data of unseen protein
|
|
44
|
+
sequences.
|
|
45
|
+
|
|
46
|
+
The class has one main input parameter (config_file), that is the filename or filepath
|
|
47
|
+
to the configuration file that contains all the required parameters for the encoding
|
|
48
|
+
strategy/process. The class also accepts a variable number of keyword arguments
|
|
49
|
+
(**kwargs) that will override the config file parameter values of the same name if
|
|
50
|
+
they are passed in.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
==========
|
|
54
|
+
:config_file : str
|
|
55
|
+
path to configuration file.
|
|
56
|
+
**kwargs: dict
|
|
57
|
+
keyword arguments and values passed into constructor. The keywords should be
|
|
58
|
+
the same name and form of those in the configuration file. The keyword values
|
|
59
|
+
input take precedence over those in the config files.
|
|
60
|
+
|
|
61
|
+
Methods
|
|
62
|
+
=======
|
|
63
|
+
read_data():
|
|
64
|
+
read dataset of protein sequences.
|
|
65
|
+
preprocessing():
|
|
66
|
+
pre-process / clean protein sequence dataset.
|
|
67
|
+
get_aai_encoding(indices):
|
|
68
|
+
get AAI encoding for user inputted index/indices.
|
|
69
|
+
encode_aai(aai_indices=None, show_plot=False, print_results=True, output_folder=""):
|
|
70
|
+
full pipeline for encoding protein sequences according to user specified
|
|
71
|
+
index/indices from the respective records in the AAI database using the
|
|
72
|
+
get_aai_encoding() function, and outputting the results with all the predictability
|
|
73
|
+
metrics. Also applying a DSP pipeline if applicable.
|
|
74
|
+
get_descriptor_encoding(descriptors=None):
|
|
75
|
+
calculate user inputted descriptor/descriptors using the input protein sequences
|
|
76
|
+
and protpy package.
|
|
77
|
+
encode_descriptor(descriptors=None, show_plot=False, print_results=True, output_folder=""):
|
|
78
|
+
full pipeline for encoding protein sequences according to user inputted descriptor/descriptors,
|
|
79
|
+
calculated using the get_descriptor_encoding() function and the protpy package and outputting
|
|
80
|
+
the results with all the predictability metrics.
|
|
81
|
+
encode_aai_descriptor(aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""):
|
|
82
|
+
full pipeline for encoding protein sequences according to user specified index/indices
|
|
83
|
+
in concatenation with descriptor/descriptors using the get_aai_encoding() and
|
|
84
|
+
get_descriptor_encoding() functions. Output the results with all the predictability
|
|
85
|
+
metrics.
|
|
86
|
+
output_results(results):
|
|
87
|
+
print out the predictive model parameters/attributes and its results.
|
|
88
|
+
"""
|
|
89
|
+
def __init__(self, config_file="", **kwargs):
|
|
90
|
+
|
|
91
|
+
self.kwargs = kwargs # keyword arguments override config parameters
|
|
92
|
+
self.config_parameters = {}
|
|
93
|
+
|
|
94
|
+
config_filepath = ""
|
|
95
|
+
|
|
96
|
+
#open json config file and read in parameters
|
|
97
|
+
if not isinstance(config_file, str):
|
|
98
|
+
raise TypeError(f'JSON config file must be a filepath of type string, got type {type(config_file)}.')
|
|
99
|
+
|
|
100
|
+
#append extension if only filename input
|
|
101
|
+
if (os.path.splitext(config_file)[1] == ''):
|
|
102
|
+
config_file = config_file + '.json'
|
|
103
|
+
|
|
104
|
+
#set class config file after normalization
|
|
105
|
+
self.config_file = config_file
|
|
106
|
+
|
|
107
|
+
if (os.path.isfile(self.config_file)):
|
|
108
|
+
config_filepath = self.config_file
|
|
109
|
+
elif (os.path.isfile(os.path.join('config', self.config_file))):
|
|
110
|
+
config_filepath = os.path.join('config', self.config_file)
|
|
111
|
+
else:
|
|
112
|
+
raise OSError(f'JSON config file {self.config_file} not found at path: {config_filepath}.')
|
|
113
|
+
try:
|
|
114
|
+
with open(config_filepath) as f:
|
|
115
|
+
self.config_parameters = json.load(f)
|
|
116
|
+
except json.JSONDecodeError as exc:
|
|
117
|
+
raise ValueError(f'Error parsing config JSON file: {config_filepath}.') from exc
|
|
118
|
+
|
|
119
|
+
#create instance of Map class so parameters can be accessed via dot notation
|
|
120
|
+
self.config_parameters = Map(self.config_parameters)
|
|
121
|
+
|
|
122
|
+
#dataset parameters
|
|
123
|
+
self.dataset = self.kwargs.get('dataset') if 'dataset' in self.kwargs else self.config_parameters.dataset["dataset"]
|
|
124
|
+
self.sequence_col = self.kwargs.get('sequence_col') if 'sequence_col' in self.kwargs else self.config_parameters.dataset["sequence_col"]
|
|
125
|
+
self.activity_col = self.kwargs.get('activity_col') if 'activity_col' in self.kwargs else self.config_parameters.dataset["activity"]
|
|
126
|
+
|
|
127
|
+
#model parameters
|
|
128
|
+
self.model_parameters = self.kwargs.get('model_parameters') if 'model_parameters' in self.kwargs else self.config_parameters.model["parameters"]
|
|
129
|
+
self.algorithm = self.kwargs.get('algorithm') if 'algorithm' in self.kwargs else self.config_parameters.model["algorithm"]
|
|
130
|
+
self.test_split = self.kwargs.get('test_split') if 'test_split' in self.kwargs else self.config_parameters.model["test_split"]
|
|
131
|
+
|
|
132
|
+
#aai parameters
|
|
133
|
+
self.aai_indices = None
|
|
134
|
+
|
|
135
|
+
#descriptors parameters
|
|
136
|
+
self.descriptors = None
|
|
137
|
+
|
|
138
|
+
#pyDSP parameters - use_dsp, spectrum, window function, window filter
|
|
139
|
+
self.use_dsp = self.kwargs.get('use_dsp') if 'use_dsp' in self.kwargs else self.config_parameters.pyDSP["use_dsp"]
|
|
140
|
+
self.dsp_parameters = self.kwargs.get('dsp_parameters') if 'dsp_parameters' in self.kwargs else self.config_parameters.pyDSP
|
|
141
|
+
self.filter_parameters = self.kwargs.get('filter_parameters') if 'filter_parameters' in self.kwargs else self.dsp_parameters["filter"]
|
|
142
|
+
self.spectrum = self.kwargs.get('spectrum') if 'spectrum' in self.kwargs else self.config_parameters.pyDSP["spectrum"]
|
|
143
|
+
self.window_type = self.kwargs.get('window_type') if 'window_type' in self.kwargs else self.config_parameters.pyDSP["window"]["type"]
|
|
144
|
+
self.filter_type = self.kwargs.get('filter_type') if 'filter_type' in self.kwargs else self.config_parameters.pyDSP["filter"]["type"]
|
|
145
|
+
|
|
146
|
+
#set use_dsp variable to true if any of the DSP parameters passed in as kwargs
|
|
147
|
+
if any(k in self.kwargs for k in ('spectrum', 'window_type', 'filter_type')):
|
|
148
|
+
self.use_dsp = True
|
|
149
|
+
|
|
150
|
+
#import and read dataset
|
|
151
|
+
self.data = self.read_data()
|
|
152
|
+
|
|
153
|
+
#array of protein sequences
|
|
154
|
+
self.sequences = self.data[self.sequence_col]
|
|
155
|
+
|
|
156
|
+
#array of activity values
|
|
157
|
+
self.activity = self.data[self.activity_col]
|
|
158
|
+
|
|
159
|
+
#pre-process dataset and protein sequences
|
|
160
|
+
self.preprocessing()
|
|
161
|
+
|
|
162
|
+
#get number of rows and cols of dataset
|
|
163
|
+
self.num_seqs = len(self.sequences)
|
|
164
|
+
self.sequence_length = len(max(self.sequences, key=len))
|
|
165
|
+
|
|
166
|
+
#feature space dimensions used in building the model
|
|
167
|
+
self.feature_space = ()
|
|
168
|
+
|
|
169
|
+
#create instance of Descriptors class using config file, protein sequences and any kwargs
|
|
170
|
+
self.descriptor = Descriptors(self.config_file, protein_seqs=self.sequences, **self.kwargs)
|
|
171
|
+
|
|
172
|
+
def read_data(self):
|
|
173
|
+
"""
|
|
174
|
+
Read in dataset according to file name from 'dataset' attribute.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
==========
|
|
178
|
+
None
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
=======
|
|
182
|
+
:data: pd.DataFrame
|
|
183
|
+
dataframe of imported dataset.
|
|
184
|
+
"""
|
|
185
|
+
#read in dataset csv if found in path, if not raise error
|
|
186
|
+
if not (os.path.isfile(self.dataset)):
|
|
187
|
+
raise OSError(f'Dataset filepath is not correct: {self.dataset}.')
|
|
188
|
+
|
|
189
|
+
#read in dataset csv
|
|
190
|
+
try:
|
|
191
|
+
data = pd.read_csv(self.dataset, sep=",", header=0)
|
|
192
|
+
return data
|
|
193
|
+
except Exception as e:
|
|
194
|
+
raise OSError(f'Error opening dataset file: {self.dataset}.') from e
|
|
195
|
+
|
|
196
|
+
def preprocessing(self):
|
|
197
|
+
"""
|
|
198
|
+
Pre-process protein sequences in dataset. Validate column names, check
|
|
199
|
+
for invalid amino acids in sequences, remove any gaps in sequence and
|
|
200
|
+
remove any NAN or +/- infinity values.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
==========
|
|
204
|
+
None
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
=======
|
|
208
|
+
None
|
|
209
|
+
"""
|
|
210
|
+
#get closest match for sequence column name in dataset
|
|
211
|
+
sequence_col_matches = get_close_matches(self.sequence_col, self.data.columns, cutoff=0.6)
|
|
212
|
+
|
|
213
|
+
#set sequence col to the first match found, else raise error
|
|
214
|
+
if (sequence_col_matches != []):
|
|
215
|
+
self.sequence_col = sequence_col_matches[0]
|
|
216
|
+
else:
|
|
217
|
+
raise ValueError(f'Sequence column ({self.sequence_col}) not present in dataset columns:\n{self.data.columns}.')
|
|
218
|
+
|
|
219
|
+
#remove any gaps found in sequences in dataset
|
|
220
|
+
self.sequences = remove_gaps(self.sequences)
|
|
221
|
+
|
|
222
|
+
#verify no invalid amino acids found in sequences, if so then raise error
|
|
223
|
+
invalid_seqs = valid_sequence(self.sequences)
|
|
224
|
+
if (invalid_seqs != None):
|
|
225
|
+
raise ValueError(f'Invalid amino acids found in protein sequence dataset: {invalid_seqs}.')
|
|
226
|
+
|
|
227
|
+
#get closest match for activity column name in dataset
|
|
228
|
+
activity_matches = get_close_matches(self.activity_col, self.data.columns, cutoff=0.6)
|
|
229
|
+
|
|
230
|
+
#set activity col to the first match found, else raise error
|
|
231
|
+
if (activity_matches != []):
|
|
232
|
+
self.activity_col = activity_matches[0]
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(f'Activity column ({self.activity_col}) not present in dataset columns:\n{list(self.data.columns)}.')
|
|
235
|
+
|
|
236
|
+
#remove any +/- infinity values or any Null/NAN's from activity values
|
|
237
|
+
nan_count = self.data[self.activity_col].replace([np.inf, -np.inf], np.nan).isna().sum()
|
|
238
|
+
if nan_count > 0:
|
|
239
|
+
warnings.warn(
|
|
240
|
+
f'{nan_count} missing/infinite activity value(s) in column "{self.activity_col}" '
|
|
241
|
+
f'replaced with 0. Consider reviewing or dropping these rows.',
|
|
242
|
+
UserWarning, stacklevel=2
|
|
243
|
+
)
|
|
244
|
+
self.data[self.activity_col] = (
|
|
245
|
+
self.data[self.activity_col]
|
|
246
|
+
.replace([np.inf, -np.inf], np.nan)
|
|
247
|
+
.fillna(0)
|
|
248
|
+
)
|
|
249
|
+
#refresh self.activity to reflect the updated (NaN-replaced) column
|
|
250
|
+
self.activity = self.data[self.activity_col]
|
|
251
|
+
|
|
252
|
+
def get_aai_encoding(self, aai_indices=None):
|
|
253
|
+
"""
|
|
254
|
+
Get AAI index encoding values for input index/indices and their respective
|
|
255
|
+
record values from the AAI database. Encode each amino acid in the protein
|
|
256
|
+
sequences in the dataset to the respective values specified in the AAI
|
|
257
|
+
The index/indices should be in the form of the properties accession number
|
|
258
|
+
which is the 10 length alphanumeric code that represents each property within
|
|
259
|
+
the AAI database. If multiple indices/accession numbers input then encode
|
|
260
|
+
protein sequences with each index and concatenate.
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
==========
|
|
264
|
+
:aai_indices: str/list (default=None)
|
|
265
|
+
string or list of AAI indices/accession numbers.
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
=======
|
|
269
|
+
:encoded_seqs: np.ndarray
|
|
270
|
+
array of the encoded protein sequences in dataset via user input index/indices.
|
|
271
|
+
"""
|
|
272
|
+
#validate AAI indices are present in the input parameter, if not raise error
|
|
273
|
+
if (aai_indices == None or aai_indices == ""):
|
|
274
|
+
raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
|
|
275
|
+
|
|
276
|
+
#check input indices is of correct type (str/list), if not raise type error
|
|
277
|
+
if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list))):
|
|
278
|
+
raise TypeError(f"Input indices parameter must be a string or list, got {type(aai_indices)}.")
|
|
279
|
+
|
|
280
|
+
#cast index string to list, split multiple indices using comma
|
|
281
|
+
if (isinstance(aai_indices, str)):
|
|
282
|
+
if (',' in aai_indices):
|
|
283
|
+
aai_indices = aai_indices.split(',') #split on ',' just in case multiple indices passed in as str
|
|
284
|
+
else:
|
|
285
|
+
aai_indices = [aai_indices]
|
|
286
|
+
|
|
287
|
+
#create zeros numpy array to store encoded sequence output
|
|
288
|
+
encoded_aai_ = np.zeros((self.num_seqs, self.sequence_length*len(aai_indices)))
|
|
289
|
+
|
|
290
|
+
#if multiple indices used then calculate AAI index encoding for each and concatenate after each calculation
|
|
291
|
+
for index in range(0, len(aai_indices)):
|
|
292
|
+
|
|
293
|
+
#get values from aaindex record using its accession number and the aaindex package
|
|
294
|
+
encoded_aai = aaindex1[aai_indices[index]].values
|
|
295
|
+
|
|
296
|
+
#initialise temp arrays to store encoded sequences
|
|
297
|
+
temp_seq_vals = []
|
|
298
|
+
temp_all_seqs = []
|
|
299
|
+
|
|
300
|
+
#iterate through each protein sequence and amino acid, getting the AAI index encoding value
|
|
301
|
+
for protein in range(0, len(self.sequences)):
|
|
302
|
+
for aa in self.sequences[protein]:
|
|
303
|
+
temp_seq_vals.append(encoded_aai[aa])
|
|
304
|
+
|
|
305
|
+
#append encoding and reset temp array
|
|
306
|
+
temp_all_seqs.append(temp_seq_vals)
|
|
307
|
+
temp_seq_vals = []
|
|
308
|
+
|
|
309
|
+
#zero-pad encoding list so that sequences are all the same length
|
|
310
|
+
temp_all_seqs = zero_padding(temp_all_seqs)
|
|
311
|
+
|
|
312
|
+
#convert list of lists into array
|
|
313
|
+
temp_all_seqs = np.array(temp_all_seqs, dtype="float32")
|
|
314
|
+
|
|
315
|
+
#in first iteration through aai_indices (index=0) set encoded_aai_ to zero-initialised
|
|
316
|
+
#numpy array, else concatenate to the array in previous iteration
|
|
317
|
+
if (index == 0):
|
|
318
|
+
encoded_aai_ = temp_all_seqs
|
|
319
|
+
else:
|
|
320
|
+
encoded_aai_ = np.concatenate((encoded_aai_, temp_all_seqs), axis=1)
|
|
321
|
+
|
|
322
|
+
return encoded_aai_
|
|
323
|
+
|
|
324
|
+
def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, output_folder=""):
|
|
325
|
+
"""
|
|
326
|
+
Full pipeline for encoding proteins sequences in dataset using the input AAI indices
|
|
327
|
+
from the AAI database. If multiple indices/accession numbers input then calculate each
|
|
328
|
+
and concatenate them. Build predictive regression ML model from encoded AAI feature data
|
|
329
|
+
for predicting the activity/fitness values of unseen sequences.
|
|
330
|
+
|
|
331
|
+
The resulting model assets and its results will be exported to the directory pointed to
|
|
332
|
+
by the global var OUTPUT_DIR. If use_dsp config parameter is true then pass AAI
|
|
333
|
+
Indices through a DSP transformation pipeline specified by the config's DSP parameters
|
|
334
|
+
(spectrum, window & filter) via the PyDSP module and class.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
==========
|
|
338
|
+
:aai_indices: str/list (default=None)
|
|
339
|
+
string or list of indices/accession numbers from the AAI.
|
|
340
|
+
:show_plot: bool (default=False)
|
|
341
|
+
display regression plot of best predictive model. If False then the plot
|
|
342
|
+
will just be saved to the output folder, else it'll be displayed & also saved.
|
|
343
|
+
:print_results: bool (default=True)
|
|
344
|
+
if true, output verbose output of results and parameters from encoding process.
|
|
345
|
+
:output_folder: str (default="")
|
|
346
|
+
output folder to store results csv to, if empty input it will be stored in
|
|
347
|
+
the OUTPUT_FOLDER global var.
|
|
348
|
+
|
|
349
|
+
Returns
|
|
350
|
+
=======
|
|
351
|
+
:aai_df: pd.Dataframe
|
|
352
|
+
pandas Dataframe storing metrics and results of encoding.
|
|
353
|
+
"""
|
|
354
|
+
#validate AAI indices are present in the input parameter
|
|
355
|
+
if (aai_indices == None or aai_indices == "" or aai_indices == []):
|
|
356
|
+
raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
|
|
357
|
+
|
|
358
|
+
#check input indices is of correct type (str/list), if not raise type error
|
|
359
|
+
if ((not isinstance(aai_indices, str)) and (not isinstance(aai_indices, list))):
|
|
360
|
+
raise TypeError(f"Input indices parameter must be a string or list, got {type(aai_indices)}.")
|
|
361
|
+
|
|
362
|
+
self.aai_indices = aai_indices
|
|
363
|
+
|
|
364
|
+
#if list of one element with multiple indices, split them into list of individual elements
|
|
365
|
+
if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1:
|
|
366
|
+
self.aai_indices = self.aai_indices[0].replace(' ', '').split(',')
|
|
367
|
+
|
|
368
|
+
#convert string indices into comma seperated list, remove whitespace
|
|
369
|
+
if isinstance(self.aai_indices, str):
|
|
370
|
+
self.aai_indices = self.aai_indices.replace(' ', '').split(',')
|
|
371
|
+
|
|
372
|
+
#sort list of indices into alphabetical order
|
|
373
|
+
self.aai_indices.sort()
|
|
374
|
+
|
|
375
|
+
#dataframe to store encoding of inputted aai indices
|
|
376
|
+
aai_encoding_df = pd.DataFrame()
|
|
377
|
+
|
|
378
|
+
#iterate over each index, calculate its encoding, apply DSP functionality if applicable, concat into one dataframe
|
|
379
|
+
for index in self.aai_indices:
|
|
380
|
+
|
|
381
|
+
#get AAI index encodings specified by indices input parameter
|
|
382
|
+
encoded_seqs = self.get_aai_encoding(index)
|
|
383
|
+
|
|
384
|
+
#if use_dsp true then get protein spectra from encoded sequences via the AAI indices using PyDSP class,
|
|
385
|
+
#else use the AAI indices encoding's themselves as the feature/training data (X)
|
|
386
|
+
if (self.use_dsp):
|
|
387
|
+
#if input spectrum is none or empty, raise error.
|
|
388
|
+
if (self.spectrum == None or self.spectrum == ""):
|
|
389
|
+
raise ValueError(f'Spectrum cannot be None or empty: {self.spectrum}.')
|
|
390
|
+
pyDSP = PyDSP(self.config_file, protein_seqs=encoded_seqs)
|
|
391
|
+
X = pd.DataFrame(pyDSP.spectrum_encoding) #set training data to FFT spectrum encoding
|
|
392
|
+
else:
|
|
393
|
+
X = pd.DataFrame(encoded_seqs) #no DSP applied to encoded sequences
|
|
394
|
+
|
|
395
|
+
#concat encoding of current aai index with other encodings for training data
|
|
396
|
+
aai_encoding_df = pd.concat([aai_encoding_df, X], axis=1)
|
|
397
|
+
|
|
398
|
+
#renaming columns in format aai_X, where X is the amino acid number in the sequence
|
|
399
|
+
columns = ["aai_" + str(x) for x in range(1, len(aai_encoding_df.columns) + 1)]
|
|
400
|
+
aai_encoding_df.columns = columns
|
|
401
|
+
|
|
402
|
+
#set class variable to the training data feature space
|
|
403
|
+
self.feature_space = aai_encoding_df.shape
|
|
404
|
+
|
|
405
|
+
#create instance of model class of type specified by algorithm parameter using X and Y data
|
|
406
|
+
self.model = Model(aai_encoding_df, self.activity, self.algorithm, parameters=self.model_parameters)
|
|
407
|
+
|
|
408
|
+
#updating algorithm attribute
|
|
409
|
+
self.algorithm = repr(self.model)
|
|
410
|
+
|
|
411
|
+
#get training and test dataset split from model class
|
|
412
|
+
X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
|
|
413
|
+
|
|
414
|
+
#fit predictive model
|
|
415
|
+
self.model.fit()
|
|
416
|
+
|
|
417
|
+
#predict activity values for test data
|
|
418
|
+
Y_pred = self.model.predict()
|
|
419
|
+
|
|
420
|
+
#create instance of Evaluate class which will get all the evaluation metrics
|
|
421
|
+
evaluation = Evaluate(Y_test, Y_pred)
|
|
422
|
+
|
|
423
|
+
#get categories for all indices in self.aai_indices
|
|
424
|
+
index_cat = []
|
|
425
|
+
if (isinstance(self.aai_indices, list)):
|
|
426
|
+
for i in range(0, len(self.aai_indices)):
|
|
427
|
+
index_cat.append(aaindex1[self.aai_indices[i]].category)
|
|
428
|
+
else:
|
|
429
|
+
index_cat = [aaindex1[self.aai_indices].category]
|
|
430
|
+
|
|
431
|
+
#create comma seperated list of categories
|
|
432
|
+
index_cat = ', '.join(index_cat)
|
|
433
|
+
|
|
434
|
+
#create output dataframe, set first row to attribute/metric values
|
|
435
|
+
aai_df = pd.DataFrame(columns=['Index', 'Category', 'R2', 'RMSE', 'MSE', 'MAE', 'RPD', 'Explained Variance'])
|
|
436
|
+
aai_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), evaluation.r2, evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
|
|
437
|
+
|
|
438
|
+
#convert index and category from default Object type -> String datatypes
|
|
439
|
+
aai_df['Index'] = aai_df['Index'].astype(pd.StringDtype())
|
|
440
|
+
aai_df['Category'] = aai_df['Category'].astype(pd.StringDtype())
|
|
441
|
+
|
|
442
|
+
#print out results from encoding
|
|
443
|
+
if (print_results):
|
|
444
|
+
self.output_results(aai_df)
|
|
445
|
+
|
|
446
|
+
#plot regression plot for predictive model
|
|
447
|
+
plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
|
|
448
|
+
|
|
449
|
+
#save results of encoding to output folder specified by input param
|
|
450
|
+
save_results(aai_df, 'aai_results', output_folder=output_folder)
|
|
451
|
+
|
|
452
|
+
return aai_df
|
|
453
|
+
|
|
454
|
+
def get_descriptor_encoding(self, descriptors=None):
|
|
455
|
+
"""
|
|
456
|
+
Calculate inputted descriptor(s), using the Descriptors class and custom-built
|
|
457
|
+
protpy package, requried for the encoding process. Get closest match to user
|
|
458
|
+
inputted string or list of descriptors using difflib library. If a single
|
|
459
|
+
descriptor is input then calculate it and return, if list of descriptors input
|
|
460
|
+
then calculate each descriptor's value and concatenate.
|
|
461
|
+
|
|
462
|
+
Parameters
|
|
463
|
+
==========
|
|
464
|
+
:descriptors: str/list (default=None)
|
|
465
|
+
string or list of protein descriptor names.
|
|
466
|
+
|
|
467
|
+
Returns
|
|
468
|
+
=======
|
|
469
|
+
:encoded_desc: pd.DataFrame
|
|
470
|
+
pandas dataframe of calculated descriptor values according to user
|
|
471
|
+
inputted descriptor(s).
|
|
472
|
+
"""
|
|
473
|
+
#raise error if no descriptors specified in input
|
|
474
|
+
if (descriptors == None or descriptors == "" or descriptors == []):
|
|
475
|
+
raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
|
|
476
|
+
|
|
477
|
+
#check input descriptor is of correct type str or list, if not raise type error
|
|
478
|
+
if (not isinstance(descriptors, str) and (not isinstance(descriptors, list))):
|
|
479
|
+
raise TypeError(f"Input descriptor parameter must be a str or list, got {type(descriptors)}.")
|
|
480
|
+
|
|
481
|
+
#cast descriptors parameter to a list if it is a str by creating comma seperated list
|
|
482
|
+
if (isinstance(descriptors, str)):
|
|
483
|
+
descriptors = descriptors.split(',')
|
|
484
|
+
|
|
485
|
+
#remove any leading or trailing whitespace from descriptors
|
|
486
|
+
descriptors = [de.strip() for de in descriptors]
|
|
487
|
+
|
|
488
|
+
#reuse cached Descriptors instance created in __init__
|
|
489
|
+
descr = self.descriptor
|
|
490
|
+
|
|
491
|
+
#store list of correct descriptor names from ones user input using the difflib library
|
|
492
|
+
temp_descriptors = []
|
|
493
|
+
|
|
494
|
+
#get closest valid available descriptor name from input descriptor parameter,
|
|
495
|
+
#if a list of descriptors passed in as the input parameter then get
|
|
496
|
+
#all valid descriptors in list
|
|
497
|
+
for de in range(0, len(descriptors)):
|
|
498
|
+
desc_matches = get_close_matches(descriptors[de],
|
|
499
|
+
descr.valid_descriptors, cutoff=0.6)
|
|
500
|
+
if (desc_matches == []):
|
|
501
|
+
raise ValueError(f'No approximate descriptor found from one input: {de}.')
|
|
502
|
+
descriptors[de] = desc_matches[0]
|
|
503
|
+
temp_descriptors.append(desc_matches[0])
|
|
504
|
+
|
|
505
|
+
#initialise temp lists and DF to store encoded descriptor values
|
|
506
|
+
encoded_desc_temp = []
|
|
507
|
+
encoded_desc_vals = []
|
|
508
|
+
encoded_desc_temp = pd.DataFrame()
|
|
509
|
+
|
|
510
|
+
#iterate and get each descriptors' values using Descriptor class and protpy package
|
|
511
|
+
for d in range(0, len(descriptors)):
|
|
512
|
+
encoded_desc_temp = descr.get_descriptor_encoding(descriptors[d])
|
|
513
|
+
#raise value error if descriptor is empty/None
|
|
514
|
+
if (encoded_desc_temp.empty):
|
|
515
|
+
raise ValueError(f'Descriptor cannot be empty or None: {descriptors[d]}.')
|
|
516
|
+
encoded_desc_vals.append(encoded_desc_temp) #append to array of all descriptor values
|
|
517
|
+
encoded_desc_temp = pd.DataFrame() #reset to empty dataframe for next iteration
|
|
518
|
+
|
|
519
|
+
#concatenate dataframes of descriptors
|
|
520
|
+
encoded_desc = pd.concat(encoded_desc_vals, axis=1)
|
|
521
|
+
|
|
522
|
+
return encoded_desc
|
|
523
|
+
|
|
524
|
+
def encode_descriptor(self, descriptors=None, show_plot=False, print_results=True, output_folder=""):
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
Full pipeline for encoding the protein sequences in the dataset using protein
|
|
528
|
+
physicochemical, biochemical and or structural descriptors, using the Descriptors
|
|
529
|
+
class and custom-built protpy package, and build predictive ML regression model
|
|
530
|
+
from the descriptor feature/training data. This model is then used to calculate
|
|
531
|
+
the activity/fitness value of unseen test sequences. If multiple descriptors input
|
|
532
|
+
then calculate each and concatenate them. The resulting model assets and its metric's
|
|
533
|
+
results will be exported to the directory pointed to by the global variable OUTPUT_DIR.
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
==========
|
|
537
|
+
:descriptors: str/list (default=None)
|
|
538
|
+
string or list of protein descriptor names.
|
|
539
|
+
:show_plot: bool (default=False)
|
|
540
|
+
display regression plot of best predictive model. If False then the plot
|
|
541
|
+
will just be saved to the output folder, else it'll be displayed & also saved.
|
|
542
|
+
:print_results: bool (default=True)
|
|
543
|
+
if true, output verbose output of results and parameters from encoding process.
|
|
544
|
+
:output_folder: str (default="")
|
|
545
|
+
output folder to store results csv to, if empty input it will be stored in
|
|
546
|
+
the OUTPUT_FOLDER global var.
|
|
547
|
+
|
|
548
|
+
Returns
|
|
549
|
+
=======
|
|
550
|
+
:desc_df: pd.DataFrame
|
|
551
|
+
pandas dataframe storing metrics and results of encoding.
|
|
552
|
+
"""
|
|
553
|
+
#raise error if no descriptor specified in input
|
|
554
|
+
if (descriptors == None or descriptors == ""):
|
|
555
|
+
raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
|
|
556
|
+
|
|
557
|
+
#check input descriptor is of correct type (str or list), if not raise type error
|
|
558
|
+
if (not (isinstance(descriptors, str))) and (not (isinstance(descriptors, list))):
|
|
559
|
+
raise TypeError(f"Input descriptor parameter must be a string or list, got {type(descriptors)}.")
|
|
560
|
+
|
|
561
|
+
#set class attribute
|
|
562
|
+
self.descriptors = descriptors
|
|
563
|
+
|
|
564
|
+
#if multiple descriptors input as str, split into comma seperated list
|
|
565
|
+
if isinstance(self.descriptors, str):
|
|
566
|
+
self.descriptors = self.descriptors.replace(' ', '').split(',')
|
|
567
|
+
|
|
568
|
+
#if list of multiple descriptors input in one string, seperate into commas seperated list of individual elements
|
|
569
|
+
if isinstance(self.descriptors, list) and len(self.descriptors) == 1:
|
|
570
|
+
self.descriptors = self.descriptors[0].replace(' ', '').split(',')
|
|
571
|
+
|
|
572
|
+
#sort list of descriptors into alphabetical order
|
|
573
|
+
self.descriptors.sort()
|
|
574
|
+
|
|
575
|
+
#reuse cached Descriptors instance created in __init__
|
|
576
|
+
descr = self.descriptor
|
|
577
|
+
|
|
578
|
+
#pandas dataframe to store all output results
|
|
579
|
+
desc_df = pd.DataFrame(columns=['Descriptor', 'Group', 'R2', 'RMSE', 'MSE', 'MAE', 'RPD', 'Explained Variance'])
|
|
580
|
+
|
|
581
|
+
#object to store sequence encodings for each input descriptor
|
|
582
|
+
descriptor_encoding_df = pd.DataFrame()
|
|
583
|
+
|
|
584
|
+
#iterate over each input descriptor, calculate its encoding from its respective function, concatenate with main encoding object
|
|
585
|
+
for desc in range(0, len(self.descriptors)):
|
|
586
|
+
|
|
587
|
+
#get closest matching descriptor from descriptor input parameter using difflib library
|
|
588
|
+
desc_matches = get_close_matches(self.descriptors[desc], descr.valid_descriptors, cutoff=0.6)
|
|
589
|
+
if (desc_matches != []):
|
|
590
|
+
self.descriptors[desc] = desc_matches[0]
|
|
591
|
+
else:
|
|
592
|
+
raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'.
|
|
593
|
+
format(self.descriptors[desc], descr.valid_descriptors))
|
|
594
|
+
|
|
595
|
+
#concatenate encoding of current descriptor to main encodng object
|
|
596
|
+
descriptor_encoding_df = pd.concat([descriptor_encoding_df, self.get_descriptor_encoding(descriptors=self.descriptors[desc])], axis=1)
|
|
597
|
+
|
|
598
|
+
#set class variable to the training data feature space
|
|
599
|
+
self.feature_space = descriptor_encoding_df.shape
|
|
600
|
+
|
|
601
|
+
#create instance of model class of type specified by algorithm parameter using X and Y data
|
|
602
|
+
self.model = Model(descriptor_encoding_df, self.activity, self.algorithm, parameters=self.model_parameters)
|
|
603
|
+
|
|
604
|
+
#updating algorithm attribute
|
|
605
|
+
self.algorithm = repr(self.model)
|
|
606
|
+
|
|
607
|
+
#get training and test dataset split using Model class
|
|
608
|
+
X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
|
|
609
|
+
|
|
610
|
+
#fit predictive model
|
|
611
|
+
self.model.fit()
|
|
612
|
+
|
|
613
|
+
#predict activity values for test data
|
|
614
|
+
Y_pred = self.model.predict()
|
|
615
|
+
|
|
616
|
+
#create instance of Evaluate class which will get all the evaluation metrics
|
|
617
|
+
evaluation = Evaluate(Y_test, Y_pred)
|
|
618
|
+
|
|
619
|
+
#get groups for all descriptors in self.desciptors, put multiple descriptor groups into comma seperated list
|
|
620
|
+
if (isinstance(self.descriptors, list)):
|
|
621
|
+
desc_group = []
|
|
622
|
+
for desc_ in self.descriptors:
|
|
623
|
+
desc_group.append(descr.descriptor_groups[desc_])
|
|
624
|
+
desc_group = ', '.join(desc_group)
|
|
625
|
+
else:
|
|
626
|
+
desc_group = descr.descriptor_groups[self.descriptors]
|
|
627
|
+
|
|
628
|
+
#add metric values to output dataframe
|
|
629
|
+
desc_df.loc[0] = [', '.join(self.descriptors), desc_group, evaluation.r2, evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
|
|
630
|
+
|
|
631
|
+
#convert Descriptor and Group from default Object type -> String datatypes
|
|
632
|
+
desc_df['Descriptor'] = desc_df['Descriptor'].astype(pd.StringDtype())
|
|
633
|
+
desc_df['Group'] = desc_df['Group'].astype(pd.StringDtype())
|
|
634
|
+
|
|
635
|
+
#ensure aai indices attribute doesn't show up in output results
|
|
636
|
+
if (self.aai_indices != None):
|
|
637
|
+
self.aai_indices = None
|
|
638
|
+
|
|
639
|
+
#print out results from encoding
|
|
640
|
+
if (print_results):
|
|
641
|
+
self.output_results(desc_df)
|
|
642
|
+
|
|
643
|
+
#plot regression plot for predictive model
|
|
644
|
+
plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
|
|
645
|
+
|
|
646
|
+
#save results of encoding to output folder
|
|
647
|
+
save_results(desc_df, 'desc_results', output_folder=output_folder)
|
|
648
|
+
|
|
649
|
+
#reset descriptors instance variable
|
|
650
|
+
self.descriptors = None
|
|
651
|
+
|
|
652
|
+
return desc_df
|
|
653
|
+
|
|
654
|
+
def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""):
|
|
655
|
+
"""
|
|
656
|
+
Encode using both AAI indices and the physicochemical/structural descriptors from
|
|
657
|
+
the get_aai_encoding() and get_descriptor_encoding() functions. The two outputs
|
|
658
|
+
from the individual encoding strategies, previously described above, will be
|
|
659
|
+
concatenated together and used in the building of a predictive regression ML
|
|
660
|
+
model. The resulting model assets and its results will be exported to the
|
|
661
|
+
directory pointed to by the global variable OUTPUT_DIR. If the config parameter
|
|
662
|
+
use_dsp is true then pass AAI Indices through a DSP transformation pipeline
|
|
663
|
+
specified by the DSP parameters (spectrum, window & filter) via the PyDSP
|
|
664
|
+
class/module.
|
|
665
|
+
|
|
666
|
+
Parameters
|
|
667
|
+
==========
|
|
668
|
+
:aai_indices: str/list (default=None)
|
|
669
|
+
string or list of indices/accession numbers from the AAI database.
|
|
670
|
+
:descriptors: str/list (default=None)
|
|
671
|
+
string or list of protein descriptors names.
|
|
672
|
+
:show_plot: bool (default=False)
|
|
673
|
+
display regression plot of best predictive model. If false then the plot
|
|
674
|
+
will just be saved to the output folder, else it'll be displayed & also saved.
|
|
675
|
+
:print_results: bool (default=True)
|
|
676
|
+
if true, output verbose output of results and parameters from encoding process.
|
|
677
|
+
:output_folder: str (default="")
|
|
678
|
+
output folder to store results csv to, if empty input it will be stored in
|
|
679
|
+
the OUTPUT_FOLDER global var.
|
|
680
|
+
|
|
681
|
+
Returns
|
|
682
|
+
=======
|
|
683
|
+
:aai_desc_df : pd.Dataframe
|
|
684
|
+
pandas dataframe storing metrics and results of encoding.
|
|
685
|
+
"""
|
|
686
|
+
#validate AAI indices and Descriptors are present in the input parameters, return error if either is None
|
|
687
|
+
if (descriptors == None or descriptors == "") or (aai_indices == None or aai_indices == ""):
|
|
688
|
+
raise ValueError('AAI Indices and Descriptor input parameters must not be empty or None.')
|
|
689
|
+
|
|
690
|
+
#check input descriptor & indices are of correct type (str/list), if not raise type error
|
|
691
|
+
if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list)) or \
|
|
692
|
+
(not isinstance(descriptors, str) and (not isinstance(descriptors, list)))):
|
|
693
|
+
raise TypeError("Input AAI indices and descriptors parameter must be of type string or list.")
|
|
694
|
+
|
|
695
|
+
#set instance attributes
|
|
696
|
+
self.aai_indices = aai_indices
|
|
697
|
+
self.descriptors = descriptors
|
|
698
|
+
|
|
699
|
+
#if list of multiple descriptors input in one string, seperate into commas seperated list of individual elements
|
|
700
|
+
if isinstance(self.descriptors, list) and len(self.descriptors) == 1:
|
|
701
|
+
self.descriptors = self.descriptors[0].replace(' ', '').split(',')
|
|
702
|
+
|
|
703
|
+
#convert descriptors into comma seperated list if str input, remove whitespace
|
|
704
|
+
if isinstance(self.descriptors, str):
|
|
705
|
+
self.descriptors = self.descriptors.replace(' ', '').split(',')
|
|
706
|
+
|
|
707
|
+
#if list of one element with multiple indices, split them into list of individual elements
|
|
708
|
+
if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1:
|
|
709
|
+
self.aai_indices = self.aai_indices[0].replace(' ', '').split(',')
|
|
710
|
+
|
|
711
|
+
#convert string indices into comma seperated list, remove whitespace
|
|
712
|
+
if isinstance(self.aai_indices, str):
|
|
713
|
+
self.aai_indices = self.aai_indices.replace(' ', '').split(',')
|
|
714
|
+
|
|
715
|
+
#sort list of indices into alphabetical order
|
|
716
|
+
self.aai_indices.sort()
|
|
717
|
+
|
|
718
|
+
#sort list of descriptors into alphabetical order
|
|
719
|
+
self.descriptors.sort()
|
|
720
|
+
|
|
721
|
+
#create output results Dataframe
|
|
722
|
+
aai_desc_df = pd.DataFrame(columns=['Index', 'Category', 'Descriptor', 'Group', 'R2', 'RMSE', \
|
|
723
|
+
'MSE', 'MAE', 'RPD', 'Explained Variance'])
|
|
724
|
+
|
|
725
|
+
#dataframe to store the encodings for the AAI indices
|
|
726
|
+
aai_encoding_df = pd.DataFrame()
|
|
727
|
+
|
|
728
|
+
#iterate over each index code, calculate its encoding features using respective function, convert to dataframe and concat to main dataframe
|
|
729
|
+
for index in self.aai_indices:
|
|
730
|
+
aai_encoding_df = pd.concat([aai_encoding_df, pd.DataFrame(self.get_aai_encoding(index))], axis=1)
|
|
731
|
+
|
|
732
|
+
#renaming columns in format aai_X, where X is the amino acid number in the sequence
|
|
733
|
+
columns = ["aai_" + str(x) for x in range(1, len(aai_encoding_df.columns) + 1)]
|
|
734
|
+
aai_encoding_df.columns = columns
|
|
735
|
+
|
|
736
|
+
#if AAI indices encoding is empty, raise error
|
|
737
|
+
if (aai_encoding_df.empty):
|
|
738
|
+
raise ValueError(f'AAI Indices encoding cannot be empty or None: {aai_indices}.')
|
|
739
|
+
|
|
740
|
+
#reuse cached Descriptors instance from __init__
|
|
741
|
+
descr = self.descriptor
|
|
742
|
+
|
|
743
|
+
#dataframe to store the encodings for the descriptors
|
|
744
|
+
descriptor_encoding_df = pd.DataFrame()
|
|
745
|
+
|
|
746
|
+
#iterate over each input descriptor, calculate its encoding from its respective function, concatenate with main encoding object
|
|
747
|
+
for desc in range(0, len(self.descriptors)):
|
|
748
|
+
|
|
749
|
+
#get closest matching descriptor from descriptor input parameter using difflib library
|
|
750
|
+
desc_matches = get_close_matches(self.descriptors[desc], descr.valid_descriptors, cutoff=0.6)
|
|
751
|
+
if (desc_matches != []):
|
|
752
|
+
self.descriptors[desc] = desc_matches[0]
|
|
753
|
+
else:
|
|
754
|
+
raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'.
|
|
755
|
+
format(self.descriptors[desc], descr.valid_descriptors))
|
|
756
|
+
|
|
757
|
+
#concatenate encoding of current descriptor to main encodng object
|
|
758
|
+
descriptor_encoding_df = pd.concat([descriptor_encoding_df, self.get_descriptor_encoding(descriptors=self.descriptors[desc])], axis=1)
|
|
759
|
+
|
|
760
|
+
#reset index for aai indices and in descriptors output dataframe
|
|
761
|
+
aai_encoding_df.reset_index(inplace=True, drop=True)
|
|
762
|
+
descriptor_encoding_df.reset_index(inplace=True, drop=True)
|
|
763
|
+
|
|
764
|
+
#concatenate AAI index and Descriptor features to get training data (X)
|
|
765
|
+
X = pd.concat([aai_encoding_df, descriptor_encoding_df], axis=1)
|
|
766
|
+
|
|
767
|
+
#set class variable to the training data feature space
|
|
768
|
+
self.feature_space = X.shape
|
|
769
|
+
|
|
770
|
+
#create instance of model class of type specified by algorithm parameter using X and Y data
|
|
771
|
+
self.model = Model(X, self.activity, self.algorithm, parameters=self.model_parameters)
|
|
772
|
+
|
|
773
|
+
#updating algorithm attribute
|
|
774
|
+
self.algorithm = repr(self.model)
|
|
775
|
+
|
|
776
|
+
#get training and test dataset split using Model class
|
|
777
|
+
X_train, X_test, Y_train, Y_test = self.model.train_test_split(test_split=self.test_split)
|
|
778
|
+
|
|
779
|
+
#fit predictive model
|
|
780
|
+
self.model.fit()
|
|
781
|
+
|
|
782
|
+
#predict activity values for test data
|
|
783
|
+
Y_pred = self.model.predict()
|
|
784
|
+
|
|
785
|
+
#create instance of Evaluate class which will get all the evaluation metrics
|
|
786
|
+
evaluation = Evaluate(Y_test, Y_pred)
|
|
787
|
+
|
|
788
|
+
#get categories for all indices in self.aai_indices
|
|
789
|
+
index_cat = []
|
|
790
|
+
if (isinstance(self.aai_indices, list)):
|
|
791
|
+
for i in range(0, len(self.aai_indices)):
|
|
792
|
+
index_cat.append(aaindex1[self.aai_indices[i]].category)
|
|
793
|
+
else:
|
|
794
|
+
index_cat = [aaindex1[self.aai_indices].category]
|
|
795
|
+
|
|
796
|
+
#seperate index categories into comma seperated string
|
|
797
|
+
index_cat = ', '.join(index_cat)
|
|
798
|
+
|
|
799
|
+
#get groups for all descriptors in self.desciptors, can be string or list of descriptors
|
|
800
|
+
if (isinstance(self.descriptors, list)):
|
|
801
|
+
desc_group = []
|
|
802
|
+
for desc_ in self.descriptors:
|
|
803
|
+
desc_group.append(descr.descriptor_groups[desc_])
|
|
804
|
+
desc_group = ', '.join(desc_group)
|
|
805
|
+
else:
|
|
806
|
+
desc_group = descr.descriptor_groups[self.descriptors]
|
|
807
|
+
|
|
808
|
+
#set output dataframe columns
|
|
809
|
+
aai_desc_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), ', '.join(self.descriptors), str(desc_group), evaluation.r2,
|
|
810
|
+
evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
|
|
811
|
+
|
|
812
|
+
#convert Index, Category, Descriptor and Group from default Object type -> String datatypes
|
|
813
|
+
# aai_desc_df['Index'] = aai_desc_df['Index'].astype(pd.StringDtype())
|
|
814
|
+
aai_desc_df['Index'] = aai_desc_df['Index'].astype("string")
|
|
815
|
+
aai_desc_df['Category'] = aai_desc_df['Category'].astype(pd.StringDtype())
|
|
816
|
+
aai_desc_df['Descriptor'] = aai_desc_df['Descriptor'].astype(pd.StringDtype())
|
|
817
|
+
aai_desc_df['Group'] = aai_desc_df['Group'].astype(pd.StringDtype())
|
|
818
|
+
|
|
819
|
+
#print out results from encoding
|
|
820
|
+
if (print_results):
|
|
821
|
+
self.output_results(aai_desc_df)
|
|
822
|
+
|
|
823
|
+
#plot regression plot for predictive model
|
|
824
|
+
plot_reg(Y_test, Y_pred, evaluation.r2, output_folder, show_plot)
|
|
825
|
+
|
|
826
|
+
#save results of encoding to output folder
|
|
827
|
+
save_results(aai_desc_df, 'aai_desc_results', output_folder=output_folder)
|
|
828
|
+
|
|
829
|
+
return aai_desc_df
|
|
830
|
+
|
|
831
|
+
def output_results(self, results):
|
|
832
|
+
"""
|
|
833
|
+
Print out the predictive model parameters/attributes and its results.
|
|
834
|
+
|
|
835
|
+
Parameters
|
|
836
|
+
==========
|
|
837
|
+
:results: dict/pd.Series
|
|
838
|
+
dictionary or Series of metrics and their associated values.
|
|
839
|
+
|
|
840
|
+
Returns
|
|
841
|
+
=======
|
|
842
|
+
None
|
|
843
|
+
"""
|
|
844
|
+
#create text wrapper for aai indices, descriptors and model parameters text
|
|
845
|
+
line_length = 90
|
|
846
|
+
|
|
847
|
+
print('##########################################################################################')
|
|
848
|
+
print('###################################### Parameters ########################################\n')
|
|
849
|
+
if (not (self.aai_indices is None)) and (len(self.aai_indices) <= 10):
|
|
850
|
+
print(textwrap.fill('# AAI Indices: {}'.format(', '.join(self.aai_indices)), line_length))
|
|
851
|
+
if (self.use_dsp):
|
|
852
|
+
print('# DSP Parameters:\n # Spectrum: {}\n # Window Function: {} \
|
|
853
|
+
\n # Filter Function: {}'.format(self.spectrum, self.window_type, self.filter_type))
|
|
854
|
+
if (self.descriptors is not None):
|
|
855
|
+
print(textwrap.fill('# Descriptors: {}'.format(', '.join(self.descriptors)), line_length))
|
|
856
|
+
print('# Configuration File: {}\n# Dataset: {}\n# Number of Sequences/Sequence Length: {} x {} \
|
|
857
|
+
\n# Target Activity: {}'.format(os.path.basename(self.config_file), self.dataset, self.num_seqs, self.sequence_length, self.activity_col))
|
|
858
|
+
print(f"# Algorithm: {repr(self.model)}")
|
|
859
|
+
if (self.model_parameters == "" or self.model_parameters is None or self.model_parameters == {}):
|
|
860
|
+
# print('# Model Parameters: {}'.format("\n\t".join(tw.wrap(', '.join(temp_model_parameters.model.get_params())))))
|
|
861
|
+
print(textwrap.fill(f'# Model Parameters: {self.model.model.get_params()}', line_length))
|
|
862
|
+
else:
|
|
863
|
+
print(textwrap.fill(f'# Model Parameters: {self.model_parameters}', line_length))
|
|
864
|
+
print(f'# Test Split: {self.test_split}\n# Feature Space: {self.feature_space}')
|
|
865
|
+
|
|
866
|
+
print('\n##########################################################################################')
|
|
867
|
+
print('######################################## Results #########################################\n')
|
|
868
|
+
print('# R2: {}'.format(results['R2'].values[0]))
|
|
869
|
+
print('# RMSE: {} '.format(results['RMSE'].values[0]))
|
|
870
|
+
print('# MSE: {} '.format(results['MSE'].values[0]))
|
|
871
|
+
print('# MAE: {}'.format(results['MAE'].values[0]))
|
|
872
|
+
print('# RPD {}'.format(results['RPD'].values[0]))
|
|
873
|
+
print('# Explained Variance {}\n'.format(results['Explained Variance'].values[0]))
|
|
874
|
+
print('##########################################################################################\n')
|
|
875
|
+
|
|
876
|
+
###################### Getters & Setters ######################
|
|
877
|
+
|
|
878
|
+
@property
|
|
879
|
+
def dataset(self):
|
|
880
|
+
return self._dataset
|
|
881
|
+
|
|
882
|
+
@dataset.setter
|
|
883
|
+
def dataset(self, val):
|
|
884
|
+
self._dataset = val
|
|
885
|
+
|
|
886
|
+
@property
|
|
887
|
+
def sequences(self):
|
|
888
|
+
return self._sequences
|
|
889
|
+
|
|
890
|
+
@sequences.setter
|
|
891
|
+
def sequences(self, val):
|
|
892
|
+
self._sequences = val
|
|
893
|
+
|
|
894
|
+
@property
|
|
895
|
+
def sequence_col(self):
|
|
896
|
+
return self._sequence_col
|
|
897
|
+
|
|
898
|
+
@sequence_col.setter
|
|
899
|
+
def sequence_col(self, val):
|
|
900
|
+
self._sequence_col = val
|
|
901
|
+
|
|
902
|
+
@property
|
|
903
|
+
def activity_col(self):
|
|
904
|
+
return self._activity_col
|
|
905
|
+
|
|
906
|
+
@activity_col.setter
|
|
907
|
+
def activity_col(self, val):
|
|
908
|
+
self._activity_col = val
|
|
909
|
+
|
|
910
|
+
@property
|
|
911
|
+
def activity(self):
|
|
912
|
+
return self._activity
|
|
913
|
+
|
|
914
|
+
@activity.setter
|
|
915
|
+
def activity(self, val):
|
|
916
|
+
self._activity = val
|
|
917
|
+
|
|
918
|
+
@property
|
|
919
|
+
def algorithm(self):
|
|
920
|
+
return self._algorithm
|
|
921
|
+
|
|
922
|
+
@algorithm.setter
|
|
923
|
+
def algorithm(self, val):
|
|
924
|
+
self._algorithm = val
|
|
925
|
+
|
|
926
|
+
@property
|
|
927
|
+
def model_parameters(self):
|
|
928
|
+
return self._model_parameters
|
|
929
|
+
|
|
930
|
+
@model_parameters.setter
|
|
931
|
+
def model_parameters(self, val):
|
|
932
|
+
self._model_parameters = val
|
|
933
|
+
|
|
934
|
+
@property
|
|
935
|
+
def test_split(self):
|
|
936
|
+
return self._test_split
|
|
937
|
+
|
|
938
|
+
@test_split.setter
|
|
939
|
+
def test_split(self, val):
|
|
940
|
+
self._test_split = val
|
|
941
|
+
|
|
942
|
+
@property
|
|
943
|
+
def num_seqs(self):
|
|
944
|
+
return self._num_seqs
|
|
945
|
+
|
|
946
|
+
@num_seqs.setter
|
|
947
|
+
def num_seqs(self, val):
|
|
948
|
+
self._num_seqs = val
|
|
949
|
+
|
|
950
|
+
@property
|
|
951
|
+
def sequence_length(self):
|
|
952
|
+
return self._sequence_length
|
|
953
|
+
|
|
954
|
+
@sequence_length.setter
|
|
955
|
+
def sequence_length(self, val):
|
|
956
|
+
self._sequence_length = val
|
|
957
|
+
|
|
958
|
+
# def __str__(self):
|
|
959
|
+
# return "Instance of PySAR class, using parameters: {}.".format(self.__dict__)
|
|
960
|
+
|
|
961
|
+
# def __repr__(self):
|
|
962
|
+
# return "<PySAR: {}>".format(self)
|