PySAR 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +53 -0
- pySAR/__init__.py +28 -0
- pySAR/descriptors.py +2893 -0
- pySAR/encoding.py +986 -0
- pySAR/evaluate.py +231 -0
- pySAR/globals_.py +21 -0
- pySAR/model.py +559 -0
- pySAR/plots.py +92 -0
- pySAR/py.typed +0 -0
- pySAR/pyDSP.py +582 -0
- pySAR/pySAR.py +962 -0
- pySAR/utils.py +283 -0
- pysar-2.5.0.dist-info/METADATA +740 -0
- pysar-2.5.0.dist-info/RECORD +17 -0
- pysar-2.5.0.dist-info/WHEEL +5 -0
- pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
- pysar-2.5.0.dist-info/top_level.txt +2 -0
pySAR/encoding.py
ADDED
|
@@ -0,0 +1,986 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# Encoding #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
import itertools
|
|
9
|
+
import logging
|
|
10
|
+
import threading
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, Callable
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
import textwrap
|
|
17
|
+
|
|
18
|
+
from aaindex import aaindex1
|
|
19
|
+
from .model import Model
|
|
20
|
+
from .pyDSP import PyDSP
|
|
21
|
+
from .evaluate import Evaluate
|
|
22
|
+
from .pySAR import PySAR
|
|
23
|
+
from .utils import save_results
|
|
24
|
+
from .descriptors import Descriptors
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MetricKey(str, Enum):
|
|
28
|
+
""" Enum for consistent metric and column naming across encoding results. """
|
|
29
|
+
INDEX = 'Index'
|
|
30
|
+
CATEGORY = 'Category'
|
|
31
|
+
DESCRIPTOR = 'Descriptor'
|
|
32
|
+
GROUP = 'Group'
|
|
33
|
+
R2 = 'R2'
|
|
34
|
+
RMSE = 'RMSE'
|
|
35
|
+
MSE = 'MSE'
|
|
36
|
+
MAE = 'MAE'
|
|
37
|
+
RPD = 'RPD'
|
|
38
|
+
EXPLAINED_VARIANCE = 'Explained Variance'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SortKey(str, Enum):
|
|
42
|
+
""" Enum for valid sort_by options when sorting encoding results. """
|
|
43
|
+
R2 = MetricKey.R2.value
|
|
44
|
+
RMSE = MetricKey.RMSE.value
|
|
45
|
+
MSE = MetricKey.MSE.value
|
|
46
|
+
MAE = MetricKey.MAE.value
|
|
47
|
+
RPD = MetricKey.RPD.value
|
|
48
|
+
EXPLAINED_VARIANCE = MetricKey.EXPLAINED_VARIANCE.value
|
|
49
|
+
|
|
50
|
+
class Encoding(PySAR):
|
|
51
|
+
"""
|
|
52
|
+
The use-case of this class is when you have a dataset of protein sequences with
|
|
53
|
+
a sought-after protein activity/fitness value and you want to measure this activity
|
|
54
|
+
value for new and unseen sequences that have not had their activity value
|
|
55
|
+
experimentally measured. Prior to protein sequences being passed into ML models,
|
|
56
|
+
the amino acids have to be numerically encoded. The encoding class allows for
|
|
57
|
+
evaluation of a variety of potential techniques at which to numerically encode the
|
|
58
|
+
protein sequences, allowing for the building of predictive regression ML models
|
|
59
|
+
that can ultimately predict the activity value of an unseen protein sequence by
|
|
60
|
+
mapping a relationship between sequence and activity/function. The strategies each
|
|
61
|
+
generate a huge number of potential models built an a plethora of available features
|
|
62
|
+
that you can then assess for performance and predictability, selecting the
|
|
63
|
+
best-performing model out of all those evaluated. This best-performing model should
|
|
64
|
+
then be used when you want to predict the activity/fitness value for new sequences.
|
|
65
|
+
|
|
66
|
+
The encoding class inherits from the main PySAR module and allows for a
|
|
67
|
+
dataset of protein sequences to be encoded through 3 main strategies: AAI Indices,
|
|
68
|
+
protein Descriptors and AAI Indices + protein Descriptors. The encoding class
|
|
69
|
+
and its methods differ from the PySAR class by allowing for the encoding using
|
|
70
|
+
all available features, in comparison to the PySAR class which is mainly used for
|
|
71
|
+
accessing individual or a small subset of features.
|
|
72
|
+
|
|
73
|
+
To date, there are 566 indices supported in the AAI and pySAR supports 33 different
|
|
74
|
+
descriptors. The features can be encoded using different combinations, for example,
|
|
75
|
+
1, 2 or 3 descriptors can be used for the descriptor and AAI + Descriptor encoding
|
|
76
|
+
strategies. In total, this class supports over 410,000 possible ways at which to
|
|
77
|
+
numerically encode the protein sequences in the building of a predictive ML model
|
|
78
|
+
for mapping these sequences to a particular activity/function, known as a
|
|
79
|
+
Sequence-Activity-Relationship (SAR) or Sequence-Function-Relationship (SFR).
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
==========
|
|
83
|
+
:config_file: (str)
|
|
84
|
+
path to configuration file with all required parameters for the pySAR encoding
|
|
85
|
+
pipeline.
|
|
86
|
+
**kwargs: dict
|
|
87
|
+
keyword arguments and values passed into constructor. The keywords should be
|
|
88
|
+
the same name and form of those in the configuration file. The keyword values
|
|
89
|
+
input take precedence over those in the config files.
|
|
90
|
+
|
|
91
|
+
Methods
|
|
92
|
+
=======
|
|
93
|
+
aai_encoding(aai_indices=None, sort_by='R2', output_folder=""):
|
|
94
|
+
encoding protein sequences using indices from the AAI and aaindex package.
|
|
95
|
+
descriptor_encoding(descriptors=None, desc_combo=1, sort_by='R2', output_folder=""):
|
|
96
|
+
encoding protein sequences using protein descriptors from descriptors module and protpy package.
|
|
97
|
+
aai_descriptor_encoding(aai_indices=None, descriptors=None, desc_combo=1, sort_by='R2', output_folder=""):
|
|
98
|
+
encoding protein sequences using indices from the AAI in concatenation with
|
|
99
|
+
the protein descriptors from the descriptors module and protpy package.
|
|
100
|
+
"""
|
|
101
|
+
def __init__(self,
|
|
102
|
+
config_file: str = "",
|
|
103
|
+
verbose: bool = True,
|
|
104
|
+
logger: Optional[logging.Logger] = None,
|
|
105
|
+
**kwargs: Any) -> None:
|
|
106
|
+
|
|
107
|
+
self.config_file = config_file
|
|
108
|
+
self.verbose = verbose
|
|
109
|
+
self.logger = logger
|
|
110
|
+
self._aai_feature_cache: Dict[str, pd.DataFrame] = {}
|
|
111
|
+
self._descriptor_feature_cache: Dict[str, pd.DataFrame] = {}
|
|
112
|
+
self._cache_lock: threading.Lock = threading.Lock() # guards both caches for thread safety
|
|
113
|
+
|
|
114
|
+
#pass config file and kwargs into parent pySAR class
|
|
115
|
+
super().__init__(self.config_file, **kwargs)
|
|
116
|
+
|
|
117
|
+
def aai_encoding(self,
|
|
118
|
+
aai_indices: Optional[Union[str, List[str]]] = None,
|
|
119
|
+
sort_by: str = 'R2',
|
|
120
|
+
output_folder: str = "",
|
|
121
|
+
n_jobs: int = 1,
|
|
122
|
+
random_state: Optional[int] = None,
|
|
123
|
+
max_models: Optional[int] = None,
|
|
124
|
+
sample_mode: bool = False,
|
|
125
|
+
resume: bool = False,
|
|
126
|
+
resume_file: str = "") -> pd.DataFrame:
|
|
127
|
+
"""
|
|
128
|
+
Encoding all protein sequences using each of the available indices in the
|
|
129
|
+
AAI and aaindex package. The protein spectra of the AAI indices can be generated
|
|
130
|
+
if use_dsp is true when creating the Encoding instance, also utilized for the
|
|
131
|
+
DSP spectra are the instance attributes: spectrum, window and filter. If not true
|
|
132
|
+
then the encoded sequences from the AAI will directly be used - default.
|
|
133
|
+
|
|
134
|
+
Each encoding will be used as the feature data to build the predictive regression
|
|
135
|
+
ML models. To date, there are 566 indices in the AAI, therefore 566 total models
|
|
136
|
+
can be built using this encoding strategy. The metrics evaluated from the model
|
|
137
|
+
for each AAI encoding combination will be collated into a dataframe, saved and
|
|
138
|
+
returned, with the results sorted by R2 by default, this can be changed using
|
|
139
|
+
the sort_by parameter. You can sort the output dataframe via the other metrics,
|
|
140
|
+
including: RMSE, MSE, MAE, RPD and Explained Variance.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
==========
|
|
144
|
+
:aai_indices: str/list (default=None)
|
|
145
|
+
str/list of aai indices to use for encoding the predictive models, by default
|
|
146
|
+
ALL AAI indices will be used if parameter remains as None.
|
|
147
|
+
:sort_by: str (default=R2)
|
|
148
|
+
sort output dataframe by specified column/metric value, results sorted by R2
|
|
149
|
+
score by default.
|
|
150
|
+
:output_folder: str (default="")
|
|
151
|
+
output folder to store results csv to, if empty then input will be stored in
|
|
152
|
+
the OUTPUT_FOLDER global var.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
=======
|
|
156
|
+
:aaindex_metrics_df: pd.DataFrame
|
|
157
|
+
dataframe of calculated metric values from generated predictive models
|
|
158
|
+
encoded using indices in the AAI for the AAI encoding strategy. Output will
|
|
159
|
+
be of the shape X x 8, where X is the number of indices that can be used
|
|
160
|
+
for the encoding and 8 is the results/metric columns. If no indices are
|
|
161
|
+
passed in then this shape will be 566 x 8.
|
|
162
|
+
"""
|
|
163
|
+
all_indices = self.validate_inputs(aai_indices, aaindex1.record_codes(), "AAI")
|
|
164
|
+
all_indices = self._apply_model_limit(all_indices, sample_mode=sample_mode, max_models=max_models)
|
|
165
|
+
metrics_rows, completed_keys = self._load_resume(
|
|
166
|
+
resume_file if resume else None,
|
|
167
|
+
key_columns=[MetricKey.INDEX.value]
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
#create text wrapper for amino acid indices and model parameters text
|
|
171
|
+
line_length = 90
|
|
172
|
+
|
|
173
|
+
#create temp Model object to access the models' parameter values for use in display text below
|
|
174
|
+
temp_model_parameters = Model(
|
|
175
|
+
X=[],
|
|
176
|
+
Y=self.activity,
|
|
177
|
+
algorithm=self.algorithm,
|
|
178
|
+
parameters=self.model_parameters
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._log('\n##########################################################################################\n')
|
|
182
|
+
self._log(f'# Encoding using {len(all_indices)} AAI combination(s) with the parameters:\n')
|
|
183
|
+
#only output indices if there are 10 or less
|
|
184
|
+
if (len(all_indices) <= 10):
|
|
185
|
+
self._log(textwrap.fill(f"# AAI Indices: {', '.join(all_indices)}", line_length))
|
|
186
|
+
else:
|
|
187
|
+
self._log(f'# AAI Indices: {len(all_indices)}')
|
|
188
|
+
if (self.use_dsp):
|
|
189
|
+
self._log(
|
|
190
|
+
f'# DSP Parameters:\n# Spectrum: {self.spectrum}\n# Window Function: {self.window_type}\n# Filter Function: {self.filter_type}'
|
|
191
|
+
)
|
|
192
|
+
self._log(
|
|
193
|
+
f'# Configuration File: {os.path.basename(self.config_file)}\n'
|
|
194
|
+
f'# Dataset: {os.path.basename(self.dataset)}\n'
|
|
195
|
+
f'# Number of Sequences/Sequence Length: {self.num_seqs} x {self.sequence_length}\n'
|
|
196
|
+
f'# Target Activity: {self.activity_col}\n'
|
|
197
|
+
f'# Algorithm: {repr(temp_model_parameters)}'
|
|
198
|
+
)
|
|
199
|
+
if not isinstance(self.model_parameters, dict) or not self.model_parameters:
|
|
200
|
+
self._log(textwrap.fill(f'# Model Parameters: {temp_model_parameters.model.get_params()}', line_length))
|
|
201
|
+
else:
|
|
202
|
+
self._log(textwrap.fill(f'# Model Parameters: {self.model_parameters}', line_length))
|
|
203
|
+
self._log(f'# Test Split: {self.test_split}')
|
|
204
|
+
self._log('\n##########################################################################################\n')
|
|
205
|
+
|
|
206
|
+
'''
|
|
207
|
+
1.) Get AAI index encoding of protein sequences, if using DSP (use_dsp = True),
|
|
208
|
+
create instance of pyDSP class and generate protein spectra from the AAI
|
|
209
|
+
indices, according to instance parameters: spectrum, window and filter.
|
|
210
|
+
2.) Build model using encoded AAI indices or protein spectra as features.
|
|
211
|
+
3.) Predict and evaluate the model using the test data.
|
|
212
|
+
4.) Append index, its category and calculated metrics to lists.
|
|
213
|
+
5.) Repeat steps 1 - 4 for all indices.
|
|
214
|
+
6.) Output results into a final dataframe, save to OUTPUT_DIR and return.
|
|
215
|
+
'''
|
|
216
|
+
#start time counter
|
|
217
|
+
start = time.time()
|
|
218
|
+
|
|
219
|
+
#create list of pending indices to process, if resume is true then this will remove any completed indices from the list, otherwise it will be the same as all_indices. If 5 or less indices to process then disable tqdm progress bar.
|
|
220
|
+
pending_indices = [idx for idx in all_indices if (idx,) not in completed_keys]
|
|
221
|
+
tqdm_disable = len(pending_indices) <= 5
|
|
222
|
+
|
|
223
|
+
def _run_index(index: str) -> Dict[str, Any]:
|
|
224
|
+
""" Helper function to run encoding, model building and evaluation for a single AAI index. """
|
|
225
|
+
X = self.build_features(feature_type="aai", index=index)
|
|
226
|
+
eval_metrics = self.run_model(X, self.activity, random_state=random_state)
|
|
227
|
+
return {
|
|
228
|
+
MetricKey.INDEX.value: index,
|
|
229
|
+
MetricKey.CATEGORY.value: aaindex1[index].category,
|
|
230
|
+
MetricKey.R2.value: eval_metrics.r2,
|
|
231
|
+
MetricKey.RMSE.value: eval_metrics.rmse,
|
|
232
|
+
MetricKey.MSE.value: eval_metrics.mse,
|
|
233
|
+
MetricKey.MAE.value: eval_metrics.mae,
|
|
234
|
+
MetricKey.RPD.value: eval_metrics.rpd,
|
|
235
|
+
MetricKey.EXPLAINED_VARIANCE.value: eval_metrics.explained_var,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
#run encoding, model building and evaluation for each index in the AAI, using parallel processing with n_jobs threads, and append results to metrics_rows list. If resume is true then this will only run for indices that have not been completed yet according to the resume file.
|
|
239
|
+
new_rows = self._execute_jobs(
|
|
240
|
+
items=pending_indices,
|
|
241
|
+
task_fn=_run_index,
|
|
242
|
+
n_jobs=n_jobs,
|
|
243
|
+
tqdm_desc="AAI Indices",
|
|
244
|
+
tqdm_unit="indices",
|
|
245
|
+
tqdm_disable=tqdm_disable
|
|
246
|
+
)
|
|
247
|
+
#append new rows to metrics_rows and save checkpoint if resume is true
|
|
248
|
+
metrics_rows.extend(new_rows)
|
|
249
|
+
self._save_resume_checkpoint(metrics_rows, resume_file if resume else None)
|
|
250
|
+
|
|
251
|
+
#stop time counter, calculate elapsed time
|
|
252
|
+
end = time.time()
|
|
253
|
+
elapsed = end - start
|
|
254
|
+
|
|
255
|
+
self._log(f'\nElapsed time for AAI Encoding: {elapsed:.2f} seconds.')
|
|
256
|
+
self._log('\n##########################################################################################')
|
|
257
|
+
|
|
258
|
+
# format results into dataframe, save to OUTPUT_DIR and return, sorting by sort_by parameter and using the appropriate filename for the AAI encoding results. If resume is true then this will also save the resume checkpoint with the results.
|
|
259
|
+
return self.format_and_save_results(
|
|
260
|
+
metrics_rows=metrics_rows,
|
|
261
|
+
columns=[
|
|
262
|
+
MetricKey.INDEX.value,
|
|
263
|
+
MetricKey.CATEGORY.value,
|
|
264
|
+
MetricKey.R2.value,
|
|
265
|
+
MetricKey.RMSE.value,
|
|
266
|
+
MetricKey.MSE.value,
|
|
267
|
+
MetricKey.MAE.value,
|
|
268
|
+
MetricKey.RPD.value,
|
|
269
|
+
MetricKey.EXPLAINED_VARIANCE.value,
|
|
270
|
+
],
|
|
271
|
+
sort_by=sort_by,
|
|
272
|
+
save_filename='aaindex_results',
|
|
273
|
+
output_folder=output_folder,
|
|
274
|
+
string_columns=[MetricKey.INDEX.value, MetricKey.CATEGORY.value],
|
|
275
|
+
resume_file=resume_file if resume else None
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def descriptor_encoding(self,
|
|
279
|
+
descriptors: Optional[Union[str, List[str]]] = None,
|
|
280
|
+
desc_combo: int = 1,
|
|
281
|
+
sort_by: str = 'R2',
|
|
282
|
+
output_folder: str = "",
|
|
283
|
+
n_jobs: int = 1,
|
|
284
|
+
random_state: Optional[int] = None,
|
|
285
|
+
max_models: Optional[int] = None,
|
|
286
|
+
sample_mode: bool = False,
|
|
287
|
+
resume: bool = False,
|
|
288
|
+
resume_file: str = "") -> pd.DataFrame:
|
|
289
|
+
"""
|
|
290
|
+
Encoding all protein sequences using the available physicochemical, biochemical
|
|
291
|
+
and structural descriptors from the custom-built protpy package. The sequences
|
|
292
|
+
can be encoded using combinations of 1, 2 or 3 of these descriptors, dictated
|
|
293
|
+
by the desc_combo input parameter: set this to 1, 2 or 3 for what encoding
|
|
294
|
+
combination to use, default is 1.
|
|
295
|
+
|
|
296
|
+
Each descriptor encoding will be used as the feature data to build the predictive
|
|
297
|
+
regression ML models. These models can then be used to predict the sought-after
|
|
298
|
+
activity/fitness value for unseen test sequences. With 33 descriptors supported
|
|
299
|
+
by pySAR & protpy this means there can be 33, 528 and 5456 total predictive models
|
|
300
|
+
built for 1, 2 or 3 descriptors, respectively. These totals may vary depending on
|
|
301
|
+
the meta-parameters on some of the descriptors e.g the lag or lambda for the
|
|
302
|
+
autocorrelation and pseudo amino acid descriptors, respectively. The metrics
|
|
303
|
+
evaluated from the model for each descriptor encoding combination will be collated
|
|
304
|
+
into a dataframe and saved and returned, with the results sorted by the R2 score
|
|
305
|
+
by default, this can be changed using the sort_by parameter.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
==========
|
|
309
|
+
:descriptors: str/list (default=None)
|
|
310
|
+
str/list of descriptors to use for encoding, by default all available descriptors
|
|
311
|
+
in the protpy package will be used for the encoding.
|
|
312
|
+
:desc_combo: int (default=1)
|
|
313
|
+
combination of descriptors to use, default of 1.
|
|
314
|
+
:sort_by: str (default=R2)
|
|
315
|
+
sort output dataframe by specified column/metric value, results sorted by R2
|
|
316
|
+
score by default.
|
|
317
|
+
:output_folder: str (default="")
|
|
318
|
+
output folder to store results csv to, if parameter not set then output will
|
|
319
|
+
be stored in the OUTPUT_FOLDER global var.
|
|
320
|
+
|
|
321
|
+
Returns
|
|
322
|
+
=======
|
|
323
|
+
:desc_metrics_df_: pd.DataFrame
|
|
324
|
+
dataframe of calculated metric values from generated predictive models
|
|
325
|
+
encoded using all or selected input descriptors for the descriptors
|
|
326
|
+
encoding strategy. Output will be of the shape X x 8, where X is the
|
|
327
|
+
number of descriptors input and 8 is the results/metric columns. By
|
|
328
|
+
default the output shape will be 33 x 8, but with a desc_combo of 2
|
|
329
|
+
and 3, the shape will be 528 x 8 and 5456 x 8, respectively.
|
|
330
|
+
"""
|
|
331
|
+
#create instance of descriptors class using config file and any kwargs
|
|
332
|
+
desc = self.descriptor
|
|
333
|
+
self.validate_desc_combo(desc_combo)
|
|
334
|
+
all_descriptor_names = self.validate_inputs(descriptors, desc.valid_descriptors, "Descriptor")
|
|
335
|
+
|
|
336
|
+
#validate input descriptors and get list of descriptor names to use for encoding
|
|
337
|
+
if desc_combo == 1:
|
|
338
|
+
all_descriptors: List[Union[str, Tuple[str, ...]]] = all_descriptor_names
|
|
339
|
+
else:
|
|
340
|
+
all_descriptors = list(itertools.combinations(all_descriptor_names, desc_combo))
|
|
341
|
+
|
|
342
|
+
# apply model limit if specified in config file, this will limit the number of models built and evaluated by taking a random sample of the total combinations of descriptors
|
|
343
|
+
all_descriptors = self._apply_model_limit(all_descriptors, sample_mode=sample_mode, max_models=max_models)
|
|
344
|
+
metrics_rows, completed_keys = self._load_resume(
|
|
345
|
+
resume_file if resume else None,
|
|
346
|
+
key_columns=[MetricKey.DESCRIPTOR.value]
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# prime descriptor cache once; improves repeated and parallel usage
|
|
350
|
+
for descriptor_name in all_descriptor_names:
|
|
351
|
+
self._get_descriptor_features(descriptor_name, desc)
|
|
352
|
+
|
|
353
|
+
#create text wrapper for descriptors and model parameters text
|
|
354
|
+
line_length = 90
|
|
355
|
+
|
|
356
|
+
#create temp Model object to access the models' parameter values for use in display text below
|
|
357
|
+
temp_model_parameters = Model(
|
|
358
|
+
X=[],
|
|
359
|
+
Y=self.activity,
|
|
360
|
+
algorithm=self.algorithm,
|
|
361
|
+
parameters=self.model_parameters
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
self._log('\n##########################################################################################\n')
|
|
365
|
+
descriptor_display = [
|
|
366
|
+
'+'.join(descriptor_set) if isinstance(descriptor_set, tuple) else descriptor_set
|
|
367
|
+
for descriptor_set in all_descriptors
|
|
368
|
+
]
|
|
369
|
+
self._log(f'# Encoding using {len(all_descriptors)} descriptor combination(s) with the parameters:\n')
|
|
370
|
+
self._log(textwrap.fill(f"# Descriptors: {', '.join(descriptor_display)}", line_length))
|
|
371
|
+
self._log(
|
|
372
|
+
f'# Configuration File: {os.path.basename(self.config_file)}\n'
|
|
373
|
+
f'# Dataset: {os.path.basename(self.dataset)}\n'
|
|
374
|
+
f'# Number of Sequences/Sequence Length: {len(self.data)} x {self.data[self.sequence_col].str.len().max()}\n'
|
|
375
|
+
f'# Target Activity: {self.activity_col}\n'
|
|
376
|
+
f'# Algorithm: {repr(temp_model_parameters)}'
|
|
377
|
+
)
|
|
378
|
+
if not isinstance(self.model_parameters, dict) or not self.model_parameters:
|
|
379
|
+
self._log(f'# Model Parameters: {temp_model_parameters.model.get_params()}')
|
|
380
|
+
else:
|
|
381
|
+
self._log(f'# Model Parameters: {self.model_parameters}')
|
|
382
|
+
self._log(f'# Test Split: {self.test_split}')
|
|
383
|
+
self._log('\n##########################################################################################')
|
|
384
|
+
|
|
385
|
+
#start counter
|
|
386
|
+
start = time.time()
|
|
387
|
+
|
|
388
|
+
'''
|
|
389
|
+
1.) Get current descriptor value or combination of descriptors from all_descriptors list for
|
|
390
|
+
dataset of protein sequences.
|
|
391
|
+
2.) Build model using calculated descriptor features from current descriptor(s).
|
|
392
|
+
3.) Predict and evaluate the model using the test data protein sequences.
|
|
393
|
+
4.) Append descriptor(s) and calculated metrics to lists.
|
|
394
|
+
5.) Repeat steps 1 - 4 for all descriptors.
|
|
395
|
+
6.) Output results into a final dataframe, save it and return, sorting by sort_by parameter.
|
|
396
|
+
'''
|
|
397
|
+
#create list of pending descriptors to process, if resume is true then this will remove any completed descriptors from the list
|
|
398
|
+
pending_descriptors = [
|
|
399
|
+
descriptor_entry for descriptor_entry in all_descriptors
|
|
400
|
+
if ('+'.join(list(descriptor_entry) if isinstance(descriptor_entry, tuple) else [descriptor_entry]),) not in completed_keys
|
|
401
|
+
]
|
|
402
|
+
tqdm_disable = len(pending_descriptors) <= 3
|
|
403
|
+
|
|
404
|
+
def _run_descriptor(descriptor_entry: Union[str, Tuple[str, ...]]) -> Dict[str, Any]:
|
|
405
|
+
""" Helper function to run encoding, model building and evaluation for a single descriptor. """
|
|
406
|
+
X = self.build_features(
|
|
407
|
+
feature_type="descriptor",
|
|
408
|
+
descriptor_entry=descriptor_entry,
|
|
409
|
+
desc_instance=desc
|
|
410
|
+
)
|
|
411
|
+
# run model and get evaluation metrics for current descriptor(s)
|
|
412
|
+
eval_metrics = self.run_model(X, self.activity, random_state=random_state)
|
|
413
|
+
|
|
414
|
+
# create descriptor label and group label for results display and dataframe output
|
|
415
|
+
descriptor_names = list(descriptor_entry) if isinstance(descriptor_entry, tuple) else [descriptor_entry]
|
|
416
|
+
descriptor_label = '+'.join(descriptor_names)
|
|
417
|
+
group_label = ','.join([desc.descriptor_groups[name] for name in descriptor_names])
|
|
418
|
+
return {
|
|
419
|
+
MetricKey.DESCRIPTOR.value: descriptor_label,
|
|
420
|
+
MetricKey.GROUP.value: group_label,
|
|
421
|
+
MetricKey.R2.value: eval_metrics.r2,
|
|
422
|
+
MetricKey.RMSE.value: eval_metrics.rmse,
|
|
423
|
+
MetricKey.MSE.value: eval_metrics.mse,
|
|
424
|
+
MetricKey.MAE.value: eval_metrics.mae,
|
|
425
|
+
MetricKey.RPD.value: eval_metrics.rpd,
|
|
426
|
+
MetricKey.EXPLAINED_VARIANCE.value: eval_metrics.explained_var,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
#run encoding, model building and evaluation for each descriptor or combination of descriptors
|
|
430
|
+
new_rows = self._execute_jobs(
|
|
431
|
+
items=pending_descriptors,
|
|
432
|
+
task_fn=_run_descriptor,
|
|
433
|
+
n_jobs=n_jobs,
|
|
434
|
+
tqdm_desc="Descriptors",
|
|
435
|
+
tqdm_unit="descriptor",
|
|
436
|
+
tqdm_disable=tqdm_disable
|
|
437
|
+
)
|
|
438
|
+
metrics_rows.extend(new_rows)
|
|
439
|
+
self._save_resume_checkpoint(metrics_rows, resume_file if resume else None)
|
|
440
|
+
|
|
441
|
+
#stop counter and calculate elapsed time
|
|
442
|
+
end = time.time()
|
|
443
|
+
elapsed = end - start
|
|
444
|
+
|
|
445
|
+
self._log(f'\nElapsed time for Descriptor Encoding: {elapsed:.2f} seconds.\n')
|
|
446
|
+
self._log('\n##########################################################################################')
|
|
447
|
+
|
|
448
|
+
if (desc_combo == 2):
|
|
449
|
+
save_filename = 'desc_combo2_results'
|
|
450
|
+
elif (desc_combo == 3):
|
|
451
|
+
save_filename = 'desc_combo3_results'
|
|
452
|
+
else:
|
|
453
|
+
save_filename = 'desc_results'
|
|
454
|
+
|
|
455
|
+
return self.format_and_save_results(
|
|
456
|
+
metrics_rows=metrics_rows,
|
|
457
|
+
columns=[
|
|
458
|
+
MetricKey.DESCRIPTOR.value,
|
|
459
|
+
MetricKey.GROUP.value,
|
|
460
|
+
MetricKey.R2.value,
|
|
461
|
+
MetricKey.RMSE.value,
|
|
462
|
+
MetricKey.MSE.value,
|
|
463
|
+
MetricKey.MAE.value,
|
|
464
|
+
MetricKey.RPD.value,
|
|
465
|
+
MetricKey.EXPLAINED_VARIANCE.value,
|
|
466
|
+
],
|
|
467
|
+
sort_by=sort_by,
|
|
468
|
+
save_filename=save_filename,
|
|
469
|
+
output_folder=output_folder,
|
|
470
|
+
string_columns=[MetricKey.DESCRIPTOR.value, MetricKey.GROUP.value],
|
|
471
|
+
resume_file=resume_file if resume else None
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
def aai_descriptor_encoding(self,
|
|
475
|
+
aai_indices: Optional[Union[str, List[str]]] = None,
|
|
476
|
+
descriptors: Optional[Union[str, List[str]]] = None,
|
|
477
|
+
desc_combo: int = 1,
|
|
478
|
+
sort_by: str = 'R2',
|
|
479
|
+
output_folder: str = "",
|
|
480
|
+
n_jobs: int = 1,
|
|
481
|
+
random_state: Optional[int] = None,
|
|
482
|
+
max_models: Optional[int] = None,
|
|
483
|
+
sample_mode: bool = False,
|
|
484
|
+
resume: bool = False,
|
|
485
|
+
resume_file: str = "") -> pd.DataFrame:
|
|
486
|
+
"""
|
|
487
|
+
Encoding all protein sequences using each of the available indices in the AAI and
|
|
488
|
+
aaindex package in concatenation with the protein descriptors available via the
|
|
489
|
+
protpy package. The sequences can be encoded using 1 AAI + 1 Descriptor, 2
|
|
490
|
+
Descriptors or 3 Descriptors, dictated by the desc_combo input parameter: set
|
|
491
|
+
this to 1, 2 or 3 for what encoding combination to use, default is 1. The protein
|
|
492
|
+
spectra of the AAI indices will be generated if the config param use_dsp is true,
|
|
493
|
+
also utilised for the DSP transformation is the class attributes: spectrum, window
|
|
494
|
+
and filter.
|
|
495
|
+
|
|
496
|
+
Each numerical encoding will be used as the feature data to build the predictive
|
|
497
|
+
regression ML models. To date, there are 566 indices and pySAR/protpy supports
|
|
498
|
+
33 descriptors so the encoding process will generate 18678, ~298000 and ~3.1M
|
|
499
|
+
models, when using 1, 2 or 3 descriptors + AAI indices, respectively. These values
|
|
500
|
+
may vary depending on the meta-parameters on some of the descriptors such as the
|
|
501
|
+
lag or lambda for the autocorrelation and pseudo amino acid descriptors, respectively.
|
|
502
|
+
The metrics evaluated from the model, accessing its accuracy and predictability for
|
|
503
|
+
each AAI + Descriptor encoding combination will be collated into a dataframe and saved
|
|
504
|
+
and returned, sorted by the R2 score by default.
|
|
505
|
+
|
|
506
|
+
Parameters
|
|
507
|
+
==========
|
|
508
|
+
:aai_indices: str/list (default=None)
|
|
509
|
+
str/list of aai indices to use for encoding the predictive models, by default
|
|
510
|
+
ALL AAI indices will be used.
|
|
511
|
+
:descriptors: list (default=None)
|
|
512
|
+
str/list of descriptors to use for encoding, by default all available descriptors
|
|
513
|
+
in the protpy package will be used for the encoding.
|
|
514
|
+
:desc_combo: int (default=1)
|
|
515
|
+
combination of descriptors to use.
|
|
516
|
+
:sort_by: str (default=R2)
|
|
517
|
+
sort output dataframe by specified column/metric value, results sorted by R2
|
|
518
|
+
score by default.
|
|
519
|
+
:output_folder: str (default="")
|
|
520
|
+
output folder to store results csv to, if empty input it will be stored in
|
|
521
|
+
the OUTPUT_FOLDER global var.
|
|
522
|
+
|
|
523
|
+
Returns
|
|
524
|
+
=======
|
|
525
|
+
:aai_desc_metrics_df_: pd.DataFrame
|
|
526
|
+
dataframe of calculated metric values from generated predictive models
|
|
527
|
+
encoded using AAI indices + descriptors encoding strategy. The output will
|
|
528
|
+
be of shape (X * Y) x 10, where X is the number of AAI indices input, Y is
|
|
529
|
+
the number of descriptors input and 10 is the results/metrics columns of
|
|
530
|
+
the output dataframe. Using the default values and desc_combo of 1, 2 and
|
|
531
|
+
3, the output shapes will be (566 * 15) x 10, (566 * 105) x 10, or
|
|
532
|
+
(566 * 455) x 10.
|
|
533
|
+
"""
|
|
534
|
+
all_indices = self.validate_inputs(aai_indices, aaindex1.record_codes(), "AAI")
|
|
535
|
+
self.validate_desc_combo(desc_combo)
|
|
536
|
+
|
|
537
|
+
#reuse cached Descriptors instance from PySAR.__init__
|
|
538
|
+
desc = self.descriptor
|
|
539
|
+
|
|
540
|
+
#validate input descriptors and get list of descriptor names to use for encoding
|
|
541
|
+
all_descriptor_names = self.validate_inputs(descriptors, desc.valid_descriptors, "Descriptor")
|
|
542
|
+
if desc_combo == 1:
|
|
543
|
+
all_descriptors: List[Union[str, Tuple[str, ...]]] = all_descriptor_names
|
|
544
|
+
else:
|
|
545
|
+
all_descriptors = list(itertools.combinations(all_descriptor_names, desc_combo))
|
|
546
|
+
|
|
547
|
+
# create list of all possible pairs of AAI indices and descriptors, then apply model limit if specified in config file
|
|
548
|
+
all_pairs: List[Tuple[str, Union[str, Tuple[str, ...]]]] = list(itertools.product(all_indices, all_descriptors))
|
|
549
|
+
all_pairs = self._apply_model_limit(all_pairs, sample_mode=sample_mode, max_models=max_models)
|
|
550
|
+
|
|
551
|
+
# if resume is true then load resume file and get list of completed keys to skip already completed pairs of AAI indices and descriptors
|
|
552
|
+
metrics_rows, completed_keys = self._load_resume(
|
|
553
|
+
resume_file if resume else None,
|
|
554
|
+
key_columns=[MetricKey.INDEX.value, MetricKey.DESCRIPTOR.value]
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# prime descriptor cache once; improves repeated and parallel usage
|
|
558
|
+
for descriptor_name in all_descriptor_names:
|
|
559
|
+
self._get_descriptor_features(descriptor_name, desc)
|
|
560
|
+
|
|
561
|
+
#create text wrapper for amino acid indices and descriptors text, split to newline if surpasses line length
|
|
562
|
+
line_length = 90
|
|
563
|
+
|
|
564
|
+
#create temp Model object to access the models' parameter values for use in display text below
|
|
565
|
+
temp_model_parameters = Model(
|
|
566
|
+
X=[],
|
|
567
|
+
Y=self.activity,
|
|
568
|
+
algorithm=self.algorithm,
|
|
569
|
+
parameters=self.model_parameters
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
self._log('\n###########################################################################\n')
|
|
573
|
+
self._log(
|
|
574
|
+
f'# Encoding using {len(all_indices)} AAI and {len(all_descriptors)} descriptor combination(s) with the parameters:\n'
|
|
575
|
+
)
|
|
576
|
+
#only output indices if there are 10 or less
|
|
577
|
+
if (len(all_indices) <= 10):
|
|
578
|
+
self._log(textwrap.fill(f"# AAI Indices: {', '.join(all_indices)}", line_length))
|
|
579
|
+
else:
|
|
580
|
+
self._log(f'# AAI Indices: {len(all_indices)}')
|
|
581
|
+
if (self.use_dsp):
|
|
582
|
+
self._log(
|
|
583
|
+
f'# DSP Parameters:\n# Spectrum: {self.spectrum}\n# Window Function: {self.window_type}\n# Filter Function: {self.filter_type}'
|
|
584
|
+
)
|
|
585
|
+
descriptor_display = [
|
|
586
|
+
'+'.join(descriptor_set) if isinstance(descriptor_set, tuple) else descriptor_set
|
|
587
|
+
for descriptor_set in all_descriptors
|
|
588
|
+
]
|
|
589
|
+
self._log(textwrap.fill(f"# Descriptors: {', '.join(descriptor_display)}", line_length))
|
|
590
|
+
self._log(
|
|
591
|
+
f'# Configuration File: {os.path.basename(self.config_file)}\n'
|
|
592
|
+
f'# Dataset: {os.path.basename(self.dataset)}\n'
|
|
593
|
+
f'# Number of Sequences/Sequence Length: {len(self.data)} x {self.data[self.sequence_col].str.len().max()}\n'
|
|
594
|
+
f'# Target Activity: {self.activity_col}\n'
|
|
595
|
+
f'# Algorithm: {repr(temp_model_parameters)}'
|
|
596
|
+
)
|
|
597
|
+
if not isinstance(self.model_parameters, dict) or not self.model_parameters:
|
|
598
|
+
self._log(f'# Model Parameters: {temp_model_parameters.model.get_params()}')
|
|
599
|
+
else:
|
|
600
|
+
self._log(f'# Model Parameters: {self.model_parameters}')
|
|
601
|
+
self._log(f'# Test Split : {self.test_split}')
|
|
602
|
+
self._log('\n###########################################################################')
|
|
603
|
+
|
|
604
|
+
#start counter
|
|
605
|
+
start = time.time()
|
|
606
|
+
|
|
607
|
+
'''
|
|
608
|
+
1.) Get AAI index encoding of protein sequences. If using DSP, create instance
|
|
609
|
+
of pyDSP class and generate protein spectra from the AAI indices, according to
|
|
610
|
+
instance parameters: spectrum, window and filter.
|
|
611
|
+
2.) Get all descriptor values and concatenate to AAI encoding features.
|
|
612
|
+
3.) Build model using concatenated AAI and Descriptor features as the training data.
|
|
613
|
+
4.) Predict and evaluate the model using the test data unseen protein sequences.
|
|
614
|
+
5.) Append index, descriptor and calculated metrics to lists.
|
|
615
|
+
6.) Repeat steps 1 - 5 for all indices in the AAI.
|
|
616
|
+
7.) Output results into a final dataframe, save it and return, sort by sort_by parameter.
|
|
617
|
+
'''
|
|
618
|
+
|
|
619
|
+
#create list of pending pairs of AAI indices and descriptors to process
|
|
620
|
+
pending_pairs = [
|
|
621
|
+
pair for pair in all_pairs
|
|
622
|
+
if (str(pair[0]), '+'.join(list(pair[1]) if isinstance(pair[1], tuple) else [pair[1]])) not in completed_keys
|
|
623
|
+
]
|
|
624
|
+
tqdm_disable = len(pending_pairs) <= 2
|
|
625
|
+
|
|
626
|
+
def _run_pair(pair: Tuple[str, Union[str, Tuple[str, ...]]]) -> Dict[str, Any]:
|
|
627
|
+
""" Helper function to run encoding, model building and evaluation for a single pair of AAI index and descriptor(s). """
|
|
628
|
+
index, descriptor_entry = pair
|
|
629
|
+
X = self.build_features(
|
|
630
|
+
feature_type="aai_descriptor",
|
|
631
|
+
index=index,
|
|
632
|
+
descriptor_entry=descriptor_entry,
|
|
633
|
+
desc_instance=desc
|
|
634
|
+
)
|
|
635
|
+
# run model and get evaluation metrics for current pair of AAI index and descriptor(s)
|
|
636
|
+
eval_metrics = self.run_model(X, self.activity, random_state=random_state)
|
|
637
|
+
|
|
638
|
+
# create descriptor label and group label for results display and dataframe output
|
|
639
|
+
descriptor_names = list(descriptor_entry) if isinstance(descriptor_entry, tuple) else [descriptor_entry]
|
|
640
|
+
descriptor_label = '+'.join(descriptor_names)
|
|
641
|
+
group_label = ','.join([desc.descriptor_groups[name] for name in descriptor_names])
|
|
642
|
+
return {
|
|
643
|
+
MetricKey.INDEX.value: index,
|
|
644
|
+
MetricKey.CATEGORY.value: aaindex1[index].category,
|
|
645
|
+
MetricKey.DESCRIPTOR.value: descriptor_label,
|
|
646
|
+
MetricKey.GROUP.value: group_label,
|
|
647
|
+
MetricKey.R2.value: eval_metrics.r2,
|
|
648
|
+
MetricKey.RMSE.value: eval_metrics.rmse,
|
|
649
|
+
MetricKey.MSE.value: eval_metrics.mse,
|
|
650
|
+
MetricKey.MAE.value: eval_metrics.mae,
|
|
651
|
+
MetricKey.RPD.value: eval_metrics.rpd,
|
|
652
|
+
MetricKey.EXPLAINED_VARIANCE.value: eval_metrics.explained_var,
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
#run encoding, model building and evaluation for each pair of AAI index and descriptor(s) using parallel processing with n_jobs threads
|
|
656
|
+
new_rows = self._execute_jobs(
|
|
657
|
+
items=pending_pairs,
|
|
658
|
+
task_fn=_run_pair,
|
|
659
|
+
n_jobs=n_jobs,
|
|
660
|
+
tqdm_desc="AAI+Descriptors",
|
|
661
|
+
tqdm_unit="pair",
|
|
662
|
+
tqdm_disable=tqdm_disable
|
|
663
|
+
)
|
|
664
|
+
#append new rows to metrics_rows and save checkpoint if resume is true
|
|
665
|
+
metrics_rows.extend(new_rows)
|
|
666
|
+
self._save_resume_checkpoint(metrics_rows, resume_file if resume else None)
|
|
667
|
+
|
|
668
|
+
#stop counter and calculate elapsed time
|
|
669
|
+
end = time.time()
|
|
670
|
+
elapsed = end - start
|
|
671
|
+
|
|
672
|
+
self._log(f'Elapsed time for AAI + Descriptor Encoding: {elapsed:.2f} seconds.')
|
|
673
|
+
self._log('\n###########################################################################')
|
|
674
|
+
|
|
675
|
+
if (desc_combo == 2):
|
|
676
|
+
save_filename = 'aai_desc_combo2_results'
|
|
677
|
+
elif (desc_combo == 3):
|
|
678
|
+
save_filename = 'aai_desc_combo3_results'
|
|
679
|
+
else:
|
|
680
|
+
save_filename = 'aai_desc_results'
|
|
681
|
+
|
|
682
|
+
return self.format_and_save_results(
|
|
683
|
+
metrics_rows=metrics_rows,
|
|
684
|
+
columns=[
|
|
685
|
+
MetricKey.INDEX.value,
|
|
686
|
+
MetricKey.CATEGORY.value,
|
|
687
|
+
MetricKey.DESCRIPTOR.value,
|
|
688
|
+
MetricKey.GROUP.value,
|
|
689
|
+
MetricKey.R2.value,
|
|
690
|
+
MetricKey.RMSE.value,
|
|
691
|
+
MetricKey.MSE.value,
|
|
692
|
+
MetricKey.MAE.value,
|
|
693
|
+
MetricKey.RPD.value,
|
|
694
|
+
MetricKey.EXPLAINED_VARIANCE.value,
|
|
695
|
+
],
|
|
696
|
+
sort_by=sort_by,
|
|
697
|
+
save_filename=save_filename,
|
|
698
|
+
output_folder=output_folder,
|
|
699
|
+
string_columns=[
|
|
700
|
+
MetricKey.INDEX.value,
|
|
701
|
+
MetricKey.CATEGORY.value,
|
|
702
|
+
MetricKey.DESCRIPTOR.value,
|
|
703
|
+
MetricKey.GROUP.value,
|
|
704
|
+
],
|
|
705
|
+
resume_file=resume_file if resume else None
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
def _log(self, message: str, level: int = logging.INFO) -> None:
|
|
709
|
+
""" Log to provided logger or fallback to print when verbose is enabled. """
|
|
710
|
+
if self.logger is not None:
|
|
711
|
+
self.logger.log(level, message)
|
|
712
|
+
return
|
|
713
|
+
if self.verbose:
|
|
714
|
+
print(message)
|
|
715
|
+
|
|
716
|
+
def validate_inputs(self,
|
|
717
|
+
input_values: Optional[Union[str, List[str]]],
|
|
718
|
+
valid_values: Sequence[str],
|
|
719
|
+
input_name: str) -> List[str]:
|
|
720
|
+
""" Validate list/string inputs and normalize to sorted unique list of strings. """
|
|
721
|
+
if input_values in (None, [], ""):
|
|
722
|
+
values = list(valid_values)
|
|
723
|
+
elif isinstance(input_values, str):
|
|
724
|
+
if ',' in input_values:
|
|
725
|
+
values = input_values.replace(' ', '').split(',')
|
|
726
|
+
else:
|
|
727
|
+
values = [input_values.strip()]
|
|
728
|
+
elif isinstance(input_values, list):
|
|
729
|
+
if not all(isinstance(item, str) for item in input_values):
|
|
730
|
+
raise TypeError(f"Input {input_name} values must be strings.")
|
|
731
|
+
values = input_values
|
|
732
|
+
else:
|
|
733
|
+
raise TypeError(f"Input {input_name} parameter is not of type list or str, got {type(input_values)}.")
|
|
734
|
+
|
|
735
|
+
# Remove duplicates and sort values, then check for any invalid entries against valid_values list
|
|
736
|
+
values = sorted(list(set(values)))
|
|
737
|
+
invalid_values = [value for value in values if value not in valid_values]
|
|
738
|
+
if invalid_values:
|
|
739
|
+
raise ValueError(f"Invalid {input_name} value(s) found: {invalid_values}.")
|
|
740
|
+
|
|
741
|
+
return values
|
|
742
|
+
|
|
743
|
+
def validate_desc_combo(self, desc_combo: int) -> None:
|
|
744
|
+
""" Validate descriptor combination size. """
|
|
745
|
+
if desc_combo not in {1, 2, 3}:
|
|
746
|
+
raise ValueError(f"Invalid desc_combo value '{desc_combo}'. Expected one of: 1, 2, 3.")
|
|
747
|
+
|
|
748
|
+
def _apply_model_limit(self,
|
|
749
|
+
entries: List[Union[str, Tuple[str, ...]]],
|
|
750
|
+
sample_mode: bool,
|
|
751
|
+
max_models: Optional[int]) -> List[Union[str, Tuple[str, ...]]]:
|
|
752
|
+
""" Limit model count for smoke runs or explicit truncation. """
|
|
753
|
+
if not entries:
|
|
754
|
+
return entries
|
|
755
|
+
|
|
756
|
+
# If max_models is set, take the first N entries (deterministic slice)
|
|
757
|
+
if max_models is not None:
|
|
758
|
+
if max_models <= 0:
|
|
759
|
+
raise ValueError(f"max_models must be > 0, got {max_models}.")
|
|
760
|
+
return entries[:max_models]
|
|
761
|
+
|
|
762
|
+
# If sample_mode is True, take the first 10 entries for quick testing
|
|
763
|
+
if sample_mode:
|
|
764
|
+
return entries[:min(10, len(entries))]
|
|
765
|
+
|
|
766
|
+
return entries
|
|
767
|
+
|
|
768
|
+
def _load_resume(self,
|
|
769
|
+
resume_file: Optional[str],
|
|
770
|
+
key_columns: Sequence[str]) -> Tuple[List[Dict[str, Any]], set]:
|
|
771
|
+
""" Load existing partial results and return rows + completed key set. """
|
|
772
|
+
if not resume_file:
|
|
773
|
+
return [], set()
|
|
774
|
+
|
|
775
|
+
# Load the resume file if it exists
|
|
776
|
+
resume_path = Path(resume_file)
|
|
777
|
+
if not resume_path.exists():
|
|
778
|
+
return [], set()
|
|
779
|
+
|
|
780
|
+
# Read existing results and build a set of completed keys based on specified key columns
|
|
781
|
+
existing_df = pd.read_csv(resume_path)
|
|
782
|
+
if existing_df.empty:
|
|
783
|
+
return [], set()
|
|
784
|
+
|
|
785
|
+
# Convert existing rows to dicts and create a set of completed keys for quick lookup
|
|
786
|
+
existing_rows = existing_df.to_dict(orient='records')
|
|
787
|
+
completed_keys = {
|
|
788
|
+
tuple(str(row[col]) for col in key_columns)
|
|
789
|
+
for row in existing_rows if all(col in row for col in key_columns)
|
|
790
|
+
}
|
|
791
|
+
return existing_rows, completed_keys
|
|
792
|
+
|
|
793
|
+
def _save_resume_checkpoint(self, metrics_rows: List[Dict[str, Any]], resume_file: Optional[str]) -> None:
|
|
794
|
+
""" Persist current progress to a resume checkpoint file. """
|
|
795
|
+
if not resume_file:
|
|
796
|
+
return
|
|
797
|
+
pd.DataFrame(metrics_rows).to_csv(resume_file, index=False)
|
|
798
|
+
|
|
799
|
+
def _get_aai_features(self, index: str) -> pd.DataFrame:
|
|
800
|
+
""" Return cached AAI features for an index, computing on first use. """
|
|
801
|
+
with self._cache_lock:
|
|
802
|
+
if index in self._aai_feature_cache:
|
|
803
|
+
return self._aai_feature_cache[index]
|
|
804
|
+
|
|
805
|
+
# Compute features outside the lock to allow concurrent computation of different indices
|
|
806
|
+
encoded_seqs = self.get_aai_encoding(index)
|
|
807
|
+
if self.use_dsp:
|
|
808
|
+
py_dsp = PyDSP(
|
|
809
|
+
self.config_file,
|
|
810
|
+
protein_seqs=encoded_seqs,
|
|
811
|
+
spectrum=self.spectrum,
|
|
812
|
+
window_type=self.window_type,
|
|
813
|
+
filter_type=self.filter_type
|
|
814
|
+
)
|
|
815
|
+
# Encode sequences using DSP and generate features
|
|
816
|
+
py_dsp.encode_sequences()
|
|
817
|
+
features = pd.DataFrame(py_dsp.spectrum_encoding)
|
|
818
|
+
else:
|
|
819
|
+
features = pd.DataFrame(encoded_seqs)
|
|
820
|
+
|
|
821
|
+
# Vectorized renaming avoids repeated per-column operations.
|
|
822
|
+
features.columns = [f"aai_{i}" for i in range(1, len(features.columns) + 1)]
|
|
823
|
+
with self._cache_lock:
|
|
824
|
+
self._aai_feature_cache[index] = features
|
|
825
|
+
return features
|
|
826
|
+
|
|
827
|
+
def _get_descriptor_features(self, descriptor_name: str, desc_instance: Descriptors) -> pd.DataFrame:
|
|
828
|
+
""" Return cached descriptor features, computing on first use. """
|
|
829
|
+
with self._cache_lock:
|
|
830
|
+
if descriptor_name in self._descriptor_feature_cache:
|
|
831
|
+
return self._descriptor_feature_cache[descriptor_name]
|
|
832
|
+
|
|
833
|
+
# Compute outside the lock to allow concurrent computation of different descriptors
|
|
834
|
+
descriptor_df = desc_instance.get_descriptor_encoding(descriptor_name)
|
|
835
|
+
with self._cache_lock:
|
|
836
|
+
self._descriptor_feature_cache[descriptor_name] = descriptor_df
|
|
837
|
+
return descriptor_df
|
|
838
|
+
|
|
839
|
+
def build_features(self,
|
|
840
|
+
feature_type: str,
|
|
841
|
+
index: Optional[str] = None,
|
|
842
|
+
descriptor_entry: Optional[Union[str, Tuple[str, ...]]] = None,
|
|
843
|
+
desc_instance: Optional[Descriptors] = None) -> pd.DataFrame:
|
|
844
|
+
""" Build feature matrix for AAI, descriptor, or combined encodings. """
|
|
845
|
+
if feature_type == "aai":
|
|
846
|
+
if index is None:
|
|
847
|
+
raise ValueError("index must be provided for AAI feature building.")
|
|
848
|
+
return self._get_aai_features(index)
|
|
849
|
+
|
|
850
|
+
if feature_type == "descriptor":
|
|
851
|
+
if desc_instance is None:
|
|
852
|
+
raise ValueError("desc_instance must be provided for descriptor feature building.")
|
|
853
|
+
if descriptor_entry is None:
|
|
854
|
+
raise ValueError("descriptor_entry must be provided for descriptor feature building.")
|
|
855
|
+
|
|
856
|
+
descriptor_names: List[str]
|
|
857
|
+
if isinstance(descriptor_entry, tuple):
|
|
858
|
+
descriptor_names = list(descriptor_entry)
|
|
859
|
+
else:
|
|
860
|
+
descriptor_names = [descriptor_entry]
|
|
861
|
+
|
|
862
|
+
descriptor_frames: List[pd.DataFrame] = [
|
|
863
|
+
self._get_descriptor_features(name, desc_instance)
|
|
864
|
+
for name in descriptor_names
|
|
865
|
+
]
|
|
866
|
+
if len(descriptor_frames) == 1:
|
|
867
|
+
return descriptor_frames[0].reset_index(drop=True)
|
|
868
|
+
# Align on row position to avoid index-based concat errors when source indexes are non-unique.
|
|
869
|
+
return pd.concat([frame.reset_index(drop=True) for frame in descriptor_frames], axis=1)
|
|
870
|
+
|
|
871
|
+
if feature_type == "aai_descriptor":
|
|
872
|
+
if index is None:
|
|
873
|
+
raise ValueError("index must be provided for combined feature building.")
|
|
874
|
+
aai_features = self.build_features(feature_type="aai", index=index).reset_index(drop=True)
|
|
875
|
+
descriptor_features = self.build_features(
|
|
876
|
+
feature_type="descriptor",
|
|
877
|
+
descriptor_entry=descriptor_entry,
|
|
878
|
+
desc_instance=desc_instance
|
|
879
|
+
).reset_index(drop=True)
|
|
880
|
+
# Concatenate descriptor and AAI features by row position for consistent model input shape.
|
|
881
|
+
return pd.concat([descriptor_features, aai_features], axis=1)
|
|
882
|
+
|
|
883
|
+
raise ValueError(f"Unknown feature_type '{feature_type}'.")
|
|
884
|
+
|
|
885
|
+
def run_model(self,
|
|
886
|
+
X: pd.DataFrame,
|
|
887
|
+
Y: pd.Series,
|
|
888
|
+
random_state: Optional[int] = None) -> Evaluate:
|
|
889
|
+
"""Train model for current configuration and return evaluated metrics."""
|
|
890
|
+
model_parameters = self.model_parameters if isinstance(self.model_parameters, dict) else {}
|
|
891
|
+
if X.shape[1] == 1 and self.algorithm.lower() == "plsregression":
|
|
892
|
+
model_parameters = dict(model_parameters)
|
|
893
|
+
model_parameters['n_components'] = 1
|
|
894
|
+
|
|
895
|
+
# Convert pandas containers to numpy arrays to avoid pandas dtype deprecation warnings in sklearn validation.
|
|
896
|
+
X_values = X.to_numpy(dtype=float, copy=False) if isinstance(X, pd.DataFrame) else X
|
|
897
|
+
Y_values = Y.to_numpy(copy=False) if isinstance(Y, pd.Series) else Y
|
|
898
|
+
|
|
899
|
+
model = Model(
|
|
900
|
+
X=X_values,
|
|
901
|
+
Y=Y_values,
|
|
902
|
+
algorithm=self.algorithm,
|
|
903
|
+
parameters=model_parameters,
|
|
904
|
+
test_split=self.test_split
|
|
905
|
+
)
|
|
906
|
+
_, _, _, y_test = model.train_test_split(test_split=self.test_split, random_state=random_state)
|
|
907
|
+
model.fit()
|
|
908
|
+
y_pred = model.predict()
|
|
909
|
+
return Evaluate(y_test, y_pred)
|
|
910
|
+
|
|
911
|
+
def collect_metrics(self, metrics_rows: List[Dict[str, Any]], row: Dict[str, Any]) -> None:
|
|
912
|
+
""" Append a metrics row to the list of result rows. """
|
|
913
|
+
metrics_rows.append(row)
|
|
914
|
+
|
|
915
|
+
def _execute_jobs(self,
|
|
916
|
+
items: Sequence[Any],
|
|
917
|
+
task_fn: Callable[[Any], Union[Dict[str, Any], List[Dict[str, Any]], None]],
|
|
918
|
+
n_jobs: int,
|
|
919
|
+
tqdm_desc: str,
|
|
920
|
+
tqdm_unit: str,
|
|
921
|
+
tqdm_disable: bool) -> List[Dict[str, Any]]:
|
|
922
|
+
""" Execute independent tasks sequentially or in parallel and collect rows. """
|
|
923
|
+
if n_jobs <= 1:
|
|
924
|
+
collected: List[Dict[str, Any]] = []
|
|
925
|
+
for item in tqdm(items, desc=tqdm_desc, unit=tqdm_unit, disable=tqdm_disable, ncols=90):
|
|
926
|
+
result = task_fn(item)
|
|
927
|
+
if result is None:
|
|
928
|
+
continue
|
|
929
|
+
if isinstance(result, list):
|
|
930
|
+
collected.extend(result)
|
|
931
|
+
else:
|
|
932
|
+
collected.append(result)
|
|
933
|
+
return collected
|
|
934
|
+
|
|
935
|
+
collected_parallel: List[Dict[str, Any]] = []
|
|
936
|
+
with ThreadPoolExecutor(max_workers=n_jobs) as executor:
|
|
937
|
+
futures = {executor.submit(task_fn, item): item for item in items}
|
|
938
|
+
iterator = as_completed(futures)
|
|
939
|
+
if not tqdm_disable:
|
|
940
|
+
iterator = tqdm(iterator, total=len(futures), desc=tqdm_desc, unit=tqdm_unit, ncols=90)
|
|
941
|
+
|
|
942
|
+
for future in iterator:
|
|
943
|
+
result = future.result()
|
|
944
|
+
if result is None:
|
|
945
|
+
continue
|
|
946
|
+
if isinstance(result, list):
|
|
947
|
+
collected_parallel.extend(result)
|
|
948
|
+
else:
|
|
949
|
+
collected_parallel.append(result)
|
|
950
|
+
|
|
951
|
+
return collected_parallel
|
|
952
|
+
|
|
953
|
+
def format_and_save_results(self,
|
|
954
|
+
metrics_rows: List[Dict[str, Any]],
|
|
955
|
+
columns: List[str],
|
|
956
|
+
sort_by: str,
|
|
957
|
+
save_filename: str,
|
|
958
|
+
output_folder: str,
|
|
959
|
+
string_columns: List[str],
|
|
960
|
+
resume_file: Optional[str] = None) -> pd.DataFrame:
|
|
961
|
+
""" Create, sort, save and return result dataframe from collected metric rows. """
|
|
962
|
+
metrics_df = pd.DataFrame(metrics_rows, columns=columns)
|
|
963
|
+
for column in string_columns:
|
|
964
|
+
if column in metrics_df.columns:
|
|
965
|
+
metrics_df[column] = metrics_df[column].astype(pd.StringDtype())
|
|
966
|
+
|
|
967
|
+
valid_sort_columns = [metric.value for metric in SortKey]
|
|
968
|
+
if sort_by not in valid_sort_columns:
|
|
969
|
+
sort_by = SortKey.R2.value
|
|
970
|
+
|
|
971
|
+
sort_ascending = sort_by not in {SortKey.R2.value, SortKey.EXPLAINED_VARIANCE.value}
|
|
972
|
+
metrics_df = metrics_df.sort_values(by=[sort_by], ascending=sort_ascending)
|
|
973
|
+
save_results(metrics_df, save_filename, output_folder=output_folder)
|
|
974
|
+
if resume_file:
|
|
975
|
+
metrics_df.to_csv(resume_file, index=False)
|
|
976
|
+
return metrics_df
|
|
977
|
+
|
|
978
|
+
def __str__(self) -> str:
|
|
979
|
+
return (
|
|
980
|
+
f"Instance of Encoding Class with attribute values: Configuration File: {os.path.basename(self.config_file)}, "
|
|
981
|
+
f"Dataset: {os.path.basename(self.dataset)}, Target Activity: {self.activity}, "
|
|
982
|
+
f"Algorithm: {self.algorithm}, Model Parameters: {self.model_parameters}, Test Split: {self.test_split}."
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
def __repr__(self) -> str:
|
|
986
|
+
return f"<{self}>"
|