nkululeko 0.95.1__py3-none-any.whl → 0.95.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/constants.py +1 -1
- nkululeko/feat_extract/feats_mld.py +13 -5
- nkululeko/feature_extractor.py +5 -0
- nkululeko/optim.py +931 -0
- nkululeko/reporting/reporter.py +9 -1
- nkululeko/tests/test_optim.py +200 -0
- nkululeko/utils/util.py +2 -2
- nkululeko-0.95.3.dist-info/METADATA +376 -0
- {nkululeko-0.95.1.dist-info → nkululeko-0.95.3.dist-info}/RECORD +13 -11
- nkululeko-0.95.1.dist-info/METADATA +0 -76
- {nkululeko-0.95.1.dist-info → nkululeko-0.95.3.dist-info}/WHEEL +0 -0
- {nkululeko-0.95.1.dist-info → nkululeko-0.95.3.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.95.1.dist-info → nkululeko-0.95.3.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.95.1.dist-info → nkululeko-0.95.3.dist-info}/top_level.txt +0 -0
nkululeko/optim.py
ADDED
@@ -0,0 +1,931 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import argparse
|
4
|
+
import ast
|
5
|
+
import configparser
|
6
|
+
import itertools
|
7
|
+
import os
|
8
|
+
import random
|
9
|
+
import sys
|
10
|
+
import time
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from nkululeko.constants import VERSION
|
15
|
+
from nkululeko.utils.util import Util
|
16
|
+
|
17
|
+
|
18
|
+
class OptimizationRunner:
|
19
|
+
"""Hyperparameter optimization runner for nkululeko experiments."""
|
20
|
+
|
21
|
+
def __init__(self, config):
|
22
|
+
self.config = config
|
23
|
+
self.util = Util("optim")
|
24
|
+
self.results = []
|
25
|
+
self.model_type = None # Will be set when parsing OPTIM params
|
26
|
+
# New: Optimization strategy configuration
|
27
|
+
self.search_strategy = "grid" # Default values
|
28
|
+
self.n_iter = 50
|
29
|
+
self.cv_folds = 3
|
30
|
+
self.metric = "accuracy"
|
31
|
+
self.random_state = 42 # Default random state for reproducibility
|
32
|
+
|
33
|
+
def parse_optim_params(self):
|
34
|
+
"""Parse OPTIM section parameters into search spaces."""
|
35
|
+
if "OPTIM" not in self.config:
|
36
|
+
self.util.error("No [OPTIM] section found in configuration")
|
37
|
+
|
38
|
+
optim_config = self.config["OPTIM"]
|
39
|
+
self.model_type = optim_config.get("model", "mlp")
|
40
|
+
|
41
|
+
# Parse optimization strategy settings
|
42
|
+
self.search_strategy = optim_config.get("search_strategy", "grid")
|
43
|
+
self.n_iter = int(optim_config.get("n_iter", "50"))
|
44
|
+
self.cv_folds = int(optim_config.get("cv_folds", "3"))
|
45
|
+
self.random_state = int(optim_config.get("random_state", "42"))
|
46
|
+
|
47
|
+
# Set global random seeds for reproducibility
|
48
|
+
random.seed(self.random_state)
|
49
|
+
np.random.seed(self.random_state)
|
50
|
+
|
51
|
+
self.util.debug(f"Using random state: {self.random_state} for reproducibility")
|
52
|
+
|
53
|
+
self.metric = optim_config.get("metric", "accuracy").lower()
|
54
|
+
|
55
|
+
self.util.debug(f"Parsed metric from config: '{self.metric}'") # Debug line
|
56
|
+
|
57
|
+
param_specs = {}
|
58
|
+
for key, value in optim_config.items():
|
59
|
+
if key in ["model", "search_strategy", "n_iter", "cv_folds", "metric", "random_state"]:
|
60
|
+
continue
|
61
|
+
param_specs[key] = self._parse_param_spec(key, value)
|
62
|
+
|
63
|
+
return param_specs
|
64
|
+
|
65
|
+
def _parse_param_spec(self, param_name, param_value):
|
66
|
+
"""Parse individual parameter specification."""
|
67
|
+
try:
|
68
|
+
parsed = ast.literal_eval(param_value)
|
69
|
+
except (ValueError, SyntaxError) as e:
|
70
|
+
self.util.debug(
|
71
|
+
f"Could not parse parameter {param_name}={param_value} as literal, treating as string: {e}"
|
72
|
+
)
|
73
|
+
if isinstance(param_value, str):
|
74
|
+
return [param_value]
|
75
|
+
return param_value
|
76
|
+
|
77
|
+
# Check for inefficient learning rate ranges and suggest better alternatives
|
78
|
+
if param_name == "lr" and isinstance(parsed, tuple) and len(parsed) == 3:
|
79
|
+
min_val, max_val, step = parsed
|
80
|
+
if step <= 0.0001 and (max_val - min_val) / step > 20:
|
81
|
+
self.util.debug(
|
82
|
+
f"WARNING: Learning rate range {param_value} will generate {int((max_val - min_val) / step)} values!"
|
83
|
+
)
|
84
|
+
self.util.debug(
|
85
|
+
"Consider using discrete values like [0.0001, 0.001, 0.01, 0.1] or range (0.0001, 0.1) for log-scale sampling"
|
86
|
+
)
|
87
|
+
|
88
|
+
if isinstance(parsed, tuple):
|
89
|
+
if len(parsed) == 2:
|
90
|
+
return self._generate_range(parsed[0], parsed[1], param_name)
|
91
|
+
elif len(parsed) == 3:
|
92
|
+
return self._generate_range_with_step(
|
93
|
+
parsed[0], parsed[1], parsed[2], param_name
|
94
|
+
)
|
95
|
+
else:
|
96
|
+
self.util.error(
|
97
|
+
f"Invalid tuple format for parameter {param_name}: {param_value}. Expected (min, max) or (min, max, step)"
|
98
|
+
)
|
99
|
+
return [parsed[0]] # Fallback to first value
|
100
|
+
elif isinstance(parsed, list):
|
101
|
+
return parsed
|
102
|
+
else:
|
103
|
+
return [parsed]
|
104
|
+
|
105
|
+
def _generate_range(self, min_val, max_val, param_name):
|
106
|
+
"""Generate parameter range based on parameter type."""
|
107
|
+
if param_name in ["nlayers"]:
|
108
|
+
return list(range(min_val, max_val + 1))
|
109
|
+
elif param_name in ["nnodes", "bs"]:
|
110
|
+
result = []
|
111
|
+
current = min_val
|
112
|
+
while current <= max_val:
|
113
|
+
result.append(current)
|
114
|
+
current *= 2
|
115
|
+
return result
|
116
|
+
elif param_name in ["lr"]:
|
117
|
+
# For learning rate, use logarithmic scale sampling (more practical)
|
118
|
+
# Generate 5-8 values on log scale between min and max
|
119
|
+
num_samples = min(8, max(5, int(np.log10(max_val / min_val) * 2)))
|
120
|
+
log_min = np.log10(min_val)
|
121
|
+
log_max = np.log10(max_val)
|
122
|
+
log_values = np.linspace(log_min, log_max, num_samples)
|
123
|
+
result = [round(10**log_val, 6) for log_val in log_values]
|
124
|
+
return result
|
125
|
+
elif param_name in ["do"]:
|
126
|
+
# For dropout, generate reasonable steps
|
127
|
+
num_steps = 5
|
128
|
+
step = (max_val - min_val) / num_steps
|
129
|
+
result = []
|
130
|
+
current = min_val
|
131
|
+
while current <= max_val + step / 2:
|
132
|
+
result.append(round(current, 2))
|
133
|
+
current += step
|
134
|
+
return result
|
135
|
+
else:
|
136
|
+
return list(range(min_val, max_val + 1))
|
137
|
+
|
138
|
+
def _generate_range_with_step(self, min_val, max_val, step, param_name):
|
139
|
+
"""Generate parameter range with explicit step."""
|
140
|
+
if (
|
141
|
+
isinstance(step, float)
|
142
|
+
or isinstance(min_val, float)
|
143
|
+
or isinstance(max_val, float)
|
144
|
+
):
|
145
|
+
result = []
|
146
|
+
current = float(min_val)
|
147
|
+
step = float(step)
|
148
|
+
max_val = float(max_val)
|
149
|
+
while current <= max_val + step / 2:
|
150
|
+
result.append(round(current, 6)) # More precision for floats
|
151
|
+
current += step
|
152
|
+
return result
|
153
|
+
else:
|
154
|
+
return list(range(min_val, max_val + 1, step))
|
155
|
+
|
156
|
+
def generate_param_combinations(self, param_specs):
|
157
|
+
"""Generate all parameter combinations for grid search."""
|
158
|
+
param_names = list(param_specs.keys())
|
159
|
+
param_values = list(param_specs.values())
|
160
|
+
|
161
|
+
combinations = []
|
162
|
+
for combo in itertools.product(*param_values):
|
163
|
+
param_dict = dict(zip(param_names, combo))
|
164
|
+
combinations.append(param_dict)
|
165
|
+
|
166
|
+
return combinations
|
167
|
+
|
168
|
+
def run_optimization(self):
|
169
|
+
"""Run hyperparameter optimization using the most appropriate method."""
|
170
|
+
self.util.debug(
|
171
|
+
f"Starting optimization using {self.search_strategy} strategy with {self.metric} metric, nkululeko version {VERSION}"
|
172
|
+
)
|
173
|
+
|
174
|
+
param_specs = self.parse_optim_params()
|
175
|
+
|
176
|
+
if not param_specs:
|
177
|
+
self.util.error("No optimization parameters found in [OPTIM] section")
|
178
|
+
return None, None, []
|
179
|
+
|
180
|
+
# Always use manual optimization to ensure consistent evaluation pipeline
|
181
|
+
# This prevents discrepancies between CV and final evaluation
|
182
|
+
self.util.debug("Using manual optimization for consistent evaluation pipeline")
|
183
|
+
return self._run_manual_optimization(param_specs)
|
184
|
+
|
185
|
+
def _run_manual_optimization(self, param_specs):
|
186
|
+
"""Run manual grid search optimization with consistent evaluation pipeline."""
|
187
|
+
combinations = self.generate_param_combinations(param_specs)
|
188
|
+
|
189
|
+
if not combinations:
|
190
|
+
self.util.error("No parameter combinations generated")
|
191
|
+
return None, None, []
|
192
|
+
|
193
|
+
self.util.debug(
|
194
|
+
f"Starting manual optimization with {len(combinations)} parameter combinations"
|
195
|
+
)
|
196
|
+
|
197
|
+
# Check if we should use cross-validation or train-test split
|
198
|
+
use_cv = self.search_strategy in ["grid_cv", "random_cv"] or (
|
199
|
+
hasattr(self, 'use_cv_in_manual') and self.use_cv_in_manual
|
200
|
+
)
|
201
|
+
|
202
|
+
if use_cv:
|
203
|
+
return self._run_manual_cv_optimization(combinations, param_specs)
|
204
|
+
else:
|
205
|
+
return self._run_manual_train_test_optimization(combinations)
|
206
|
+
|
207
|
+
def _run_manual_train_test_optimization(self, combinations):
|
208
|
+
"""Run manual optimization using train-test split (matches final evaluation)."""
|
209
|
+
best_result = None
|
210
|
+
best_params = None
|
211
|
+
best_score = -float("inf") if self.util.high_is_good() else float("inf")
|
212
|
+
|
213
|
+
for i, params in enumerate(combinations):
|
214
|
+
self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
|
215
|
+
|
216
|
+
self._update_config_with_params(params)
|
217
|
+
|
218
|
+
try:
|
219
|
+
result, last_epoch = self._run_single_experiment()
|
220
|
+
score = result # result.test is already a numeric value
|
221
|
+
|
222
|
+
result_entry = {
|
223
|
+
"params": params.copy(),
|
224
|
+
"score": score,
|
225
|
+
"result": result,
|
226
|
+
"epoch": last_epoch,
|
227
|
+
}
|
228
|
+
self.results.append(result_entry)
|
229
|
+
|
230
|
+
is_better = (self.util.high_is_good() and score > best_score) or (
|
231
|
+
not self.util.high_is_good() and score < best_score
|
232
|
+
)
|
233
|
+
|
234
|
+
if is_better:
|
235
|
+
best_score = score
|
236
|
+
best_result = result
|
237
|
+
best_params = params.copy()
|
238
|
+
|
239
|
+
self.util.debug(f"Result: {result}, Score: {score}")
|
240
|
+
|
241
|
+
except Exception as e:
|
242
|
+
self.util.error(f"Failed with params {params}: {str(e)}")
|
243
|
+
# Log the full traceback for debugging
|
244
|
+
import traceback
|
245
|
+
|
246
|
+
self.util.debug(f"Full traceback: {traceback.format_exc()}")
|
247
|
+
continue
|
248
|
+
|
249
|
+
self.util.debug("Optimization complete!")
|
250
|
+
self.util.debug(f"Best parameters: {best_params}")
|
251
|
+
self.util.debug(f"Best result: {best_result}")
|
252
|
+
|
253
|
+
# Save results to file
|
254
|
+
self.save_results()
|
255
|
+
|
256
|
+
return best_params, best_result, self.results
|
257
|
+
|
258
|
+
def _run_manual_cv_optimization(self, combinations, param_specs):
|
259
|
+
"""Run manual optimization using cross-validation."""
|
260
|
+
import numpy as np
|
261
|
+
from sklearn.model_selection import StratifiedKFold
|
262
|
+
|
263
|
+
self.util.debug("Using cross-validation for optimization (may differ from final evaluation)")
|
264
|
+
|
265
|
+
# Set up the experiment once to get the data
|
266
|
+
import nkululeko.experiment as exp
|
267
|
+
expr = exp.Experiment(self.config)
|
268
|
+
expr.set_module("optim")
|
269
|
+
expr.load_datasets()
|
270
|
+
expr.fill_train_and_tests()
|
271
|
+
expr.extract_feats()
|
272
|
+
|
273
|
+
# Create stratified CV splits
|
274
|
+
cv_splitter = StratifiedKFold(
|
275
|
+
n_splits=self.cv_folds,
|
276
|
+
shuffle=True,
|
277
|
+
random_state=self.random_state
|
278
|
+
)
|
279
|
+
|
280
|
+
best_result = None
|
281
|
+
best_params = None
|
282
|
+
best_score = -float("inf") if self.util.high_is_good() else float("inf")
|
283
|
+
|
284
|
+
for i, params in enumerate(combinations):
|
285
|
+
self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
|
286
|
+
|
287
|
+
# Run cross-validation for this parameter combination
|
288
|
+
cv_scores = []
|
289
|
+
|
290
|
+
try:
|
291
|
+
for fold, (train_idx, val_idx) in enumerate(cv_splitter.split(
|
292
|
+
expr.feats_train, expr.df_train[self.config["DATA"]["target"]]
|
293
|
+
)):
|
294
|
+
self.util.debug(f" Fold {fold+1}/{self.cv_folds}")
|
295
|
+
|
296
|
+
# Create fold-specific data
|
297
|
+
fold_train_feats = expr.feats_train.iloc[train_idx]
|
298
|
+
fold_val_feats = expr.feats_train.iloc[val_idx]
|
299
|
+
fold_train_df = expr.df_train.iloc[train_idx]
|
300
|
+
fold_val_df = expr.df_train.iloc[val_idx]
|
301
|
+
|
302
|
+
# Update config with current parameters
|
303
|
+
self._update_config_with_params(params)
|
304
|
+
|
305
|
+
# Run experiment on this fold
|
306
|
+
fold_score = self._run_cv_fold(
|
307
|
+
fold_train_feats, fold_val_feats,
|
308
|
+
fold_train_df, fold_val_df, params
|
309
|
+
)
|
310
|
+
cv_scores.append(fold_score)
|
311
|
+
|
312
|
+
# Calculate mean CV score
|
313
|
+
mean_score = np.mean(cv_scores)
|
314
|
+
std_score = np.std(cv_scores)
|
315
|
+
|
316
|
+
result_entry = {
|
317
|
+
"params": params.copy(),
|
318
|
+
"score": mean_score,
|
319
|
+
"result": mean_score,
|
320
|
+
"cv_std": std_score,
|
321
|
+
"cv_scores": cv_scores,
|
322
|
+
"epoch": 0,
|
323
|
+
}
|
324
|
+
self.results.append(result_entry)
|
325
|
+
|
326
|
+
is_better = (self.util.high_is_good() and mean_score > best_score) or (
|
327
|
+
not self.util.high_is_good() and mean_score < best_score
|
328
|
+
)
|
329
|
+
|
330
|
+
if is_better:
|
331
|
+
best_score = mean_score
|
332
|
+
best_result = mean_score
|
333
|
+
best_params = params.copy()
|
334
|
+
|
335
|
+
self.util.debug(f"CV Score: {mean_score:.4f} ± {std_score:.4f}")
|
336
|
+
|
337
|
+
except Exception as e:
|
338
|
+
self.util.error(f"Failed with params {params}: {str(e)}")
|
339
|
+
continue
|
340
|
+
|
341
|
+
self.util.debug("Cross-validation optimization complete!")
|
342
|
+
self.util.debug(f"Best parameters: {best_params}")
|
343
|
+
self.util.debug(f"Best CV score: {best_result}")
|
344
|
+
|
345
|
+
# Validate with final evaluation pipeline
|
346
|
+
if best_params:
|
347
|
+
validation_score = self._validate_best_params_standard_eval(best_params, expr)
|
348
|
+
if validation_score is not None:
|
349
|
+
self.util.debug(f"Cross-validation score: {best_result:.4f}")
|
350
|
+
self.util.debug(f"Standard evaluation score: {validation_score:.4f}")
|
351
|
+
score_diff = abs(best_result - validation_score)
|
352
|
+
self.util.debug(f"Score difference: {score_diff:.4f}")
|
353
|
+
|
354
|
+
if score_diff > 0.1: # 10% difference threshold
|
355
|
+
self.util.debug("WARNING: Large discrepancy between CV and standard evaluation!")
|
356
|
+
self.util.debug("Consider using train-test optimization for more consistent results.")
|
357
|
+
|
358
|
+
# Save results to file
|
359
|
+
self.save_results()
|
360
|
+
|
361
|
+
return best_params, best_result, self.results
|
362
|
+
|
363
|
+
def _run_cv_fold(self, train_feats, val_feats, train_df, val_df, params):
|
364
|
+
"""Run a single cross-validation fold."""
|
365
|
+
from nkululeko.modelrunner import Modelrunner
|
366
|
+
|
367
|
+
# Create a temporary runner for this fold
|
368
|
+
runner = Modelrunner(train_df, val_df, train_feats, val_feats, 0)
|
369
|
+
runner._select_model(self.model_type)
|
370
|
+
|
371
|
+
# Configure model with current parameters
|
372
|
+
if self.model_type == "mlp":
|
373
|
+
self._configure_mlp_model(runner.model, params)
|
374
|
+
else:
|
375
|
+
self._configure_traditional_model(runner.model, params)
|
376
|
+
|
377
|
+
# Train and evaluate
|
378
|
+
runner.model.train()
|
379
|
+
reports = runner.model.predict()
|
380
|
+
|
381
|
+
# Extract score based on metric
|
382
|
+
return self._extract_score_from_report(reports)
|
383
|
+
|
384
|
+
def _run_sklearn_optimization(self, param_specs):
|
385
|
+
"""Run optimization using scikit-learn's hyperparameter search methods with consistent data handling."""
|
386
|
+
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
|
387
|
+
StratifiedKFold)
|
388
|
+
|
389
|
+
# Import the actual experiment to get the model and data
|
390
|
+
import nkululeko.experiment as exp
|
391
|
+
|
392
|
+
# Set up the experiment
|
393
|
+
expr = exp.Experiment(self.config)
|
394
|
+
expr.set_module("optim")
|
395
|
+
expr.load_datasets()
|
396
|
+
expr.fill_train_and_tests()
|
397
|
+
expr.extract_feats()
|
398
|
+
|
399
|
+
# Apply the same balancing as the final evaluation
|
400
|
+
original_train_feats = expr.feats_train.copy()
|
401
|
+
original_train_df = expr.df_train.copy()
|
402
|
+
|
403
|
+
if "FEATS" in self.config and "balancing" in self.config["FEATS"]:
|
404
|
+
balancing_method = self.config["FEATS"]["balancing"]
|
405
|
+
if balancing_method and balancing_method.lower() != "none":
|
406
|
+
self.util.debug(f"Applying {balancing_method} balancing for optimization consistency")
|
407
|
+
try:
|
408
|
+
from nkululeko.balance import DataBalancer
|
409
|
+
balancer = DataBalancer()
|
410
|
+
expr.feats_train, expr.df_train = balancer.balance_features(
|
411
|
+
expr.df_train, expr.feats_train, self.config["DATA"]["target"], balancing_method
|
412
|
+
)
|
413
|
+
self.util.debug(f"Balanced training data: {len(expr.feats_train)} samples")
|
414
|
+
except Exception as e:
|
415
|
+
self.util.debug(f"Balancing failed: {e}, using original data")
|
416
|
+
expr.feats_train = original_train_feats
|
417
|
+
expr.df_train = original_train_df
|
418
|
+
|
419
|
+
# Get the base model without hyperparameter tuning
|
420
|
+
original_tuning_params = self.config.get(
|
421
|
+
"MODEL", "tuning_params", fallback=None
|
422
|
+
)
|
423
|
+
if "MODEL" not in self.config:
|
424
|
+
self.config.add_section("MODEL")
|
425
|
+
|
426
|
+
# Temporarily disable tuning_params to get base model
|
427
|
+
if original_tuning_params:
|
428
|
+
self.config.remove_option("MODEL", "tuning_params")
|
429
|
+
|
430
|
+
# Create a model instance using the modelrunner approach
|
431
|
+
from nkululeko.modelrunner import Modelrunner
|
432
|
+
|
433
|
+
runner = Modelrunner(
|
434
|
+
expr.df_train, expr.df_test, expr.feats_train, expr.feats_test, 0
|
435
|
+
)
|
436
|
+
runner._select_model(self.model_type)
|
437
|
+
base_clf = runner.model.clf
|
438
|
+
|
439
|
+
# Restore original tuning_params if it existed
|
440
|
+
if original_tuning_params:
|
441
|
+
self.config.set("MODEL", "tuning_params", original_tuning_params)
|
442
|
+
|
443
|
+
# Convert parameter specifications to sklearn format
|
444
|
+
sklearn_params = self._convert_to_sklearn_params(param_specs)
|
445
|
+
|
446
|
+
# Create stratified CV for consistent cross-validation
|
447
|
+
cv = StratifiedKFold(
|
448
|
+
n_splits=self.cv_folds,
|
449
|
+
shuffle=True,
|
450
|
+
random_state=self.random_state
|
451
|
+
)
|
452
|
+
|
453
|
+
# Choose search strategy
|
454
|
+
if self.search_strategy == "random":
|
455
|
+
search = RandomizedSearchCV(
|
456
|
+
base_clf,
|
457
|
+
sklearn_params,
|
458
|
+
n_iter=self.n_iter,
|
459
|
+
cv=cv, # Use stratified CV
|
460
|
+
scoring=self._get_scoring_metric(),
|
461
|
+
random_state=self.random_state,
|
462
|
+
n_jobs=-1,
|
463
|
+
verbose=1,
|
464
|
+
)
|
465
|
+
elif self.search_strategy == "halving_random":
|
466
|
+
try:
|
467
|
+
from sklearn.model_selection import HalvingRandomSearchCV
|
468
|
+
|
469
|
+
search = HalvingRandomSearchCV(
|
470
|
+
base_clf,
|
471
|
+
sklearn_params,
|
472
|
+
cv=cv, # Use stratified CV
|
473
|
+
scoring=self._get_scoring_metric(),
|
474
|
+
random_state=self.random_state,
|
475
|
+
n_jobs=-1,
|
476
|
+
verbose=1,
|
477
|
+
)
|
478
|
+
except ImportError:
|
479
|
+
self.util.debug(
|
480
|
+
"HalvingRandomSearchCV not available, falling back to RandomizedSearchCV"
|
481
|
+
)
|
482
|
+
search = RandomizedSearchCV(
|
483
|
+
base_clf,
|
484
|
+
sklearn_params,
|
485
|
+
n_iter=self.n_iter,
|
486
|
+
cv=cv, # Use stratified CV
|
487
|
+
scoring=self._get_scoring_metric(),
|
488
|
+
random_state=self.random_state,
|
489
|
+
n_jobs=-1,
|
490
|
+
verbose=1,
|
491
|
+
)
|
492
|
+
elif self.search_strategy == "halving_grid":
|
493
|
+
try:
|
494
|
+
from sklearn.model_selection import HalvingGridSearchCV
|
495
|
+
|
496
|
+
search = HalvingGridSearchCV(
|
497
|
+
base_clf,
|
498
|
+
sklearn_params,
|
499
|
+
cv=cv, # Use stratified CV
|
500
|
+
scoring=self._get_scoring_metric(),
|
501
|
+
random_state=self.random_state,
|
502
|
+
n_jobs=-1,
|
503
|
+
verbose=1,
|
504
|
+
)
|
505
|
+
except ImportError:
|
506
|
+
self.util.debug(
|
507
|
+
"HalvingGridSearchCV not available, falling back to GridSearchCV"
|
508
|
+
)
|
509
|
+
search = GridSearchCV(
|
510
|
+
base_clf,
|
511
|
+
sklearn_params,
|
512
|
+
cv=cv, # Use stratified CV
|
513
|
+
scoring=self._get_scoring_metric(),
|
514
|
+
n_jobs=-1,
|
515
|
+
verbose=1,
|
516
|
+
)
|
517
|
+
else: # grid search (default)
|
518
|
+
search = GridSearchCV(
|
519
|
+
base_clf,
|
520
|
+
sklearn_params,
|
521
|
+
cv=cv, # Use stratified CV
|
522
|
+
scoring=self._get_scoring_metric(),
|
523
|
+
n_jobs=-1,
|
524
|
+
verbose=1,
|
525
|
+
)
|
526
|
+
|
527
|
+
self.util.debug(
|
528
|
+
f"Starting {self.search_strategy} search with {len(sklearn_params)} parameters"
|
529
|
+
)
|
530
|
+
self.util.debug(f"Using stratified {self.cv_folds}-fold cross-validation")
|
531
|
+
|
532
|
+
# Fit the search
|
533
|
+
search.fit(expr.feats_train, expr.df_train[self.config["DATA"]["target"]])
|
534
|
+
|
535
|
+
# Extract results
|
536
|
+
best_params = search.best_params_
|
537
|
+
best_score = search.best_score_
|
538
|
+
|
539
|
+
# Convert results back to our format
|
540
|
+
all_results = []
|
541
|
+
for i, (params, score) in enumerate(
|
542
|
+
zip(search.cv_results_["params"], search.cv_results_["mean_test_score"])
|
543
|
+
):
|
544
|
+
result_entry = {
|
545
|
+
"params": params,
|
546
|
+
"score": score,
|
547
|
+
"result": score,
|
548
|
+
"epoch": 0,
|
549
|
+
}
|
550
|
+
all_results.append(result_entry)
|
551
|
+
|
552
|
+
self.results = all_results
|
553
|
+
|
554
|
+
self.util.debug("Optimization complete!")
|
555
|
+
self.util.debug(f"Best parameters: {best_params}")
|
556
|
+
self.util.debug(f"Best score: {best_score}")
|
557
|
+
|
558
|
+
# Save results
|
559
|
+
self.save_results()
|
560
|
+
|
561
|
+
# Validate best parameters using standard nkululeko evaluation for consistency
|
562
|
+
validation_score = self._validate_best_params_standard_eval(best_params, expr)
|
563
|
+
if validation_score is not None:
|
564
|
+
self.util.debug(f"Cross-validation score: {best_score:.4f}")
|
565
|
+
self.util.debug(f"Standard evaluation score: {validation_score:.4f}")
|
566
|
+
score_diff = abs(best_score - validation_score)
|
567
|
+
self.util.debug(f"Score difference: {score_diff:.4f}")
|
568
|
+
|
569
|
+
if score_diff > 0.1: # 10% difference threshold
|
570
|
+
self.util.debug("WARNING: Large discrepancy between CV and standard evaluation!")
|
571
|
+
self.util.debug("This may indicate overfitting to CV folds or inconsistent data handling.")
|
572
|
+
self.util.debug("Consider using manual optimization for more consistent results.")
|
573
|
+
|
574
|
+
return best_params, best_score, all_results
|
575
|
+
|
576
|
+
def _convert_to_sklearn_params(self, param_specs):
|
577
|
+
"""Convert our parameter specifications to sklearn format."""
|
578
|
+
# Parameter name mapping from nkululeko names to sklearn names
|
579
|
+
param_mapping = {
|
580
|
+
# SVM parameters
|
581
|
+
"C_val": "C", # SVM regularization parameter
|
582
|
+
"c_val": "C", # Alternative lowercase version
|
583
|
+
# KNN parameters
|
584
|
+
"K_val": "n_neighbors", # KNN number of neighbors
|
585
|
+
"k_val": "n_neighbors", # Alternative lowercase version
|
586
|
+
"KNN_weights": "weights", # KNN weights (uniform/distance)
|
587
|
+
"knn_weights": "weights", # Alternative lowercase version
|
588
|
+
}
|
589
|
+
|
590
|
+
sklearn_params = {}
|
591
|
+
for param_name, values in param_specs.items():
|
592
|
+
# Map parameter names to sklearn equivalents
|
593
|
+
sklearn_param_name = param_mapping.get(param_name, param_name)
|
594
|
+
|
595
|
+
if isinstance(values, list):
|
596
|
+
sklearn_params[sklearn_param_name] = values
|
597
|
+
else:
|
598
|
+
# Convert single values to lists
|
599
|
+
sklearn_params[sklearn_param_name] = [values]
|
600
|
+
return sklearn_params
|
601
|
+
|
602
|
+
def _get_scoring_metric(self):
|
603
|
+
"""Get the appropriate scoring metric for sklearn optimization."""
|
604
|
+
# Create custom scorer for specificity if needed
|
605
|
+
if self.metric == "specificity":
|
606
|
+
from sklearn.metrics import make_scorer
|
607
|
+
|
608
|
+
def specificity_score(y_true, y_pred):
|
609
|
+
import numpy as np
|
610
|
+
from sklearn.metrics import confusion_matrix
|
611
|
+
|
612
|
+
cm = confusion_matrix(y_true, y_pred)
|
613
|
+
if cm.shape[0] == 2: # Binary classification
|
614
|
+
tn = cm[0, 0]
|
615
|
+
fp = cm[0, 1]
|
616
|
+
return tn / (tn + fp) if (tn + fp) > 0 else 0.0
|
617
|
+
else: # Multi-class: average specificity
|
618
|
+
specificities = []
|
619
|
+
for i in range(cm.shape[0]):
|
620
|
+
tn = np.sum(cm) - (
|
621
|
+
np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i]
|
622
|
+
)
|
623
|
+
fp = np.sum(cm[:, i]) - cm[i, i]
|
624
|
+
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
|
625
|
+
specificities.append(specificity)
|
626
|
+
return np.mean(specificities)
|
627
|
+
|
628
|
+
return make_scorer(specificity_score)
|
629
|
+
|
630
|
+
# Standard scikit-learn metrics
|
631
|
+
metric_map = {
|
632
|
+
"uar": "balanced_accuracy", # Unweighted Average Recall
|
633
|
+
"accuracy": "accuracy", # Standard accuracy
|
634
|
+
"f1": "f1_macro", # Macro-averaged F1
|
635
|
+
"precision": "precision_macro", # Macro-averaged precision
|
636
|
+
"recall": "recall_macro", # Macro-averaged recall
|
637
|
+
"sensitivity": "recall_macro", # Sensitivity = recall
|
638
|
+
}
|
639
|
+
|
640
|
+
if self.util.exp_is_classification():
|
641
|
+
return metric_map.get(self.metric or "accuracy", "accuracy")
|
642
|
+
else:
|
643
|
+
# For regression tasks
|
644
|
+
if self.metric in [
|
645
|
+
"accuracy",
|
646
|
+
"uar",
|
647
|
+
"f1",
|
648
|
+
"precision",
|
649
|
+
"recall",
|
650
|
+
"sensitivity",
|
651
|
+
"specificity",
|
652
|
+
]:
|
653
|
+
self.util.debug(
|
654
|
+
f"Warning: {self.metric} is not suitable for regression, using RMSE"
|
655
|
+
)
|
656
|
+
return "neg_root_mean_squared_error"
|
657
|
+
|
658
|
+
def _update_config_with_params(self, params):
|
659
|
+
"""Update configuration with current parameter set."""
|
660
|
+
self._ensure_model_section()
|
661
|
+
|
662
|
+
if self.model_type == "mlp":
|
663
|
+
self._update_mlp_params(params)
|
664
|
+
else:
|
665
|
+
self._update_traditional_ml_params(params)
|
666
|
+
|
667
|
+
def _ensure_model_section(self):
|
668
|
+
"""Ensure MODEL section exists with basic configuration."""
|
669
|
+
if "MODEL" not in self.config:
|
670
|
+
self.config.add_section("MODEL")
|
671
|
+
|
672
|
+
if "type" not in self.config["MODEL"]:
|
673
|
+
self.config["MODEL"]["type"] = self.model_type
|
674
|
+
|
675
|
+
def _update_mlp_params(self, params):
|
676
|
+
"""Update MLP-specific parameters."""
|
677
|
+
if "nlayers" in params and "nnodes" in params:
|
678
|
+
nlayers = params["nlayers"]
|
679
|
+
nnodes = params["nnodes"]
|
680
|
+
layers = {f"l{i+1}": nnodes for i in range(nlayers)}
|
681
|
+
self.config["MODEL"]["layers"] = str(layers)
|
682
|
+
|
683
|
+
if "lr" in params:
|
684
|
+
self.config["MODEL"]["learning_rate"] = str(params["lr"])
|
685
|
+
|
686
|
+
if "bs" in params:
|
687
|
+
self.config["MODEL"]["batch_size"] = str(params["bs"])
|
688
|
+
|
689
|
+
if "do" in params:
|
690
|
+
self.config["MODEL"]["drop"] = str(params["do"])
|
691
|
+
|
692
|
+
if "loss" in params:
|
693
|
+
self.config["MODEL"]["loss"] = params["loss"]
|
694
|
+
|
695
|
+
def _update_traditional_ml_params(self, params):
|
696
|
+
"""Update traditional ML parameters using tuning_params approach."""
|
697
|
+
# For optimization, we set the specific parameter values directly
|
698
|
+
# rather than using the tuning mechanism
|
699
|
+
for param_name, param_value in params.items():
|
700
|
+
self.config["MODEL"][param_name] = str(param_value)
|
701
|
+
|
702
|
+
# Add random_state to model configuration for consistency
|
703
|
+
if self.model_type in ["xgb", "xgr", "svm", "svr", "knn", "knn_reg", "tree", "tree_reg"]:
|
704
|
+
self.config["MODEL"]["random_state"] = str(self.random_state)
|
705
|
+
|
706
|
+
def _run_single_experiment(self):
|
707
|
+
"""Run a single experiment with current configuration."""
|
708
|
+
import nkululeko.experiment as exp
|
709
|
+
|
710
|
+
if "MODEL" not in self.config:
|
711
|
+
self.config.add_section("MODEL")
|
712
|
+
if "type" not in self.config["MODEL"]:
|
713
|
+
self.config["MODEL"]["type"] = self.model_type
|
714
|
+
|
715
|
+
expr = exp.Experiment(self.config)
|
716
|
+
expr.set_module("optim")
|
717
|
+
|
718
|
+
expr.load_datasets()
|
719
|
+
|
720
|
+
expr.fill_train_and_tests()
|
721
|
+
|
722
|
+
expr.extract_feats()
|
723
|
+
|
724
|
+
expr.init_runmanager()
|
725
|
+
|
726
|
+
reports, last_epochs = expr.run()
|
727
|
+
result = expr.get_best_report(reports).result.test
|
728
|
+
|
729
|
+
return result, int(min(last_epochs))
|
730
|
+
|
731
|
+
def save_results(self, filepath=None):
|
732
|
+
"""Save optimization results to CSV file."""
|
733
|
+
if not self.results:
|
734
|
+
self.util.debug("No results to save")
|
735
|
+
return
|
736
|
+
|
737
|
+
if filepath is None:
|
738
|
+
# Save in the results directory instead of current directory
|
739
|
+
results_dir = self.util.get_path("res_dir")
|
740
|
+
filepath = os.path.join(
|
741
|
+
results_dir, f"optimization_results_{self.model_type}.csv"
|
742
|
+
)
|
743
|
+
|
744
|
+
import csv
|
745
|
+
|
746
|
+
try:
|
747
|
+
with open(filepath, "w", newline="") as csvfile:
|
748
|
+
# Get all unique parameter names from all results
|
749
|
+
param_names = set()
|
750
|
+
for result in self.results:
|
751
|
+
param_names.update(result["params"].keys())
|
752
|
+
param_names = sorted(list(param_names))
|
753
|
+
|
754
|
+
fieldnames = param_names + ["score", "result", "epoch"]
|
755
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
756
|
+
|
757
|
+
writer.writeheader()
|
758
|
+
for result in self.results:
|
759
|
+
row = result["params"].copy()
|
760
|
+
row["score"] = result["score"]
|
761
|
+
row["result"] = result["result"]
|
762
|
+
row["epoch"] = result["epoch"]
|
763
|
+
writer.writerow(row)
|
764
|
+
|
765
|
+
self.util.debug(f"Optimization results saved to {filepath}")
|
766
|
+
except Exception as e:
|
767
|
+
self.util.error(f"Failed to save results: {e}")
|
768
|
+
|
769
|
+
def get_best_params(self):
|
770
|
+
"""Get the best parameters found during optimization."""
|
771
|
+
if not self.results:
|
772
|
+
return None
|
773
|
+
|
774
|
+
best_result = None
|
775
|
+
best_score = -float("inf") if self.util.high_is_good() else float("inf")
|
776
|
+
|
777
|
+
for result in self.results:
|
778
|
+
score = result["score"]
|
779
|
+
is_better = (self.util.high_is_good() and score > best_score) or (
|
780
|
+
not self.util.high_is_good() and score < best_score
|
781
|
+
)
|
782
|
+
if is_better:
|
783
|
+
best_score = score
|
784
|
+
best_result = result
|
785
|
+
|
786
|
+
return best_result
|
787
|
+
|
788
|
+
def get_recommended_ranges(self, param_name):
|
789
|
+
"""Get recommended parameter ranges for common hyperparameters."""
|
790
|
+
recommendations = {
|
791
|
+
"lr": [0.0001, 0.001, 0.01, 0.1], # Log-scale discrete values
|
792
|
+
"do": [0.1, 0.3, 0.5, 0.7], # Common dropout rates
|
793
|
+
"C_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization
|
794
|
+
"c_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization (alternative)
|
795
|
+
"K_val": [3, 5, 7, 9, 11], # KNN neighbors
|
796
|
+
"k_val": [3, 5, 7, 9, 11], # KNN neighbors (alternative)
|
797
|
+
"KNN_weights": ["uniform", "distance"], # KNN weights
|
798
|
+
"knn_weights": ["uniform", "distance"], # KNN weights (alternative)
|
799
|
+
"n_estimators": [50, 100, 200], # XGB trees
|
800
|
+
"max_depth": [3, 6, 9, 12], # Tree depth
|
801
|
+
"subsample": [0.6, 0.8, 1.0], # XGB subsample
|
802
|
+
"learning_rate": [0.01, 0.1, 0.3], # XGB learning rate
|
803
|
+
}
|
804
|
+
return recommendations.get(param_name, None)
|
805
|
+
|
806
|
+
def _validate_best_params_standard_eval(self, best_params, expr):
|
807
|
+
"""Validate the best parameters using standard nkululeko train-test evaluation."""
|
808
|
+
try:
|
809
|
+
# Set the model parameters to the best found values
|
810
|
+
self._update_config_with_params(best_params)
|
811
|
+
|
812
|
+
# Run a single experiment with these parameters using the standard approach
|
813
|
+
result, _ = self._run_single_experiment()
|
814
|
+
|
815
|
+
return result
|
816
|
+
except Exception as e:
|
817
|
+
self.util.debug(f"Standard validation failed: {e}")
|
818
|
+
return None
|
819
|
+
|
820
|
+
def _configure_mlp_model(self, model, params):
|
821
|
+
"""Configure MLP model with current parameters."""
|
822
|
+
# Set MLP-specific parameters
|
823
|
+
if hasattr(model, 'clf') and hasattr(model.clf, 'set_params'):
|
824
|
+
model_params = {}
|
825
|
+
|
826
|
+
# Map optimization parameters to model parameters
|
827
|
+
if "lr" in params:
|
828
|
+
model_params["learning_rate"] = params["lr"]
|
829
|
+
if "do" in params:
|
830
|
+
model_params["dropout"] = params["do"]
|
831
|
+
if "bs" in params:
|
832
|
+
model_params["batch_size"] = params["bs"]
|
833
|
+
|
834
|
+
model.clf.set_params(**model_params)
|
835
|
+
|
836
|
+
def _configure_traditional_model(self, model, params):
|
837
|
+
"""Configure traditional ML model with current parameters."""
|
838
|
+
if hasattr(model, 'clf') and hasattr(model.clf, 'set_params'):
|
839
|
+
# Map parameter names for different models
|
840
|
+
param_mapping = {
|
841
|
+
"C_val": "C",
|
842
|
+
"c_val": "C",
|
843
|
+
"K_val": "n_neighbors",
|
844
|
+
"k_val": "n_neighbors",
|
845
|
+
"KNN_weights": "weights",
|
846
|
+
"knn_weights": "weights",
|
847
|
+
}
|
848
|
+
|
849
|
+
model_params = {}
|
850
|
+
for param_name, param_value in params.items():
|
851
|
+
sklearn_param = param_mapping.get(param_name, param_name)
|
852
|
+
model_params[sklearn_param] = param_value
|
853
|
+
|
854
|
+
model.clf.set_params(**model_params)
|
855
|
+
|
856
|
+
def _extract_score_from_report(self, reports):
|
857
|
+
"""Extract score from model prediction reports."""
|
858
|
+
# This is a simplified version - you may need to adapt based on your report structure
|
859
|
+
if isinstance(reports, dict):
|
860
|
+
# Try to extract the metric we're optimizing for
|
861
|
+
if self.metric in reports:
|
862
|
+
return reports[self.metric]
|
863
|
+
elif "test" in reports:
|
864
|
+
return reports["test"]
|
865
|
+
else:
|
866
|
+
# Return the first numeric value found
|
867
|
+
for key, value in reports.items():
|
868
|
+
if isinstance(value, (int, float)):
|
869
|
+
return value
|
870
|
+
elif isinstance(reports, (int, float)):
|
871
|
+
return reports
|
872
|
+
else:
|
873
|
+
# Fallback: assume it's a list and take the first element
|
874
|
+
try:
|
875
|
+
return reports[0] if hasattr(reports, '__getitem__') else 0.0
|
876
|
+
except:
|
877
|
+
return 0.0
|
878
|
+
|
879
|
+
|
880
|
+
def doit(config_file):
|
881
|
+
"""Run hyperparameter optimization experiment."""
|
882
|
+
if not os.path.isfile(config_file):
|
883
|
+
print(f"ERROR: no such file: {config_file}")
|
884
|
+
sys.exit(1)
|
885
|
+
|
886
|
+
config = configparser.ConfigParser()
|
887
|
+
config.read(config_file)
|
888
|
+
|
889
|
+
optimizer = OptimizationRunner(config)
|
890
|
+
|
891
|
+
# Start timing the optimization
|
892
|
+
start_time = time.time()
|
893
|
+
|
894
|
+
# Run optimization using the unified approach
|
895
|
+
try:
|
896
|
+
best_params, best_result, all_results = optimizer.run_optimization()
|
897
|
+
except Exception as e:
|
898
|
+
print(f"Optimization failed: {e}")
|
899
|
+
return None, None
|
900
|
+
|
901
|
+
# Calculate optimization time
|
902
|
+
end_time = time.time()
|
903
|
+
optimization_time = end_time - start_time
|
904
|
+
|
905
|
+
print("OPTIMIZATION COMPLETE")
|
906
|
+
print(f"Best parameters: {best_params}")
|
907
|
+
print(f"Best result: {best_result}")
|
908
|
+
print(
|
909
|
+
f"Optimization time: {optimization_time:.2f} seconds ({optimization_time/60:.2f} minutes)"
|
910
|
+
)
|
911
|
+
print("DONE")
|
912
|
+
return best_params, best_result
|
913
|
+
|
914
|
+
|
915
|
+
def main():
|
916
|
+
"""Main entry point for optimization module."""
|
917
|
+
parser = argparse.ArgumentParser(
|
918
|
+
description="Run nkululeko hyperparameter optimization."
|
919
|
+
)
|
920
|
+
parser.add_argument("--version", action="version", version=f"Nkululeko {VERSION}")
|
921
|
+
parser.add_argument(
|
922
|
+
"--config", default="exp.ini", help="The optimization configuration file"
|
923
|
+
)
|
924
|
+
args = parser.parse_args()
|
925
|
+
|
926
|
+
config_file = args.config
|
927
|
+
doit(config_file)
|
928
|
+
|
929
|
+
|
930
|
+
if __name__ == "__main__":
|
931
|
+
main()
|