nkululeko 0.95.2__py3-none-any.whl → 0.95.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/optim.py CHANGED
@@ -1,880 +1,15 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
- import ast
5
4
  import configparser
6
- import itertools
7
5
  import os
8
- import random
9
6
  import sys
10
7
  import time
11
8
 
12
- import numpy as np
13
-
14
9
  from nkululeko.constants import VERSION
15
- from nkululeko.utils.util import Util
16
-
17
-
18
- class OptimizationRunner:
19
- """Hyperparameter optimization runner for nkululeko experiments."""
20
-
21
- def __init__(self, config):
22
- self.config = config
23
- self.util = Util("optim")
24
- self.results = []
25
- self.model_type = None # Will be set when parsing OPTIM params
26
- # New: Optimization strategy configuration
27
- self.search_strategy = "grid" # Default values
28
- self.n_iter = 50
29
- self.cv_folds = 3
30
- self.metric = "accuracy"
31
- self.random_state = 42 # Default random state for reproducibility
32
-
33
- def parse_optim_params(self):
34
- """Parse OPTIM section parameters into search spaces."""
35
- if "OPTIM" not in self.config:
36
- self.util.error("No [OPTIM] section found in configuration")
37
-
38
- optim_config = self.config["OPTIM"]
39
- self.model_type = optim_config.get("model", "mlp")
40
-
41
- # Parse optimization strategy settings
42
- self.search_strategy = optim_config.get("search_strategy", "grid")
43
- self.n_iter = int(optim_config.get("n_iter", "50"))
44
- self.cv_folds = int(optim_config.get("cv_folds", "3"))
45
- self.random_state = int(optim_config.get("random_state", "42"))
46
-
47
- # Set global random seeds for reproducibility
48
- random.seed(self.random_state)
49
- np.random.seed(self.random_state)
50
-
51
- self.util.debug(f"Using random state: {self.random_state} for reproducibility")
52
-
53
- self.metric = optim_config.get("metric", "accuracy").lower()
54
-
55
- self.util.debug(f"Parsed metric from config: '{self.metric}'") # Debug line
56
-
57
- param_specs = {}
58
- for key, value in optim_config.items():
59
- if key in ["model", "search_strategy", "n_iter", "cv_folds", "metric", "random_state"]:
60
- continue
61
- param_specs[key] = self._parse_param_spec(key, value)
62
-
63
- return param_specs
64
-
65
- def _parse_param_spec(self, param_name, param_value):
66
- """Parse individual parameter specification."""
67
- try:
68
- parsed = ast.literal_eval(param_value)
69
- except (ValueError, SyntaxError) as e:
70
- self.util.debug(
71
- f"Could not parse parameter {param_name}={param_value} as literal, treating as string: {e}"
72
- )
73
- if isinstance(param_value, str):
74
- return [param_value]
75
- return param_value
76
-
77
- # Check for inefficient learning rate ranges and suggest better alternatives
78
- if param_name == "lr" and isinstance(parsed, tuple) and len(parsed) == 3:
79
- min_val, max_val, step = parsed
80
- if step <= 0.0001 and (max_val - min_val) / step > 20:
81
- self.util.debug(
82
- f"WARNING: Learning rate range {param_value} will generate {int((max_val - min_val) / step)} values!"
83
- )
84
- self.util.debug(
85
- "Consider using discrete values like [0.0001, 0.001, 0.01, 0.1] or range (0.0001, 0.1) for log-scale sampling"
86
- )
87
-
88
- if isinstance(parsed, tuple):
89
- if len(parsed) == 2:
90
- return self._generate_range(parsed[0], parsed[1], param_name)
91
- elif len(parsed) == 3:
92
- return self._generate_range_with_step(
93
- parsed[0], parsed[1], parsed[2], param_name
94
- )
95
- else:
96
- self.util.error(
97
- f"Invalid tuple format for parameter {param_name}: {param_value}. Expected (min, max) or (min, max, step)"
98
- )
99
- return [parsed[0]] # Fallback to first value
100
- elif isinstance(parsed, list):
101
- return parsed
102
- else:
103
- return [parsed]
104
-
105
- def _generate_range(self, min_val, max_val, param_name):
106
- """Generate parameter range based on parameter type."""
107
- if param_name in ["nlayers"]:
108
- return list(range(min_val, max_val + 1))
109
- elif param_name in ["nnodes", "bs"]:
110
- result = []
111
- current = min_val
112
- while current <= max_val:
113
- result.append(current)
114
- current *= 2
115
- return result
116
- elif param_name in ["lr"]:
117
- # For learning rate, use logarithmic scale sampling (more practical)
118
- # Generate 5-8 values on log scale between min and max
119
- num_samples = min(8, max(5, int(np.log10(max_val / min_val) * 2)))
120
- log_min = np.log10(min_val)
121
- log_max = np.log10(max_val)
122
- log_values = np.linspace(log_min, log_max, num_samples)
123
- result = [round(10**log_val, 6) for log_val in log_values]
124
- return result
125
- elif param_name in ["do"]:
126
- # For dropout, generate reasonable steps
127
- num_steps = 5
128
- step = (max_val - min_val) / num_steps
129
- result = []
130
- current = min_val
131
- while current <= max_val + step / 2:
132
- result.append(round(current, 2))
133
- current += step
134
- return result
135
- else:
136
- return list(range(min_val, max_val + 1))
137
-
138
- def _generate_range_with_step(self, min_val, max_val, step, param_name):
139
- """Generate parameter range with explicit step."""
140
- if (
141
- isinstance(step, float)
142
- or isinstance(min_val, float)
143
- or isinstance(max_val, float)
144
- ):
145
- result = []
146
- current = float(min_val)
147
- step = float(step)
148
- max_val = float(max_val)
149
- while current <= max_val + step / 2:
150
- result.append(round(current, 6)) # More precision for floats
151
- current += step
152
- return result
153
- else:
154
- return list(range(min_val, max_val + 1, step))
155
-
156
- def generate_param_combinations(self, param_specs):
157
- """Generate all parameter combinations for grid search."""
158
- param_names = list(param_specs.keys())
159
- param_values = list(param_specs.values())
160
-
161
- combinations = []
162
- for combo in itertools.product(*param_values):
163
- param_dict = dict(zip(param_names, combo))
164
- combinations.append(param_dict)
165
-
166
- return combinations
167
-
168
- def run_optimization(self):
169
- """Run hyperparameter optimization using the most appropriate method."""
170
- self.util.debug(
171
- f"Starting optimization using {self.search_strategy} strategy with {self.metric} metric, nkululeko version {VERSION}"
172
- )
173
-
174
- param_specs = self.parse_optim_params()
175
-
176
- if not param_specs:
177
- self.util.error("No optimization parameters found in [OPTIM] section")
178
- return None, None, []
179
-
180
- # Always use manual optimization to ensure consistent evaluation pipeline
181
- # This prevents discrepancies between CV and final evaluation
182
- self.util.debug("Using manual optimization for consistent evaluation pipeline")
183
- return self._run_manual_optimization(param_specs)
184
-
185
- def _run_manual_optimization(self, param_specs):
186
- """Run manual grid search optimization with consistent evaluation pipeline."""
187
- combinations = self.generate_param_combinations(param_specs)
188
-
189
- if not combinations:
190
- self.util.error("No parameter combinations generated")
191
- return None, None, []
192
-
193
- self.util.debug(
194
- f"Starting manual optimization with {len(combinations)} parameter combinations"
195
- )
196
-
197
- # Check if we should use cross-validation or train-test split
198
- use_cv = self.search_strategy in ["grid_cv", "random_cv"] or (
199
- hasattr(self, 'use_cv_in_manual') and self.use_cv_in_manual
200
- )
201
-
202
- if use_cv:
203
- return self._run_manual_cv_optimization(combinations, param_specs)
204
- else:
205
- return self._run_manual_train_test_optimization(combinations)
206
-
207
- def _run_manual_train_test_optimization(self, combinations):
208
- """Run manual optimization using train-test split (matches final evaluation)."""
209
- best_result = None
210
- best_params = None
211
- best_score = -float("inf") if self.util.high_is_good() else float("inf")
212
-
213
- for i, params in enumerate(combinations):
214
- self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
215
-
216
- self._update_config_with_params(params)
217
-
218
- try:
219
- result, last_epoch = self._run_single_experiment()
220
- score = result # result.test is already a numeric value
221
-
222
- result_entry = {
223
- "params": params.copy(),
224
- "score": score,
225
- "result": result,
226
- "epoch": last_epoch,
227
- }
228
- self.results.append(result_entry)
229
-
230
- is_better = (self.util.high_is_good() and score > best_score) or (
231
- not self.util.high_is_good() and score < best_score
232
- )
233
-
234
- if is_better:
235
- best_score = score
236
- best_result = result
237
- best_params = params.copy()
238
-
239
- self.util.debug(f"Result: {result}, Score: {score}")
240
-
241
- except Exception as e:
242
- self.util.error(f"Failed with params {params}: {str(e)}")
243
- # Log the full traceback for debugging
244
- import traceback
245
-
246
- self.util.debug(f"Full traceback: {traceback.format_exc()}")
247
- continue
248
-
249
- self.util.debug("Optimization complete!")
250
- self.util.debug(f"Best parameters: {best_params}")
251
- self.util.debug(f"Best result: {best_result}")
252
-
253
- # Save results to file
254
- self.save_results()
255
-
256
- return best_params, best_result, self.results
257
-
258
- def _run_manual_cv_optimization(self, combinations, param_specs):
259
- """Run manual optimization using cross-validation."""
260
- import numpy as np
261
- from sklearn.model_selection import StratifiedKFold
262
-
263
- self.util.debug("Using cross-validation for optimization (may differ from final evaluation)")
264
-
265
- # Set up the experiment once to get the data
266
- import nkululeko.experiment as exp
267
- expr = exp.Experiment(self.config)
268
- expr.set_module("optim")
269
- expr.load_datasets()
270
- expr.fill_train_and_tests()
271
- expr.extract_feats()
272
-
273
- # Create stratified CV splits
274
- cv_splitter = StratifiedKFold(
275
- n_splits=self.cv_folds,
276
- shuffle=True,
277
- random_state=self.random_state
278
- )
279
-
280
- best_result = None
281
- best_params = None
282
- best_score = -float("inf") if self.util.high_is_good() else float("inf")
283
-
284
- for i, params in enumerate(combinations):
285
- self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
286
-
287
- # Run cross-validation for this parameter combination
288
- cv_scores = []
289
-
290
- try:
291
- for fold, (train_idx, val_idx) in enumerate(cv_splitter.split(
292
- expr.feats_train, expr.df_train[self.config["DATA"]["target"]]
293
- )):
294
- self.util.debug(f" Fold {fold+1}/{self.cv_folds}")
295
-
296
- # Create fold-specific data
297
- fold_train_feats = expr.feats_train.iloc[train_idx]
298
- fold_val_feats = expr.feats_train.iloc[val_idx]
299
- fold_train_df = expr.df_train.iloc[train_idx]
300
- fold_val_df = expr.df_train.iloc[val_idx]
301
-
302
- # Update config with current parameters
303
- self._update_config_with_params(params)
304
-
305
- # Run experiment on this fold
306
- fold_score = self._run_cv_fold(
307
- fold_train_feats, fold_val_feats,
308
- fold_train_df, fold_val_df, params
309
- )
310
- cv_scores.append(fold_score)
311
-
312
- # Calculate mean CV score
313
- mean_score = np.mean(cv_scores)
314
- std_score = np.std(cv_scores)
315
-
316
- result_entry = {
317
- "params": params.copy(),
318
- "score": mean_score,
319
- "result": mean_score,
320
- "cv_std": std_score,
321
- "cv_scores": cv_scores,
322
- "epoch": 0,
323
- }
324
- self.results.append(result_entry)
325
-
326
- is_better = (self.util.high_is_good() and mean_score > best_score) or (
327
- not self.util.high_is_good() and mean_score < best_score
328
- )
329
-
330
- if is_better:
331
- best_score = mean_score
332
- best_result = mean_score
333
- best_params = params.copy()
334
-
335
- self.util.debug(f"CV Score: {mean_score:.4f} ± {std_score:.4f}")
336
-
337
- except Exception as e:
338
- self.util.error(f"Failed with params {params}: {str(e)}")
339
- continue
340
-
341
- self.util.debug("Cross-validation optimization complete!")
342
- self.util.debug(f"Best parameters: {best_params}")
343
- self.util.debug(f"Best CV score: {best_result}")
344
-
345
- # Validate with final evaluation pipeline
346
- if best_params:
347
- validation_score = self._validate_best_params_standard_eval(best_params, expr)
348
- if validation_score is not None:
349
- self.util.debug(f"Cross-validation score: {best_result:.4f}")
350
- self.util.debug(f"Standard evaluation score: {validation_score:.4f}")
351
- score_diff = abs(best_result - validation_score)
352
- self.util.debug(f"Score difference: {score_diff:.4f}")
353
-
354
- if score_diff > 0.1: # 10% difference threshold
355
- self.util.debug("WARNING: Large discrepancy between CV and standard evaluation!")
356
- self.util.debug("Consider using train-test optimization for more consistent results.")
357
-
358
- # Save results to file
359
- self.save_results()
360
-
361
- return best_params, best_result, self.results
362
-
363
- def _run_cv_fold(self, train_feats, val_feats, train_df, val_df, params):
364
- """Run a single cross-validation fold."""
365
- from nkululeko.modelrunner import Modelrunner
366
-
367
- # Create a temporary runner for this fold
368
- runner = Modelrunner(train_df, val_df, train_feats, val_feats, 0)
369
- runner._select_model(self.model_type)
370
-
371
- # Configure model with current parameters
372
- if self.model_type == "mlp":
373
- self._configure_mlp_model(runner.model, params)
374
- else:
375
- self._configure_traditional_model(runner.model, params)
376
-
377
- # Train and evaluate
378
- runner.model.train()
379
- reports = runner.model.predict()
380
-
381
- # Extract score based on metric
382
- return self._extract_score_from_report(reports)
383
-
384
- def _run_sklearn_optimization(self, param_specs):
385
- """Run optimization using scikit-learn's hyperparameter search methods with consistent data handling."""
386
- from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
387
- StratifiedKFold)
388
-
389
- # Import the actual experiment to get the model and data
390
- import nkululeko.experiment as exp
391
-
392
- # Set up the experiment
393
- expr = exp.Experiment(self.config)
394
- expr.set_module("optim")
395
- expr.load_datasets()
396
- expr.fill_train_and_tests()
397
- expr.extract_feats()
398
-
399
- # Apply the same balancing as the final evaluation
400
- original_train_feats = expr.feats_train.copy()
401
- original_train_df = expr.df_train.copy()
402
-
403
- if "FEATS" in self.config and "balancing" in self.config["FEATS"]:
404
- balancing_method = self.config["FEATS"]["balancing"]
405
- if balancing_method and balancing_method.lower() != "none":
406
- self.util.debug(f"Applying {balancing_method} balancing for optimization consistency")
407
- try:
408
- from nkululeko.balance import DataBalancer
409
- balancer = DataBalancer()
410
- expr.feats_train, expr.df_train = balancer.balance_features(
411
- expr.df_train, expr.feats_train, self.config["DATA"]["target"], balancing_method
412
- )
413
- self.util.debug(f"Balanced training data: {len(expr.feats_train)} samples")
414
- except Exception as e:
415
- self.util.debug(f"Balancing failed: {e}, using original data")
416
- expr.feats_train = original_train_feats
417
- expr.df_train = original_train_df
418
-
419
- # Get the base model without hyperparameter tuning
420
- original_tuning_params = self.config.get(
421
- "MODEL", "tuning_params", fallback=None
422
- )
423
- if "MODEL" not in self.config:
424
- self.config.add_section("MODEL")
425
-
426
- # Temporarily disable tuning_params to get base model
427
- if original_tuning_params:
428
- self.config.remove_option("MODEL", "tuning_params")
429
-
430
- # Create a model instance using the modelrunner approach
431
- from nkululeko.modelrunner import Modelrunner
432
-
433
- runner = Modelrunner(
434
- expr.df_train, expr.df_test, expr.feats_train, expr.feats_test, 0
435
- )
436
- runner._select_model(self.model_type)
437
- base_clf = runner.model.clf
438
-
439
- # Restore original tuning_params if it existed
440
- if original_tuning_params:
441
- self.config.set("MODEL", "tuning_params", original_tuning_params)
442
-
443
- # Convert parameter specifications to sklearn format
444
- sklearn_params = self._convert_to_sklearn_params(param_specs)
445
-
446
- # Create stratified CV for consistent cross-validation
447
- cv = StratifiedKFold(
448
- n_splits=self.cv_folds,
449
- shuffle=True,
450
- random_state=self.random_state
451
- )
452
-
453
- # Choose search strategy
454
- if self.search_strategy == "random":
455
- search = RandomizedSearchCV(
456
- base_clf,
457
- sklearn_params,
458
- n_iter=self.n_iter,
459
- cv=cv, # Use stratified CV
460
- scoring=self._get_scoring_metric(),
461
- random_state=self.random_state,
462
- n_jobs=-1,
463
- verbose=1,
464
- )
465
- elif self.search_strategy == "halving_random":
466
- try:
467
- from sklearn.model_selection import HalvingRandomSearchCV
468
-
469
- search = HalvingRandomSearchCV(
470
- base_clf,
471
- sklearn_params,
472
- cv=cv, # Use stratified CV
473
- scoring=self._get_scoring_metric(),
474
- random_state=self.random_state,
475
- n_jobs=-1,
476
- verbose=1,
477
- )
478
- except ImportError:
479
- self.util.debug(
480
- "HalvingRandomSearchCV not available, falling back to RandomizedSearchCV"
481
- )
482
- search = RandomizedSearchCV(
483
- base_clf,
484
- sklearn_params,
485
- n_iter=self.n_iter,
486
- cv=cv, # Use stratified CV
487
- scoring=self._get_scoring_metric(),
488
- random_state=self.random_state,
489
- n_jobs=-1,
490
- verbose=1,
491
- )
492
- elif self.search_strategy == "halving_grid":
493
- try:
494
- from sklearn.model_selection import HalvingGridSearchCV
495
-
496
- search = HalvingGridSearchCV(
497
- base_clf,
498
- sklearn_params,
499
- cv=cv, # Use stratified CV
500
- scoring=self._get_scoring_metric(),
501
- random_state=self.random_state,
502
- n_jobs=-1,
503
- verbose=1,
504
- )
505
- except ImportError:
506
- self.util.debug(
507
- "HalvingGridSearchCV not available, falling back to GridSearchCV"
508
- )
509
- search = GridSearchCV(
510
- base_clf,
511
- sklearn_params,
512
- cv=cv, # Use stratified CV
513
- scoring=self._get_scoring_metric(),
514
- n_jobs=-1,
515
- verbose=1,
516
- )
517
- else: # grid search (default)
518
- search = GridSearchCV(
519
- base_clf,
520
- sklearn_params,
521
- cv=cv, # Use stratified CV
522
- scoring=self._get_scoring_metric(),
523
- n_jobs=-1,
524
- verbose=1,
525
- )
526
-
527
- self.util.debug(
528
- f"Starting {self.search_strategy} search with {len(sklearn_params)} parameters"
529
- )
530
- self.util.debug(f"Using stratified {self.cv_folds}-fold cross-validation")
531
-
532
- # Fit the search
533
- search.fit(expr.feats_train, expr.df_train[self.config["DATA"]["target"]])
534
-
535
- # Extract results
536
- best_params = search.best_params_
537
- best_score = search.best_score_
538
-
539
- # Convert results back to our format
540
- all_results = []
541
- for i, (params, score) in enumerate(
542
- zip(search.cv_results_["params"], search.cv_results_["mean_test_score"])
543
- ):
544
- result_entry = {
545
- "params": params,
546
- "score": score,
547
- "result": score,
548
- "epoch": 0,
549
- }
550
- all_results.append(result_entry)
551
-
552
- self.results = all_results
553
-
554
- self.util.debug("Optimization complete!")
555
- self.util.debug(f"Best parameters: {best_params}")
556
- self.util.debug(f"Best score: {best_score}")
557
-
558
- # Save results
559
- self.save_results()
560
-
561
- # Validate best parameters using standard nkululeko evaluation for consistency
562
- validation_score = self._validate_best_params_standard_eval(best_params, expr)
563
- if validation_score is not None:
564
- self.util.debug(f"Cross-validation score: {best_score:.4f}")
565
- self.util.debug(f"Standard evaluation score: {validation_score:.4f}")
566
- score_diff = abs(best_score - validation_score)
567
- self.util.debug(f"Score difference: {score_diff:.4f}")
568
-
569
- if score_diff > 0.1: # 10% difference threshold
570
- self.util.debug("WARNING: Large discrepancy between CV and standard evaluation!")
571
- self.util.debug("This may indicate overfitting to CV folds or inconsistent data handling.")
572
- self.util.debug("Consider using manual optimization for more consistent results.")
573
-
574
- return best_params, best_score, all_results
575
-
576
- def _convert_to_sklearn_params(self, param_specs):
577
- """Convert our parameter specifications to sklearn format."""
578
- # Parameter name mapping from nkululeko names to sklearn names
579
- param_mapping = {
580
- # SVM parameters
581
- "C_val": "C", # SVM regularization parameter
582
- "c_val": "C", # Alternative lowercase version
583
- # KNN parameters
584
- "K_val": "n_neighbors", # KNN number of neighbors
585
- "k_val": "n_neighbors", # Alternative lowercase version
586
- "KNN_weights": "weights", # KNN weights (uniform/distance)
587
- "knn_weights": "weights", # Alternative lowercase version
588
- }
589
-
590
- sklearn_params = {}
591
- for param_name, values in param_specs.items():
592
- # Map parameter names to sklearn equivalents
593
- sklearn_param_name = param_mapping.get(param_name, param_name)
594
-
595
- if isinstance(values, list):
596
- sklearn_params[sklearn_param_name] = values
597
- else:
598
- # Convert single values to lists
599
- sklearn_params[sklearn_param_name] = [values]
600
- return sklearn_params
601
-
602
- def _get_scoring_metric(self):
603
- """Get the appropriate scoring metric for sklearn optimization."""
604
- # Create custom scorer for specificity if needed
605
- if self.metric == "specificity":
606
- from sklearn.metrics import make_scorer
607
-
608
- def specificity_score(y_true, y_pred):
609
- import numpy as np
610
- from sklearn.metrics import confusion_matrix
611
-
612
- cm = confusion_matrix(y_true, y_pred)
613
- if cm.shape[0] == 2: # Binary classification
614
- tn = cm[0, 0]
615
- fp = cm[0, 1]
616
- return tn / (tn + fp) if (tn + fp) > 0 else 0.0
617
- else: # Multi-class: average specificity
618
- specificities = []
619
- for i in range(cm.shape[0]):
620
- tn = np.sum(cm) - (
621
- np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i]
622
- )
623
- fp = np.sum(cm[:, i]) - cm[i, i]
624
- specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
625
- specificities.append(specificity)
626
- return np.mean(specificities)
627
-
628
- return make_scorer(specificity_score)
629
-
630
- # Standard scikit-learn metrics
631
- metric_map = {
632
- "uar": "balanced_accuracy", # Unweighted Average Recall
633
- "accuracy": "accuracy", # Standard accuracy
634
- "f1": "f1_macro", # Macro-averaged F1
635
- "precision": "precision_macro", # Macro-averaged precision
636
- "recall": "recall_macro", # Macro-averaged recall
637
- "sensitivity": "recall_macro", # Sensitivity = recall
638
- }
639
-
640
- if self.util.exp_is_classification():
641
- return metric_map.get(self.metric or "accuracy", "accuracy")
642
- else:
643
- # For regression tasks
644
- if self.metric in [
645
- "accuracy",
646
- "uar",
647
- "f1",
648
- "precision",
649
- "recall",
650
- "sensitivity",
651
- "specificity",
652
- ]:
653
- self.util.debug(
654
- f"Warning: {self.metric} is not suitable for regression, using RMSE"
655
- )
656
- return "neg_root_mean_squared_error"
657
-
658
- def _update_config_with_params(self, params):
659
- """Update configuration with current parameter set."""
660
- self._ensure_model_section()
661
-
662
- if self.model_type == "mlp":
663
- self._update_mlp_params(params)
664
- else:
665
- self._update_traditional_ml_params(params)
666
-
667
- def _ensure_model_section(self):
668
- """Ensure MODEL section exists with basic configuration."""
669
- if "MODEL" not in self.config:
670
- self.config.add_section("MODEL")
671
-
672
- if "type" not in self.config["MODEL"]:
673
- self.config["MODEL"]["type"] = self.model_type
674
-
675
- def _update_mlp_params(self, params):
676
- """Update MLP-specific parameters."""
677
- if "nlayers" in params and "nnodes" in params:
678
- nlayers = params["nlayers"]
679
- nnodes = params["nnodes"]
680
- layers = {f"l{i+1}": nnodes for i in range(nlayers)}
681
- self.config["MODEL"]["layers"] = str(layers)
682
-
683
- if "lr" in params:
684
- self.config["MODEL"]["learning_rate"] = str(params["lr"])
685
-
686
- if "bs" in params:
687
- self.config["MODEL"]["batch_size"] = str(params["bs"])
688
-
689
- if "do" in params:
690
- self.config["MODEL"]["drop"] = str(params["do"])
691
-
692
- if "loss" in params:
693
- self.config["MODEL"]["loss"] = params["loss"]
694
-
695
- def _update_traditional_ml_params(self, params):
696
- """Update traditional ML parameters using tuning_params approach."""
697
- # For optimization, we set the specific parameter values directly
698
- # rather than using the tuning mechanism
699
- for param_name, param_value in params.items():
700
- self.config["MODEL"][param_name] = str(param_value)
701
-
702
- # Add random_state to model configuration for consistency
703
- if self.model_type in ["xgb", "xgr", "svm", "svr", "knn", "knn_reg", "tree", "tree_reg"]:
704
- self.config["MODEL"]["random_state"] = str(self.random_state)
705
-
706
- def _run_single_experiment(self):
707
- """Run a single experiment with current configuration."""
708
- import nkululeko.experiment as exp
709
-
710
- if "MODEL" not in self.config:
711
- self.config.add_section("MODEL")
712
- if "type" not in self.config["MODEL"]:
713
- self.config["MODEL"]["type"] = self.model_type
714
-
715
- expr = exp.Experiment(self.config)
716
- expr.set_module("optim")
717
-
718
- expr.load_datasets()
719
-
720
- expr.fill_train_and_tests()
721
-
722
- expr.extract_feats()
723
-
724
- expr.init_runmanager()
725
-
726
- reports, last_epochs = expr.run()
727
- result = expr.get_best_report(reports).result.test
728
-
729
- return result, int(min(last_epochs))
730
-
731
- def save_results(self, filepath=None):
732
- """Save optimization results to CSV file."""
733
- if not self.results:
734
- self.util.debug("No results to save")
735
- return
736
-
737
- if filepath is None:
738
- # Save in the results directory instead of current directory
739
- results_dir = self.util.get_path("res_dir")
740
- filepath = os.path.join(
741
- results_dir, f"optimization_results_{self.model_type}.csv"
742
- )
743
-
744
- import csv
745
-
746
- try:
747
- with open(filepath, "w", newline="") as csvfile:
748
- # Get all unique parameter names from all results
749
- param_names = set()
750
- for result in self.results:
751
- param_names.update(result["params"].keys())
752
- param_names = sorted(list(param_names))
753
-
754
- fieldnames = param_names + ["score", "result", "epoch"]
755
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
756
-
757
- writer.writeheader()
758
- for result in self.results:
759
- row = result["params"].copy()
760
- row["score"] = result["score"]
761
- row["result"] = result["result"]
762
- row["epoch"] = result["epoch"]
763
- writer.writerow(row)
764
-
765
- self.util.debug(f"Optimization results saved to {filepath}")
766
- except Exception as e:
767
- self.util.error(f"Failed to save results: {e}")
768
-
769
- def get_best_params(self):
770
- """Get the best parameters found during optimization."""
771
- if not self.results:
772
- return None
773
-
774
- best_result = None
775
- best_score = -float("inf") if self.util.high_is_good() else float("inf")
776
-
777
- for result in self.results:
778
- score = result["score"]
779
- is_better = (self.util.high_is_good() and score > best_score) or (
780
- not self.util.high_is_good() and score < best_score
781
- )
782
- if is_better:
783
- best_score = score
784
- best_result = result
785
-
786
- return best_result
787
-
788
- def get_recommended_ranges(self, param_name):
789
- """Get recommended parameter ranges for common hyperparameters."""
790
- recommendations = {
791
- "lr": [0.0001, 0.001, 0.01, 0.1], # Log-scale discrete values
792
- "do": [0.1, 0.3, 0.5, 0.7], # Common dropout rates
793
- "C_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization
794
- "c_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization (alternative)
795
- "K_val": [3, 5, 7, 9, 11], # KNN neighbors
796
- "k_val": [3, 5, 7, 9, 11], # KNN neighbors (alternative)
797
- "KNN_weights": ["uniform", "distance"], # KNN weights
798
- "knn_weights": ["uniform", "distance"], # KNN weights (alternative)
799
- "n_estimators": [50, 100, 200], # XGB trees
800
- "max_depth": [3, 6, 9, 12], # Tree depth
801
- "subsample": [0.6, 0.8, 1.0], # XGB subsample
802
- "learning_rate": [0.01, 0.1, 0.3], # XGB learning rate
803
- }
804
- return recommendations.get(param_name, None)
805
-
806
- def _validate_best_params_standard_eval(self, best_params, expr):
807
- """Validate the best parameters using standard nkululeko train-test evaluation."""
808
- try:
809
- # Set the model parameters to the best found values
810
- self._update_config_with_params(best_params)
811
-
812
- # Run a single experiment with these parameters using the standard approach
813
- result, _ = self._run_single_experiment()
814
-
815
- return result
816
- except Exception as e:
817
- self.util.debug(f"Standard validation failed: {e}")
818
- return None
819
-
820
- def _configure_mlp_model(self, model, params):
821
- """Configure MLP model with current parameters."""
822
- # Set MLP-specific parameters
823
- if hasattr(model, 'clf') and hasattr(model.clf, 'set_params'):
824
- model_params = {}
825
-
826
- # Map optimization parameters to model parameters
827
- if "lr" in params:
828
- model_params["learning_rate"] = params["lr"]
829
- if "do" in params:
830
- model_params["dropout"] = params["do"]
831
- if "bs" in params:
832
- model_params["batch_size"] = params["bs"]
833
-
834
- model.clf.set_params(**model_params)
835
-
836
- def _configure_traditional_model(self, model, params):
837
- """Configure traditional ML model with current parameters."""
838
- if hasattr(model, 'clf') and hasattr(model.clf, 'set_params'):
839
- # Map parameter names for different models
840
- param_mapping = {
841
- "C_val": "C",
842
- "c_val": "C",
843
- "K_val": "n_neighbors",
844
- "k_val": "n_neighbors",
845
- "KNN_weights": "weights",
846
- "knn_weights": "weights",
847
- }
848
-
849
- model_params = {}
850
- for param_name, param_value in params.items():
851
- sklearn_param = param_mapping.get(param_name, param_name)
852
- model_params[sklearn_param] = param_value
853
-
854
- model.clf.set_params(**model_params)
855
10
 
856
- def _extract_score_from_report(self, reports):
857
- """Extract score from model prediction reports."""
858
- # This is a simplified version - you may need to adapt based on your report structure
859
- if isinstance(reports, dict):
860
- # Try to extract the metric we're optimizing for
861
- if self.metric in reports:
862
- return reports[self.metric]
863
- elif "test" in reports:
864
- return reports["test"]
865
- else:
866
- # Return the first numeric value found
867
- for key, value in reports.items():
868
- if isinstance(value, (int, float)):
869
- return value
870
- elif isinstance(reports, (int, float)):
871
- return reports
872
- else:
873
- # Fallback: assume it's a list and take the first element
874
- try:
875
- return reports[0] if hasattr(reports, '__getitem__') else 0.0
876
- except:
877
- return 0.0
11
+ # Import the OptimizationRunner class from the dedicated module
12
+ from nkululeko.optimizationrunner import OptimizationRunner
878
13
 
879
14
 
880
15
  def doit(config_file):
@@ -902,10 +37,7 @@ def doit(config_file):
902
37
  end_time = time.time()
903
38
  optimization_time = end_time - start_time
904
39
 
905
- print("OPTIMIZATION COMPLETE")
906
- print(f"Best parameters: {best_params}")
907
- print(f"Best result: {best_result}")
908
- print(
40
+ optimizer.util.debug(
909
41
  f"Optimization time: {optimization_time:.2f} seconds ({optimization_time/60:.2f} minutes)"
910
42
  )
911
43
  print("DONE")