nkululeko 0.95.2__py3-none-any.whl → 0.95.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1118 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import ast
4
+ import itertools
5
+ import os
6
+ import random
7
+
8
+ import numpy as np
9
+
10
+ from nkululeko.constants import VERSION
11
+ from nkululeko.utils.util import Util
12
+
13
+
14
+ class OptimizationRunner:
15
+ """Hyperparameter optimization runner for nkululeko experiments."""
16
+
17
+ def __init__(self, config):
18
+ self.config = config
19
+ self.util = Util("optim")
20
+ self.results = []
21
+ self.model_type = None # Will be set when parsing OPTIM params
22
+ # New: Optimization strategy configuration
23
+ self.search_strategy = "grid" # Default values
24
+ self.n_iter = 50
25
+ self.cv_folds = None
26
+ self.metric = "accuracy"
27
+ self.random_state = 42 # Default random state for reproducibility
28
+
29
+ def parse_optim_params(self):
30
+ """Parse OPTIM section parameters into search spaces."""
31
+ if "OPTIM" not in self.config:
32
+ self.util.error("No [OPTIM] section found in configuration")
33
+
34
+ optim_config = self.config["OPTIM"]
35
+ self.model_type = optim_config.get("model", "mlp")
36
+
37
+ # Parse optimization strategy settings
38
+ self.search_strategy = optim_config.get("search_strategy", "grid")
39
+ self.n_iter = int(optim_config.get("n_iter", "50"))
40
+ self.cv_folds = int(optim_config.get("cv_folds", "3"))
41
+ self.random_state = int(optim_config.get("random_state", "42"))
42
+
43
+ # Set global random seeds for reproducibility
44
+ random.seed(self.random_state)
45
+ np.random.seed(self.random_state)
46
+
47
+ self.util.debug(f"Using random state: {self.random_state} for reproducibility")
48
+
49
+ self.metric = optim_config.get("metric", "accuracy").lower()
50
+
51
+ self.util.debug(f"Parsed metric from config: '{self.metric}'") # Debug line
52
+
53
+ param_specs = {}
54
+ for key, value in optim_config.items():
55
+ if key in [
56
+ "model",
57
+ "search_strategy",
58
+ "n_iter",
59
+ "cv_folds",
60
+ "metric",
61
+ "random_state",
62
+ ]:
63
+ continue
64
+ param_specs[key] = self._parse_param_spec(key, value)
65
+
66
+ return param_specs
67
+
68
+ def _parse_param_spec(self, param_name, param_value):
69
+ """Parse individual parameter specification."""
70
+ try:
71
+ parsed = ast.literal_eval(param_value)
72
+ except (ValueError, SyntaxError) as e:
73
+ self.util.debug(
74
+ f"Could not parse parameter {param_name}={param_value} as literal, treating as string: {e}"
75
+ )
76
+ if isinstance(param_value, str):
77
+ return [param_value]
78
+ return param_value
79
+
80
+ # Check for inefficient learning rate ranges and suggest better alternatives
81
+ if param_name == "lr" and isinstance(parsed, tuple) and len(parsed) == 3:
82
+ min_val, max_val, step = parsed
83
+ if step <= 0.0001 and (max_val - min_val) / step > 20:
84
+ self.util.debug(
85
+ f"WARNING: Learning rate range {param_value} will generate {int((max_val - min_val) / step)} values!"
86
+ )
87
+ self.util.debug(
88
+ "Consider using discrete values like [0.0001, 0.001, 0.01, 0.1] or range (0.0001, 0.1) for log-scale sampling"
89
+ )
90
+
91
+ if isinstance(parsed, tuple):
92
+ if len(parsed) == 2:
93
+ return self._generate_range(parsed[0], parsed[1], param_name)
94
+ elif len(parsed) == 3:
95
+ return self._generate_range_with_step(
96
+ parsed[0], parsed[1], parsed[2], param_name
97
+ )
98
+ else:
99
+ self.util.error(
100
+ f"Invalid tuple format for parameter {param_name}: {param_value}. Expected (min, max) or (min, max, step)"
101
+ )
102
+ return [parsed[0]] # Fallback to first value
103
+ elif isinstance(parsed, list):
104
+ return parsed
105
+ else:
106
+ return [parsed]
107
+
108
+ def _generate_range(self, min_val, max_val, param_name):
109
+ """Generate parameter range based on parameter type."""
110
+ if param_name in ["nlayers"]:
111
+ return list(range(min_val, max_val + 1))
112
+ elif param_name in ["nnodes", "bs"]:
113
+ result = []
114
+ current = min_val
115
+ while current <= max_val:
116
+ result.append(current)
117
+ current *= 2
118
+ return result
119
+ elif param_name in ["lr"]:
120
+ # For learning rate, use logarithmic scale sampling (more practical)
121
+ # Generate 5-8 values on log scale between min and max
122
+ num_samples = min(8, max(5, int(np.log10(max_val / min_val) * 2)))
123
+ log_min = np.log10(min_val)
124
+ log_max = np.log10(max_val)
125
+ log_values = np.linspace(log_min, log_max, num_samples)
126
+ result = [round(10**log_val, 6) for log_val in log_values]
127
+ return result
128
+ elif param_name in ["do"]:
129
+ # For dropout, generate reasonable steps
130
+ num_steps = 5
131
+ step = (max_val - min_val) / num_steps
132
+ result = []
133
+ current = min_val
134
+ while current <= max_val + step / 2:
135
+ result.append(round(current, 2))
136
+ current += step
137
+ return result
138
+ else:
139
+ return list(range(min_val, max_val + 1))
140
+
141
+ def _generate_range_with_step(self, min_val, max_val, step, param_name):
142
+ """Generate parameter range with explicit step."""
143
+ if (
144
+ isinstance(step, float)
145
+ or isinstance(min_val, float)
146
+ or isinstance(max_val, float)
147
+ ):
148
+ result = []
149
+ current = float(min_val)
150
+ step = float(step)
151
+ max_val = float(max_val)
152
+ while current <= max_val + step / 2:
153
+ result.append(round(current, 6)) # More precision for floats
154
+ current += step
155
+ return result
156
+ else:
157
+ return list(range(min_val, max_val + 1, step))
158
+
159
+ def generate_param_combinations(self, param_specs):
160
+ """Generate all parameter combinations for grid search."""
161
+ param_names = list(param_specs.keys())
162
+ param_values = list(param_specs.values())
163
+
164
+ combinations = []
165
+ for combo in itertools.product(*param_values):
166
+ param_dict = dict(zip(param_names, combo))
167
+ combinations.append(param_dict)
168
+
169
+ return combinations
170
+
171
+ def run_optimization(self):
172
+ """Run hyperparameter optimization using the most appropriate method."""
173
+ param_specs = self.parse_optim_params()
174
+
175
+ self.util.debug(
176
+ f"Starting optimization using {self.search_strategy} strategy with {self.metric.upper()} metric, nkululeko version {VERSION}"
177
+ )
178
+
179
+ # Set comprehensive random state for reproducibility
180
+ self._set_comprehensive_random_state()
181
+
182
+ if not param_specs:
183
+ self.util.error("No optimization parameters found in [OPTIM] section")
184
+ return None, None, []
185
+
186
+ # Always use manual optimization to ensure consistent evaluation pipeline
187
+ # This prevents discrepancies between CV and final evaluation
188
+ self.util.debug("Using manual optimization for consistent evaluation pipeline")
189
+ return self._run_manual_optimization(param_specs)
190
+
191
+ def _run_manual_optimization(self, param_specs):
192
+ """Run manual grid search optimization with consistent evaluation pipeline."""
193
+ combinations = self.generate_param_combinations(param_specs)
194
+
195
+ if not combinations:
196
+ self.util.error("No parameter combinations generated")
197
+ return None, None, []
198
+
199
+ self.util.debug(
200
+ f"Starting manual optimization with {len(combinations)} parameter combinations"
201
+ )
202
+
203
+ # Check if we should use cross-validation or train-test split
204
+ use_cv = self.search_strategy in ["grid_cv", "random_cv"] or (
205
+ hasattr(self, "use_cv_in_manual") and self.use_cv_in_manual
206
+ )
207
+
208
+ if use_cv:
209
+ return self._run_manual_cv_optimization(combinations, param_specs)
210
+ else:
211
+ return self._run_manual_train_test_optimization(combinations)
212
+
213
+ def _run_manual_train_test_optimization(self, combinations):
214
+ """Run manual optimization using train-test split (matches final evaluation)."""
215
+ best_result = None
216
+ best_params = None
217
+ best_score = -float("inf") if self.util.high_is_good() else float("inf")
218
+
219
+ for i, params in enumerate(combinations):
220
+ self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
221
+
222
+ self._update_config_with_params(params)
223
+
224
+ try:
225
+ result, last_epoch = self._run_single_experiment()
226
+ score = result # result.test is already a numeric value
227
+
228
+ result_entry = {
229
+ "params": params.copy(),
230
+ "score": score,
231
+ "epoch": last_epoch,
232
+ }
233
+ self.results.append(result_entry)
234
+
235
+ is_better = (self.util.high_is_good() and score > best_score) or (
236
+ not self.util.high_is_good() and score < best_score
237
+ )
238
+
239
+ if is_better:
240
+ best_score = score
241
+ best_result = score
242
+ best_params = params.copy()
243
+
244
+ self.util.debug(f"Score: {score} {self.metric.upper()}")
245
+
246
+ except Exception as e:
247
+ self.util.error(f"Failed with params {params}: {str(e)}")
248
+ # Log the full traceback for debugging
249
+ import traceback
250
+
251
+ self.util.debug(f"Full traceback: {traceback.format_exc()}")
252
+ continue
253
+
254
+ self.util.debug("Optimization complete!")
255
+ self.util.debug(f"Best parameters: {best_params}")
256
+ if best_result is not None:
257
+ self.util.debug(f"Best score: {best_result} {self.metric.upper()}")
258
+ self.util.debug("=" * 60)
259
+ self.util.debug("REPRODUCIBILITY INSTRUCTIONS:")
260
+ self.util.debug(
261
+ f"1. Set random_state = {self.random_state} in [MODEL], [FEATS], and [OPTIM] sections"
262
+ )
263
+ self.util.debug(
264
+ "2. Use the best parameters shown above in your [MODEL] section"
265
+ )
266
+ self.util.debug(
267
+ "3. Ensure identical data preprocessing and feature extraction"
268
+ )
269
+ self.util.debug("4. Use the same dependency versions and environment")
270
+ self.util.debug("=" * 60)
271
+ else:
272
+ self.util.debug("No successful optimization runs completed")
273
+
274
+ # Save results to file
275
+ self.save_results()
276
+
277
+ return best_params, best_result, self.results
278
+
279
+ def _run_manual_cv_optimization(self, combinations, param_specs):
280
+ """Run manual optimization using cross-validation."""
281
+ import numpy as np
282
+ from sklearn.model_selection import StratifiedKFold
283
+
284
+ self.util.debug(
285
+ "Using cross-validation for optimization (may differ from final evaluation)"
286
+ )
287
+
288
+ # Set up the experiment once to get the data
289
+ import nkululeko.experiment as exp
290
+
291
+ expr = exp.Experiment(self.config)
292
+ expr.set_module("optim")
293
+ expr.load_datasets()
294
+ expr.fill_train_and_tests()
295
+ expr.extract_feats()
296
+
297
+ # Create stratified CV splits
298
+ cv_splitter = StratifiedKFold(
299
+ n_splits=self.cv_folds, shuffle=True, random_state=self.random_state
300
+ )
301
+
302
+ best_result = None
303
+ best_params = None
304
+ best_score = -float("inf") if self.util.high_is_good() else float("inf")
305
+
306
+ for i, params in enumerate(combinations):
307
+ self.util.debug(f"Testing combination {i+1}/{len(combinations)}: {params}")
308
+
309
+ # Run cross-validation for this parameter combination
310
+ cv_scores = []
311
+
312
+ try:
313
+ for fold, (train_idx, val_idx) in enumerate(
314
+ cv_splitter.split(
315
+ expr.feats_train, expr.df_train[self.config["DATA"]["target"]]
316
+ )
317
+ ):
318
+ self.util.debug(f" Fold {fold+1}/{self.cv_folds}")
319
+
320
+ # Create fold-specific data
321
+ fold_train_feats = expr.feats_train.iloc[train_idx]
322
+ fold_val_feats = expr.feats_train.iloc[val_idx]
323
+ fold_train_df = expr.df_train.iloc[train_idx]
324
+ fold_val_df = expr.df_train.iloc[val_idx]
325
+
326
+ # Update config with current parameters
327
+ self._update_config_with_params(params)
328
+
329
+ # Run experiment on this fold
330
+ fold_score = self._run_cv_fold(
331
+ fold_train_feats,
332
+ fold_val_feats,
333
+ fold_train_df,
334
+ fold_val_df,
335
+ params,
336
+ )
337
+ cv_scores.append(fold_score)
338
+
339
+ # Calculate mean CV score
340
+ mean_score = np.mean(cv_scores)
341
+ std_score = np.std(cv_scores)
342
+
343
+ result_entry = {
344
+ "params": params.copy(),
345
+ "score": mean_score,
346
+ "cv_std": std_score,
347
+ "cv_scores": cv_scores,
348
+ "epoch": 0,
349
+ }
350
+ self.results.append(result_entry)
351
+
352
+ is_better = (self.util.high_is_good() and mean_score > best_score) or (
353
+ not self.util.high_is_good() and mean_score < best_score
354
+ )
355
+
356
+ if is_better:
357
+ best_score = mean_score
358
+ best_result = mean_score
359
+ best_params = params.copy()
360
+
361
+ self.util.debug(
362
+ f"CV Score: {mean_score:.4f} ± {std_score:.4f} {self.metric.upper()}"
363
+ )
364
+
365
+ except Exception as e:
366
+ self.util.error(f"Failed with params {params}: {str(e)}")
367
+ continue
368
+
369
+ self.util.debug("Cross-validation optimization complete!")
370
+ self.util.debug(f"Best parameters: {best_params}")
371
+ if best_result is not None:
372
+ self.util.debug(f"Best CV score: {best_result} {self.metric.upper()}")
373
+ else:
374
+ self.util.debug("No successful CV runs completed")
375
+
376
+ # Validate with final evaluation pipeline
377
+ if best_params and best_result is not None:
378
+ validation_score = self._validate_best_params_standard_eval(
379
+ best_params, expr
380
+ )
381
+ if validation_score is not None:
382
+ self.util.debug(
383
+ f"Cross-validation score: {best_result:.4f} {self.metric.upper()}"
384
+ )
385
+ self.util.debug(
386
+ f"Standard evaluation score: {validation_score:.4f} {self.metric.upper()}"
387
+ )
388
+ score_diff = abs(best_result - validation_score)
389
+ self.util.debug(f"Score difference: {score_diff:.4f}")
390
+
391
+ if score_diff > 0.1: # 10% difference threshold
392
+ self.util.debug(
393
+ "WARNING: Large discrepancy between CV and standard evaluation!"
394
+ )
395
+ self.util.debug(
396
+ "Consider using train-test optimization for more consistent results."
397
+ )
398
+
399
+ # Save results to file
400
+ self.save_results()
401
+
402
+ return best_params, best_result, self.results
403
+
404
+ def _run_cv_fold(self, train_feats, val_feats, train_df, val_df, params):
405
+ """Run a single cross-validation fold."""
406
+ from nkululeko.modelrunner import Modelrunner
407
+
408
+ # Create a temporary runner for this fold
409
+ runner = Modelrunner(train_df, val_df, train_feats, val_feats, 0)
410
+ runner._select_model(self.model_type)
411
+
412
+ # Configure model with current parameters
413
+ if self.model_type == "mlp":
414
+ self._configure_mlp_model(runner.model, params)
415
+ else:
416
+ self._configure_traditional_model(runner.model, params)
417
+
418
+ # Train and evaluate
419
+ runner.model.train()
420
+ reports = runner.model.predict()
421
+
422
+ # Extract score based on metric
423
+ return self._extract_score_from_report(reports)
424
+
425
+ def _run_sklearn_optimization(self, param_specs):
426
+ """Run optimization using scikit-learn's hyperparameter search methods with consistent data handling."""
427
+ from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
428
+ StratifiedKFold)
429
+
430
+ # Import the actual experiment to get the model and data
431
+ import nkululeko.experiment as exp
432
+
433
+ # Set up the experiment
434
+ expr = exp.Experiment(self.config)
435
+ expr.set_module("optim")
436
+ expr.load_datasets()
437
+ expr.fill_train_and_tests()
438
+ expr.extract_feats()
439
+
440
+ # Apply the same balancing as the final evaluation
441
+ original_train_feats = expr.feats_train.copy()
442
+ original_train_df = expr.df_train.copy()
443
+
444
+ if "FEATS" in self.config and "balancing" in self.config["FEATS"]:
445
+ balancing_method = self.config["FEATS"]["balancing"]
446
+ if balancing_method and balancing_method.lower() != "none":
447
+ self.util.debug(
448
+ f"Applying {balancing_method} balancing for optimization consistency"
449
+ )
450
+ try:
451
+ from nkululeko.balance import DataBalancer
452
+
453
+ balancer = DataBalancer()
454
+ expr.feats_train, expr.df_train = balancer.balance_features(
455
+ expr.df_train,
456
+ expr.feats_train,
457
+ self.config["DATA"]["target"],
458
+ balancing_method,
459
+ )
460
+ self.util.debug(
461
+ f"Balanced training data: {len(expr.feats_train)} samples"
462
+ )
463
+ except Exception as e:
464
+ self.util.debug(f"Balancing failed: {e}, using original data")
465
+ expr.feats_train = original_train_feats
466
+ expr.df_train = original_train_df
467
+
468
+ # Get the base model without hyperparameter tuning
469
+ original_tuning_params = self.config.get(
470
+ "MODEL", "tuning_params", fallback=None
471
+ )
472
+ if "MODEL" not in self.config:
473
+ self.config.add_section("MODEL")
474
+
475
+ # Temporarily disable tuning_params to get base model
476
+ if original_tuning_params:
477
+ self.config.remove_option("MODEL", "tuning_params")
478
+
479
+ # Create a model instance using the modelrunner approach
480
+ from nkululeko.modelrunner import Modelrunner
481
+
482
+ runner = Modelrunner(
483
+ expr.df_train, expr.df_test, expr.feats_train, expr.feats_test, 0
484
+ )
485
+ runner._select_model(self.model_type)
486
+ base_clf = runner.model.clf
487
+
488
+ # Restore original tuning_params if it existed
489
+ if original_tuning_params:
490
+ self.config.set("MODEL", "tuning_params", original_tuning_params)
491
+
492
+ # Convert parameter specifications to sklearn format
493
+ sklearn_params = self._convert_to_sklearn_params(param_specs)
494
+
495
+ # Create stratified CV for consistent cross-validation
496
+ cv = StratifiedKFold(
497
+ n_splits=self.cv_folds, shuffle=True, random_state=self.random_state
498
+ )
499
+
500
+ # Choose search strategy
501
+ if self.search_strategy == "random":
502
+ search = RandomizedSearchCV(
503
+ base_clf,
504
+ sklearn_params,
505
+ n_iter=self.n_iter,
506
+ cv=cv, # Use stratified CV
507
+ scoring=self._get_scoring_metric(),
508
+ random_state=self.random_state,
509
+ n_jobs=-1,
510
+ verbose=1,
511
+ )
512
+ elif self.search_strategy == "halving_random":
513
+ try:
514
+ from sklearn.model_selection import HalvingRandomSearchCV
515
+
516
+ search = HalvingRandomSearchCV(
517
+ base_clf,
518
+ sklearn_params,
519
+ cv=cv, # Use stratified CV
520
+ scoring=self._get_scoring_metric(),
521
+ random_state=self.random_state,
522
+ n_jobs=-1,
523
+ verbose=1,
524
+ )
525
+ except ImportError:
526
+ self.util.debug(
527
+ "HalvingRandomSearchCV not available, falling back to RandomizedSearchCV"
528
+ )
529
+ search = RandomizedSearchCV(
530
+ base_clf,
531
+ sklearn_params,
532
+ n_iter=self.n_iter,
533
+ cv=cv, # Use stratified CV
534
+ scoring=self._get_scoring_metric(),
535
+ random_state=self.random_state,
536
+ n_jobs=-1,
537
+ verbose=1,
538
+ )
539
+ elif self.search_strategy == "halving_grid":
540
+ try:
541
+ from sklearn.model_selection import HalvingGridSearchCV
542
+
543
+ search = HalvingGridSearchCV(
544
+ base_clf,
545
+ sklearn_params,
546
+ cv=cv, # Use stratified CV
547
+ scoring=self._get_scoring_metric(),
548
+ random_state=self.random_state,
549
+ n_jobs=-1,
550
+ verbose=1,
551
+ )
552
+ except ImportError:
553
+ self.util.debug(
554
+ "HalvingGridSearchCV not available, falling back to GridSearchCV"
555
+ )
556
+ search = GridSearchCV(
557
+ base_clf,
558
+ sklearn_params,
559
+ cv=cv, # Use stratified CV
560
+ scoring=self._get_scoring_metric(),
561
+ n_jobs=-1,
562
+ verbose=1,
563
+ )
564
+ else: # grid search (default)
565
+ search = GridSearchCV(
566
+ base_clf,
567
+ sklearn_params,
568
+ cv=cv, # Use stratified CV
569
+ scoring=self._get_scoring_metric(),
570
+ n_jobs=-1,
571
+ verbose=1,
572
+ )
573
+
574
+ self.util.debug(
575
+ f"Starting {self.search_strategy} search with {len(sklearn_params)} parameters"
576
+ )
577
+ self.util.debug(f"Using stratified {self.cv_folds}-fold cross-validation")
578
+
579
+ # Fit the search
580
+ search.fit(expr.feats_train, expr.df_train[self.config["DATA"]["target"]])
581
+
582
+ # Extract results
583
+ best_params = search.best_params_
584
+ best_score = search.best_score_
585
+
586
+ # Convert results back to our format
587
+ all_results = []
588
+ for i, (params, score) in enumerate(
589
+ zip(search.cv_results_["params"], search.cv_results_["mean_test_score"])
590
+ ):
591
+ result_entry = {
592
+ "params": params,
593
+ "score": score,
594
+ "epoch": 0,
595
+ }
596
+ all_results.append(result_entry)
597
+
598
+ self.results = all_results
599
+
600
+ self.util.debug("Optimization complete!")
601
+ self.util.debug(f"Best parameters: {best_params}")
602
+ self.util.debug(f"Best score: {best_score} {self.metric.upper()}")
603
+
604
+ # Save results
605
+ self.save_results()
606
+
607
+ # Validate best parameters using standard nkululeko evaluation for consistency
608
+ validation_score = self._validate_best_params_standard_eval(best_params, expr)
609
+ if validation_score is not None:
610
+ self.util.debug(
611
+ f"Cross-validation score: {best_score:.4f} {self.metric.upper()}"
612
+ )
613
+ self.util.debug(
614
+ f"Standard evaluation score: {validation_score:.4f} {self.metric.upper()}"
615
+ )
616
+ score_diff = abs(best_score - validation_score)
617
+ self.util.debug(f"Score difference: {score_diff:.4f}")
618
+
619
+ if score_diff > 0.1: # 10% difference threshold
620
+ self.util.debug(
621
+ "WARNING: Large discrepancy between CV and standard evaluation!"
622
+ )
623
+ self.util.debug(
624
+ "This may indicate overfitting to CV folds or inconsistent data handling."
625
+ )
626
+ self.util.debug(
627
+ "Consider using manual optimization for more consistent results."
628
+ )
629
+
630
+ return best_params, best_score, all_results
631
+
632
+ def _convert_to_sklearn_params(self, param_specs):
633
+ """Convert our parameter specifications to sklearn format."""
634
+ # Parameter name mapping from nkululeko names to sklearn names
635
+ param_mapping = {
636
+ # SVM parameters
637
+ "C_val": "C", # SVM regularization parameter
638
+ "c_val": "C", # Alternative lowercase version
639
+ # KNN parameters
640
+ "K_val": "n_neighbors", # KNN number of neighbors
641
+ "k_val": "n_neighbors", # Alternative lowercase version
642
+ "KNN_weights": "weights", # KNN weights (uniform/distance)
643
+ "knn_weights": "weights", # Alternative lowercase version
644
+ }
645
+
646
+ sklearn_params = {}
647
+ for param_name, values in param_specs.items():
648
+ # Map parameter names to sklearn equivalents
649
+ sklearn_param_name = param_mapping.get(param_name, param_name)
650
+
651
+ if isinstance(values, list):
652
+ sklearn_params[sklearn_param_name] = values
653
+ else:
654
+ # Convert single values to lists
655
+ sklearn_params[sklearn_param_name] = [values]
656
+ return sklearn_params
657
+
658
+ def _get_scoring_metric(self):
659
+ """Get the appropriate scoring metric for sklearn optimization."""
660
+ # Create custom scorer for specificity if needed
661
+ if self.metric == "specificity":
662
+ from sklearn.metrics import make_scorer
663
+
664
+ def specificity_score(y_true, y_pred):
665
+ import numpy as np
666
+ from sklearn.metrics import confusion_matrix
667
+
668
+ cm = confusion_matrix(y_true, y_pred)
669
+ if cm.shape[0] == 2: # Binary classification
670
+ tn = cm[0, 0]
671
+ fp = cm[0, 1]
672
+ return tn / (tn + fp) if (tn + fp) > 0 else 0.0
673
+ else: # Multi-class: average specificity
674
+ specificities = []
675
+ for i in range(cm.shape[0]):
676
+ tn = np.sum(cm) - (
677
+ np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i]
678
+ )
679
+ fp = np.sum(cm[:, i]) - cm[i, i]
680
+ specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
681
+ specificities.append(specificity)
682
+ return np.mean(specificities)
683
+
684
+ return make_scorer(specificity_score)
685
+
686
+ # Standard scikit-learn metrics
687
+ metric_map = {
688
+ "uar": "balanced_accuracy", # Unweighted Average Recall
689
+ "accuracy": "accuracy", # Standard accuracy
690
+ "f1": "f1_macro", # Macro-averaged F1
691
+ "precision": "precision_macro", # Macro-averaged precision
692
+ "recall": "recall_macro", # Macro-averaged recall
693
+ "sensitivity": "recall_macro", # Sensitivity = recall
694
+ }
695
+
696
+ if self.util.exp_is_classification():
697
+ return metric_map.get(self.metric or "accuracy", "accuracy")
698
+ else:
699
+ # For regression tasks
700
+ if self.metric in [
701
+ "accuracy",
702
+ "uar",
703
+ "f1",
704
+ "precision",
705
+ "recall",
706
+ "sensitivity",
707
+ "specificity",
708
+ ]:
709
+ self.util.debug(
710
+ f"Warning: {self.metric} is not suitable for regression, using RMSE"
711
+ )
712
+ return "neg_root_mean_squared_error"
713
+
714
+ def _update_config_with_params(self, params):
715
+ """Update configuration with current parameter set."""
716
+ self._ensure_model_section()
717
+
718
+ if self.model_type == "mlp":
719
+ self._update_mlp_params(params)
720
+ else:
721
+ self._update_traditional_ml_params(params)
722
+
723
+ # Ensure random state is propagated to all components for reproducibility
724
+ self._ensure_global_random_state()
725
+
726
+ def _ensure_model_section(self):
727
+ """Ensure MODEL section exists with basic configuration."""
728
+ if "MODEL" not in self.config:
729
+ self.config.add_section("MODEL")
730
+
731
+ if "type" not in self.config["MODEL"]:
732
+ self.config["MODEL"]["type"] = self.model_type
733
+
734
+ def _update_mlp_params(self, params):
735
+ """Update MLP-specific parameters."""
736
+ if "nlayers" in params and "nnodes" in params:
737
+ nlayers = params["nlayers"]
738
+ nnodes = params["nnodes"]
739
+ layers = {f"l{i+1}": nnodes for i in range(nlayers)}
740
+ self.config["MODEL"]["layers"] = str(layers)
741
+
742
+ if "lr" in params:
743
+ self.config["MODEL"]["learning_rate"] = str(params["lr"])
744
+
745
+ if "bs" in params:
746
+ self.config["MODEL"]["batch_size"] = str(params["bs"])
747
+
748
+ if "do" in params:
749
+ self.config["MODEL"]["drop"] = str(params["do"])
750
+
751
+ if "loss" in params:
752
+ self.config["MODEL"]["loss"] = params["loss"]
753
+
754
+ def _update_traditional_ml_params(self, params):
755
+ """Update traditional ML parameters using tuning_params approach."""
756
+ # For optimization, we set the specific parameter values directly
757
+ # rather than using the tuning mechanism
758
+ for param_name, param_value in params.items():
759
+ self.config["MODEL"][param_name] = str(param_value)
760
+
761
+ # Always add random_state to model configuration for consistency and reproducibility
762
+ self.config["MODEL"]["random_state"] = str(self.random_state)
763
+
764
+ # For XGBoost specifically, also set additional reproducibility parameters
765
+ if self.model_type in ["xgb", "xgr"]:
766
+ # Ensure deterministic behavior
767
+ self.config["MODEL"]["n_jobs"] = (
768
+ "1" # Force single-threaded for reproducibility
769
+ )
770
+ if "tree_method" not in params:
771
+ self.config["MODEL"]["tree_method"] = (
772
+ "exact" # Deterministic tree construction
773
+ )
774
+
775
+ def _run_single_experiment(self):
776
+ """Run a single experiment with current configuration."""
777
+ import nkululeko.experiment as exp
778
+
779
+ if "MODEL" not in self.config:
780
+ self.config.add_section("MODEL")
781
+ if "type" not in self.config["MODEL"]:
782
+ self.config["MODEL"]["type"] = self.model_type
783
+
784
+ # Ensure random state is set for all components before creating experiment
785
+ self._ensure_global_random_state()
786
+
787
+ expr = exp.Experiment(self.config)
788
+ expr.set_module("optim")
789
+
790
+ expr.load_datasets()
791
+
792
+ expr.fill_train_and_tests()
793
+
794
+ expr.extract_feats()
795
+
796
+ expr.init_runmanager()
797
+
798
+ reports, last_epochs = expr.run()
799
+ result = expr.get_best_report(reports).result.test
800
+
801
+ return result, int(min(last_epochs))
802
+
803
+ def save_results(self, filepath=None):
804
+ """Save optimization results to CSV file with reproducibility information."""
805
+ if not self.results:
806
+ self.util.debug("No results to save")
807
+ return
808
+
809
+ if filepath is None:
810
+ # Save in the results directory instead of current directory
811
+ results_dir = self.util.get_path("res_dir")
812
+ filepath = os.path.join(
813
+ results_dir, f"optimization_results_{self.model_type}.csv"
814
+ )
815
+
816
+ import csv
817
+
818
+ try:
819
+ with open(filepath, "w", newline="") as csvfile:
820
+ # Get all unique parameter names from all results
821
+ param_names = set()
822
+ for result in self.results:
823
+ param_names.update(result["params"].keys())
824
+ param_names = sorted(list(param_names))
825
+
826
+ # Add reproducibility information to field names
827
+ fieldnames = param_names + [
828
+ "score",
829
+ "epoch",
830
+ "random_state",
831
+ "search_strategy",
832
+ "metric",
833
+ ]
834
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
835
+
836
+ writer.writeheader()
837
+ # Note: 'score' contains the metric value (e.g., UAR, accuracy, etc.)
838
+ for result in self.results:
839
+ row = result["params"].copy()
840
+ row["score"] = result["score"]
841
+ row["epoch"] = result["epoch"]
842
+ # Add reproducibility information
843
+ row["random_state"] = self.random_state
844
+ row["search_strategy"] = self.search_strategy
845
+ row["metric"] = self.metric
846
+ writer.writerow(row)
847
+
848
+ self.util.debug(f"Optimization results saved to {filepath}")
849
+
850
+ # Save a separate reproducibility guide
851
+ self._save_reproducibility_guide(filepath)
852
+
853
+ except Exception as e:
854
+ self.util.error(f"Failed to save results: {e}")
855
+
856
+ def _save_reproducibility_guide(self, optimization_filepath):
857
+ """Save a detailed reproducibility guide with instructions."""
858
+ guide_filepath = optimization_filepath.replace(
859
+ ".csv", "_reproducibility_guide.txt"
860
+ )
861
+
862
+ best_result = self.get_best_params()
863
+ if not best_result:
864
+ return
865
+
866
+ try:
867
+ with open(guide_filepath, "w") as f:
868
+ f.write("NKULULEKO OPTIMIZATION REPRODUCIBILITY GUIDE\n")
869
+ f.write("=" * 50 + "\n\n")
870
+
871
+ f.write("Optimization Details:\n")
872
+ f.write(f"- Model type: {self.model_type}\n")
873
+ f.write(f"- Search strategy: {self.search_strategy}\n")
874
+ f.write(f"- Metric optimized: {self.metric}\n")
875
+ f.write(f"- Random state used: {self.random_state}\n")
876
+ f.write(f"- Total combinations tested: {len(self.results)}\n\n")
877
+
878
+ f.write("Best Parameters Found:\n")
879
+ for param, value in best_result["params"].items():
880
+ f.write(f"- {param}: {value}\n")
881
+ f.write(
882
+ f"- Best score: {best_result['score']:.6f} {self.metric.upper()}\n\n"
883
+ )
884
+
885
+ f.write("To reproduce these results:\n")
886
+ f.write("1. Use the exact same random_state in your configuration:\n")
887
+ f.write(" [MODEL]\n")
888
+ f.write(f" random_state = {self.random_state}\n\n")
889
+ f.write(" [FEATS]\n")
890
+ f.write(f" balancing_random_state = {self.random_state}\n\n")
891
+ f.write(" [OPTIM]\n")
892
+ f.write(f" random_state = {self.random_state}\n\n")
893
+
894
+ f.write("2. Set the best parameters in your [MODEL] section:\n")
895
+ for param, value in best_result["params"].items():
896
+ f.write(f" {param} = {value}\n")
897
+ f.write("\n")
898
+
899
+ f.write(
900
+ "3. Run the experiment with the same data and feature configuration.\n\n"
901
+ )
902
+
903
+ f.write("Important notes for reproducibility:\n")
904
+ f.write("- Use the same versions of all dependencies\n")
905
+ f.write("- Use the same data files and preprocessing\n")
906
+ f.write("- Ensure consistent environment (CPU/GPU, OS)\n")
907
+ f.write("- Set random seeds before any random operations\n")
908
+
909
+ self.util.debug(f"Reproducibility guide saved to {guide_filepath}")
910
+
911
+ except Exception as e:
912
+ self.util.debug(f"Failed to save reproducibility guide: {e}")
913
+
914
+ def get_best_params(self):
915
+ """Get the best parameters found during optimization."""
916
+ if not self.results:
917
+ return None
918
+
919
+ best_result = None
920
+ best_score = -float("inf") if self.util.high_is_good() else float("inf")
921
+
922
+ for result in self.results:
923
+ score = result["score"]
924
+ is_better = (self.util.high_is_good() and score > best_score) or (
925
+ not self.util.high_is_good() and score < best_score
926
+ )
927
+ if is_better:
928
+ best_score = score
929
+ best_result = result
930
+
931
+ return best_result
932
+
933
+ def get_recommended_ranges(self, param_name):
934
+ """Get recommended parameter ranges for common hyperparameters."""
935
+ recommendations = {
936
+ "lr": [0.0001, 0.001, 0.01, 0.1], # Log-scale discrete values
937
+ "do": [0.1, 0.3, 0.5, 0.7], # Common dropout rates
938
+ "C_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization
939
+ "c_val": [0.1, 1.0, 10.0, 100.0], # SVM regularization (alternative)
940
+ "K_val": [3, 5, 7, 9, 11], # KNN neighbors
941
+ "k_val": [3, 5, 7, 9, 11], # KNN neighbors (alternative)
942
+ "KNN_weights": ["uniform", "distance"], # KNN weights
943
+ "knn_weights": ["uniform", "distance"], # KNN weights (alternative)
944
+ "n_estimators": [50, 100, 200], # XGB trees
945
+ "max_depth": [3, 6, 9, 12], # Tree depth
946
+ "subsample": [0.6, 0.8, 1.0], # XGB subsample
947
+ "learning_rate": [0.01, 0.1, 0.3], # XGB learning rate
948
+ }
949
+ return recommendations.get(param_name, None)
950
+
951
+ def _validate_best_params_standard_eval(self, best_params, expr):
952
+ """Validate the best parameters using standard nkululeko train-test evaluation."""
953
+ try:
954
+ # Set the model parameters to the best found values
955
+ self._update_config_with_params(best_params)
956
+
957
+ # Run a single experiment with these parameters using the standard approach
958
+ result, _ = self._run_single_experiment()
959
+
960
+ return result
961
+ except Exception as e:
962
+ self.util.debug(f"Standard validation failed: {e}")
963
+ return None
964
+
965
+ def _configure_mlp_model(self, model, params):
966
+ """Configure MLP model with current parameters."""
967
+ # Set MLP-specific parameters
968
+ if hasattr(model, "clf") and hasattr(model.clf, "set_params"):
969
+ model_params = {}
970
+
971
+ # Map optimization parameters to model parameters
972
+ if "lr" in params:
973
+ model_params["learning_rate"] = params["lr"]
974
+ if "do" in params:
975
+ model_params["dropout"] = params["do"]
976
+ if "bs" in params:
977
+ model_params["batch_size"] = params["bs"]
978
+
979
+ model.clf.set_params(**model_params)
980
+
981
+ def _configure_traditional_model(self, model, params):
982
+ """Configure traditional ML model with current parameters."""
983
+ if hasattr(model, "clf") and hasattr(model.clf, "set_params"):
984
+ # Map parameter names for different models
985
+ param_mapping = {
986
+ "C_val": "C",
987
+ "c_val": "C",
988
+ "K_val": "n_neighbors",
989
+ "k_val": "n_neighbors",
990
+ "KNN_weights": "weights",
991
+ "knn_weights": "weights",
992
+ }
993
+
994
+ model_params = {}
995
+ for param_name, param_value in params.items():
996
+ sklearn_param = param_mapping.get(param_name, param_name)
997
+ model_params[sklearn_param] = param_value
998
+
999
+ model.clf.set_params(**model_params)
1000
+
1001
+ def _extract_score_from_report(self, reports):
1002
+ """Extract score from model prediction reports."""
1003
+ # This is a simplified version - you may need to adapt based on your report structure
1004
+ if isinstance(reports, dict):
1005
+ # Try to extract the metric we're optimizing for
1006
+ if self.metric in reports:
1007
+ return reports[self.metric]
1008
+ elif "test" in reports:
1009
+ return reports["test"]
1010
+ else:
1011
+ # Return the first numeric value found
1012
+ for key, value in reports.items():
1013
+ if isinstance(value, (int, float)):
1014
+ return value
1015
+ elif isinstance(reports, (int, float)):
1016
+ return reports
1017
+ else:
1018
+ # Fallback: assume it's a list and take the first element
1019
+ try:
1020
+ return reports[0] if hasattr(reports, "__getitem__") else 0.0
1021
+ except (IndexError, TypeError):
1022
+ return 0.0
1023
+
1024
+ def _ensure_global_random_state(self):
1025
+ """Ensure the random state is properly propagated to all components for reproducibility."""
1026
+ # Set global random state for the optimization module
1027
+ if "OPTIM" not in self.config:
1028
+ self.config.add_section("OPTIM")
1029
+ self.config["OPTIM"]["random_state"] = str(self.random_state)
1030
+
1031
+ # Set random state for model
1032
+ if "MODEL" not in self.config:
1033
+ self.config.add_section("MODEL")
1034
+ self.config["MODEL"]["random_state"] = str(self.random_state)
1035
+
1036
+ # Set random state for feature balancing
1037
+ if "FEATS" not in self.config:
1038
+ self.config.add_section("FEATS")
1039
+ self.config["FEATS"]["balancing_random_state"] = str(self.random_state)
1040
+
1041
+ # Set random state for experiment
1042
+ if "EXP" not in self.config:
1043
+ self.config.add_section("EXP")
1044
+ self.config["EXP"]["random_state"] = str(self.random_state)
1045
+
1046
+ def _set_comprehensive_random_state(self):
1047
+ """Set comprehensive random state across all libraries and components for full reproducibility."""
1048
+ import os
1049
+ import random
1050
+
1051
+ import numpy as np
1052
+
1053
+ # Set Python's built-in random seed
1054
+ random.seed(self.random_state)
1055
+
1056
+ # Set NumPy random seed
1057
+ np.random.seed(self.random_state)
1058
+
1059
+ # Set environment variables for additional reproducibility
1060
+ os.environ["PYTHONHASHSEED"] = str(self.random_state)
1061
+
1062
+ # Set TensorFlow/Keras random seed if available
1063
+ try:
1064
+ import tensorflow as tf
1065
+
1066
+ tf.random.set_seed(self.random_state)
1067
+ except ImportError:
1068
+ pass
1069
+
1070
+ # Set PyTorch random seed if available
1071
+ try:
1072
+ import torch
1073
+
1074
+ torch.manual_seed(self.random_state)
1075
+ if torch.cuda.is_available():
1076
+ torch.cuda.manual_seed(self.random_state)
1077
+ torch.cuda.manual_seed_all(self.random_state)
1078
+ torch.backends.cudnn.deterministic = True
1079
+ torch.backends.cudnn.benchmark = False
1080
+ except ImportError:
1081
+ pass
1082
+
1083
+ # Set XGBoost specific random state
1084
+ try:
1085
+ # XGBoost random state will be handled in model configuration
1086
+ pass
1087
+ except ImportError:
1088
+ pass
1089
+
1090
+ self.util.debug(
1091
+ f"Set comprehensive random state: {self.random_state} for full reproducibility"
1092
+ )
1093
+
1094
+ def _display_reproduction_config(self, best_params, best_result):
1095
+ """Display complete configuration needed for result reproduction."""
1096
+ self.util.debug("")
1097
+ self.util.debug("COMPLETE CONFIGURATION FOR REPRODUCTION:")
1098
+ self.util.debug("=" * 50)
1099
+ self.util.debug("[MODEL]")
1100
+ self.util.debug("type = " + str(self.model_type))
1101
+ self.util.debug("random_state = " + str(self.random_state))
1102
+ for param, value in best_params.items():
1103
+ self.util.debug(f"{param} = {value}")
1104
+
1105
+ self.util.debug("")
1106
+ self.util.debug("[FEATS]")
1107
+ if "FEATS" in self.config:
1108
+ for key, value in self.config["FEATS"].items():
1109
+ self.util.debug(f"{key} = {value}")
1110
+ self.util.debug("balancing_random_state = " + str(self.random_state))
1111
+
1112
+ self.util.debug("")
1113
+ self.util.debug("[OPTIM]")
1114
+ self.util.debug("random_state = " + str(self.random_state))
1115
+
1116
+ self.util.debug("")
1117
+ self.util.debug(f"Expected result: {best_result:.6f} {self.metric.upper()}")
1118
+ self.util.debug("=" * 50)