lecrapaud 0.19.3__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -15,7 +15,7 @@ from pydantic import BaseModel
15
15
  import ast
16
16
 
17
17
  # ML models
18
- from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
19
19
  from sklearn.calibration import CalibratedClassifierCV
20
20
  from sklearn.metrics import (
21
21
  mean_absolute_percentage_error,
@@ -63,23 +63,38 @@ from ray.tune.logger import TBXLoggerCallback
63
63
  from ray.tune.schedulers import ASHAScheduler
64
64
  from ray.air import session
65
65
 
66
+ # HyperOpt standalone
67
+ from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
68
+
66
69
  # Internal library
67
70
  from lecrapaud.search_space import all_models
68
71
  from lecrapaud.directories import clean_directory
69
72
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
- from lecrapaud.config import PYTHON_ENV
73
+ from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
71
74
  from lecrapaud.feature_selection import load_train_data
72
75
  from lecrapaud.db import (
73
76
  Model,
74
77
  ModelSelection,
75
- ModelTraining,
76
- Score,
78
+ ModelSelectionScore,
77
79
  Target,
78
80
  Experiment,
79
81
  )
80
82
 
81
83
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
82
84
 
85
+ # Suppress XGBoost and LightGBM logging
86
+ import logging
87
+
88
+ logging.getLogger("lightgbm").setLevel(logging.ERROR)
89
+ logging.getLogger("xgboost").setLevel(logging.ERROR)
90
+
91
+ # Set global verbosity for XGBoost
92
+ xgb.set_config(verbosity=0)
93
+
94
+ # Suppress warnings
95
+ warnings.filterwarnings("ignore", category=UserWarning)
96
+ warnings.filterwarnings("ignore", category=FutureWarning)
97
+
83
98
  # Reproducible result
84
99
  keras.utils.set_random_seed(42)
85
100
  np.random.seed(42)
@@ -410,6 +425,7 @@ class ModelEngine:
410
425
  "metric": eval_metric,
411
426
  "num_class": num_class,
412
427
  "verbose": -1,
428
+ "verbose_eval": False,
413
429
  },
414
430
  num_boost_round=params["num_boost_round"],
415
431
  train_set=train_data,
@@ -421,6 +437,7 @@ class ModelEngine:
421
437
  ),
422
438
  lgb.record_evaluation(evals_result),
423
439
  tensorboard_callback,
440
+ lgb.log_evaluation(period=0), # Disable evaluation logging
424
441
  ],
425
442
  )
426
443
  else:
@@ -462,7 +479,7 @@ class ModelEngine:
462
479
  if self.target_type == "regression"
463
480
  else ("logloss" if num_class <= 2 else "mlogloss")
464
481
  )
465
- xgb.set_config(verbosity=0)
482
+ # XGBoost verbosity already set globally
466
483
  model = xgb.train(
467
484
  params={
468
485
  **params["model_params"],
@@ -477,11 +494,11 @@ class ModelEngine:
477
494
  xgb.callback.EarlyStopping(
478
495
  rounds=params["early_stopping_rounds"], save_best=True
479
496
  ),
480
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
497
+ # Removed EvaluationMonitor to suppress logs
481
498
  tensorboard_callback,
482
499
  ],
483
500
  evals_result=evals_result, # Record evaluation result
484
- verbose_eval=10000,
501
+ verbose_eval=False, # Disable evaluation logging
485
502
  )
486
503
 
487
504
  model.model_name = self.create_model
@@ -746,6 +763,171 @@ class ModelEngine:
746
763
  )
747
764
 
748
765
 
766
+ def trainable_cv(
767
+ params,
768
+ x_train,
769
+ y_train,
770
+ x_val,
771
+ y_val,
772
+ model_name,
773
+ target_type,
774
+ experiment_name,
775
+ target_number,
776
+ create_model,
777
+ n_splits=3,
778
+ plot=False,
779
+ log_dir=None,
780
+ target_clf_thresholds: dict = None,
781
+ time_series=True,
782
+ recurrent=False,
783
+ ):
784
+ """Cross-validation version of trainable for hyperopt.
785
+
786
+ Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
787
+ Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
788
+ """
789
+ # Combine train and validation data for cross-validation
790
+ if recurrent:
791
+ x_train_val = np.concatenate([x_train, x_val], axis=0)
792
+ y_train_val = np.concatenate([y_train, y_val], axis=0)
793
+ else:
794
+ x_train_val = pd.concat([x_train, x_val], axis=0)
795
+ y_train_val = pd.concat([y_train, y_val], axis=0)
796
+ # Store original index for later use if needed
797
+ original_index = x_train_val.index.copy()
798
+ # Reset index for proper iloc indexing with CV splits
799
+ x_train_val = x_train_val.reset_index(drop=True)
800
+ y_train_val = y_train_val.reset_index(drop=True)
801
+
802
+ # Choose appropriate cross-validation splitter
803
+ if time_series:
804
+ # Time series split for temporal data
805
+ n_samples = len(x_train_val)
806
+ test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
807
+ cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
808
+ else:
809
+ # Stratified or regular K-fold for i.i.d. data
810
+ if target_type == "classification":
811
+ cv_splitter = StratifiedKFold(
812
+ n_splits=n_splits, shuffle=True, random_state=42
813
+ )
814
+ else:
815
+ cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
816
+
817
+ # Store all predictions and true values for pooled metrics
818
+ all_predictions = []
819
+ all_y_true = []
820
+ fold_times = []
821
+
822
+ # Get splits based on the CV strategy
823
+ if time_series or target_type == "regression":
824
+ splits = cv_splitter.split(x_train_val)
825
+ else:
826
+ # For stratified split, we need to pass y
827
+ if recurrent:
828
+ # Extract the target from the 2D array (first column is target)
829
+ y_for_split = y_train_val[:, 0]
830
+ else:
831
+ y_for_split = y_train_val
832
+ splits = cv_splitter.split(x_train_val, y_for_split)
833
+
834
+ for fold_idx, (train_idx, val_idx) in enumerate(splits):
835
+ # Extract fold data
836
+ if recurrent:
837
+ x_fold_train = x_train_val[train_idx]
838
+ y_fold_train = y_train_val[train_idx]
839
+ x_fold_val = x_train_val[val_idx]
840
+ y_fold_val = y_train_val[val_idx]
841
+ else:
842
+ x_fold_train = x_train_val.iloc[train_idx]
843
+ y_fold_train = y_train_val.iloc[train_idx]
844
+ x_fold_val = x_train_val.iloc[val_idx]
845
+ y_fold_val = y_train_val.iloc[val_idx]
846
+
847
+ # Train model for this fold
848
+ model = ModelEngine(
849
+ model_name=model_name,
850
+ target_type=target_type,
851
+ target_number=target_number,
852
+ create_model=create_model,
853
+ plot=False, # Disable individual fold plots
854
+ log_dir=log_dir,
855
+ )
856
+
857
+ if recurrent:
858
+ timesteps = params["timesteps"]
859
+ x_fold_train = x_fold_train[:, -timesteps:, :]
860
+ x_fold_val = x_fold_val[:, -timesteps:, :]
861
+
862
+ # Fit model
863
+ model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
864
+
865
+ # Get predictions
866
+ y_pred = model.predict(x_fold_val)
867
+
868
+ # Handle recurrent model indexing
869
+ if recurrent:
870
+ y_fold_val = pd.DataFrame(
871
+ y_fold_val, columns=["TARGET", "index"]
872
+ ).set_index("index")
873
+ y_pred.index = y_fold_val.index
874
+
875
+ # Store predictions and true values
876
+ all_predictions.append(y_pred)
877
+ all_y_true.append(y_fold_val)
878
+
879
+ # Concatenate all fold predictions
880
+ if target_type == "classification":
881
+ # For classification, we need to handle probability columns
882
+ all_pred_df = pd.concat(all_predictions, axis=0)
883
+ all_y_series = pd.concat(all_y_true, axis=0)
884
+ # Ensure we have a DataFrame with TARGET column
885
+ if isinstance(all_y_series, pd.Series):
886
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
887
+ else:
888
+ all_y_df = all_y_series
889
+ else:
890
+ # For regression, just concatenate the predictions
891
+ all_pred_series = pd.concat(all_predictions, axis=0)
892
+ all_y_series = pd.concat(all_y_true, axis=0)
893
+ all_pred_df = pd.DataFrame({"PRED": all_pred_series})
894
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
895
+
896
+ # Create combined prediction DataFrame
897
+ prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
898
+
899
+ # Calculate pooled metrics
900
+ score = {
901
+ "DATE": datetime.now(),
902
+ "MODEL_NAME": model_name,
903
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
904
+ }
905
+
906
+ # Unscale if needed (for regression with scaling)
907
+ if (
908
+ model.need_scaling
909
+ and target_type == "regression"
910
+ and model.scaler_y is not None
911
+ ):
912
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
913
+ prediction[["TARGET"]].values
914
+ )
915
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
916
+ prediction[["PRED"]].values
917
+ )
918
+
919
+ # Evaluate with pooled predictions
920
+ score.update(evaluate(prediction, target_type, target_clf_thresholds))
921
+
922
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
923
+ logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
924
+
925
+ # Report to Ray if in Ray context
926
+ if session.get_session():
927
+ session.report(metrics=score)
928
+ return score
929
+
930
+
749
931
  def trainable(
750
932
  params,
751
933
  x_train,
@@ -757,7 +939,6 @@ def trainable(
757
939
  experiment_name,
758
940
  target_number,
759
941
  create_model,
760
- type_name="hyperopts",
761
942
  plot=False,
762
943
  log_dir=None,
763
944
  target_clf_thresholds: dict = None,
@@ -783,9 +964,7 @@ def trainable(
783
964
  x_val = x_val[:, -timesteps:, :]
784
965
 
785
966
  # Compile and fit model on train set
786
- start = time.time()
787
967
  model.fit(x_train, y_train, x_val, y_val, params)
788
- stop = time.time()
789
968
 
790
969
  # Prediction on val set
791
970
  y_pred = model.predict(x_val)
@@ -815,8 +994,6 @@ def trainable(
815
994
  score = {
816
995
  "DATE": datetime.now(),
817
996
  "MODEL_NAME": model.model_name,
818
- "TYPE": type_name,
819
- "TRAINING_TIME": stop - start,
820
997
  "EVAL_DATA_STD": prediction["TARGET"].std(),
821
998
  }
822
999
 
@@ -825,7 +1002,8 @@ def trainable(
825
1002
  metric = "RMSE" if target_type == "regression" else "LOGLOSS"
826
1003
  logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
827
1004
 
828
- if type_name == "hyperopts":
1005
+ # Report to Ray if in Ray context
1006
+ if session.get_session():
829
1007
  session.report(metrics=score)
830
1008
  return score
831
1009
 
@@ -883,7 +1061,7 @@ class ModelSelectionEngine:
883
1061
  experiment_name,
884
1062
  perform_hyperopt=True,
885
1063
  number_of_trials=20,
886
- perform_crossval=False,
1064
+ perform_crossval=False, # This controls CV during hyperopt, not after
887
1065
  plot=True,
888
1066
  clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
889
1067
  preserve_model=True,
@@ -896,6 +1074,7 @@ class ModelSelectionEngine:
896
1074
  self.experiment_name = experiment_name
897
1075
  self.plot = plot
898
1076
  self.number_of_trials = number_of_trials
1077
+ self.perform_crossval = perform_crossval
899
1078
 
900
1079
  if self.experiment_id is None:
901
1080
  raise ValueError("Please provide a experiment.")
@@ -945,12 +1124,11 @@ class ModelSelectionEngine:
945
1124
  # create model selection in db
946
1125
  target = Target.find_by(name=f"TARGET_{self.target_number}")
947
1126
  model_selection = ModelSelection.upsert(
948
- match_fields=["target_id", "experiment_id"],
949
1127
  target_id=target.id,
950
1128
  experiment_id=self.experiment_id,
951
1129
  )
952
1130
 
953
- # recurrent models starts at 9 # len(list_models)
1131
+ # STEP 1 : TRAINING MODELS
954
1132
  for i in self.models_idx:
955
1133
  config = all_models[i]
956
1134
  recurrent = config["recurrent"]
@@ -968,19 +1146,11 @@ class ModelSelectionEngine:
968
1146
  elif perform_hyperopt:
969
1147
  clean_directory(self.results_dir)
970
1148
 
971
- logger.info(f"Training a {model_name}")
972
- model = Model.upsert(
973
- match_fields=["name", "type"],
974
- name=model_name,
975
- type=self.target_type,
976
- )
977
- model_training = ModelTraining.upsert(
978
- match_fields=["model_id", "model_selection_id"],
979
- model_id=model.id,
980
- model_selection_id=model_selection.id,
1149
+ logger.info(
1150
+ f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
981
1151
  )
982
1152
 
983
- # getting data
1153
+ # Getting data
984
1154
  if recurrent:
985
1155
  # Clear cluster from previous Keras session graphs.
986
1156
  K.clear_session()
@@ -990,7 +1160,7 @@ class ModelSelectionEngine:
990
1160
  for i, e in enumerate(self.all_features)
991
1161
  if e in set(self.features)
992
1162
  ]
993
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
1163
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
994
1164
  x_train = x_train_reshaped[:, :, features_idx]
995
1165
  y_train = y_train_reshaped[:, [self.target_number, 0]]
996
1166
  x_val = x_val_reshaped[:, :, features_idx]
@@ -1020,7 +1190,8 @@ class ModelSelectionEngine:
1020
1190
  y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
1021
1191
 
1022
1192
  log_dir = get_log_dir(self.target_dir, model_name)
1023
- # instantiate model
1193
+
1194
+ # Instantiate model
1024
1195
  model = ModelEngine(
1025
1196
  target_number=self.target_number,
1026
1197
  model_name=model_name,
@@ -1031,8 +1202,8 @@ class ModelSelectionEngine:
1031
1202
  log_dir=log_dir,
1032
1203
  )
1033
1204
 
1034
- start = time.time()
1035
1205
  # Tuning hyperparameters
1206
+ start = time.time()
1036
1207
  if perform_hyperopt:
1037
1208
  model_best_params = self.hyperoptimize(
1038
1209
  x_train, y_train, x_val, y_val, model
@@ -1049,7 +1220,7 @@ class ModelSelectionEngine:
1049
1220
  f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
1050
1221
  )
1051
1222
 
1052
- # save best params
1223
+ # Save best params
1053
1224
  best_params_file = f"{self.target_dir}/best_params.json"
1054
1225
  try:
1055
1226
  with open(best_params_file, "r") as f:
@@ -1061,114 +1232,25 @@ class ModelSelectionEngine:
1061
1232
  with open(best_params_file, "w") as f:
1062
1233
  json.dump(json_dict, f, indent=4)
1063
1234
 
1064
- # Perform cross-validation of the best model on k-folds of train + val set
1065
- if perform_crossval:
1066
- x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
1067
- y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
1068
- n_splits = 4
1069
- n_samples = len(x_train_val)
1070
- test_size = int(n_samples / (n_splits + 4))
1071
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1072
-
1073
- # Store the scores
1074
- cv_scores = []
1075
-
1076
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1077
- self.type_name = f"crossval_fold_{i}"
1078
-
1079
- if self.time_series:
1080
- date_series = pd.concat(
1081
- [
1082
- train[self.date_column],
1083
- val[self.date_column],
1084
- test[self.date_column],
1085
- ],
1086
- axis=0,
1087
- ).reset_index(drop=True)
1088
-
1089
- date_series = date_series.map(pd.Timestamp.fromordinal)
1090
-
1091
- # Now you can use the actual train/val indices to extract ranges
1092
- train_start = date_series.iloc[train_index[0]]
1093
- train_end = date_series.iloc[train_index[-1]]
1094
- val_start = date_series.iloc[val_index[0]]
1095
- val_end = date_series.iloc[val_index[-1]]
1096
-
1097
- logger.info(
1098
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1099
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1100
- )
1101
- else:
1102
- logger.info(
1103
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1104
- )
1105
-
1106
- # Train the model and get the score
1107
- if recurrent:
1108
- cv_score, _, _ = self.train_model(
1109
- params=model_best_params,
1110
- x_train=x_train_val[train_index],
1111
- y_train=y_train_val[train_index],
1112
- x_val=x_train_val[val_index],
1113
- y_val=y_train_val[val_index],
1114
- model=model,
1115
- )
1116
- else:
1117
- cv_score, _, _ = self.train_model(
1118
- params=model_best_params,
1119
- x_train=x_train_val.iloc[train_index],
1120
- y_train=y_train_val.iloc[train_index],
1121
- x_val=x_train_val.iloc[val_index],
1122
- y_val=y_train_val.iloc[val_index],
1123
- model=model,
1124
- )
1125
-
1126
- # Append score to the list
1127
- cv_scores.append(cv_score)
1128
-
1129
- # Calculate mean of all numerical metrics across all cross-validation folds
1130
- cv_scores_df = pd.DataFrame(cv_scores)
1131
- # Get mean of all numeric columns
1132
- cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
1235
+ # Always evaluate on test set (no cross-validation here)
1236
+ # The hyperopt already did CV if needed to find best params
1237
+ best_score, best_model, best_pred = self.train_model(
1238
+ params=model_best_params,
1239
+ x_train=pd.concat([x_train, x_val], axis=0),
1240
+ y_train=pd.concat([y_train, y_val], axis=0),
1241
+ x_val=x_test,
1242
+ y_val=y_test,
1243
+ model=model,
1244
+ )
1245
+ stop = time.time()
1246
+ training_time = stop - start
1133
1247
 
1134
- logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
1135
- for metric, value in cv_means.items():
1248
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1249
+ logger.info(f"👉 {model.model_name} scores on test set:")
1250
+ for metric, value in best_score.items():
1251
+ if isinstance(value, (int, float)):
1136
1252
  logger.info(f" {metric}: {value:.4f}")
1137
1253
 
1138
- # Retrain on entire training set, but keep score on cross-validation folds
1139
- # Get the test score using the best model
1140
- test_score, best_model, best_pred = self.train_model(
1141
- params=model_best_params,
1142
- x_train=pd.concat([x_train, x_val], axis=0),
1143
- y_train=pd.concat([y_train, y_val], axis=0),
1144
- x_val=x_test,
1145
- y_val=y_test,
1146
- model=model,
1147
- )
1148
-
1149
- # Update all metrics with cross-validation means
1150
- for metric, value in cv_means.items():
1151
- if metric in test_score: # Only update existing metrics
1152
- test_score[metric] = value
1153
- best_score = test_score
1154
- best_score["TYPE"] = "crossval"
1155
- else:
1156
- # Evaluate on test set
1157
- self.type_name = "testset"
1158
- best_score, best_model, best_pred = self.train_model(
1159
- params=model_best_params,
1160
- x_train=pd.concat([x_train, x_val], axis=0),
1161
- y_train=pd.concat([y_train, y_val], axis=0),
1162
- x_val=x_test,
1163
- y_val=y_test,
1164
- model=model,
1165
- )
1166
-
1167
- logger.info(f"👉 {model.model_name} scores on test set:")
1168
- for metric, value in best_score.items():
1169
- if isinstance(value, (int, float)):
1170
- logger.info(f" {metric}: {value:.4f}")
1171
-
1172
1254
  # Save predictions
1173
1255
  best_pred.to_csv(
1174
1256
  f"{self.results_dir}/prediction.csv",
@@ -1179,7 +1261,6 @@ class ModelSelectionEngine:
1179
1261
 
1180
1262
  # Save best model
1181
1263
  model_path = best_model.save(self.results_dir)
1182
-
1183
1264
  model_path = Path(model_path).resolve()
1184
1265
  best_score["MODEL_PATH"] = model_path
1185
1266
 
@@ -1202,32 +1283,26 @@ class ModelSelectionEngine:
1202
1283
  scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1203
1284
  scores_tracking.to_csv(scores_tracking_path, index=False)
1204
1285
 
1205
- # Save model training metadata
1206
- stop = time.time()
1207
- training_time = stop - start
1208
- model_training.best_params = model_best_params
1209
- model_training.model_path = model_path
1210
- model_training.training_time = training_time
1211
- model_training.save()
1212
-
1213
- # Store metrics in DB
1286
+ # Save in db
1214
1287
  drop_cols = [
1215
1288
  "DATE",
1216
1289
  "MODEL_NAME",
1217
- "MODEL_PATH",
1218
1290
  ]
1219
1291
  best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1220
1292
  score_data = {k.lower(): v for k, v in best_score.items()}
1221
-
1222
- Score.upsert(
1223
- match_fields=["model_training_id"],
1224
- model_training_id=model_training.id,
1293
+ model = Model.upsert(
1294
+ name=model_name,
1295
+ type=self.target_type,
1296
+ )
1297
+ ModelSelectionScore.upsert(
1298
+ model_id=model.id,
1299
+ model_selection_id=model_selection.id,
1300
+ best_params=serialize_for_json(model_best_params),
1301
+ training_time=training_time,
1225
1302
  **score_data,
1226
1303
  )
1227
1304
 
1228
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1229
-
1230
- # find best model type
1305
+ # STEP 2 :FINDING BEST MODEL OVERALL
1231
1306
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1232
1307
  scores_tracking = pd.read_csv(scores_tracking_path)
1233
1308
  best_score_overall = scores_tracking.iloc[0, :]
@@ -1238,12 +1313,11 @@ class ModelSelectionEngine:
1238
1313
  else:
1239
1314
  best_thresholds = None
1240
1315
 
1241
- # Remove any .best or .keras files
1316
+ # Remove any .best or .keras files, and save best model in target_dir
1242
1317
  for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
1243
1318
  os.path.join(self.target_dir, "*.keras")
1244
1319
  ):
1245
1320
  os.remove(file_path)
1246
- # Copy the best model in root training folder for this target
1247
1321
  best_model_path = Path(
1248
1322
  f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1249
1323
  ).resolve()
@@ -1255,13 +1329,13 @@ class ModelSelectionEngine:
1255
1329
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1256
1330
  best_model_params = json.load(f)[best_model_name]
1257
1331
 
1258
- # Save model_selection results to db
1259
-
1332
+ # Save to db
1260
1333
  model_selection = ModelSelection.get(model_selection.id)
1261
- model_selection.best_model_id = Model.find_by(
1334
+ model = Model.find_by(
1262
1335
  name=best_score_overall["MODEL_NAME"], type=self.target_type
1263
- ).id
1264
- model_selection.best_model_params = best_model_params
1336
+ )
1337
+ model_selection.best_model_id = model.id
1338
+ model_selection.best_model_params = serialize_for_json(best_model_params)
1265
1339
  model_selection.best_thresholds = best_thresholds
1266
1340
  model_selection.best_model_path = best_model_path
1267
1341
 
@@ -1286,7 +1360,169 @@ class ModelSelectionEngine:
1286
1360
  return best_model
1287
1361
 
1288
1362
  def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1289
- self.type_name = "hyperopts"
1363
+ """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1364
+ if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1365
+ return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1366
+ else:
1367
+ return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1368
+
1369
+ def hyperoptimize_hyperopt(
1370
+ self, x_train, y_train, x_val, y_val, model: ModelEngine
1371
+ ):
1372
+ """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
1373
+
1374
+ logger.info("Start tuning hyperparameters with HyperOpt standalone...")
1375
+
1376
+ # Convert Ray search space to HyperOpt search space
1377
+ def convert_search_space(ray_space):
1378
+ """Convert Ray Tune search space to HyperOpt format."""
1379
+ from ray.tune.search.sample import Categorical, Float, Integer
1380
+
1381
+ hp_space = {}
1382
+ for key, value in ray_space.items():
1383
+ if isinstance(value, Float):
1384
+ if (
1385
+ hasattr(value, "sampler")
1386
+ and value.sampler.__class__.__name__ == "LogUniform"
1387
+ ):
1388
+ # LogUniform distribution
1389
+ hp_space[key] = hp.loguniform(
1390
+ key, np.log(value.lower), np.log(value.upper)
1391
+ )
1392
+ else:
1393
+ # Uniform distribution
1394
+ hp_space[key] = hp.uniform(key, value.lower, value.upper)
1395
+ elif isinstance(value, Integer):
1396
+ # Integer uniform distribution
1397
+ hp_space[key] = hp.randint(key, value.lower, value.upper)
1398
+ elif isinstance(value, Categorical):
1399
+ # Categorical/choice distribution
1400
+ hp_space[key] = hp.choice(key, value.categories)
1401
+ elif isinstance(value, dict):
1402
+ # Nested dict, recurse
1403
+ hp_space[key] = convert_search_space(value)
1404
+ else:
1405
+ # Static value or unknown type
1406
+ hp_space[key] = value
1407
+ return hp_space
1408
+
1409
+ # Create objective function for HyperOpt
1410
+ def objective(params):
1411
+ """Objective function to minimize."""
1412
+ try:
1413
+ # Convert numpy types to native Python types
1414
+ params = serialize_for_json(params)
1415
+
1416
+ # Use existing trainable function based on perform_crossval
1417
+ if self.perform_crossval:
1418
+ score = trainable_cv(
1419
+ params,
1420
+ x_train,
1421
+ y_train,
1422
+ x_val,
1423
+ y_val,
1424
+ model.model_name,
1425
+ self.target_type,
1426
+ self.experiment_name,
1427
+ self.target_number,
1428
+ model.create_model,
1429
+ n_splits=3,
1430
+ plot=model.plot,
1431
+ log_dir=model.log_dir,
1432
+ target_clf_thresholds=self.target_clf_thresholds,
1433
+ time_series=self.time_series,
1434
+ recurrent=model.recurrent,
1435
+ )
1436
+ else:
1437
+ score, _, _ = trainable(
1438
+ params,
1439
+ x_train,
1440
+ y_train,
1441
+ x_val,
1442
+ y_val,
1443
+ model.model_name,
1444
+ self.target_type,
1445
+ self.experiment_name,
1446
+ self.target_number,
1447
+ model.create_model,
1448
+ plot=model.plot,
1449
+ log_dir=model.log_dir,
1450
+ target_clf_thresholds=self.target_clf_thresholds,
1451
+ )
1452
+
1453
+ # HyperOpt minimizes, so return the metric directly
1454
+ loss = score[self.metric]
1455
+
1456
+ # Log trial info
1457
+ logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
1458
+
1459
+ return {
1460
+ "loss": loss,
1461
+ "status": STATUS_OK,
1462
+ "score": score, # Keep full score dict for analysis
1463
+ }
1464
+
1465
+ except Exception as e:
1466
+ logger.error(f"Trial failed: {str(e)}")
1467
+ return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
1468
+
1469
+ # Convert search space
1470
+ hp_search_space = convert_search_space(model.search_params)
1471
+
1472
+ # Run optimization
1473
+ trials = Trials()
1474
+ best_params = fmin(
1475
+ fn=objective,
1476
+ space=hp_search_space,
1477
+ algo=tpe.suggest,
1478
+ max_evals=self.number_of_trials,
1479
+ trials=trials,
1480
+ verbose=True,
1481
+ show_progressbar=True,
1482
+ )
1483
+
1484
+ # Get the actual parameter values (not just indices for hp.choice)
1485
+ best_params = space_eval(hp_search_space, best_params)
1486
+
1487
+ # Convert numpy types to native Python types
1488
+ best_params = serialize_for_json(best_params)
1489
+
1490
+ # Get best score from trials
1491
+ best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
1492
+ best_score = trials.trials[best_trial_idx]["result"].get("score", {})
1493
+
1494
+ # Log results
1495
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1496
+ logger.info(f"Best Scores found were:\n{best_score}")
1497
+
1498
+ # Create summary DataFrame for consistency with Ray version
1499
+ results_df = pd.DataFrame(
1500
+ [
1501
+ {
1502
+ "trial_id": i,
1503
+ self.metric: t["result"]["loss"],
1504
+ **{
1505
+ k: v
1506
+ for k, v in t["result"].get("score", {}).items()
1507
+ if isinstance(v, (int, float))
1508
+ },
1509
+ }
1510
+ for i, t in enumerate(trials.trials)
1511
+ if t["result"]["status"] == STATUS_OK
1512
+ ]
1513
+ )
1514
+
1515
+ if not results_df.empty:
1516
+ logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
1517
+
1518
+ # Save trial history for analysis
1519
+ trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
1520
+ with open(trials_path, "wb") as f:
1521
+ pickle.dump(trials, f)
1522
+
1523
+ return best_params
1524
+
1525
+ def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1290
1526
 
1291
1527
  def collect_error_logs(target_dir: int, storage_path: str):
1292
1528
  output_error_file = f"{target_dir}/errors.log"
@@ -1329,9 +1565,22 @@ class ModelSelectionEngine:
1329
1565
  }
1330
1566
  )
1331
1567
 
1568
+ # Choose between regular trainable or CV version based on perform_crossval flag
1569
+ # perform_crossval controls whether to use CV during hyperopt
1570
+ if self.perform_crossval:
1571
+ trainable_fn = trainable_cv
1572
+ additional_params = {
1573
+ "n_splits": 3, # Can be made configurable
1574
+ "time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
1575
+ "recurrent": model.recurrent,
1576
+ }
1577
+ else:
1578
+ trainable_fn = trainable
1579
+ additional_params = {}
1580
+
1332
1581
  tuner = Tuner(
1333
1582
  trainable=with_parameters(
1334
- trainable,
1583
+ trainable_fn,
1335
1584
  x_train=x_train,
1336
1585
  y_train=y_train,
1337
1586
  x_val=x_val,
@@ -1341,10 +1590,10 @@ class ModelSelectionEngine:
1341
1590
  experiment_name=self.experiment_name,
1342
1591
  target_number=self.target_number,
1343
1592
  create_model=model.create_model,
1344
- type_name="hyperopts",
1345
1593
  plot=model.plot,
1346
1594
  log_dir=model.log_dir,
1347
1595
  target_clf_thresholds=self.target_clf_thresholds,
1596
+ **additional_params,
1348
1597
  ),
1349
1598
  param_space=model.search_params,
1350
1599
  tune_config=TuneConfig(
@@ -1398,7 +1647,6 @@ class ModelSelectionEngine:
1398
1647
  self.experiment_name,
1399
1648
  self.target_number,
1400
1649
  model.create_model,
1401
- self.type_name,
1402
1650
  model.plot,
1403
1651
  log_dir=model.log_dir,
1404
1652
  target_clf_thresholds=self.target_clf_thresholds,