lecrapaud 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -15,7 +15,7 @@ from pydantic import BaseModel
15
15
  import ast
16
16
 
17
17
  # ML models
18
- from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
19
19
  from sklearn.calibration import CalibratedClassifierCV
20
20
  from sklearn.metrics import (
21
21
  mean_absolute_percentage_error,
@@ -63,23 +63,38 @@ from ray.tune.logger import TBXLoggerCallback
63
63
  from ray.tune.schedulers import ASHAScheduler
64
64
  from ray.air import session
65
65
 
66
+ # HyperOpt standalone
67
+ from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
68
+
66
69
  # Internal library
67
70
  from lecrapaud.search_space import all_models
68
71
  from lecrapaud.directories import clean_directory
69
72
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
- from lecrapaud.config import PYTHON_ENV
73
+ from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
71
74
  from lecrapaud.feature_selection import load_train_data
72
75
  from lecrapaud.db import (
73
76
  Model,
74
77
  ModelSelection,
75
- ModelTraining,
76
- Score,
78
+ ModelSelectionScore,
77
79
  Target,
78
80
  Experiment,
79
81
  )
80
82
 
81
83
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
82
84
 
85
+ # Suppress XGBoost and LightGBM logging
86
+ import logging
87
+
88
+ logging.getLogger("lightgbm").setLevel(logging.ERROR)
89
+ logging.getLogger("xgboost").setLevel(logging.ERROR)
90
+
91
+ # Set global verbosity for XGBoost
92
+ xgb.set_config(verbosity=0)
93
+
94
+ # Suppress warnings
95
+ warnings.filterwarnings("ignore", category=UserWarning)
96
+ warnings.filterwarnings("ignore", category=FutureWarning)
97
+
83
98
  # Reproducible result
84
99
  keras.utils.set_random_seed(42)
85
100
  np.random.seed(42)
@@ -410,6 +425,7 @@ class ModelEngine:
410
425
  "metric": eval_metric,
411
426
  "num_class": num_class,
412
427
  "verbose": -1,
428
+ "verbose_eval": False,
413
429
  },
414
430
  num_boost_round=params["num_boost_round"],
415
431
  train_set=train_data,
@@ -421,6 +437,7 @@ class ModelEngine:
421
437
  ),
422
438
  lgb.record_evaluation(evals_result),
423
439
  tensorboard_callback,
440
+ lgb.log_evaluation(period=0), # Disable evaluation logging
424
441
  ],
425
442
  )
426
443
  else:
@@ -462,7 +479,7 @@ class ModelEngine:
462
479
  if self.target_type == "regression"
463
480
  else ("logloss" if num_class <= 2 else "mlogloss")
464
481
  )
465
- xgb.set_config(verbosity=0)
482
+ # XGBoost verbosity already set globally
466
483
  model = xgb.train(
467
484
  params={
468
485
  **params["model_params"],
@@ -477,11 +494,11 @@ class ModelEngine:
477
494
  xgb.callback.EarlyStopping(
478
495
  rounds=params["early_stopping_rounds"], save_best=True
479
496
  ),
480
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
497
+ # Removed EvaluationMonitor to suppress logs
481
498
  tensorboard_callback,
482
499
  ],
483
500
  evals_result=evals_result, # Record evaluation result
484
- verbose_eval=10000,
501
+ verbose_eval=False, # Disable evaluation logging
485
502
  )
486
503
 
487
504
  model.model_name = self.create_model
@@ -746,6 +763,171 @@ class ModelEngine:
746
763
  )
747
764
 
748
765
 
766
+ def trainable_cv(
767
+ params,
768
+ x_train,
769
+ y_train,
770
+ x_val,
771
+ y_val,
772
+ model_name,
773
+ target_type,
774
+ experiment_name,
775
+ target_number,
776
+ create_model,
777
+ n_splits=3,
778
+ plot=False,
779
+ log_dir=None,
780
+ target_clf_thresholds: dict = None,
781
+ time_series=True,
782
+ recurrent=False,
783
+ ):
784
+ """Cross-validation version of trainable for hyperopt.
785
+
786
+ Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
787
+ Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
788
+ """
789
+ # Combine train and validation data for cross-validation
790
+ if recurrent:
791
+ x_train_val = np.concatenate([x_train, x_val], axis=0)
792
+ y_train_val = np.concatenate([y_train, y_val], axis=0)
793
+ else:
794
+ x_train_val = pd.concat([x_train, x_val], axis=0)
795
+ y_train_val = pd.concat([y_train, y_val], axis=0)
796
+ # Store original index for later use if needed
797
+ original_index = x_train_val.index.copy()
798
+ # Reset index for proper iloc indexing with CV splits
799
+ x_train_val = x_train_val.reset_index(drop=True)
800
+ y_train_val = y_train_val.reset_index(drop=True)
801
+
802
+ # Choose appropriate cross-validation splitter
803
+ if time_series:
804
+ # Time series split for temporal data
805
+ n_samples = len(x_train_val)
806
+ test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
807
+ cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
808
+ else:
809
+ # Stratified or regular K-fold for i.i.d. data
810
+ if target_type == "classification":
811
+ cv_splitter = StratifiedKFold(
812
+ n_splits=n_splits, shuffle=True, random_state=42
813
+ )
814
+ else:
815
+ cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
816
+
817
+ # Store all predictions and true values for pooled metrics
818
+ all_predictions = []
819
+ all_y_true = []
820
+ fold_times = []
821
+
822
+ # Get splits based on the CV strategy
823
+ if time_series or target_type == "regression":
824
+ splits = cv_splitter.split(x_train_val)
825
+ else:
826
+ # For stratified split, we need to pass y
827
+ if recurrent:
828
+ # Extract the target from the 2D array (first column is target)
829
+ y_for_split = y_train_val[:, 0]
830
+ else:
831
+ y_for_split = y_train_val
832
+ splits = cv_splitter.split(x_train_val, y_for_split)
833
+
834
+ for fold_idx, (train_idx, val_idx) in enumerate(splits):
835
+ # Extract fold data
836
+ if recurrent:
837
+ x_fold_train = x_train_val[train_idx]
838
+ y_fold_train = y_train_val[train_idx]
839
+ x_fold_val = x_train_val[val_idx]
840
+ y_fold_val = y_train_val[val_idx]
841
+ else:
842
+ x_fold_train = x_train_val.iloc[train_idx]
843
+ y_fold_train = y_train_val.iloc[train_idx]
844
+ x_fold_val = x_train_val.iloc[val_idx]
845
+ y_fold_val = y_train_val.iloc[val_idx]
846
+
847
+ # Train model for this fold
848
+ model = ModelEngine(
849
+ model_name=model_name,
850
+ target_type=target_type,
851
+ target_number=target_number,
852
+ create_model=create_model,
853
+ plot=False, # Disable individual fold plots
854
+ log_dir=log_dir,
855
+ )
856
+
857
+ if recurrent:
858
+ timesteps = params["timesteps"]
859
+ x_fold_train = x_fold_train[:, -timesteps:, :]
860
+ x_fold_val = x_fold_val[:, -timesteps:, :]
861
+
862
+ # Fit model
863
+ model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
864
+
865
+ # Get predictions
866
+ y_pred = model.predict(x_fold_val)
867
+
868
+ # Handle recurrent model indexing
869
+ if recurrent:
870
+ y_fold_val = pd.DataFrame(
871
+ y_fold_val, columns=["TARGET", "index"]
872
+ ).set_index("index")
873
+ y_pred.index = y_fold_val.index
874
+
875
+ # Store predictions and true values
876
+ all_predictions.append(y_pred)
877
+ all_y_true.append(y_fold_val)
878
+
879
+ # Concatenate all fold predictions
880
+ if target_type == "classification":
881
+ # For classification, we need to handle probability columns
882
+ all_pred_df = pd.concat(all_predictions, axis=0)
883
+ all_y_series = pd.concat(all_y_true, axis=0)
884
+ # Ensure we have a DataFrame with TARGET column
885
+ if isinstance(all_y_series, pd.Series):
886
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
887
+ else:
888
+ all_y_df = all_y_series
889
+ else:
890
+ # For regression, just concatenate the predictions
891
+ all_pred_series = pd.concat(all_predictions, axis=0)
892
+ all_y_series = pd.concat(all_y_true, axis=0)
893
+ all_pred_df = pd.DataFrame({"PRED": all_pred_series})
894
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
895
+
896
+ # Create combined prediction DataFrame
897
+ prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
898
+
899
+ # Calculate pooled metrics
900
+ score = {
901
+ "DATE": datetime.now(),
902
+ "MODEL_NAME": model_name,
903
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
904
+ }
905
+
906
+ # Unscale if needed (for regression with scaling)
907
+ if (
908
+ model.need_scaling
909
+ and target_type == "regression"
910
+ and model.scaler_y is not None
911
+ ):
912
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
913
+ prediction[["TARGET"]].values
914
+ )
915
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
916
+ prediction[["PRED"]].values
917
+ )
918
+
919
+ # Evaluate with pooled predictions
920
+ score.update(evaluate(prediction, target_type, target_clf_thresholds))
921
+
922
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
923
+ logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
924
+
925
+ # Report to Ray if in Ray context
926
+ if session.get_session():
927
+ session.report(metrics=score)
928
+ return score
929
+
930
+
749
931
  def trainable(
750
932
  params,
751
933
  x_train,
@@ -757,7 +939,6 @@ def trainable(
757
939
  experiment_name,
758
940
  target_number,
759
941
  create_model,
760
- type_name="hyperopts",
761
942
  plot=False,
762
943
  log_dir=None,
763
944
  target_clf_thresholds: dict = None,
@@ -783,9 +964,7 @@ def trainable(
783
964
  x_val = x_val[:, -timesteps:, :]
784
965
 
785
966
  # Compile and fit model on train set
786
- start = time.time()
787
967
  model.fit(x_train, y_train, x_val, y_val, params)
788
- stop = time.time()
789
968
 
790
969
  # Prediction on val set
791
970
  y_pred = model.predict(x_val)
@@ -815,8 +994,6 @@ def trainable(
815
994
  score = {
816
995
  "DATE": datetime.now(),
817
996
  "MODEL_NAME": model.model_name,
818
- "TYPE": type_name,
819
- "TRAINING_TIME": stop - start,
820
997
  "EVAL_DATA_STD": prediction["TARGET"].std(),
821
998
  }
822
999
 
@@ -825,7 +1002,8 @@ def trainable(
825
1002
  metric = "RMSE" if target_type == "regression" else "LOGLOSS"
826
1003
  logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
827
1004
 
828
- if type_name == "hyperopts":
1005
+ # Report to Ray if in Ray context
1006
+ if session.get_session():
829
1007
  session.report(metrics=score)
830
1008
  return score
831
1009
 
@@ -839,24 +1017,24 @@ class ModelSelectionEngine:
839
1017
  data,
840
1018
  reshaped_data,
841
1019
  target_number,
842
- target_clf,
843
1020
  experiment,
844
- models_idx,
845
- time_series,
846
- date_column,
847
- group_column,
848
- target_clf_thresholds,
849
1021
  **kwargs,
850
1022
  ):
851
1023
  self.data = data
852
1024
  self.reshaped_data = reshaped_data
853
1025
  self.target_number = target_number
854
1026
  self.experiment = experiment
855
- self.target_clf = target_clf
856
- self.models_idx = models_idx
857
- self.time_series = time_series
858
- self.date_column = date_column
859
- self.group_column = group_column
1027
+
1028
+ # Get all parameters from experiment context
1029
+ context = self.experiment.context
1030
+ self.target_clf = context.get("target_clf", [])
1031
+ self.models_idx = context.get("models_idx", [])
1032
+ self.time_series = context.get("time_series", False)
1033
+ self.date_column = context.get("date_column", None)
1034
+ self.group_column = context.get("group_column", None)
1035
+
1036
+ # Handle target_clf_thresholds
1037
+ target_clf_thresholds = context.get("target_clf_thresholds", {})
860
1038
  self.target_clf_thresholds = (
861
1039
  target_clf_thresholds[target_number]
862
1040
  if target_number in target_clf_thresholds.keys()
@@ -878,24 +1056,19 @@ class ModelSelectionEngine:
878
1056
  )
879
1057
 
880
1058
  # Main training function
881
- def run(
882
- self,
883
- experiment_name,
884
- perform_hyperopt=True,
885
- number_of_trials=20,
886
- perform_crossval=False,
887
- plot=True,
888
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
889
- preserve_model=True,
890
- best_params=None,
891
- ):
1059
+ def run(self, best_params=None):
892
1060
  """
893
1061
  Selects the best models based on a target variable, optionally performing hyperparameter optimization
894
1062
  and cross-validation, and manages outputs in a session-specific directory.
895
1063
  """
896
- self.experiment_name = experiment_name
897
- self.plot = plot
898
- self.number_of_trials = number_of_trials
1064
+ # Get all parameters from experiment context
1065
+ context = self.experiment.context
1066
+ self.experiment_name = context.get("experiment_name", "")
1067
+ self.plot = context.get("plot", True)
1068
+ self.number_of_trials = context.get("number_of_trials", 20)
1069
+ self.perform_crossval = context.get("perform_crossval", False)
1070
+ self.preserve_model = context.get("preserve_model", True)
1071
+ self.perform_hyperopt = context.get("perform_hyperopt", True)
899
1072
 
900
1073
  if self.experiment_id is None:
901
1074
  raise ValueError("Please provide a experiment.")
@@ -945,12 +1118,11 @@ class ModelSelectionEngine:
945
1118
  # create model selection in db
946
1119
  target = Target.find_by(name=f"TARGET_{self.target_number}")
947
1120
  model_selection = ModelSelection.upsert(
948
- match_fields=["target_id", "experiment_id"],
949
1121
  target_id=target.id,
950
1122
  experiment_id=self.experiment_id,
951
1123
  )
952
1124
 
953
- # recurrent models starts at 9 # len(list_models)
1125
+ # STEP 1 : TRAINING MODELS
954
1126
  for i in self.models_idx:
955
1127
  config = all_models[i]
956
1128
  recurrent = config["recurrent"]
@@ -963,24 +1135,16 @@ class ModelSelectionEngine:
963
1135
  self.results_dir = f"{self.target_dir}/{model_name}"
964
1136
  if not os.path.exists(f"{self.results_dir}"):
965
1137
  os.makedirs(f"{self.results_dir}")
966
- elif preserve_model and contains_best(self.results_dir):
1138
+ elif self.preserve_model and contains_best(self.results_dir):
967
1139
  continue
968
- elif perform_hyperopt:
1140
+ elif self.perform_hyperopt:
969
1141
  clean_directory(self.results_dir)
970
1142
 
971
- logger.info(f"Training a {model_name}")
972
- model = Model.upsert(
973
- match_fields=["name", "type"],
974
- name=model_name,
975
- type=self.target_type,
976
- )
977
- model_training = ModelTraining.upsert(
978
- match_fields=["model_id", "model_selection_id"],
979
- model_id=model.id,
980
- model_selection_id=model_selection.id,
1143
+ logger.info(
1144
+ f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
981
1145
  )
982
1146
 
983
- # getting data
1147
+ # Getting data
984
1148
  if recurrent:
985
1149
  # Clear cluster from previous Keras session graphs.
986
1150
  K.clear_session()
@@ -990,7 +1154,7 @@ class ModelSelectionEngine:
990
1154
  for i, e in enumerate(self.all_features)
991
1155
  if e in set(self.features)
992
1156
  ]
993
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
1157
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
994
1158
  x_train = x_train_reshaped[:, :, features_idx]
995
1159
  y_train = y_train_reshaped[:, [self.target_number, 0]]
996
1160
  x_val = x_val_reshaped[:, :, features_idx]
@@ -1020,7 +1184,8 @@ class ModelSelectionEngine:
1020
1184
  y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
1021
1185
 
1022
1186
  log_dir = get_log_dir(self.target_dir, model_name)
1023
- # instantiate model
1187
+
1188
+ # Instantiate model
1024
1189
  model = ModelEngine(
1025
1190
  target_number=self.target_number,
1026
1191
  model_name=model_name,
@@ -1031,9 +1196,9 @@ class ModelSelectionEngine:
1031
1196
  log_dir=log_dir,
1032
1197
  )
1033
1198
 
1034
- start = time.time()
1035
1199
  # Tuning hyperparameters
1036
- if perform_hyperopt:
1200
+ start = time.time()
1201
+ if self.perform_hyperopt:
1037
1202
  model_best_params = self.hyperoptimize(
1038
1203
  x_train, y_train, x_val, y_val, model
1039
1204
  )
@@ -1049,7 +1214,7 @@ class ModelSelectionEngine:
1049
1214
  f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
1050
1215
  )
1051
1216
 
1052
- # save best params
1217
+ # Save best params
1053
1218
  best_params_file = f"{self.target_dir}/best_params.json"
1054
1219
  try:
1055
1220
  with open(best_params_file, "r") as f:
@@ -1061,114 +1226,25 @@ class ModelSelectionEngine:
1061
1226
  with open(best_params_file, "w") as f:
1062
1227
  json.dump(json_dict, f, indent=4)
1063
1228
 
1064
- # Perform cross-validation of the best model on k-folds of train + val set
1065
- if perform_crossval:
1066
- x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
1067
- y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
1068
- n_splits = 4
1069
- n_samples = len(x_train_val)
1070
- test_size = int(n_samples / (n_splits + 4))
1071
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1072
-
1073
- # Store the scores
1074
- cv_scores = []
1075
-
1076
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1077
- self.type_name = f"crossval_fold_{i}"
1078
-
1079
- if self.time_series:
1080
- date_series = pd.concat(
1081
- [
1082
- train[self.date_column],
1083
- val[self.date_column],
1084
- test[self.date_column],
1085
- ],
1086
- axis=0,
1087
- ).reset_index(drop=True)
1088
-
1089
- date_series = date_series.map(pd.Timestamp.fromordinal)
1090
-
1091
- # Now you can use the actual train/val indices to extract ranges
1092
- train_start = date_series.iloc[train_index[0]]
1093
- train_end = date_series.iloc[train_index[-1]]
1094
- val_start = date_series.iloc[val_index[0]]
1095
- val_end = date_series.iloc[val_index[-1]]
1096
-
1097
- logger.info(
1098
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1099
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1100
- )
1101
- else:
1102
- logger.info(
1103
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1104
- )
1105
-
1106
- # Train the model and get the score
1107
- if recurrent:
1108
- cv_score, _, _ = self.train_model(
1109
- params=model_best_params,
1110
- x_train=x_train_val[train_index],
1111
- y_train=y_train_val[train_index],
1112
- x_val=x_train_val[val_index],
1113
- y_val=y_train_val[val_index],
1114
- model=model,
1115
- )
1116
- else:
1117
- cv_score, _, _ = self.train_model(
1118
- params=model_best_params,
1119
- x_train=x_train_val.iloc[train_index],
1120
- y_train=y_train_val.iloc[train_index],
1121
- x_val=x_train_val.iloc[val_index],
1122
- y_val=y_train_val.iloc[val_index],
1123
- model=model,
1124
- )
1125
-
1126
- # Append score to the list
1127
- cv_scores.append(cv_score)
1128
-
1129
- # Calculate mean of all numerical metrics across all cross-validation folds
1130
- cv_scores_df = pd.DataFrame(cv_scores)
1131
- # Get mean of all numeric columns
1132
- cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
1229
+ # Always evaluate on test set (no cross-validation here)
1230
+ # The hyperopt already did CV if needed to find best params
1231
+ best_score, best_model, best_pred = self.train_model(
1232
+ params=model_best_params,
1233
+ x_train=pd.concat([x_train, x_val], axis=0),
1234
+ y_train=pd.concat([y_train, y_val], axis=0),
1235
+ x_val=x_test,
1236
+ y_val=y_test,
1237
+ model=model,
1238
+ )
1239
+ stop = time.time()
1240
+ training_time = stop - start
1133
1241
 
1134
- logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
1135
- for metric, value in cv_means.items():
1242
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1243
+ logger.info(f"👉 {model.model_name} scores on test set:")
1244
+ for metric, value in best_score.items():
1245
+ if isinstance(value, (int, float)):
1136
1246
  logger.info(f" {metric}: {value:.4f}")
1137
1247
 
1138
- # Retrain on entire training set, but keep score on cross-validation folds
1139
- # Get the test score using the best model
1140
- test_score, best_model, best_pred = self.train_model(
1141
- params=model_best_params,
1142
- x_train=pd.concat([x_train, x_val], axis=0),
1143
- y_train=pd.concat([y_train, y_val], axis=0),
1144
- x_val=x_test,
1145
- y_val=y_test,
1146
- model=model,
1147
- )
1148
-
1149
- # Update all metrics with cross-validation means
1150
- for metric, value in cv_means.items():
1151
- if metric in test_score: # Only update existing metrics
1152
- test_score[metric] = value
1153
- best_score = test_score
1154
- best_score["TYPE"] = "crossval"
1155
- else:
1156
- # Evaluate on test set
1157
- self.type_name = "testset"
1158
- best_score, best_model, best_pred = self.train_model(
1159
- params=model_best_params,
1160
- x_train=pd.concat([x_train, x_val], axis=0),
1161
- y_train=pd.concat([y_train, y_val], axis=0),
1162
- x_val=x_test,
1163
- y_val=y_test,
1164
- model=model,
1165
- )
1166
-
1167
- logger.info(f"👉 {model.model_name} scores on test set:")
1168
- for metric, value in best_score.items():
1169
- if isinstance(value, (int, float)):
1170
- logger.info(f" {metric}: {value:.4f}")
1171
-
1172
1248
  # Save predictions
1173
1249
  best_pred.to_csv(
1174
1250
  f"{self.results_dir}/prediction.csv",
@@ -1179,7 +1255,6 @@ class ModelSelectionEngine:
1179
1255
 
1180
1256
  # Save best model
1181
1257
  model_path = best_model.save(self.results_dir)
1182
-
1183
1258
  model_path = Path(model_path).resolve()
1184
1259
  best_score["MODEL_PATH"] = model_path
1185
1260
 
@@ -1202,32 +1277,26 @@ class ModelSelectionEngine:
1202
1277
  scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1203
1278
  scores_tracking.to_csv(scores_tracking_path, index=False)
1204
1279
 
1205
- # Save model training metadata
1206
- stop = time.time()
1207
- training_time = stop - start
1208
- model_training.best_params = model_best_params
1209
- model_training.model_path = model_path
1210
- model_training.training_time = training_time
1211
- model_training.save()
1212
-
1213
- # Store metrics in DB
1280
+ # Save in db
1214
1281
  drop_cols = [
1215
1282
  "DATE",
1216
1283
  "MODEL_NAME",
1217
- "MODEL_PATH",
1218
1284
  ]
1219
1285
  best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1220
1286
  score_data = {k.lower(): v for k, v in best_score.items()}
1221
-
1222
- Score.upsert(
1223
- match_fields=["model_training_id"],
1224
- model_training_id=model_training.id,
1287
+ model = Model.upsert(
1288
+ name=model_name,
1289
+ type=self.target_type,
1290
+ )
1291
+ ModelSelectionScore.upsert(
1292
+ model_id=model.id,
1293
+ model_selection_id=model_selection.id,
1294
+ best_params=serialize_for_json(model_best_params),
1295
+ training_time=training_time,
1225
1296
  **score_data,
1226
1297
  )
1227
1298
 
1228
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1229
-
1230
- # find best model type
1299
+ # STEP 2 :FINDING BEST MODEL OVERALL
1231
1300
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1232
1301
  scores_tracking = pd.read_csv(scores_tracking_path)
1233
1302
  best_score_overall = scores_tracking.iloc[0, :]
@@ -1238,12 +1307,11 @@ class ModelSelectionEngine:
1238
1307
  else:
1239
1308
  best_thresholds = None
1240
1309
 
1241
- # Remove any .best or .keras files
1310
+ # Remove any .best or .keras files, and save best model in target_dir
1242
1311
  for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
1243
1312
  os.path.join(self.target_dir, "*.keras")
1244
1313
  ):
1245
1314
  os.remove(file_path)
1246
- # Copy the best model in root training folder for this target
1247
1315
  best_model_path = Path(
1248
1316
  f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1249
1317
  ).resolve()
@@ -1255,13 +1323,13 @@ class ModelSelectionEngine:
1255
1323
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1256
1324
  best_model_params = json.load(f)[best_model_name]
1257
1325
 
1258
- # Save model_selection results to db
1259
-
1326
+ # Save to db
1260
1327
  model_selection = ModelSelection.get(model_selection.id)
1261
- model_selection.best_model_id = Model.find_by(
1328
+ model = Model.find_by(
1262
1329
  name=best_score_overall["MODEL_NAME"], type=self.target_type
1263
- ).id
1264
- model_selection.best_model_params = best_model_params
1330
+ )
1331
+ model_selection.best_model_id = model.id
1332
+ model_selection.best_model_params = serialize_for_json(best_model_params)
1265
1333
  model_selection.best_thresholds = best_thresholds
1266
1334
  model_selection.best_model_path = best_model_path
1267
1335
 
@@ -1286,7 +1354,169 @@ class ModelSelectionEngine:
1286
1354
  return best_model
1287
1355
 
1288
1356
  def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1289
- self.type_name = "hyperopts"
1357
+ """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1358
+ if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1359
+ return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1360
+ else:
1361
+ return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1362
+
1363
+ def hyperoptimize_hyperopt(
1364
+ self, x_train, y_train, x_val, y_val, model: ModelEngine
1365
+ ):
1366
+ """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
1367
+
1368
+ logger.info("Start tuning hyperparameters with HyperOpt standalone...")
1369
+
1370
+ # Convert Ray search space to HyperOpt search space
1371
+ def convert_search_space(ray_space):
1372
+ """Convert Ray Tune search space to HyperOpt format."""
1373
+ from ray.tune.search.sample import Categorical, Float, Integer
1374
+
1375
+ hp_space = {}
1376
+ for key, value in ray_space.items():
1377
+ if isinstance(value, Float):
1378
+ if (
1379
+ hasattr(value, "sampler")
1380
+ and value.sampler.__class__.__name__ == "LogUniform"
1381
+ ):
1382
+ # LogUniform distribution
1383
+ hp_space[key] = hp.loguniform(
1384
+ key, np.log(value.lower), np.log(value.upper)
1385
+ )
1386
+ else:
1387
+ # Uniform distribution
1388
+ hp_space[key] = hp.uniform(key, value.lower, value.upper)
1389
+ elif isinstance(value, Integer):
1390
+ # Integer uniform distribution
1391
+ hp_space[key] = hp.randint(key, value.lower, value.upper)
1392
+ elif isinstance(value, Categorical):
1393
+ # Categorical/choice distribution
1394
+ hp_space[key] = hp.choice(key, value.categories)
1395
+ elif isinstance(value, dict):
1396
+ # Nested dict, recurse
1397
+ hp_space[key] = convert_search_space(value)
1398
+ else:
1399
+ # Static value or unknown type
1400
+ hp_space[key] = value
1401
+ return hp_space
1402
+
1403
+ # Create objective function for HyperOpt
1404
+ def objective(params):
1405
+ """Objective function to minimize."""
1406
+ try:
1407
+ # Convert numpy types to native Python types
1408
+ params = serialize_for_json(params)
1409
+
1410
+ # Use existing trainable function based on perform_crossval
1411
+ if self.perform_crossval:
1412
+ score = trainable_cv(
1413
+ params,
1414
+ x_train,
1415
+ y_train,
1416
+ x_val,
1417
+ y_val,
1418
+ model.model_name,
1419
+ self.target_type,
1420
+ self.experiment_name,
1421
+ self.target_number,
1422
+ model.create_model,
1423
+ n_splits=3,
1424
+ plot=model.plot,
1425
+ log_dir=model.log_dir,
1426
+ target_clf_thresholds=self.target_clf_thresholds,
1427
+ time_series=self.time_series,
1428
+ recurrent=model.recurrent,
1429
+ )
1430
+ else:
1431
+ score, _, _ = trainable(
1432
+ params,
1433
+ x_train,
1434
+ y_train,
1435
+ x_val,
1436
+ y_val,
1437
+ model.model_name,
1438
+ self.target_type,
1439
+ self.experiment_name,
1440
+ self.target_number,
1441
+ model.create_model,
1442
+ plot=model.plot,
1443
+ log_dir=model.log_dir,
1444
+ target_clf_thresholds=self.target_clf_thresholds,
1445
+ )
1446
+
1447
+ # HyperOpt minimizes, so return the metric directly
1448
+ loss = score[self.metric]
1449
+
1450
+ # Log trial info
1451
+ logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
1452
+
1453
+ return {
1454
+ "loss": loss,
1455
+ "status": STATUS_OK,
1456
+ "score": score, # Keep full score dict for analysis
1457
+ }
1458
+
1459
+ except Exception as e:
1460
+ logger.error(f"Trial failed: {str(e)}")
1461
+ return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
1462
+
1463
+ # Convert search space
1464
+ hp_search_space = convert_search_space(model.search_params)
1465
+
1466
+ # Run optimization
1467
+ trials = Trials()
1468
+ best_params = fmin(
1469
+ fn=objective,
1470
+ space=hp_search_space,
1471
+ algo=tpe.suggest,
1472
+ max_evals=self.number_of_trials,
1473
+ trials=trials,
1474
+ verbose=True,
1475
+ show_progressbar=True,
1476
+ )
1477
+
1478
+ # Get the actual parameter values (not just indices for hp.choice)
1479
+ best_params = space_eval(hp_search_space, best_params)
1480
+
1481
+ # Convert numpy types to native Python types
1482
+ best_params = serialize_for_json(best_params)
1483
+
1484
+ # Get best score from trials
1485
+ best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
1486
+ best_score = trials.trials[best_trial_idx]["result"].get("score", {})
1487
+
1488
+ # Log results
1489
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1490
+ logger.info(f"Best Scores found were:\n{best_score}")
1491
+
1492
+ # Create summary DataFrame for consistency with Ray version
1493
+ results_df = pd.DataFrame(
1494
+ [
1495
+ {
1496
+ "trial_id": i,
1497
+ self.metric: t["result"]["loss"],
1498
+ **{
1499
+ k: v
1500
+ for k, v in t["result"].get("score", {}).items()
1501
+ if isinstance(v, (int, float))
1502
+ },
1503
+ }
1504
+ for i, t in enumerate(trials.trials)
1505
+ if t["result"]["status"] == STATUS_OK
1506
+ ]
1507
+ )
1508
+
1509
+ if not results_df.empty:
1510
+ logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
1511
+
1512
+ # Save trial history for analysis
1513
+ trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
1514
+ with open(trials_path, "wb") as f:
1515
+ pickle.dump(trials, f)
1516
+
1517
+ return best_params
1518
+
1519
+ def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1290
1520
 
1291
1521
  def collect_error_logs(target_dir: int, storage_path: str):
1292
1522
  output_error_file = f"{target_dir}/errors.log"
@@ -1329,9 +1559,22 @@ class ModelSelectionEngine:
1329
1559
  }
1330
1560
  )
1331
1561
 
1562
+ # Choose between regular trainable or CV version based on perform_crossval flag
1563
+ # perform_crossval controls whether to use CV during hyperopt
1564
+ if self.perform_crossval:
1565
+ trainable_fn = trainable_cv
1566
+ additional_params = {
1567
+ "n_splits": 3, # Can be made configurable
1568
+ "time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
1569
+ "recurrent": model.recurrent,
1570
+ }
1571
+ else:
1572
+ trainable_fn = trainable
1573
+ additional_params = {}
1574
+
1332
1575
  tuner = Tuner(
1333
1576
  trainable=with_parameters(
1334
- trainable,
1577
+ trainable_fn,
1335
1578
  x_train=x_train,
1336
1579
  y_train=y_train,
1337
1580
  x_val=x_val,
@@ -1341,10 +1584,10 @@ class ModelSelectionEngine:
1341
1584
  experiment_name=self.experiment_name,
1342
1585
  target_number=self.target_number,
1343
1586
  create_model=model.create_model,
1344
- type_name="hyperopts",
1345
1587
  plot=model.plot,
1346
1588
  log_dir=model.log_dir,
1347
1589
  target_clf_thresholds=self.target_clf_thresholds,
1590
+ **additional_params,
1348
1591
  ),
1349
1592
  param_space=model.search_params,
1350
1593
  tune_config=TuneConfig(
@@ -1398,7 +1641,6 @@ class ModelSelectionEngine:
1398
1641
  self.experiment_name,
1399
1642
  self.target_number,
1400
1643
  model.create_model,
1401
- self.type_name,
1402
1644
  model.plot,
1403
1645
  log_dir=model.log_dir,
1404
1646
  target_clf_thresholds=self.target_clf_thresholds,