lecrapaud 0.19.2__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -15,7 +15,7 @@ from pydantic import BaseModel
15
15
  import ast
16
16
 
17
17
  # ML models
18
- from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
19
19
  from sklearn.calibration import CalibratedClassifierCV
20
20
  from sklearn.metrics import (
21
21
  mean_absolute_percentage_error,
@@ -63,23 +63,38 @@ from ray.tune.logger import TBXLoggerCallback
63
63
  from ray.tune.schedulers import ASHAScheduler
64
64
  from ray.air import session
65
65
 
66
+ # HyperOpt standalone
67
+ from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
68
+
66
69
  # Internal library
67
70
  from lecrapaud.search_space import all_models
68
71
  from lecrapaud.directories import clean_directory
69
72
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
- from lecrapaud.config import PYTHON_ENV
73
+ from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
71
74
  from lecrapaud.feature_selection import load_train_data
72
75
  from lecrapaud.db import (
73
76
  Model,
74
77
  ModelSelection,
75
- ModelTraining,
76
- Score,
78
+ ModelSelectionScore,
77
79
  Target,
78
80
  Experiment,
79
81
  )
80
82
 
81
83
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
82
84
 
85
+ # Suppress XGBoost and LightGBM logging
86
+ import logging
87
+
88
+ logging.getLogger("lightgbm").setLevel(logging.ERROR)
89
+ logging.getLogger("xgboost").setLevel(logging.ERROR)
90
+
91
+ # Set global verbosity for XGBoost
92
+ xgb.set_config(verbosity=0)
93
+
94
+ # Suppress warnings
95
+ warnings.filterwarnings("ignore", category=UserWarning)
96
+ warnings.filterwarnings("ignore", category=FutureWarning)
97
+
83
98
  # Reproducible result
84
99
  keras.utils.set_random_seed(42)
85
100
  np.random.seed(42)
@@ -110,6 +125,63 @@ def test_hardware():
110
125
  warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
111
126
 
112
127
 
128
+ class CatBoostWrapper:
129
+ """
130
+ Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
131
+ as direct attributes, while forwarding all method calls and properties.
132
+ """
133
+
134
+ __slots__ = ("_model", "_extra_attrs")
135
+
136
+ def __init__(self, model, **kwargs):
137
+ object.__setattr__(self, "_model", model)
138
+ object.__setattr__(self, "_extra_attrs", {})
139
+ # Register kwargs as direct attributes
140
+ for key, value in kwargs.items():
141
+ setattr(self, key, value)
142
+
143
+ # ---- Transparent access ----
144
+ def __getattr__(self, name):
145
+ """Forward attribute access to the underlying model if not found."""
146
+ model = object.__getattribute__(self, "_model")
147
+ if hasattr(model, name):
148
+ return getattr(model, name)
149
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
150
+ if name in extra_attrs:
151
+ return extra_attrs[name]
152
+ raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
153
+
154
+ def __setattr__(self, name, value):
155
+ """Set to wrapper or forward to model when appropriate."""
156
+ if name in CatBoostWrapper.__slots__:
157
+ object.__setattr__(self, name, value)
158
+ return
159
+
160
+ model = object.__getattribute__(self, "_model")
161
+ if hasattr(model, name):
162
+ setattr(model, name, value)
163
+ else:
164
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
165
+ extra_attrs[name] = value
166
+
167
+ def __dir__(self):
168
+ """Merge dir() from wrapper, model, and custom attributes."""
169
+ base = set(super().__dir__())
170
+ model_attrs = set(dir(object.__getattribute__(self, "_model")))
171
+ extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
172
+ return sorted(base | model_attrs | extra_attrs)
173
+
174
+ def __repr__(self):
175
+ model = object.__getattribute__(self, "_model")
176
+ extras = object.__getattribute__(self, "_extra_attrs")
177
+ return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
178
+
179
+ @property
180
+ def model(self):
181
+ """Access the raw CatBoost model."""
182
+ return object.__getattribute__(self, "_model")
183
+
184
+
113
185
  class ModelEngine:
114
186
 
115
187
  def __init__(
@@ -296,12 +368,15 @@ class ModelEngine:
296
368
  )
297
369
 
298
370
  # Attach metadata for consistency with sklearn path
299
- model.model_name = self.model_name
300
- model.target_type = self.target_type
301
- logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
371
+ model_wrapped = CatBoostWrapper(
372
+ model, model_name=self.model_name, target_type=self.target_type
373
+ )
374
+ logger.info(
375
+ f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
376
+ )
302
377
 
303
- self._model = model
304
- return model
378
+ self._model = model_wrapped
379
+ return model_wrapped
305
380
 
306
381
  def fit_boosting(self, x_train, y_train, x_val, y_val, params):
307
382
  """
@@ -350,6 +425,7 @@ class ModelEngine:
350
425
  "metric": eval_metric,
351
426
  "num_class": num_class,
352
427
  "verbose": -1,
428
+ "verbose_eval": False,
353
429
  },
354
430
  num_boost_round=params["num_boost_round"],
355
431
  train_set=train_data,
@@ -361,6 +437,7 @@ class ModelEngine:
361
437
  ),
362
438
  lgb.record_evaluation(evals_result),
363
439
  tensorboard_callback,
440
+ lgb.log_evaluation(period=0), # Disable evaluation logging
364
441
  ],
365
442
  )
366
443
  else:
@@ -402,7 +479,7 @@ class ModelEngine:
402
479
  if self.target_type == "regression"
403
480
  else ("logloss" if num_class <= 2 else "mlogloss")
404
481
  )
405
- xgb.set_config(verbosity=0)
482
+ # XGBoost verbosity already set globally
406
483
  model = xgb.train(
407
484
  params={
408
485
  **params["model_params"],
@@ -417,11 +494,11 @@ class ModelEngine:
417
494
  xgb.callback.EarlyStopping(
418
495
  rounds=params["early_stopping_rounds"], save_best=True
419
496
  ),
420
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
497
+ # Removed EvaluationMonitor to suppress logs
421
498
  tensorboard_callback,
422
499
  ],
423
500
  evals_result=evals_result, # Record evaluation result
424
- verbose_eval=10000,
501
+ verbose_eval=False, # Disable evaluation logging
425
502
  )
426
503
 
427
504
  model.model_name = self.create_model
@@ -686,6 +763,171 @@ class ModelEngine:
686
763
  )
687
764
 
688
765
 
766
+ def trainable_cv(
767
+ params,
768
+ x_train,
769
+ y_train,
770
+ x_val,
771
+ y_val,
772
+ model_name,
773
+ target_type,
774
+ experiment_name,
775
+ target_number,
776
+ create_model,
777
+ n_splits=3,
778
+ plot=False,
779
+ log_dir=None,
780
+ target_clf_thresholds: dict = None,
781
+ time_series=True,
782
+ recurrent=False,
783
+ ):
784
+ """Cross-validation version of trainable for hyperopt.
785
+
786
+ Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
787
+ Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
788
+ """
789
+ # Combine train and validation data for cross-validation
790
+ if recurrent:
791
+ x_train_val = np.concatenate([x_train, x_val], axis=0)
792
+ y_train_val = np.concatenate([y_train, y_val], axis=0)
793
+ else:
794
+ x_train_val = pd.concat([x_train, x_val], axis=0)
795
+ y_train_val = pd.concat([y_train, y_val], axis=0)
796
+ # Store original index for later use if needed
797
+ original_index = x_train_val.index.copy()
798
+ # Reset index for proper iloc indexing with CV splits
799
+ x_train_val = x_train_val.reset_index(drop=True)
800
+ y_train_val = y_train_val.reset_index(drop=True)
801
+
802
+ # Choose appropriate cross-validation splitter
803
+ if time_series:
804
+ # Time series split for temporal data
805
+ n_samples = len(x_train_val)
806
+ test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
807
+ cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
808
+ else:
809
+ # Stratified or regular K-fold for i.i.d. data
810
+ if target_type == "classification":
811
+ cv_splitter = StratifiedKFold(
812
+ n_splits=n_splits, shuffle=True, random_state=42
813
+ )
814
+ else:
815
+ cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
816
+
817
+ # Store all predictions and true values for pooled metrics
818
+ all_predictions = []
819
+ all_y_true = []
820
+ fold_times = []
821
+
822
+ # Get splits based on the CV strategy
823
+ if time_series or target_type == "regression":
824
+ splits = cv_splitter.split(x_train_val)
825
+ else:
826
+ # For stratified split, we need to pass y
827
+ if recurrent:
828
+ # Extract the target from the 2D array (first column is target)
829
+ y_for_split = y_train_val[:, 0]
830
+ else:
831
+ y_for_split = y_train_val
832
+ splits = cv_splitter.split(x_train_val, y_for_split)
833
+
834
+ for fold_idx, (train_idx, val_idx) in enumerate(splits):
835
+ # Extract fold data
836
+ if recurrent:
837
+ x_fold_train = x_train_val[train_idx]
838
+ y_fold_train = y_train_val[train_idx]
839
+ x_fold_val = x_train_val[val_idx]
840
+ y_fold_val = y_train_val[val_idx]
841
+ else:
842
+ x_fold_train = x_train_val.iloc[train_idx]
843
+ y_fold_train = y_train_val.iloc[train_idx]
844
+ x_fold_val = x_train_val.iloc[val_idx]
845
+ y_fold_val = y_train_val.iloc[val_idx]
846
+
847
+ # Train model for this fold
848
+ model = ModelEngine(
849
+ model_name=model_name,
850
+ target_type=target_type,
851
+ target_number=target_number,
852
+ create_model=create_model,
853
+ plot=False, # Disable individual fold plots
854
+ log_dir=log_dir,
855
+ )
856
+
857
+ if recurrent:
858
+ timesteps = params["timesteps"]
859
+ x_fold_train = x_fold_train[:, -timesteps:, :]
860
+ x_fold_val = x_fold_val[:, -timesteps:, :]
861
+
862
+ # Fit model
863
+ model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
864
+
865
+ # Get predictions
866
+ y_pred = model.predict(x_fold_val)
867
+
868
+ # Handle recurrent model indexing
869
+ if recurrent:
870
+ y_fold_val = pd.DataFrame(
871
+ y_fold_val, columns=["TARGET", "index"]
872
+ ).set_index("index")
873
+ y_pred.index = y_fold_val.index
874
+
875
+ # Store predictions and true values
876
+ all_predictions.append(y_pred)
877
+ all_y_true.append(y_fold_val)
878
+
879
+ # Concatenate all fold predictions
880
+ if target_type == "classification":
881
+ # For classification, we need to handle probability columns
882
+ all_pred_df = pd.concat(all_predictions, axis=0)
883
+ all_y_series = pd.concat(all_y_true, axis=0)
884
+ # Ensure we have a DataFrame with TARGET column
885
+ if isinstance(all_y_series, pd.Series):
886
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
887
+ else:
888
+ all_y_df = all_y_series
889
+ else:
890
+ # For regression, just concatenate the predictions
891
+ all_pred_series = pd.concat(all_predictions, axis=0)
892
+ all_y_series = pd.concat(all_y_true, axis=0)
893
+ all_pred_df = pd.DataFrame({"PRED": all_pred_series})
894
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
895
+
896
+ # Create combined prediction DataFrame
897
+ prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
898
+
899
+ # Calculate pooled metrics
900
+ score = {
901
+ "DATE": datetime.now(),
902
+ "MODEL_NAME": model_name,
903
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
904
+ }
905
+
906
+ # Unscale if needed (for regression with scaling)
907
+ if (
908
+ model.need_scaling
909
+ and target_type == "regression"
910
+ and model.scaler_y is not None
911
+ ):
912
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
913
+ prediction[["TARGET"]].values
914
+ )
915
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
916
+ prediction[["PRED"]].values
917
+ )
918
+
919
+ # Evaluate with pooled predictions
920
+ score.update(evaluate(prediction, target_type, target_clf_thresholds))
921
+
922
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
923
+ logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
924
+
925
+ # Report to Ray if in Ray context
926
+ if session.get_session():
927
+ session.report(metrics=score)
928
+ return score
929
+
930
+
689
931
  def trainable(
690
932
  params,
691
933
  x_train,
@@ -697,7 +939,6 @@ def trainable(
697
939
  experiment_name,
698
940
  target_number,
699
941
  create_model,
700
- type_name="hyperopts",
701
942
  plot=False,
702
943
  log_dir=None,
703
944
  target_clf_thresholds: dict = None,
@@ -723,9 +964,7 @@ def trainable(
723
964
  x_val = x_val[:, -timesteps:, :]
724
965
 
725
966
  # Compile and fit model on train set
726
- start = time.time()
727
967
  model.fit(x_train, y_train, x_val, y_val, params)
728
- stop = time.time()
729
968
 
730
969
  # Prediction on val set
731
970
  y_pred = model.predict(x_val)
@@ -755,8 +994,6 @@ def trainable(
755
994
  score = {
756
995
  "DATE": datetime.now(),
757
996
  "MODEL_NAME": model.model_name,
758
- "TYPE": type_name,
759
- "TRAINING_TIME": stop - start,
760
997
  "EVAL_DATA_STD": prediction["TARGET"].std(),
761
998
  }
762
999
 
@@ -765,7 +1002,8 @@ def trainable(
765
1002
  metric = "RMSE" if target_type == "regression" else "LOGLOSS"
766
1003
  logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
767
1004
 
768
- if type_name == "hyperopts":
1005
+ # Report to Ray if in Ray context
1006
+ if session.get_session():
769
1007
  session.report(metrics=score)
770
1008
  return score
771
1009
 
@@ -823,7 +1061,7 @@ class ModelSelectionEngine:
823
1061
  experiment_name,
824
1062
  perform_hyperopt=True,
825
1063
  number_of_trials=20,
826
- perform_crossval=False,
1064
+ perform_crossval=False, # This controls CV during hyperopt, not after
827
1065
  plot=True,
828
1066
  clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
829
1067
  preserve_model=True,
@@ -836,6 +1074,7 @@ class ModelSelectionEngine:
836
1074
  self.experiment_name = experiment_name
837
1075
  self.plot = plot
838
1076
  self.number_of_trials = number_of_trials
1077
+ self.perform_crossval = perform_crossval
839
1078
 
840
1079
  if self.experiment_id is None:
841
1080
  raise ValueError("Please provide a experiment.")
@@ -885,12 +1124,11 @@ class ModelSelectionEngine:
885
1124
  # create model selection in db
886
1125
  target = Target.find_by(name=f"TARGET_{self.target_number}")
887
1126
  model_selection = ModelSelection.upsert(
888
- match_fields=["target_id", "experiment_id"],
889
1127
  target_id=target.id,
890
1128
  experiment_id=self.experiment_id,
891
1129
  )
892
1130
 
893
- # recurrent models starts at 9 # len(list_models)
1131
+ # STEP 1 : TRAINING MODELS
894
1132
  for i in self.models_idx:
895
1133
  config = all_models[i]
896
1134
  recurrent = config["recurrent"]
@@ -908,19 +1146,11 @@ class ModelSelectionEngine:
908
1146
  elif perform_hyperopt:
909
1147
  clean_directory(self.results_dir)
910
1148
 
911
- logger.info(f"Training a {model_name}")
912
- model = Model.upsert(
913
- match_fields=["name", "type"],
914
- name=model_name,
915
- type=self.target_type,
916
- )
917
- model_training = ModelTraining.upsert(
918
- match_fields=["model_id", "model_selection_id"],
919
- model_id=model.id,
920
- model_selection_id=model_selection.id,
1149
+ logger.info(
1150
+ f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
921
1151
  )
922
1152
 
923
- # getting data
1153
+ # Getting data
924
1154
  if recurrent:
925
1155
  # Clear cluster from previous Keras session graphs.
926
1156
  K.clear_session()
@@ -930,7 +1160,7 @@ class ModelSelectionEngine:
930
1160
  for i, e in enumerate(self.all_features)
931
1161
  if e in set(self.features)
932
1162
  ]
933
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
1163
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
934
1164
  x_train = x_train_reshaped[:, :, features_idx]
935
1165
  y_train = y_train_reshaped[:, [self.target_number, 0]]
936
1166
  x_val = x_val_reshaped[:, :, features_idx]
@@ -960,7 +1190,8 @@ class ModelSelectionEngine:
960
1190
  y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
961
1191
 
962
1192
  log_dir = get_log_dir(self.target_dir, model_name)
963
- # instantiate model
1193
+
1194
+ # Instantiate model
964
1195
  model = ModelEngine(
965
1196
  target_number=self.target_number,
966
1197
  model_name=model_name,
@@ -971,8 +1202,8 @@ class ModelSelectionEngine:
971
1202
  log_dir=log_dir,
972
1203
  )
973
1204
 
974
- start = time.time()
975
1205
  # Tuning hyperparameters
1206
+ start = time.time()
976
1207
  if perform_hyperopt:
977
1208
  model_best_params = self.hyperoptimize(
978
1209
  x_train, y_train, x_val, y_val, model
@@ -989,7 +1220,7 @@ class ModelSelectionEngine:
989
1220
  f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
990
1221
  )
991
1222
 
992
- # save best params
1223
+ # Save best params
993
1224
  best_params_file = f"{self.target_dir}/best_params.json"
994
1225
  try:
995
1226
  with open(best_params_file, "r") as f:
@@ -1001,114 +1232,25 @@ class ModelSelectionEngine:
1001
1232
  with open(best_params_file, "w") as f:
1002
1233
  json.dump(json_dict, f, indent=4)
1003
1234
 
1004
- # Perform cross-validation of the best model on k-folds of train + val set
1005
- if perform_crossval:
1006
- x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
1007
- y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
1008
- n_splits = 4
1009
- n_samples = len(x_train_val)
1010
- test_size = int(n_samples / (n_splits + 4))
1011
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1012
-
1013
- # Store the scores
1014
- cv_scores = []
1015
-
1016
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1017
- self.type_name = f"crossval_fold_{i}"
1018
-
1019
- if self.time_series:
1020
- date_series = pd.concat(
1021
- [
1022
- train[self.date_column],
1023
- val[self.date_column],
1024
- test[self.date_column],
1025
- ],
1026
- axis=0,
1027
- ).reset_index(drop=True)
1028
-
1029
- date_series = date_series.map(pd.Timestamp.fromordinal)
1030
-
1031
- # Now you can use the actual train/val indices to extract ranges
1032
- train_start = date_series.iloc[train_index[0]]
1033
- train_end = date_series.iloc[train_index[-1]]
1034
- val_start = date_series.iloc[val_index[0]]
1035
- val_end = date_series.iloc[val_index[-1]]
1036
-
1037
- logger.info(
1038
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1039
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1040
- )
1041
- else:
1042
- logger.info(
1043
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1044
- )
1045
-
1046
- # Train the model and get the score
1047
- if recurrent:
1048
- cv_score, _, _ = self.train_model(
1049
- params=model_best_params,
1050
- x_train=x_train_val[train_index],
1051
- y_train=y_train_val[train_index],
1052
- x_val=x_train_val[val_index],
1053
- y_val=y_train_val[val_index],
1054
- model=model,
1055
- )
1056
- else:
1057
- cv_score, _, _ = self.train_model(
1058
- params=model_best_params,
1059
- x_train=x_train_val.iloc[train_index],
1060
- y_train=y_train_val.iloc[train_index],
1061
- x_val=x_train_val.iloc[val_index],
1062
- y_val=y_train_val.iloc[val_index],
1063
- model=model,
1064
- )
1065
-
1066
- # Append score to the list
1067
- cv_scores.append(cv_score)
1068
-
1069
- # Calculate mean of all numerical metrics across all cross-validation folds
1070
- cv_scores_df = pd.DataFrame(cv_scores)
1071
- # Get mean of all numeric columns
1072
- cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
1235
+ # Always evaluate on test set (no cross-validation here)
1236
+ # The hyperopt already did CV if needed to find best params
1237
+ best_score, best_model, best_pred = self.train_model(
1238
+ params=model_best_params,
1239
+ x_train=pd.concat([x_train, x_val], axis=0),
1240
+ y_train=pd.concat([y_train, y_val], axis=0),
1241
+ x_val=x_test,
1242
+ y_val=y_test,
1243
+ model=model,
1244
+ )
1245
+ stop = time.time()
1246
+ training_time = stop - start
1073
1247
 
1074
- logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
1075
- for metric, value in cv_means.items():
1248
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1249
+ logger.info(f"👉 {model.model_name} scores on test set:")
1250
+ for metric, value in best_score.items():
1251
+ if isinstance(value, (int, float)):
1076
1252
  logger.info(f" {metric}: {value:.4f}")
1077
1253
 
1078
- # Retrain on entire training set, but keep score on cross-validation folds
1079
- # Get the test score using the best model
1080
- test_score, best_model, best_pred = self.train_model(
1081
- params=model_best_params,
1082
- x_train=pd.concat([x_train, x_val], axis=0),
1083
- y_train=pd.concat([y_train, y_val], axis=0),
1084
- x_val=x_test,
1085
- y_val=y_test,
1086
- model=model,
1087
- )
1088
-
1089
- # Update all metrics with cross-validation means
1090
- for metric, value in cv_means.items():
1091
- if metric in test_score: # Only update existing metrics
1092
- test_score[metric] = value
1093
- best_score = test_score
1094
- best_score["TYPE"] = "crossval"
1095
- else:
1096
- # Evaluate on test set
1097
- self.type_name = "testset"
1098
- best_score, best_model, best_pred = self.train_model(
1099
- params=model_best_params,
1100
- x_train=pd.concat([x_train, x_val], axis=0),
1101
- y_train=pd.concat([y_train, y_val], axis=0),
1102
- x_val=x_test,
1103
- y_val=y_test,
1104
- model=model,
1105
- )
1106
-
1107
- logger.info(f"👉 {model.model_name} scores on test set:")
1108
- for metric, value in best_score.items():
1109
- if isinstance(value, (int, float)):
1110
- logger.info(f" {metric}: {value:.4f}")
1111
-
1112
1254
  # Save predictions
1113
1255
  best_pred.to_csv(
1114
1256
  f"{self.results_dir}/prediction.csv",
@@ -1119,7 +1261,6 @@ class ModelSelectionEngine:
1119
1261
 
1120
1262
  # Save best model
1121
1263
  model_path = best_model.save(self.results_dir)
1122
-
1123
1264
  model_path = Path(model_path).resolve()
1124
1265
  best_score["MODEL_PATH"] = model_path
1125
1266
 
@@ -1142,32 +1283,26 @@ class ModelSelectionEngine:
1142
1283
  scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1143
1284
  scores_tracking.to_csv(scores_tracking_path, index=False)
1144
1285
 
1145
- # Save model training metadata
1146
- stop = time.time()
1147
- training_time = stop - start
1148
- model_training.best_params = model_best_params
1149
- model_training.model_path = model_path
1150
- model_training.training_time = training_time
1151
- model_training.save()
1152
-
1153
- # Store metrics in DB
1286
+ # Save in db
1154
1287
  drop_cols = [
1155
1288
  "DATE",
1156
1289
  "MODEL_NAME",
1157
- "MODEL_PATH",
1158
1290
  ]
1159
1291
  best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1160
1292
  score_data = {k.lower(): v for k, v in best_score.items()}
1161
-
1162
- Score.upsert(
1163
- match_fields=["model_training_id"],
1164
- model_training_id=model_training.id,
1293
+ model = Model.upsert(
1294
+ name=model_name,
1295
+ type=self.target_type,
1296
+ )
1297
+ ModelSelectionScore.upsert(
1298
+ model_id=model.id,
1299
+ model_selection_id=model_selection.id,
1300
+ best_params=serialize_for_json(model_best_params),
1301
+ training_time=training_time,
1165
1302
  **score_data,
1166
1303
  )
1167
1304
 
1168
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1169
-
1170
- # find best model type
1305
+ # STEP 2 :FINDING BEST MODEL OVERALL
1171
1306
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1172
1307
  scores_tracking = pd.read_csv(scores_tracking_path)
1173
1308
  best_score_overall = scores_tracking.iloc[0, :]
@@ -1178,12 +1313,11 @@ class ModelSelectionEngine:
1178
1313
  else:
1179
1314
  best_thresholds = None
1180
1315
 
1181
- # Remove any .best or .keras files
1316
+ # Remove any .best or .keras files, and save best model in target_dir
1182
1317
  for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
1183
1318
  os.path.join(self.target_dir, "*.keras")
1184
1319
  ):
1185
1320
  os.remove(file_path)
1186
- # Copy the best model in root training folder for this target
1187
1321
  best_model_path = Path(
1188
1322
  f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1189
1323
  ).resolve()
@@ -1195,13 +1329,13 @@ class ModelSelectionEngine:
1195
1329
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1196
1330
  best_model_params = json.load(f)[best_model_name]
1197
1331
 
1198
- # Save model_selection results to db
1199
-
1332
+ # Save to db
1200
1333
  model_selection = ModelSelection.get(model_selection.id)
1201
- model_selection.best_model_id = Model.find_by(
1334
+ model = Model.find_by(
1202
1335
  name=best_score_overall["MODEL_NAME"], type=self.target_type
1203
- ).id
1204
- model_selection.best_model_params = best_model_params
1336
+ )
1337
+ model_selection.best_model_id = model.id
1338
+ model_selection.best_model_params = serialize_for_json(best_model_params)
1205
1339
  model_selection.best_thresholds = best_thresholds
1206
1340
  model_selection.best_model_path = best_model_path
1207
1341
 
@@ -1226,7 +1360,169 @@ class ModelSelectionEngine:
1226
1360
  return best_model
1227
1361
 
1228
1362
  def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1229
- self.type_name = "hyperopts"
1363
+ """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1364
+ if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1365
+ return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1366
+ else:
1367
+ return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1368
+
1369
+ def hyperoptimize_hyperopt(
1370
+ self, x_train, y_train, x_val, y_val, model: ModelEngine
1371
+ ):
1372
+ """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
1373
+
1374
+ logger.info("Start tuning hyperparameters with HyperOpt standalone...")
1375
+
1376
+ # Convert Ray search space to HyperOpt search space
1377
+ def convert_search_space(ray_space):
1378
+ """Convert Ray Tune search space to HyperOpt format."""
1379
+ from ray.tune.search.sample import Categorical, Float, Integer
1380
+
1381
+ hp_space = {}
1382
+ for key, value in ray_space.items():
1383
+ if isinstance(value, Float):
1384
+ if (
1385
+ hasattr(value, "sampler")
1386
+ and value.sampler.__class__.__name__ == "LogUniform"
1387
+ ):
1388
+ # LogUniform distribution
1389
+ hp_space[key] = hp.loguniform(
1390
+ key, np.log(value.lower), np.log(value.upper)
1391
+ )
1392
+ else:
1393
+ # Uniform distribution
1394
+ hp_space[key] = hp.uniform(key, value.lower, value.upper)
1395
+ elif isinstance(value, Integer):
1396
+ # Integer uniform distribution
1397
+ hp_space[key] = hp.randint(key, value.lower, value.upper)
1398
+ elif isinstance(value, Categorical):
1399
+ # Categorical/choice distribution
1400
+ hp_space[key] = hp.choice(key, value.categories)
1401
+ elif isinstance(value, dict):
1402
+ # Nested dict, recurse
1403
+ hp_space[key] = convert_search_space(value)
1404
+ else:
1405
+ # Static value or unknown type
1406
+ hp_space[key] = value
1407
+ return hp_space
1408
+
1409
+ # Create objective function for HyperOpt
1410
+ def objective(params):
1411
+ """Objective function to minimize."""
1412
+ try:
1413
+ # Convert numpy types to native Python types
1414
+ params = serialize_for_json(params)
1415
+
1416
+ # Use existing trainable function based on perform_crossval
1417
+ if self.perform_crossval:
1418
+ score = trainable_cv(
1419
+ params,
1420
+ x_train,
1421
+ y_train,
1422
+ x_val,
1423
+ y_val,
1424
+ model.model_name,
1425
+ self.target_type,
1426
+ self.experiment_name,
1427
+ self.target_number,
1428
+ model.create_model,
1429
+ n_splits=3,
1430
+ plot=model.plot,
1431
+ log_dir=model.log_dir,
1432
+ target_clf_thresholds=self.target_clf_thresholds,
1433
+ time_series=self.time_series,
1434
+ recurrent=model.recurrent,
1435
+ )
1436
+ else:
1437
+ score, _, _ = trainable(
1438
+ params,
1439
+ x_train,
1440
+ y_train,
1441
+ x_val,
1442
+ y_val,
1443
+ model.model_name,
1444
+ self.target_type,
1445
+ self.experiment_name,
1446
+ self.target_number,
1447
+ model.create_model,
1448
+ plot=model.plot,
1449
+ log_dir=model.log_dir,
1450
+ target_clf_thresholds=self.target_clf_thresholds,
1451
+ )
1452
+
1453
+ # HyperOpt minimizes, so return the metric directly
1454
+ loss = score[self.metric]
1455
+
1456
+ # Log trial info
1457
+ logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
1458
+
1459
+ return {
1460
+ "loss": loss,
1461
+ "status": STATUS_OK,
1462
+ "score": score, # Keep full score dict for analysis
1463
+ }
1464
+
1465
+ except Exception as e:
1466
+ logger.error(f"Trial failed: {str(e)}")
1467
+ return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
1468
+
1469
+ # Convert search space
1470
+ hp_search_space = convert_search_space(model.search_params)
1471
+
1472
+ # Run optimization
1473
+ trials = Trials()
1474
+ best_params = fmin(
1475
+ fn=objective,
1476
+ space=hp_search_space,
1477
+ algo=tpe.suggest,
1478
+ max_evals=self.number_of_trials,
1479
+ trials=trials,
1480
+ verbose=True,
1481
+ show_progressbar=True,
1482
+ )
1483
+
1484
+ # Get the actual parameter values (not just indices for hp.choice)
1485
+ best_params = space_eval(hp_search_space, best_params)
1486
+
1487
+ # Convert numpy types to native Python types
1488
+ best_params = serialize_for_json(best_params)
1489
+
1490
+ # Get best score from trials
1491
+ best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
1492
+ best_score = trials.trials[best_trial_idx]["result"].get("score", {})
1493
+
1494
+ # Log results
1495
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1496
+ logger.info(f"Best Scores found were:\n{best_score}")
1497
+
1498
+ # Create summary DataFrame for consistency with Ray version
1499
+ results_df = pd.DataFrame(
1500
+ [
1501
+ {
1502
+ "trial_id": i,
1503
+ self.metric: t["result"]["loss"],
1504
+ **{
1505
+ k: v
1506
+ for k, v in t["result"].get("score", {}).items()
1507
+ if isinstance(v, (int, float))
1508
+ },
1509
+ }
1510
+ for i, t in enumerate(trials.trials)
1511
+ if t["result"]["status"] == STATUS_OK
1512
+ ]
1513
+ )
1514
+
1515
+ if not results_df.empty:
1516
+ logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
1517
+
1518
+ # Save trial history for analysis
1519
+ trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
1520
+ with open(trials_path, "wb") as f:
1521
+ pickle.dump(trials, f)
1522
+
1523
+ return best_params
1524
+
1525
+ def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1230
1526
 
1231
1527
  def collect_error_logs(target_dir: int, storage_path: str):
1232
1528
  output_error_file = f"{target_dir}/errors.log"
@@ -1269,9 +1565,22 @@ class ModelSelectionEngine:
1269
1565
  }
1270
1566
  )
1271
1567
 
1568
+ # Choose between regular trainable or CV version based on perform_crossval flag
1569
+ # perform_crossval controls whether to use CV during hyperopt
1570
+ if self.perform_crossval:
1571
+ trainable_fn = trainable_cv
1572
+ additional_params = {
1573
+ "n_splits": 3, # Can be made configurable
1574
+ "time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
1575
+ "recurrent": model.recurrent,
1576
+ }
1577
+ else:
1578
+ trainable_fn = trainable
1579
+ additional_params = {}
1580
+
1272
1581
  tuner = Tuner(
1273
1582
  trainable=with_parameters(
1274
- trainable,
1583
+ trainable_fn,
1275
1584
  x_train=x_train,
1276
1585
  y_train=y_train,
1277
1586
  x_val=x_val,
@@ -1281,10 +1590,10 @@ class ModelSelectionEngine:
1281
1590
  experiment_name=self.experiment_name,
1282
1591
  target_number=self.target_number,
1283
1592
  create_model=model.create_model,
1284
- type_name="hyperopts",
1285
1593
  plot=model.plot,
1286
1594
  log_dir=model.log_dir,
1287
1595
  target_clf_thresholds=self.target_clf_thresholds,
1596
+ **additional_params,
1288
1597
  ),
1289
1598
  param_space=model.search_params,
1290
1599
  tune_config=TuneConfig(
@@ -1338,7 +1647,6 @@ class ModelSelectionEngine:
1338
1647
  self.experiment_name,
1339
1648
  self.target_number,
1340
1649
  model.create_model,
1341
- self.type_name,
1342
1650
  model.plot,
1343
1651
  log_dir=model.log_dir,
1344
1652
  target_clf_thresholds=self.target_clf_thresholds,