lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
  5. lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
  6. lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
  7. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  8. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  9. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  10. lecrapaud/db/models/__init__.py +2 -4
  11. lecrapaud/db/models/base.py +122 -67
  12. lecrapaud/db/models/experiment.py +196 -183
  13. lecrapaud/db/models/feature_selection.py +0 -3
  14. lecrapaud/db/models/feature_selection_rank.py +0 -18
  15. lecrapaud/db/models/model_selection.py +2 -2
  16. lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
  17. lecrapaud/db/session.py +33 -4
  18. lecrapaud/experiment.py +44 -17
  19. lecrapaud/feature_engineering.py +45 -674
  20. lecrapaud/feature_preprocessing.py +1202 -0
  21. lecrapaud/feature_selection.py +145 -332
  22. lecrapaud/integrations/sentry_integration.py +46 -0
  23. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  24. lecrapaud/mixins.py +247 -0
  25. lecrapaud/model_preprocessing.py +295 -0
  26. lecrapaud/model_selection.py +725 -249
  27. lecrapaud/pipeline.py +548 -0
  28. lecrapaud/search_space.py +38 -1
  29. lecrapaud/utils.py +36 -3
  30. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  31. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  32. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  33. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  34. lecrapaud/db/models/model_training.py +0 -64
  35. lecrapaud/jobs/__init__.py +0 -13
  36. lecrapaud/jobs/config.py +0 -17
  37. lecrapaud/jobs/scheduler.py +0 -30
  38. lecrapaud/jobs/tasks.py +0 -17
  39. lecrapaud-0.18.7.dist-info/METADATA +0 -248
  40. lecrapaud-0.18.7.dist-info/RECORD +0 -46
@@ -14,10 +14,8 @@ import pickle
14
14
  from pydantic import BaseModel
15
15
  import ast
16
16
 
17
- os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
18
-
19
17
  # ML models
20
- from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
21
19
  from sklearn.calibration import CalibratedClassifierCV
22
20
  from sklearn.metrics import (
23
21
  mean_absolute_percentage_error,
@@ -57,28 +55,45 @@ from tensorboardX import SummaryWriter
57
55
 
58
56
  # Optimization
59
57
  import ray
60
- from ray.tune import Tuner, TuneConfig, with_parameters
61
- from ray.train import RunConfig
58
+ from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
62
59
  from ray.tune.search.hyperopt import HyperOptSearch
63
60
  from ray.tune.search.bayesopt import BayesOptSearch
64
61
  from ray.tune.logger import TBXLoggerCallback
65
62
  from ray.tune.schedulers import ASHAScheduler
66
63
  from ray.air import session
67
64
 
65
+ # HyperOpt standalone
66
+ from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
67
+
68
68
  # Internal library
69
69
  from lecrapaud.search_space import all_models
70
70
  from lecrapaud.directories import clean_directory
71
71
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
72
- from lecrapaud.config import PYTHON_ENV
72
+ from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
73
73
  from lecrapaud.feature_selection import load_train_data
74
74
  from lecrapaud.db import (
75
75
  Model,
76
76
  ModelSelection,
77
- ModelTraining,
78
- Score,
77
+ ModelSelectionScore,
79
78
  Target,
80
79
  Experiment,
81
80
  )
81
+ from lecrapaud.mixins import LeCrapaudEstimatorMixin
82
+
83
+ os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
84
+
85
+ # Suppress XGBoost and LightGBM logging
86
+ import logging
87
+
88
+ logging.getLogger("lightgbm").setLevel(logging.ERROR)
89
+ logging.getLogger("xgboost").setLevel(logging.ERROR)
90
+
91
+ # Set global verbosity for XGBoost
92
+ xgb.set_config(verbosity=0)
93
+
94
+ # Suppress warnings
95
+ warnings.filterwarnings("ignore", category=UserWarning)
96
+ warnings.filterwarnings("ignore", category=FutureWarning)
82
97
 
83
98
  # Reproducible result
84
99
  keras.utils.set_random_seed(42)
@@ -110,7 +125,64 @@ def test_hardware():
110
125
  warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
111
126
 
112
127
 
113
- class ModelEngine:
128
+ class CatBoostWrapper:
129
+ """
130
+ Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
131
+ as direct attributes, while forwarding all method calls and properties.
132
+ """
133
+
134
+ __slots__ = ("_model", "_extra_attrs")
135
+
136
+ def __init__(self, model, **kwargs):
137
+ object.__setattr__(self, "_model", model)
138
+ object.__setattr__(self, "_extra_attrs", {})
139
+ # Register kwargs as direct attributes
140
+ for key, value in kwargs.items():
141
+ setattr(self, key, value)
142
+
143
+ # ---- Transparent access ----
144
+ def __getattr__(self, name):
145
+ """Forward attribute access to the underlying model if not found."""
146
+ model = object.__getattribute__(self, "_model")
147
+ if hasattr(model, name):
148
+ return getattr(model, name)
149
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
150
+ if name in extra_attrs:
151
+ return extra_attrs[name]
152
+ raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
153
+
154
+ def __setattr__(self, name, value):
155
+ """Set to wrapper or forward to model when appropriate."""
156
+ if name in CatBoostWrapper.__slots__:
157
+ object.__setattr__(self, name, value)
158
+ return
159
+
160
+ model = object.__getattribute__(self, "_model")
161
+ if hasattr(model, name):
162
+ setattr(model, name, value)
163
+ else:
164
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
165
+ extra_attrs[name] = value
166
+
167
+ def __dir__(self):
168
+ """Merge dir() from wrapper, model, and custom attributes."""
169
+ base = set(super().__dir__())
170
+ model_attrs = set(dir(object.__getattribute__(self, "_model")))
171
+ extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
172
+ return sorted(base | model_attrs | extra_attrs)
173
+
174
+ def __repr__(self):
175
+ model = object.__getattribute__(self, "_model")
176
+ extras = object.__getattribute__(self, "_extra_attrs")
177
+ return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
178
+
179
+ @property
180
+ def model(self):
181
+ """Access the raw CatBoost model."""
182
+ return object.__getattribute__(self, "_model")
183
+
184
+
185
+ class BaseModel:
114
186
 
115
187
  def __init__(
116
188
  self,
@@ -157,8 +229,10 @@ class ModelEngine:
157
229
  def fit(self, *args):
158
230
  if self.recurrent:
159
231
  fit = self.fit_recurrent
160
- elif (self.create_model == "lgb") or (self.create_model == "xgb"):
232
+ elif (self.model_name == "lgb") or (self.model_name == "xgb"):
161
233
  fit = self.fit_boosting
234
+ elif self.model_name == "catboost":
235
+ fit = self.fit_catboost
162
236
  else:
163
237
  fit = self.fit_sklearn
164
238
  model = fit(*args)
@@ -201,17 +275,113 @@ class ModelEngine:
201
275
 
202
276
  return model
203
277
 
204
- def fit_boosting(self, x_train, y_train, x_val, y_val, params):
278
+ def fit_catboost(self, x_train, y_train, x_val, y_val, params):
205
279
  """
206
- This is using lightGBM or XGboost C++ librairies
280
+ Train CatBoost models with native early stopping and log metrics to TensorBoard.
281
+ Also supports plotting of the primary eval metric if self.plot is True.
207
282
  """
208
- lightGBM = self.create_model == "lgb"
283
+ # Prepare constructor parameters
284
+ ctor_params = dict(params) if params else {}
285
+ early_stopping_rounds = ctor_params.pop("early_stopping_rounds", None)
286
+ # Alias support: num_boost_round -> iterations
287
+ num_boost_round = ctor_params.pop("num_boost_round", None)
288
+ if num_boost_round is not None and "iterations" not in ctor_params:
289
+ ctor_params["iterations"] = num_boost_round
290
+
291
+ # Determine classification/regression setup
292
+ labels = np.unique(y_train)
293
+ num_class = (
294
+ labels.size
295
+ if self.target_type == "classification" and labels.size > 2
296
+ else 1
297
+ )
298
+
299
+ if self.target_type == "regression":
300
+ ctor_params.setdefault("loss_function", "RMSE")
301
+ eval_metric = ctor_params.get("eval_metric", "RMSE")
302
+ else:
303
+ if num_class <= 2:
304
+ ctor_params.setdefault("loss_function", "Logloss")
305
+ eval_metric = ctor_params.get("eval_metric", "Logloss")
306
+ else:
307
+ ctor_params.setdefault("loss_function", "MultiClass")
308
+ eval_metric = ctor_params.get("eval_metric", "MultiClass")
309
+ ctor_params.setdefault("eval_metric", eval_metric)
310
+
311
+ # Instantiate CatBoost model from provided constructor
312
+ model = self.create_model(**ctor_params, allow_writing_files=False)
313
+
314
+ # Train with eval_set and early stopping
315
+ logger.info(f"Fitting the model {self.model_name}...")
316
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
317
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
318
+
319
+ model.fit(
320
+ x_train,
321
+ y_train,
322
+ eval_set=[(x_val, y_val)],
323
+ use_best_model=True,
324
+ early_stopping_rounds=early_stopping_rounds,
325
+ verbose=False,
326
+ )
209
327
 
210
- # Experiments
211
- boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
212
- train_data = boosting_dataset(x_train, label=y_train)
213
- val_data = boosting_dataset(x_val, label=y_val)
328
+ # Retrieve evaluation results
329
+ evals_result = model.get_evals_result()
330
+ # CatBoost commonly uses 'learn' and 'validation' (or 'validation_0')
331
+ learn_key = "learn"
332
+ val_key = None
333
+ for k in evals_result.keys():
334
+ if k != learn_key:
335
+ val_key = k
336
+ break
337
+
338
+ # Ensure eval_metric exists; otherwise fallback to first available metric
339
+ if eval_metric not in evals_result.get(learn_key, {}):
340
+ if evals_result.get(learn_key):
341
+ eval_metric = next(iter(evals_result[learn_key].keys()))
342
+
343
+ # TensorBoard logging
344
+ writer = SummaryWriter(self.log_dir)
345
+ try:
346
+ # learn_scores = evals_result.get(learn_key, {}).get(eval_metric, [])
347
+ val_scores = (
348
+ evals_result.get(val_key, {}).get(eval_metric, []) if val_key else []
349
+ )
350
+ # for i, v in enumerate(learn_scores):
351
+ # writer.add_scalar(f"CatBoost/train/{eval_metric}", v, i)
352
+ for i, v in enumerate(val_scores):
353
+ writer.add_scalar(f"CatBoost/{eval_metric}", v, i)
354
+ finally:
355
+ writer.close()
214
356
 
357
+ # Optional plotting of training progress
358
+ if self.plot and eval_metric and learn_key in evals_result and val_key:
359
+ logs = {
360
+ "train": evals_result[learn_key].get(eval_metric, []),
361
+ "val": evals_result[val_key].get(eval_metric, []),
362
+ }
363
+ plot_training_progress(
364
+ logs=logs,
365
+ model_name=self.model_name,
366
+ target_number=self.target_number,
367
+ title_suffix=f"Training Progress - {eval_metric}",
368
+ )
369
+
370
+ # Attach metadata for consistency with sklearn path
371
+ model_wrapped = CatBoostWrapper(
372
+ model, model_name=self.model_name, target_type=self.target_type
373
+ )
374
+ logger.info(
375
+ f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
376
+ )
377
+
378
+ self._model = model_wrapped
379
+ return model_wrapped
380
+
381
+ def fit_boosting(self, x_train, y_train, x_val, y_val, params):
382
+ """
383
+ This is using lightGBM or XGboost C++ librairies
384
+ """
215
385
  # Create a TensorBoardX writer
216
386
  writer = SummaryWriter(self.log_dir)
217
387
  evals_result = {}
@@ -223,11 +393,13 @@ class ModelEngine:
223
393
  if self.target_type == "classification" and labels.size > 2
224
394
  else 1
225
395
  )
226
- logger.info("Fitting the model...")
396
+ logger.info(f"Fitting the model {self.model_name}...")
227
397
  logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
228
398
  logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
229
399
 
230
- if lightGBM:
400
+ if self.model_name == "lgb":
401
+ train_data = lgb.Dataset(x_train, label=y_train)
402
+ val_data = lgb.Dataset(x_val, label=y_val)
231
403
 
232
404
  def tensorboard_callback(env):
233
405
  for i, metric in enumerate(env.evaluation_result_list):
@@ -252,18 +424,25 @@ class ModelEngine:
252
424
  "objective": loss,
253
425
  "metric": eval_metric,
254
426
  "num_class": num_class,
427
+ "verbose": -1,
428
+ "verbose_eval": False,
255
429
  },
256
430
  num_boost_round=params["num_boost_round"],
257
431
  train_set=train_data,
258
432
  valid_sets=[train_data, val_data],
259
433
  valid_names=["train", "val"],
260
434
  callbacks=[
261
- lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
435
+ lgb.early_stopping(
436
+ stopping_rounds=params["early_stopping_rounds"], verbose=False
437
+ ),
262
438
  lgb.record_evaluation(evals_result),
263
439
  tensorboard_callback,
440
+ lgb.log_evaluation(period=0), # Disable evaluation logging
264
441
  ],
265
442
  )
266
443
  else:
444
+ train_data = xgb.DMatrix(x_train, label=y_train)
445
+ val_data = xgb.DMatrix(x_val, label=y_val)
267
446
 
268
447
  class TensorBoardCallback(xgb.callback.TrainingCallback):
269
448
 
@@ -300,6 +479,7 @@ class ModelEngine:
300
479
  if self.target_type == "regression"
301
480
  else ("logloss" if num_class <= 2 else "mlogloss")
302
481
  )
482
+ # XGBoost verbosity already set globally
303
483
  model = xgb.train(
304
484
  params={
305
485
  **params["model_params"],
@@ -314,11 +494,11 @@ class ModelEngine:
314
494
  xgb.callback.EarlyStopping(
315
495
  rounds=params["early_stopping_rounds"], save_best=True
316
496
  ),
317
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
497
+ # Removed EvaluationMonitor to suppress logs
318
498
  tensorboard_callback,
319
499
  ],
320
500
  evals_result=evals_result, # Record evaluation result
321
- verbose_eval=0,
501
+ verbose_eval=False, # Disable evaluation logging
322
502
  )
323
503
 
324
504
  model.model_name = self.create_model
@@ -583,6 +763,171 @@ class ModelEngine:
583
763
  )
584
764
 
585
765
 
766
+ def trainable_cv(
767
+ params,
768
+ x_train,
769
+ y_train,
770
+ x_val,
771
+ y_val,
772
+ model_name,
773
+ target_type,
774
+ experiment_name,
775
+ target_number,
776
+ create_model,
777
+ n_splits=3,
778
+ plot=False,
779
+ log_dir=None,
780
+ target_clf_thresholds: dict = None,
781
+ time_series=True,
782
+ recurrent=False,
783
+ ):
784
+ """Cross-validation version of trainable for hyperopt.
785
+
786
+ Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
787
+ Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
788
+ """
789
+ # Combine train and validation data for cross-validation
790
+ if recurrent:
791
+ x_train_val = np.concatenate([x_train, x_val], axis=0)
792
+ y_train_val = np.concatenate([y_train, y_val], axis=0)
793
+ else:
794
+ x_train_val = pd.concat([x_train, x_val], axis=0)
795
+ y_train_val = pd.concat([y_train, y_val], axis=0)
796
+ # Store original index for later use if needed
797
+ original_index = x_train_val.index.copy()
798
+ # Reset index for proper iloc indexing with CV splits
799
+ x_train_val = x_train_val.reset_index(drop=True)
800
+ y_train_val = y_train_val.reset_index(drop=True)
801
+
802
+ # Choose appropriate cross-validation splitter
803
+ if time_series:
804
+ # Time series split for temporal data
805
+ n_samples = len(x_train_val)
806
+ test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
807
+ cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
808
+ else:
809
+ # Stratified or regular K-fold for i.i.d. data
810
+ if target_type == "classification":
811
+ cv_splitter = StratifiedKFold(
812
+ n_splits=n_splits, shuffle=True, random_state=42
813
+ )
814
+ else:
815
+ cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
816
+
817
+ # Store all predictions and true values for pooled metrics
818
+ all_predictions = []
819
+ all_y_true = []
820
+ fold_times = []
821
+
822
+ # Get splits based on the CV strategy
823
+ if time_series or target_type == "regression":
824
+ splits = cv_splitter.split(x_train_val)
825
+ else:
826
+ # For stratified split, we need to pass y
827
+ if recurrent:
828
+ # Extract the target from the 2D array (first column is target)
829
+ y_for_split = y_train_val[:, 0]
830
+ else:
831
+ y_for_split = y_train_val
832
+ splits = cv_splitter.split(x_train_val, y_for_split)
833
+
834
+ for fold_idx, (train_idx, val_idx) in enumerate(splits):
835
+ # Extract fold data
836
+ if recurrent:
837
+ x_fold_train = x_train_val[train_idx]
838
+ y_fold_train = y_train_val[train_idx]
839
+ x_fold_val = x_train_val[val_idx]
840
+ y_fold_val = y_train_val[val_idx]
841
+ else:
842
+ x_fold_train = x_train_val.iloc[train_idx]
843
+ y_fold_train = y_train_val.iloc[train_idx]
844
+ x_fold_val = x_train_val.iloc[val_idx]
845
+ y_fold_val = y_train_val.iloc[val_idx]
846
+
847
+ # Train model for this fold
848
+ model = BaseModel(
849
+ model_name=model_name,
850
+ target_type=target_type,
851
+ target_number=target_number,
852
+ create_model=create_model,
853
+ plot=False, # Disable individual fold plots
854
+ log_dir=log_dir,
855
+ )
856
+
857
+ if recurrent:
858
+ timesteps = params["timesteps"]
859
+ x_fold_train = x_fold_train[:, -timesteps:, :]
860
+ x_fold_val = x_fold_val[:, -timesteps:, :]
861
+
862
+ # Fit model
863
+ model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
864
+
865
+ # Get predictions
866
+ y_pred = model.predict(x_fold_val)
867
+
868
+ # Handle recurrent model indexing
869
+ if recurrent:
870
+ y_fold_val = pd.DataFrame(
871
+ y_fold_val, columns=["TARGET", "index"]
872
+ ).set_index("index")
873
+ y_pred.index = y_fold_val.index
874
+
875
+ # Store predictions and true values
876
+ all_predictions.append(y_pred)
877
+ all_y_true.append(y_fold_val)
878
+
879
+ # Concatenate all fold predictions
880
+ if target_type == "classification":
881
+ # For classification, we need to handle probability columns
882
+ all_pred_df = pd.concat(all_predictions, axis=0)
883
+ all_y_series = pd.concat(all_y_true, axis=0)
884
+ # Ensure we have a DataFrame with TARGET column
885
+ if isinstance(all_y_series, pd.Series):
886
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
887
+ else:
888
+ all_y_df = all_y_series
889
+ else:
890
+ # For regression, just concatenate the predictions
891
+ all_pred_series = pd.concat(all_predictions, axis=0)
892
+ all_y_series = pd.concat(all_y_true, axis=0)
893
+ all_pred_df = pd.DataFrame({"PRED": all_pred_series})
894
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
895
+
896
+ # Create combined prediction DataFrame
897
+ prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
898
+
899
+ # Calculate pooled metrics
900
+ score = {
901
+ "DATE": datetime.now(),
902
+ "MODEL_NAME": model_name,
903
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
904
+ }
905
+
906
+ # Unscale if needed (for regression with scaling)
907
+ if (
908
+ model.need_scaling
909
+ and target_type == "regression"
910
+ and model.scaler_y is not None
911
+ ):
912
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
913
+ prediction[["TARGET"]].values
914
+ )
915
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
916
+ prediction[["PRED"]].values
917
+ )
918
+
919
+ # Evaluate with pooled predictions
920
+ score.update(evaluate(prediction, target_type, target_clf_thresholds))
921
+
922
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
923
+ logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
924
+
925
+ # Report to Ray if in Ray context
926
+ if session.get_session():
927
+ session.report(metrics=score)
928
+ return score
929
+
930
+
586
931
  def trainable(
587
932
  params,
588
933
  x_train,
@@ -594,14 +939,13 @@ def trainable(
594
939
  experiment_name,
595
940
  target_number,
596
941
  create_model,
597
- type_name="hyperopts",
598
942
  plot=False,
599
943
  log_dir=None,
600
944
  target_clf_thresholds: dict = None,
601
945
  ):
602
946
  """Standalone version of train_model that doesn't depend on self"""
603
947
  # Create model engine
604
- model = ModelEngine(
948
+ model = BaseModel(
605
949
  model_name=model_name,
606
950
  target_type=target_type,
607
951
  target_number=target_number,
@@ -620,9 +964,7 @@ def trainable(
620
964
  x_val = x_val[:, -timesteps:, :]
621
965
 
622
966
  # Compile and fit model on train set
623
- start = time.time()
624
967
  model.fit(x_train, y_train, x_val, y_val, params)
625
- stop = time.time()
626
968
 
627
969
  # Prediction on val set
628
970
  y_pred = model.predict(x_val)
@@ -652,8 +994,6 @@ def trainable(
652
994
  score = {
653
995
  "DATE": datetime.now(),
654
996
  "MODEL_NAME": model.model_name,
655
- "TYPE": type_name,
656
- "TRAINING_TIME": stop - start,
657
997
  "EVAL_DATA_STD": prediction["TARGET"].std(),
658
998
  }
659
999
 
@@ -662,77 +1002,107 @@ def trainable(
662
1002
  metric = "RMSE" if target_type == "regression" else "LOGLOSS"
663
1003
  logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
664
1004
 
665
- if type_name == "hyperopts":
1005
+ # Report to Ray if in Ray context
1006
+ if session.get_session():
666
1007
  session.report(metrics=score)
667
1008
  return score
668
1009
 
669
1010
  return score, model, prediction
670
1011
 
671
1012
 
672
- class ModelSelectionEngine:
1013
+ class ModelSelector(LeCrapaudEstimatorMixin):
673
1014
 
674
1015
  def __init__(
675
1016
  self,
676
- data,
677
- reshaped_data,
678
- target_number,
679
- target_clf,
680
- experiment,
681
- models_idx,
682
- time_series,
683
- date_column,
684
- group_column,
685
- target_clf_thresholds,
1017
+ experiment: Experiment = None,
1018
+ target_number: int = None,
686
1019
  **kwargs,
687
1020
  ):
688
- self.data = data
689
- self.reshaped_data = reshaped_data
1021
+ # The mixin will automatically set all experiment.context parameters as attributes
1022
+ super().__init__(experiment=experiment, target_number=target_number, **kwargs)
1023
+
1024
+ # Set defaults for required parameters if not provided
1025
+ if not hasattr(self, "target_clf"):
1026
+ self.target_clf = []
1027
+ if not hasattr(self, "models_idx"):
1028
+ self.models_idx = []
1029
+ if not hasattr(self, "time_series"):
1030
+ self.time_series = False
1031
+ if not hasattr(self, "date_column"):
1032
+ self.date_column = None
1033
+ if not hasattr(self, "group_column"):
1034
+ self.group_column = None
1035
+ if not hasattr(self, "target_clf_thresholds"):
1036
+ self.target_clf_thresholds = {}
690
1037
  self.target_number = target_number
691
- self.experiment = experiment
692
- self.target_clf = target_clf
693
- self.models_idx = models_idx
694
- self.time_series = time_series
695
- self.date_column = date_column
696
- self.group_column = group_column
697
- self.target_clf_thresholds = (
698
- target_clf_thresholds[target_number]
699
- if target_number in target_clf_thresholds.keys()
700
- else None
701
- )
702
1038
 
703
- self.target_type = (
704
- "classification" if self.target_number in self.target_clf else "regression"
705
- )
706
- self.experiment_dir = self.experiment.path
707
- self.experiment_id = self.experiment.id
708
- self.data_dir = f"{self.experiment_dir}/data"
709
- self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
710
- self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
711
- self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
712
- self.features = self.experiment.get_features(self.target_number)
713
- self.all_features = self.experiment.get_all_features(
714
- date_column=self.date_column, group_column=self.group_column
715
- )
1039
+ # Handle target_clf_thresholds for specific target
1040
+ # Handle both string and integer keys for backward compatibility
1041
+ if self.target_number and self.target_clf_thresholds:
1042
+ # Try both integer and string versions of the target number
1043
+ if self.target_number in self.target_clf_thresholds:
1044
+ self.target_clf_thresholds = self.target_clf_thresholds[
1045
+ self.target_number
1046
+ ]
1047
+ elif str(self.target_number) in self.target_clf_thresholds:
1048
+ self.target_clf_thresholds = self.target_clf_thresholds[
1049
+ str(self.target_number)
1050
+ ]
1051
+
1052
+ # Derived attributes
1053
+ if self.target_number is not None:
1054
+ self.target_type = (
1055
+ "classification"
1056
+ if self.target_number in self.target_clf
1057
+ else "regression"
1058
+ )
1059
+ self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
1060
+
1061
+ # Set paths and features if experiment is available
1062
+ if self.experiment:
1063
+ self.experiment_dir = self.experiment.path
1064
+ self.experiment_id = self.experiment.id
1065
+ self.data_dir = f"{self.experiment_dir}/data"
1066
+ self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
1067
+
1068
+ if self.target_number is not None:
1069
+ self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
1070
+ self.features = self.experiment.get_features(self.target_number)
1071
+
1072
+ self.all_features = self.experiment.get_all_features(
1073
+ date_column=self.date_column, group_column=self.group_column
1074
+ )
716
1075
 
717
1076
  # Main training function
718
- def run(
719
- self,
720
- experiment_name,
721
- perform_hyperopt=True,
722
- number_of_trials=20,
723
- perform_crossval=False,
724
- plot=True,
725
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
726
- preserve_model=True,
727
- best_params=None,
728
- ):
1077
+ def fit(self, X, y=None, reshaped_data=None, best_params=None):
729
1078
  """
730
- Selects the best models based on a target variable, optionally performing hyperparameter optimization
731
- and cross-validation, and manages outputs in a session-specific directory.
1079
+ Fit the model selector (train and select best model).
1080
+
1081
+ Args:
1082
+ X: Either a DataFrame or a dict with train/val/test data
1083
+ y: Target values (ignored, uses TARGET columns)
1084
+ reshaped_data: Optional reshaped data for recurrent models
1085
+ best_params: Optional pre-defined best parameters
1086
+
1087
+ Returns:
1088
+ self: Returns self for chaining
732
1089
  """
733
- self.experiment_name = experiment_name
734
- self.plot = plot
735
- self.number_of_trials = number_of_trials
1090
+ # Handle both DataFrame and dict inputs
1091
+ if isinstance(X, dict):
1092
+ self.data = X
1093
+ self.reshaped_data = reshaped_data
1094
+ else:
1095
+ # For simple DataFrame input, we expect it to be just training data
1096
+ # This is less common for ModelSelector which typically needs train/val/test
1097
+ raise ValueError("ModelSelector requires a dict with train/val/test data")
1098
+ # Get all parameters from experiment context
1099
+ context = self.experiment.context
1100
+ self.experiment_name = context.get("experiment_name", "")
1101
+ self.plot = context.get("plot", True)
1102
+ self.number_of_trials = context.get("number_of_trials", 20)
1103
+ self.perform_crossval = context.get("perform_crossval", False)
1104
+ self.preserve_model = context.get("preserve_model", True)
1105
+ self.perform_hyperopt = context.get("perform_hyperopt", True)
736
1106
 
737
1107
  if self.experiment_id is None:
738
1108
  raise ValueError("Please provide a experiment.")
@@ -782,12 +1152,11 @@ class ModelSelectionEngine:
782
1152
  # create model selection in db
783
1153
  target = Target.find_by(name=f"TARGET_{self.target_number}")
784
1154
  model_selection = ModelSelection.upsert(
785
- match_fields=["target_id", "experiment_id"],
786
1155
  target_id=target.id,
787
1156
  experiment_id=self.experiment_id,
788
1157
  )
789
1158
 
790
- # recurrent models starts at 9 # len(list_models)
1159
+ # STEP 1 : TRAINING MODELS
791
1160
  for i in self.models_idx:
792
1161
  config = all_models[i]
793
1162
  recurrent = config["recurrent"]
@@ -800,24 +1169,16 @@ class ModelSelectionEngine:
800
1169
  self.results_dir = f"{self.target_dir}/{model_name}"
801
1170
  if not os.path.exists(f"{self.results_dir}"):
802
1171
  os.makedirs(f"{self.results_dir}")
803
- elif preserve_model and contains_best(self.results_dir):
1172
+ elif self.preserve_model and contains_best(self.results_dir):
804
1173
  continue
805
- elif perform_hyperopt:
1174
+ elif self.perform_hyperopt:
806
1175
  clean_directory(self.results_dir)
807
1176
 
808
- logger.info(f"Training a {model_name}")
809
- model = Model.upsert(
810
- match_fields=["name", "type"],
811
- name=model_name,
812
- type=self.target_type,
813
- )
814
- model_training = ModelTraining.upsert(
815
- match_fields=["model_id", "model_selection_id"],
816
- model_id=model.id,
817
- model_selection_id=model_selection.id,
1177
+ logger.info(
1178
+ f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
818
1179
  )
819
1180
 
820
- # getting data
1181
+ # Getting data
821
1182
  if recurrent:
822
1183
  # Clear cluster from previous Keras session graphs.
823
1184
  K.clear_session()
@@ -827,7 +1188,7 @@ class ModelSelectionEngine:
827
1188
  for i, e in enumerate(self.all_features)
828
1189
  if e in set(self.features)
829
1190
  ]
830
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
1191
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
831
1192
  x_train = x_train_reshaped[:, :, features_idx]
832
1193
  y_train = y_train_reshaped[:, [self.target_number, 0]]
833
1194
  x_val = x_val_reshaped[:, :, features_idx]
@@ -857,8 +1218,9 @@ class ModelSelectionEngine:
857
1218
  y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
858
1219
 
859
1220
  log_dir = get_log_dir(self.target_dir, model_name)
860
- # instantiate model
861
- model = ModelEngine(
1221
+
1222
+ # Instantiate model
1223
+ model = BaseModel(
862
1224
  target_number=self.target_number,
863
1225
  model_name=model_name,
864
1226
  search_params=config["search_params"],
@@ -868,9 +1230,9 @@ class ModelSelectionEngine:
868
1230
  log_dir=log_dir,
869
1231
  )
870
1232
 
871
- start = time.time()
872
1233
  # Tuning hyperparameters
873
- if perform_hyperopt:
1234
+ start = time.time()
1235
+ if self.perform_hyperopt:
874
1236
  model_best_params = self.hyperoptimize(
875
1237
  x_train, y_train, x_val, y_val, model
876
1238
  )
@@ -886,7 +1248,7 @@ class ModelSelectionEngine:
886
1248
  f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
887
1249
  )
888
1250
 
889
- # save best params
1251
+ # Save best params
890
1252
  best_params_file = f"{self.target_dir}/best_params.json"
891
1253
  try:
892
1254
  with open(best_params_file, "r") as f:
@@ -898,114 +1260,25 @@ class ModelSelectionEngine:
898
1260
  with open(best_params_file, "w") as f:
899
1261
  json.dump(json_dict, f, indent=4)
900
1262
 
901
- # Perform cross-validation of the best model on k-folds of train + val set
902
- if perform_crossval:
903
- x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
904
- y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
905
- n_splits = 4
906
- n_samples = len(x_train_val)
907
- test_size = int(n_samples / (n_splits + 4))
908
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
909
-
910
- # Store the scores
911
- cv_scores = []
912
-
913
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
914
- self.type_name = f"crossval_fold_{i}"
915
-
916
- if self.time_series:
917
- date_series = pd.concat(
918
- [
919
- train[self.date_column],
920
- val[self.date_column],
921
- test[self.date_column],
922
- ],
923
- axis=0,
924
- ).reset_index(drop=True)
925
-
926
- date_series = date_series.map(pd.Timestamp.fromordinal)
927
-
928
- # Now you can use the actual train/val indices to extract ranges
929
- train_start = date_series.iloc[train_index[0]]
930
- train_end = date_series.iloc[train_index[-1]]
931
- val_start = date_series.iloc[val_index[0]]
932
- val_end = date_series.iloc[val_index[-1]]
933
-
934
- logger.info(
935
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
936
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
937
- )
938
- else:
939
- logger.info(
940
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
941
- )
942
-
943
- # Train the model and get the score
944
- if recurrent:
945
- cv_score, _, _ = self.train_model(
946
- params=model_best_params,
947
- x_train=x_train_val[train_index],
948
- y_train=y_train_val[train_index],
949
- x_val=x_train_val[val_index],
950
- y_val=y_train_val[val_index],
951
- model=model,
952
- )
953
- else:
954
- cv_score, _, _ = self.train_model(
955
- params=model_best_params,
956
- x_train=x_train_val.iloc[train_index],
957
- y_train=y_train_val.iloc[train_index],
958
- x_val=x_train_val.iloc[val_index],
959
- y_val=y_train_val.iloc[val_index],
960
- model=model,
961
- )
962
-
963
- # Append score to the list
964
- cv_scores.append(cv_score)
965
-
966
- # Calculate mean of all numerical metrics across all cross-validation folds
967
- cv_scores_df = pd.DataFrame(cv_scores)
968
- # Get mean of all numeric columns
969
- cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
1263
+ # Always evaluate on test set (no cross-validation here)
1264
+ # The hyperopt already did CV if needed to find best params
1265
+ best_score, best_model, best_pred = self.train_model(
1266
+ params=model_best_params,
1267
+ x_train=pd.concat([x_train, x_val], axis=0),
1268
+ y_train=pd.concat([y_train, y_val], axis=0),
1269
+ x_val=x_test,
1270
+ y_val=y_test,
1271
+ model=model,
1272
+ )
1273
+ stop = time.time()
1274
+ training_time = stop - start
970
1275
 
971
- logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
972
- for metric, value in cv_means.items():
1276
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1277
+ logger.info(f"👉 {model.model_name} scores on test set:")
1278
+ for metric, value in best_score.items():
1279
+ if isinstance(value, (int, float)):
973
1280
  logger.info(f" {metric}: {value:.4f}")
974
1281
 
975
- # Retrain on entire training set, but keep score on cross-validation folds
976
- # Get the test score using the best model
977
- test_score, best_model, best_pred = self.train_model(
978
- params=model_best_params,
979
- x_train=pd.concat([x_train, x_val], axis=0),
980
- y_train=pd.concat([y_train, y_val], axis=0),
981
- x_val=x_test,
982
- y_val=y_test,
983
- model=model,
984
- )
985
-
986
- # Update all metrics with cross-validation means
987
- for metric, value in cv_means.items():
988
- if metric in test_score: # Only update existing metrics
989
- test_score[metric] = value
990
- best_score = test_score
991
- best_score["TYPE"] = "crossval"
992
- else:
993
- # Evaluate on test set
994
- self.type_name = "testset"
995
- best_score, best_model, best_pred = self.train_model(
996
- params=model_best_params,
997
- x_train=pd.concat([x_train, x_val], axis=0),
998
- y_train=pd.concat([y_train, y_val], axis=0),
999
- x_val=x_test,
1000
- y_val=y_test,
1001
- model=model,
1002
- )
1003
-
1004
- logger.info(f"👉 {model.model_name} scores on test set:")
1005
- for metric, value in best_score.items():
1006
- if isinstance(value, (int, float)):
1007
- logger.info(f" {metric}: {value:.4f}")
1008
-
1009
1282
  # Save predictions
1010
1283
  best_pred.to_csv(
1011
1284
  f"{self.results_dir}/prediction.csv",
@@ -1016,7 +1289,6 @@ class ModelSelectionEngine:
1016
1289
 
1017
1290
  # Save best model
1018
1291
  model_path = best_model.save(self.results_dir)
1019
-
1020
1292
  model_path = Path(model_path).resolve()
1021
1293
  best_score["MODEL_PATH"] = model_path
1022
1294
 
@@ -1039,32 +1311,26 @@ class ModelSelectionEngine:
1039
1311
  scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1040
1312
  scores_tracking.to_csv(scores_tracking_path, index=False)
1041
1313
 
1042
- # Save model training metadata
1043
- stop = time.time()
1044
- training_time = stop - start
1045
- model_training.best_params = model_best_params
1046
- model_training.model_path = model_path
1047
- model_training.training_time = training_time
1048
- model_training.save()
1049
-
1050
- # Store metrics in DB
1314
+ # Save in db
1051
1315
  drop_cols = [
1052
1316
  "DATE",
1053
1317
  "MODEL_NAME",
1054
- "MODEL_PATH",
1055
1318
  ]
1056
1319
  best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1057
1320
  score_data = {k.lower(): v for k, v in best_score.items()}
1058
-
1059
- Score.upsert(
1060
- match_fields=["model_training_id"],
1061
- model_training_id=model_training.id,
1321
+ model = Model.upsert(
1322
+ name=model_name,
1323
+ type=self.target_type,
1324
+ )
1325
+ ModelSelectionScore.upsert(
1326
+ model_id=model.id,
1327
+ model_selection_id=model_selection.id,
1328
+ best_params=serialize_for_json(model_best_params),
1329
+ training_time=training_time,
1062
1330
  **score_data,
1063
1331
  )
1064
1332
 
1065
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1066
-
1067
- # find best model type
1333
+ # STEP 2 :FINDING BEST MODEL OVERALL
1068
1334
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1069
1335
  scores_tracking = pd.read_csv(scores_tracking_path)
1070
1336
  best_score_overall = scores_tracking.iloc[0, :]
@@ -1075,12 +1341,11 @@ class ModelSelectionEngine:
1075
1341
  else:
1076
1342
  best_thresholds = None
1077
1343
 
1078
- # Remove any .best or .keras files
1344
+ # Remove any .best or .keras files, and save best model in target_dir
1079
1345
  for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
1080
1346
  os.path.join(self.target_dir, "*.keras")
1081
1347
  ):
1082
1348
  os.remove(file_path)
1083
- # Copy the best model in root training folder for this target
1084
1349
  best_model_path = Path(
1085
1350
  f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1086
1351
  ).resolve()
@@ -1092,13 +1357,13 @@ class ModelSelectionEngine:
1092
1357
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1093
1358
  best_model_params = json.load(f)[best_model_name]
1094
1359
 
1095
- # Save model_selection results to db
1096
-
1360
+ # Save to db
1097
1361
  model_selection = ModelSelection.get(model_selection.id)
1098
- model_selection.best_model_id = Model.find_by(
1362
+ model = Model.find_by(
1099
1363
  name=best_score_overall["MODEL_NAME"], type=self.target_type
1100
- ).id
1101
- model_selection.best_model_params = best_model_params
1364
+ )
1365
+ model_selection.best_model_id = model.id
1366
+ model_selection.best_model_params = serialize_for_json(best_model_params)
1102
1367
  model_selection.best_thresholds = best_thresholds
1103
1368
  model_selection.best_model_path = best_model_path
1104
1369
 
@@ -1111,7 +1376,7 @@ class ModelSelectionEngine:
1111
1376
  k: v for k, v in best_score_overall.items() if k not in drop_cols
1112
1377
  }
1113
1378
  score_data = {k.lower(): v for k, v in best_score_overall.items()}
1114
- model_selection.best_score = score_data
1379
+ model_selection.best_score = serialize_for_json(score_data)
1115
1380
  model_selection.save()
1116
1381
 
1117
1382
  logger.info(f"Best model overall is : {best_score_overall}")
@@ -1119,11 +1384,188 @@ class ModelSelectionEngine:
1119
1384
  # Consolidate best parameters from all targets into a single file
1120
1385
  self.consolidate_best_params()
1121
1386
 
1122
- best_model = joblib.load(best_model_path)
1123
- return best_model
1387
+ self.best_model_ = BaseModel(
1388
+ path=self.target_dir, target_number=self.target_number
1389
+ )
1390
+ self._set_fitted()
1391
+ return self
1392
+
1393
+ def get_best_model(self):
1394
+ """
1395
+ Get the best trained model.
1396
+
1397
+ Returns:
1398
+ The best model found during training
1399
+ """
1400
+ self._check_is_fitted()
1401
+ return self.best_model_
1402
+
1403
+ def hyperoptimize(self, x_train, y_train, x_val, y_val, model: BaseModel):
1404
+ """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1405
+ if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1406
+ return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1407
+ elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
1408
+ return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1409
+ else:
1410
+ raise ValueError(
1411
+ f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
1412
+ )
1413
+
1414
+ def hyperoptimize_hyperopt(self, x_train, y_train, x_val, y_val, model: BaseModel):
1415
+ """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
1416
+
1417
+ logger.info("Start tuning hyperparameters with HyperOpt standalone...")
1418
+
1419
+ # Convert Ray search space to HyperOpt search space
1420
+ def convert_search_space(ray_space):
1421
+ """Convert Ray Tune search space to HyperOpt format."""
1422
+ from ray.tune.search.sample import Categorical, Float, Integer
1423
+
1424
+ hp_space = {}
1425
+ for key, value in ray_space.items():
1426
+ if isinstance(value, Float):
1427
+ if (
1428
+ hasattr(value, "sampler")
1429
+ and value.sampler.__class__.__name__ == "LogUniform"
1430
+ ):
1431
+ # LogUniform distribution
1432
+ hp_space[key] = hp.loguniform(
1433
+ key, np.log(value.lower), np.log(value.upper)
1434
+ )
1435
+ else:
1436
+ # Uniform distribution
1437
+ hp_space[key] = hp.uniform(key, value.lower, value.upper)
1438
+ elif isinstance(value, Integer):
1439
+ # Integer uniform distribution
1440
+ hp_space[key] = hp.randint(key, value.lower, value.upper)
1441
+ elif isinstance(value, Categorical):
1442
+ # Categorical/choice distribution
1443
+ hp_space[key] = hp.choice(key, value.categories)
1444
+ elif isinstance(value, dict):
1445
+ # Nested dict, recurse
1446
+ hp_space[key] = convert_search_space(value)
1447
+ else:
1448
+ # Static value or unknown type
1449
+ hp_space[key] = value
1450
+ return hp_space
1451
+
1452
+ # Create objective function for HyperOpt
1453
+ def objective(params):
1454
+ """Objective function to minimize."""
1455
+ try:
1456
+ # Convert numpy types to native Python types
1457
+ params = serialize_for_json(params)
1458
+
1459
+ # Use existing trainable function based on perform_crossval
1460
+ if self.perform_crossval:
1461
+ score = trainable_cv(
1462
+ params,
1463
+ x_train,
1464
+ y_train,
1465
+ x_val,
1466
+ y_val,
1467
+ model.model_name,
1468
+ self.target_type,
1469
+ self.experiment_name,
1470
+ self.target_number,
1471
+ model.create_model,
1472
+ n_splits=3,
1473
+ plot=model.plot,
1474
+ log_dir=model.log_dir,
1475
+ target_clf_thresholds=self.target_clf_thresholds,
1476
+ time_series=self.time_series,
1477
+ recurrent=model.recurrent,
1478
+ )
1479
+ else:
1480
+ score, _, _ = trainable(
1481
+ params,
1482
+ x_train,
1483
+ y_train,
1484
+ x_val,
1485
+ y_val,
1486
+ model.model_name,
1487
+ self.target_type,
1488
+ self.experiment_name,
1489
+ self.target_number,
1490
+ model.create_model,
1491
+ plot=model.plot,
1492
+ log_dir=model.log_dir,
1493
+ target_clf_thresholds=self.target_clf_thresholds,
1494
+ )
1495
+
1496
+ # HyperOpt minimizes, so return the metric directly
1497
+ loss = score[self.metric]
1498
+
1499
+ # Log trial info
1500
+ logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
1501
+
1502
+ return {
1503
+ "loss": loss,
1504
+ "status": STATUS_OK,
1505
+ "score": score, # Keep full score dict for analysis
1506
+ }
1507
+
1508
+ except Exception as e:
1509
+ logger.error(f"Trial failed: {str(e)}")
1510
+ return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
1511
+
1512
+ # Convert search space
1513
+ hp_search_space = convert_search_space(model.search_params)
1514
+
1515
+ # Run optimization
1516
+ trials = Trials()
1517
+ best_params = fmin(
1518
+ fn=objective,
1519
+ space=hp_search_space,
1520
+ algo=tpe.suggest,
1521
+ max_evals=self.number_of_trials,
1522
+ trials=trials,
1523
+ verbose=True,
1524
+ show_progressbar=True,
1525
+ )
1526
+
1527
+ # Get the actual parameter values (not just indices for hp.choice)
1528
+ best_params = space_eval(hp_search_space, best_params)
1529
+
1530
+ # Convert numpy types to native Python types
1531
+ best_params = serialize_for_json(best_params)
1532
+
1533
+ # Get best score from trials
1534
+ best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
1535
+ best_score = trials.trials[best_trial_idx]["result"].get("score", {})
1536
+
1537
+ # Log results
1538
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1539
+ logger.info(f"Best Scores found were:\n{best_score}")
1540
+
1541
+ # Create summary DataFrame for consistency with Ray version
1542
+ results_df = pd.DataFrame(
1543
+ [
1544
+ {
1545
+ "trial_id": i,
1546
+ self.metric: t["result"]["loss"],
1547
+ **{
1548
+ k: v
1549
+ for k, v in t["result"].get("score", {}).items()
1550
+ if isinstance(v, (int, float))
1551
+ },
1552
+ }
1553
+ for i, t in enumerate(trials.trials)
1554
+ if t["result"]["status"] == STATUS_OK
1555
+ ]
1556
+ )
1124
1557
 
1125
- def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1126
- self.type_name = "hyperopts"
1558
+ if not results_df.empty:
1559
+ logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
1560
+
1561
+ # Save trial history for analysis
1562
+ trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
1563
+ with open(trials_path, "wb") as f:
1564
+ pickle.dump(trials, f)
1565
+
1566
+ return best_params
1567
+
1568
+ def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: BaseModel):
1127
1569
 
1128
1570
  def collect_error_logs(target_dir: int, storage_path: str):
1129
1571
  output_error_file = f"{target_dir}/errors.log"
@@ -1166,9 +1608,22 @@ class ModelSelectionEngine:
1166
1608
  }
1167
1609
  )
1168
1610
 
1611
+ # Choose between regular trainable or CV version based on perform_crossval flag
1612
+ # perform_crossval controls whether to use CV during hyperopt
1613
+ if self.perform_crossval:
1614
+ trainable_fn = trainable_cv
1615
+ additional_params = {
1616
+ "n_splits": 3, # Can be made configurable
1617
+ "time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
1618
+ "recurrent": model.recurrent,
1619
+ }
1620
+ else:
1621
+ trainable_fn = trainable
1622
+ additional_params = {}
1623
+
1169
1624
  tuner = Tuner(
1170
1625
  trainable=with_parameters(
1171
- trainable,
1626
+ trainable_fn,
1172
1627
  x_train=x_train,
1173
1628
  y_train=y_train,
1174
1629
  x_val=x_val,
@@ -1178,10 +1633,10 @@ class ModelSelectionEngine:
1178
1633
  experiment_name=self.experiment_name,
1179
1634
  target_number=self.target_number,
1180
1635
  create_model=model.create_model,
1181
- type_name="hyperopts",
1182
1636
  plot=model.plot,
1183
1637
  log_dir=model.log_dir,
1184
1638
  target_clf_thresholds=self.target_clf_thresholds,
1639
+ **additional_params,
1185
1640
  ),
1186
1641
  param_space=model.search_params,
1187
1642
  tune_config=TuneConfig(
@@ -1221,7 +1676,7 @@ class ModelSelectionEngine:
1221
1676
 
1222
1677
  return best_params
1223
1678
 
1224
- def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
1679
+ def train_model(self, params, x_train, y_train, x_val, y_val, model: BaseModel):
1225
1680
  # Use the standalone training function to avoid duplication
1226
1681
  # For train_model, we pass the data directly (not as Ray references)
1227
1682
  return trainable(
@@ -1235,7 +1690,6 @@ class ModelSelectionEngine:
1235
1690
  self.experiment_name,
1236
1691
  self.target_number,
1237
1692
  model.create_model,
1238
- self.type_name,
1239
1693
  model.plot,
1240
1694
  log_dir=model.log_dir,
1241
1695
  target_clf_thresholds=self.target_clf_thresholds,
@@ -1341,11 +1795,11 @@ def evaluate(
1341
1795
  y_pred_proba = (
1342
1796
  prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1343
1797
  )
1344
- if num_classes > 2:
1345
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1346
- lb.fit(labels)
1347
- y_true_onhot = lb.transform(y_true)
1348
- y_pred_onehot = lb.transform(y_pred)
1798
+ # if num_classes > 2:
1799
+ # lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1800
+ # lb.fit(labels)
1801
+ # y_true_onhot = lb.transform(y_true)
1802
+ # y_pred_onehot = lb.transform(y_pred)
1349
1803
 
1350
1804
  score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1351
1805
  score["ACCURACY"] = accuracy_score(y_true, y_pred)
@@ -1365,6 +1819,9 @@ def evaluate(
1365
1819
  average=("binary" if num_classes == 2 else "macro"),
1366
1820
  )
1367
1821
  score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
1822
+ score["AVG_PRECISION"] = average_precision_score(
1823
+ y_true, y_pred_proba, average="macro"
1824
+ )
1368
1825
 
1369
1826
  # Store the complete thresholds dictionary
1370
1827
  if len(target_clf_thresholds.keys()) > 1:
@@ -1719,6 +2176,20 @@ class Thresholds(BaseModel):
1719
2176
  def find_best_threshold(
1720
2177
  prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
1721
2178
  ) -> Thresholds:
2179
+ def _normalize_class_label(cls):
2180
+ if isinstance(cls, (np.integer, int)):
2181
+ return int(cls)
2182
+ if isinstance(cls, (float, np.floating)) and cls.is_integer():
2183
+ return int(cls)
2184
+ if isinstance(cls, str):
2185
+ try:
2186
+ as_float = float(cls)
2187
+ if as_float.is_integer():
2188
+ return int(as_float)
2189
+ except ValueError:
2190
+ pass
2191
+ return cls
2192
+
1722
2193
  """
1723
2194
  General function to find best threshold optimizing recall, precision, or f1.
1724
2195
 
@@ -1737,10 +2208,15 @@ def find_best_threshold(
1737
2208
  pred_cols = [
1738
2209
  col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
1739
2210
  ]
1740
- classes = [1] if len(pred_cols) <= 2 else sorted(y_true.unique())
2211
+ classes = (
2212
+ [1]
2213
+ if len(pred_cols) <= 2
2214
+ else sorted({_normalize_class_label(cls) for cls in y_true.unique()}, key=str)
2215
+ )
1741
2216
 
1742
2217
  results = {}
1743
- for cls in classes:
2218
+ for raw_cls in classes:
2219
+ cls = _normalize_class_label(raw_cls)
1744
2220
  cls_str = str(cls)
1745
2221
  if cls_str not in prediction.columns and cls not in prediction.columns:
1746
2222
  logger.warning(f"Missing predicted probabilities for class '{cls}'")