lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
@@ -15,7 +15,7 @@ from pydantic import BaseModel
15
15
  import ast
16
16
 
17
17
  # ML models
18
- from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
19
19
  from sklearn.calibration import CalibratedClassifierCV
20
20
  from sklearn.metrics import (
21
21
  mean_absolute_percentage_error,
@@ -55,31 +55,46 @@ from tensorboardX import SummaryWriter
55
55
 
56
56
  # Optimization
57
57
  import ray
58
- from ray.tune import Tuner, TuneConfig, with_parameters
59
- from ray.train import RunConfig
58
+ from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
60
59
  from ray.tune.search.hyperopt import HyperOptSearch
61
60
  from ray.tune.search.bayesopt import BayesOptSearch
62
61
  from ray.tune.logger import TBXLoggerCallback
63
62
  from ray.tune.schedulers import ASHAScheduler
64
63
  from ray.air import session
65
64
 
65
+ # HyperOpt standalone
66
+ from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
67
+
66
68
  # Internal library
67
69
  from lecrapaud.search_space import all_models
68
70
  from lecrapaud.directories import clean_directory
69
71
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
- from lecrapaud.config import PYTHON_ENV
72
+ from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
71
73
  from lecrapaud.feature_selection import load_train_data
72
74
  from lecrapaud.db import (
73
75
  Model,
74
76
  ModelSelection,
75
- ModelTraining,
76
- Score,
77
+ ModelSelectionScore,
77
78
  Target,
78
79
  Experiment,
79
80
  )
81
+ from lecrapaud.mixins import LeCrapaudEstimatorMixin
80
82
 
81
83
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
82
84
 
85
+ # Suppress XGBoost and LightGBM logging
86
+ import logging
87
+
88
+ logging.getLogger("lightgbm").setLevel(logging.ERROR)
89
+ logging.getLogger("xgboost").setLevel(logging.ERROR)
90
+
91
+ # Set global verbosity for XGBoost
92
+ xgb.set_config(verbosity=0)
93
+
94
+ # Suppress warnings
95
+ warnings.filterwarnings("ignore", category=UserWarning)
96
+ warnings.filterwarnings("ignore", category=FutureWarning)
97
+
83
98
  # Reproducible result
84
99
  keras.utils.set_random_seed(42)
85
100
  np.random.seed(42)
@@ -110,7 +125,64 @@ def test_hardware():
110
125
  warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
111
126
 
112
127
 
113
- class ModelEngine:
128
+ class CatBoostWrapper:
129
+ """
130
+ Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
131
+ as direct attributes, while forwarding all method calls and properties.
132
+ """
133
+
134
+ __slots__ = ("_model", "_extra_attrs")
135
+
136
+ def __init__(self, model, **kwargs):
137
+ object.__setattr__(self, "_model", model)
138
+ object.__setattr__(self, "_extra_attrs", {})
139
+ # Register kwargs as direct attributes
140
+ for key, value in kwargs.items():
141
+ setattr(self, key, value)
142
+
143
+ # ---- Transparent access ----
144
+ def __getattr__(self, name):
145
+ """Forward attribute access to the underlying model if not found."""
146
+ model = object.__getattribute__(self, "_model")
147
+ if hasattr(model, name):
148
+ return getattr(model, name)
149
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
150
+ if name in extra_attrs:
151
+ return extra_attrs[name]
152
+ raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
153
+
154
+ def __setattr__(self, name, value):
155
+ """Set to wrapper or forward to model when appropriate."""
156
+ if name in CatBoostWrapper.__slots__:
157
+ object.__setattr__(self, name, value)
158
+ return
159
+
160
+ model = object.__getattribute__(self, "_model")
161
+ if hasattr(model, name):
162
+ setattr(model, name, value)
163
+ else:
164
+ extra_attrs = object.__getattribute__(self, "_extra_attrs")
165
+ extra_attrs[name] = value
166
+
167
+ def __dir__(self):
168
+ """Merge dir() from wrapper, model, and custom attributes."""
169
+ base = set(super().__dir__())
170
+ model_attrs = set(dir(object.__getattribute__(self, "_model")))
171
+ extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
172
+ return sorted(base | model_attrs | extra_attrs)
173
+
174
+ def __repr__(self):
175
+ model = object.__getattribute__(self, "_model")
176
+ extras = object.__getattribute__(self, "_extra_attrs")
177
+ return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
178
+
179
+ @property
180
+ def model(self):
181
+ """Access the raw CatBoost model."""
182
+ return object.__getattribute__(self, "_model")
183
+
184
+
185
+ class BaseModel:
114
186
 
115
187
  def __init__(
116
188
  self,
@@ -296,12 +368,15 @@ class ModelEngine:
296
368
  )
297
369
 
298
370
  # Attach metadata for consistency with sklearn path
299
- model.model_name = self.model_name
300
- model.target_type = self.target_type
301
- logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
371
+ model_wrapped = CatBoostWrapper(
372
+ model, model_name=self.model_name, target_type=self.target_type
373
+ )
374
+ logger.info(
375
+ f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
376
+ )
302
377
 
303
- self._model = model
304
- return model
378
+ self._model = model_wrapped
379
+ return model_wrapped
305
380
 
306
381
  def fit_boosting(self, x_train, y_train, x_val, y_val, params):
307
382
  """
@@ -350,6 +425,7 @@ class ModelEngine:
350
425
  "metric": eval_metric,
351
426
  "num_class": num_class,
352
427
  "verbose": -1,
428
+ "verbose_eval": False,
353
429
  },
354
430
  num_boost_round=params["num_boost_round"],
355
431
  train_set=train_data,
@@ -361,6 +437,7 @@ class ModelEngine:
361
437
  ),
362
438
  lgb.record_evaluation(evals_result),
363
439
  tensorboard_callback,
440
+ lgb.log_evaluation(period=0), # Disable evaluation logging
364
441
  ],
365
442
  )
366
443
  else:
@@ -402,7 +479,7 @@ class ModelEngine:
402
479
  if self.target_type == "regression"
403
480
  else ("logloss" if num_class <= 2 else "mlogloss")
404
481
  )
405
- xgb.set_config(verbosity=0)
482
+ # XGBoost verbosity already set globally
406
483
  model = xgb.train(
407
484
  params={
408
485
  **params["model_params"],
@@ -417,11 +494,11 @@ class ModelEngine:
417
494
  xgb.callback.EarlyStopping(
418
495
  rounds=params["early_stopping_rounds"], save_best=True
419
496
  ),
420
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
497
+ # Removed EvaluationMonitor to suppress logs
421
498
  tensorboard_callback,
422
499
  ],
423
500
  evals_result=evals_result, # Record evaluation result
424
- verbose_eval=10000,
501
+ verbose_eval=False, # Disable evaluation logging
425
502
  )
426
503
 
427
504
  model.model_name = self.create_model
@@ -686,6 +763,171 @@ class ModelEngine:
686
763
  )
687
764
 
688
765
 
766
+ def trainable_cv(
767
+ params,
768
+ x_train,
769
+ y_train,
770
+ x_val,
771
+ y_val,
772
+ model_name,
773
+ target_type,
774
+ experiment_name,
775
+ target_number,
776
+ create_model,
777
+ n_splits=3,
778
+ plot=False,
779
+ log_dir=None,
780
+ target_clf_thresholds: dict = None,
781
+ time_series=True,
782
+ recurrent=False,
783
+ ):
784
+ """Cross-validation version of trainable for hyperopt.
785
+
786
+ Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
787
+ Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
788
+ """
789
+ # Combine train and validation data for cross-validation
790
+ if recurrent:
791
+ x_train_val = np.concatenate([x_train, x_val], axis=0)
792
+ y_train_val = np.concatenate([y_train, y_val], axis=0)
793
+ else:
794
+ x_train_val = pd.concat([x_train, x_val], axis=0)
795
+ y_train_val = pd.concat([y_train, y_val], axis=0)
796
+ # Store original index for later use if needed
797
+ original_index = x_train_val.index.copy()
798
+ # Reset index for proper iloc indexing with CV splits
799
+ x_train_val = x_train_val.reset_index(drop=True)
800
+ y_train_val = y_train_val.reset_index(drop=True)
801
+
802
+ # Choose appropriate cross-validation splitter
803
+ if time_series:
804
+ # Time series split for temporal data
805
+ n_samples = len(x_train_val)
806
+ test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
807
+ cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
808
+ else:
809
+ # Stratified or regular K-fold for i.i.d. data
810
+ if target_type == "classification":
811
+ cv_splitter = StratifiedKFold(
812
+ n_splits=n_splits, shuffle=True, random_state=42
813
+ )
814
+ else:
815
+ cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
816
+
817
+ # Store all predictions and true values for pooled metrics
818
+ all_predictions = []
819
+ all_y_true = []
820
+ fold_times = []
821
+
822
+ # Get splits based on the CV strategy
823
+ if time_series or target_type == "regression":
824
+ splits = cv_splitter.split(x_train_val)
825
+ else:
826
+ # For stratified split, we need to pass y
827
+ if recurrent:
828
+ # Extract the target from the 2D array (first column is target)
829
+ y_for_split = y_train_val[:, 0]
830
+ else:
831
+ y_for_split = y_train_val
832
+ splits = cv_splitter.split(x_train_val, y_for_split)
833
+
834
+ for fold_idx, (train_idx, val_idx) in enumerate(splits):
835
+ # Extract fold data
836
+ if recurrent:
837
+ x_fold_train = x_train_val[train_idx]
838
+ y_fold_train = y_train_val[train_idx]
839
+ x_fold_val = x_train_val[val_idx]
840
+ y_fold_val = y_train_val[val_idx]
841
+ else:
842
+ x_fold_train = x_train_val.iloc[train_idx]
843
+ y_fold_train = y_train_val.iloc[train_idx]
844
+ x_fold_val = x_train_val.iloc[val_idx]
845
+ y_fold_val = y_train_val.iloc[val_idx]
846
+
847
+ # Train model for this fold
848
+ model = BaseModel(
849
+ model_name=model_name,
850
+ target_type=target_type,
851
+ target_number=target_number,
852
+ create_model=create_model,
853
+ plot=False, # Disable individual fold plots
854
+ log_dir=log_dir,
855
+ )
856
+
857
+ if recurrent:
858
+ timesteps = params["timesteps"]
859
+ x_fold_train = x_fold_train[:, -timesteps:, :]
860
+ x_fold_val = x_fold_val[:, -timesteps:, :]
861
+
862
+ # Fit model
863
+ model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
864
+
865
+ # Get predictions
866
+ y_pred = model.predict(x_fold_val)
867
+
868
+ # Handle recurrent model indexing
869
+ if recurrent:
870
+ y_fold_val = pd.DataFrame(
871
+ y_fold_val, columns=["TARGET", "index"]
872
+ ).set_index("index")
873
+ y_pred.index = y_fold_val.index
874
+
875
+ # Store predictions and true values
876
+ all_predictions.append(y_pred)
877
+ all_y_true.append(y_fold_val)
878
+
879
+ # Concatenate all fold predictions
880
+ if target_type == "classification":
881
+ # For classification, we need to handle probability columns
882
+ all_pred_df = pd.concat(all_predictions, axis=0)
883
+ all_y_series = pd.concat(all_y_true, axis=0)
884
+ # Ensure we have a DataFrame with TARGET column
885
+ if isinstance(all_y_series, pd.Series):
886
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
887
+ else:
888
+ all_y_df = all_y_series
889
+ else:
890
+ # For regression, just concatenate the predictions
891
+ all_pred_series = pd.concat(all_predictions, axis=0)
892
+ all_y_series = pd.concat(all_y_true, axis=0)
893
+ all_pred_df = pd.DataFrame({"PRED": all_pred_series})
894
+ all_y_df = pd.DataFrame({"TARGET": all_y_series})
895
+
896
+ # Create combined prediction DataFrame
897
+ prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
898
+
899
+ # Calculate pooled metrics
900
+ score = {
901
+ "DATE": datetime.now(),
902
+ "MODEL_NAME": model_name,
903
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
904
+ }
905
+
906
+ # Unscale if needed (for regression with scaling)
907
+ if (
908
+ model.need_scaling
909
+ and target_type == "regression"
910
+ and model.scaler_y is not None
911
+ ):
912
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
913
+ prediction[["TARGET"]].values
914
+ )
915
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
916
+ prediction[["PRED"]].values
917
+ )
918
+
919
+ # Evaluate with pooled predictions
920
+ score.update(evaluate(prediction, target_type, target_clf_thresholds))
921
+
922
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
923
+ logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
924
+
925
+ # Report to Ray if in Ray context
926
+ if session.get_session():
927
+ session.report(metrics=score)
928
+ return score
929
+
930
+
689
931
  def trainable(
690
932
  params,
691
933
  x_train,
@@ -697,14 +939,13 @@ def trainable(
697
939
  experiment_name,
698
940
  target_number,
699
941
  create_model,
700
- type_name="hyperopts",
701
942
  plot=False,
702
943
  log_dir=None,
703
944
  target_clf_thresholds: dict = None,
704
945
  ):
705
946
  """Standalone version of train_model that doesn't depend on self"""
706
947
  # Create model engine
707
- model = ModelEngine(
948
+ model = BaseModel(
708
949
  model_name=model_name,
709
950
  target_type=target_type,
710
951
  target_number=target_number,
@@ -723,9 +964,7 @@ def trainable(
723
964
  x_val = x_val[:, -timesteps:, :]
724
965
 
725
966
  # Compile and fit model on train set
726
- start = time.time()
727
967
  model.fit(x_train, y_train, x_val, y_val, params)
728
- stop = time.time()
729
968
 
730
969
  # Prediction on val set
731
970
  y_pred = model.predict(x_val)
@@ -755,8 +994,6 @@ def trainable(
755
994
  score = {
756
995
  "DATE": datetime.now(),
757
996
  "MODEL_NAME": model.model_name,
758
- "TYPE": type_name,
759
- "TRAINING_TIME": stop - start,
760
997
  "EVAL_DATA_STD": prediction["TARGET"].std(),
761
998
  }
762
999
 
@@ -765,77 +1002,107 @@ def trainable(
765
1002
  metric = "RMSE" if target_type == "regression" else "LOGLOSS"
766
1003
  logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
767
1004
 
768
- if type_name == "hyperopts":
1005
+ # Report to Ray if in Ray context
1006
+ if session.get_session():
769
1007
  session.report(metrics=score)
770
1008
  return score
771
1009
 
772
1010
  return score, model, prediction
773
1011
 
774
1012
 
775
- class ModelSelectionEngine:
1013
+ class ModelSelector(LeCrapaudEstimatorMixin):
776
1014
 
777
1015
  def __init__(
778
1016
  self,
779
- data,
780
- reshaped_data,
781
- target_number,
782
- target_clf,
783
- experiment,
784
- models_idx,
785
- time_series,
786
- date_column,
787
- group_column,
788
- target_clf_thresholds,
1017
+ experiment: Experiment = None,
1018
+ target_number: int = None,
789
1019
  **kwargs,
790
1020
  ):
791
- self.data = data
792
- self.reshaped_data = reshaped_data
1021
+ # The mixin will automatically set all experiment.context parameters as attributes
1022
+ super().__init__(experiment=experiment, target_number=target_number, **kwargs)
1023
+
1024
+ # Set defaults for required parameters if not provided
1025
+ if not hasattr(self, "target_clf"):
1026
+ self.target_clf = []
1027
+ if not hasattr(self, "models_idx"):
1028
+ self.models_idx = []
1029
+ if not hasattr(self, "time_series"):
1030
+ self.time_series = False
1031
+ if not hasattr(self, "date_column"):
1032
+ self.date_column = None
1033
+ if not hasattr(self, "group_column"):
1034
+ self.group_column = None
1035
+ if not hasattr(self, "target_clf_thresholds"):
1036
+ self.target_clf_thresholds = {}
793
1037
  self.target_number = target_number
794
- self.experiment = experiment
795
- self.target_clf = target_clf
796
- self.models_idx = models_idx
797
- self.time_series = time_series
798
- self.date_column = date_column
799
- self.group_column = group_column
800
- self.target_clf_thresholds = (
801
- target_clf_thresholds[target_number]
802
- if target_number in target_clf_thresholds.keys()
803
- else None
804
- )
805
1038
 
806
- self.target_type = (
807
- "classification" if self.target_number in self.target_clf else "regression"
808
- )
809
- self.experiment_dir = self.experiment.path
810
- self.experiment_id = self.experiment.id
811
- self.data_dir = f"{self.experiment_dir}/data"
812
- self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
813
- self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
814
- self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
815
- self.features = self.experiment.get_features(self.target_number)
816
- self.all_features = self.experiment.get_all_features(
817
- date_column=self.date_column, group_column=self.group_column
818
- )
1039
+ # Handle target_clf_thresholds for specific target
1040
+ # Handle both string and integer keys for backward compatibility
1041
+ if self.target_number and self.target_clf_thresholds:
1042
+ # Try both integer and string versions of the target number
1043
+ if self.target_number in self.target_clf_thresholds:
1044
+ self.target_clf_thresholds = self.target_clf_thresholds[
1045
+ self.target_number
1046
+ ]
1047
+ elif str(self.target_number) in self.target_clf_thresholds:
1048
+ self.target_clf_thresholds = self.target_clf_thresholds[
1049
+ str(self.target_number)
1050
+ ]
1051
+
1052
+ # Derived attributes
1053
+ if self.target_number is not None:
1054
+ self.target_type = (
1055
+ "classification"
1056
+ if self.target_number in self.target_clf
1057
+ else "regression"
1058
+ )
1059
+ self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
1060
+
1061
+ # Set paths and features if experiment is available
1062
+ if self.experiment:
1063
+ self.experiment_dir = self.experiment.path
1064
+ self.experiment_id = self.experiment.id
1065
+ self.data_dir = f"{self.experiment_dir}/data"
1066
+ self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
1067
+
1068
+ if self.target_number is not None:
1069
+ self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
1070
+ self.features = self.experiment.get_features(self.target_number)
1071
+
1072
+ self.all_features = self.experiment.get_all_features(
1073
+ date_column=self.date_column, group_column=self.group_column
1074
+ )
819
1075
 
820
1076
  # Main training function
821
- def run(
822
- self,
823
- experiment_name,
824
- perform_hyperopt=True,
825
- number_of_trials=20,
826
- perform_crossval=False,
827
- plot=True,
828
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
829
- preserve_model=True,
830
- best_params=None,
831
- ):
1077
+ def fit(self, X, y=None, reshaped_data=None, best_params=None):
832
1078
  """
833
- Selects the best models based on a target variable, optionally performing hyperparameter optimization
834
- and cross-validation, and manages outputs in a session-specific directory.
1079
+ Fit the model selector (train and select best model).
1080
+
1081
+ Args:
1082
+ X: Either a DataFrame or a dict with train/val/test data
1083
+ y: Target values (ignored, uses TARGET columns)
1084
+ reshaped_data: Optional reshaped data for recurrent models
1085
+ best_params: Optional pre-defined best parameters
1086
+
1087
+ Returns:
1088
+ self: Returns self for chaining
835
1089
  """
836
- self.experiment_name = experiment_name
837
- self.plot = plot
838
- self.number_of_trials = number_of_trials
1090
+ # Handle both DataFrame and dict inputs
1091
+ if isinstance(X, dict):
1092
+ self.data = X
1093
+ self.reshaped_data = reshaped_data
1094
+ else:
1095
+ # For simple DataFrame input, we expect it to be just training data
1096
+ # This is less common for ModelSelector which typically needs train/val/test
1097
+ raise ValueError("ModelSelector requires a dict with train/val/test data")
1098
+ # Get all parameters from experiment context
1099
+ context = self.experiment.context
1100
+ self.experiment_name = context.get("experiment_name", "")
1101
+ self.plot = context.get("plot", True)
1102
+ self.number_of_trials = context.get("number_of_trials", 20)
1103
+ self.perform_crossval = context.get("perform_crossval", False)
1104
+ self.preserve_model = context.get("preserve_model", True)
1105
+ self.perform_hyperopt = context.get("perform_hyperopt", True)
839
1106
 
840
1107
  if self.experiment_id is None:
841
1108
  raise ValueError("Please provide a experiment.")
@@ -885,12 +1152,11 @@ class ModelSelectionEngine:
885
1152
  # create model selection in db
886
1153
  target = Target.find_by(name=f"TARGET_{self.target_number}")
887
1154
  model_selection = ModelSelection.upsert(
888
- match_fields=["target_id", "experiment_id"],
889
1155
  target_id=target.id,
890
1156
  experiment_id=self.experiment_id,
891
1157
  )
892
1158
 
893
- # recurrent models starts at 9 # len(list_models)
1159
+ # STEP 1 : TRAINING MODELS
894
1160
  for i in self.models_idx:
895
1161
  config = all_models[i]
896
1162
  recurrent = config["recurrent"]
@@ -903,24 +1169,16 @@ class ModelSelectionEngine:
903
1169
  self.results_dir = f"{self.target_dir}/{model_name}"
904
1170
  if not os.path.exists(f"{self.results_dir}"):
905
1171
  os.makedirs(f"{self.results_dir}")
906
- elif preserve_model and contains_best(self.results_dir):
1172
+ elif self.preserve_model and contains_best(self.results_dir):
907
1173
  continue
908
- elif perform_hyperopt:
1174
+ elif self.perform_hyperopt:
909
1175
  clean_directory(self.results_dir)
910
1176
 
911
- logger.info(f"Training a {model_name}")
912
- model = Model.upsert(
913
- match_fields=["name", "type"],
914
- name=model_name,
915
- type=self.target_type,
916
- )
917
- model_training = ModelTraining.upsert(
918
- match_fields=["model_id", "model_selection_id"],
919
- model_id=model.id,
920
- model_selection_id=model_selection.id,
1177
+ logger.info(
1178
+ f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
921
1179
  )
922
1180
 
923
- # getting data
1181
+ # Getting data
924
1182
  if recurrent:
925
1183
  # Clear cluster from previous Keras session graphs.
926
1184
  K.clear_session()
@@ -930,7 +1188,7 @@ class ModelSelectionEngine:
930
1188
  for i, e in enumerate(self.all_features)
931
1189
  if e in set(self.features)
932
1190
  ]
933
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
1191
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
934
1192
  x_train = x_train_reshaped[:, :, features_idx]
935
1193
  y_train = y_train_reshaped[:, [self.target_number, 0]]
936
1194
  x_val = x_val_reshaped[:, :, features_idx]
@@ -960,8 +1218,9 @@ class ModelSelectionEngine:
960
1218
  y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
961
1219
 
962
1220
  log_dir = get_log_dir(self.target_dir, model_name)
963
- # instantiate model
964
- model = ModelEngine(
1221
+
1222
+ # Instantiate model
1223
+ model = BaseModel(
965
1224
  target_number=self.target_number,
966
1225
  model_name=model_name,
967
1226
  search_params=config["search_params"],
@@ -971,9 +1230,9 @@ class ModelSelectionEngine:
971
1230
  log_dir=log_dir,
972
1231
  )
973
1232
 
974
- start = time.time()
975
1233
  # Tuning hyperparameters
976
- if perform_hyperopt:
1234
+ start = time.time()
1235
+ if self.perform_hyperopt:
977
1236
  model_best_params = self.hyperoptimize(
978
1237
  x_train, y_train, x_val, y_val, model
979
1238
  )
@@ -989,7 +1248,7 @@ class ModelSelectionEngine:
989
1248
  f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
990
1249
  )
991
1250
 
992
- # save best params
1251
+ # Save best params
993
1252
  best_params_file = f"{self.target_dir}/best_params.json"
994
1253
  try:
995
1254
  with open(best_params_file, "r") as f:
@@ -1001,114 +1260,25 @@ class ModelSelectionEngine:
1001
1260
  with open(best_params_file, "w") as f:
1002
1261
  json.dump(json_dict, f, indent=4)
1003
1262
 
1004
- # Perform cross-validation of the best model on k-folds of train + val set
1005
- if perform_crossval:
1006
- x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
1007
- y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
1008
- n_splits = 4
1009
- n_samples = len(x_train_val)
1010
- test_size = int(n_samples / (n_splits + 4))
1011
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1012
-
1013
- # Store the scores
1014
- cv_scores = []
1015
-
1016
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1017
- self.type_name = f"crossval_fold_{i}"
1018
-
1019
- if self.time_series:
1020
- date_series = pd.concat(
1021
- [
1022
- train[self.date_column],
1023
- val[self.date_column],
1024
- test[self.date_column],
1025
- ],
1026
- axis=0,
1027
- ).reset_index(drop=True)
1028
-
1029
- date_series = date_series.map(pd.Timestamp.fromordinal)
1030
-
1031
- # Now you can use the actual train/val indices to extract ranges
1032
- train_start = date_series.iloc[train_index[0]]
1033
- train_end = date_series.iloc[train_index[-1]]
1034
- val_start = date_series.iloc[val_index[0]]
1035
- val_end = date_series.iloc[val_index[-1]]
1036
-
1037
- logger.info(
1038
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1039
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1040
- )
1041
- else:
1042
- logger.info(
1043
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1044
- )
1045
-
1046
- # Train the model and get the score
1047
- if recurrent:
1048
- cv_score, _, _ = self.train_model(
1049
- params=model_best_params,
1050
- x_train=x_train_val[train_index],
1051
- y_train=y_train_val[train_index],
1052
- x_val=x_train_val[val_index],
1053
- y_val=y_train_val[val_index],
1054
- model=model,
1055
- )
1056
- else:
1057
- cv_score, _, _ = self.train_model(
1058
- params=model_best_params,
1059
- x_train=x_train_val.iloc[train_index],
1060
- y_train=y_train_val.iloc[train_index],
1061
- x_val=x_train_val.iloc[val_index],
1062
- y_val=y_train_val.iloc[val_index],
1063
- model=model,
1064
- )
1065
-
1066
- # Append score to the list
1067
- cv_scores.append(cv_score)
1068
-
1069
- # Calculate mean of all numerical metrics across all cross-validation folds
1070
- cv_scores_df = pd.DataFrame(cv_scores)
1071
- # Get mean of all numeric columns
1072
- cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
1263
+ # Always evaluate on test set (no cross-validation here)
1264
+ # The hyperopt already did CV if needed to find best params
1265
+ best_score, best_model, best_pred = self.train_model(
1266
+ params=model_best_params,
1267
+ x_train=pd.concat([x_train, x_val], axis=0),
1268
+ y_train=pd.concat([y_train, y_val], axis=0),
1269
+ x_val=x_test,
1270
+ y_val=y_test,
1271
+ model=model,
1272
+ )
1273
+ stop = time.time()
1274
+ training_time = stop - start
1073
1275
 
1074
- logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
1075
- for metric, value in cv_means.items():
1276
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1277
+ logger.info(f"👉 {model.model_name} scores on test set:")
1278
+ for metric, value in best_score.items():
1279
+ if isinstance(value, (int, float)):
1076
1280
  logger.info(f" {metric}: {value:.4f}")
1077
1281
 
1078
- # Retrain on entire training set, but keep score on cross-validation folds
1079
- # Get the test score using the best model
1080
- test_score, best_model, best_pred = self.train_model(
1081
- params=model_best_params,
1082
- x_train=pd.concat([x_train, x_val], axis=0),
1083
- y_train=pd.concat([y_train, y_val], axis=0),
1084
- x_val=x_test,
1085
- y_val=y_test,
1086
- model=model,
1087
- )
1088
-
1089
- # Update all metrics with cross-validation means
1090
- for metric, value in cv_means.items():
1091
- if metric in test_score: # Only update existing metrics
1092
- test_score[metric] = value
1093
- best_score = test_score
1094
- best_score["TYPE"] = "crossval"
1095
- else:
1096
- # Evaluate on test set
1097
- self.type_name = "testset"
1098
- best_score, best_model, best_pred = self.train_model(
1099
- params=model_best_params,
1100
- x_train=pd.concat([x_train, x_val], axis=0),
1101
- y_train=pd.concat([y_train, y_val], axis=0),
1102
- x_val=x_test,
1103
- y_val=y_test,
1104
- model=model,
1105
- )
1106
-
1107
- logger.info(f"👉 {model.model_name} scores on test set:")
1108
- for metric, value in best_score.items():
1109
- if isinstance(value, (int, float)):
1110
- logger.info(f" {metric}: {value:.4f}")
1111
-
1112
1282
  # Save predictions
1113
1283
  best_pred.to_csv(
1114
1284
  f"{self.results_dir}/prediction.csv",
@@ -1119,7 +1289,6 @@ class ModelSelectionEngine:
1119
1289
 
1120
1290
  # Save best model
1121
1291
  model_path = best_model.save(self.results_dir)
1122
-
1123
1292
  model_path = Path(model_path).resolve()
1124
1293
  best_score["MODEL_PATH"] = model_path
1125
1294
 
@@ -1142,32 +1311,26 @@ class ModelSelectionEngine:
1142
1311
  scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1143
1312
  scores_tracking.to_csv(scores_tracking_path, index=False)
1144
1313
 
1145
- # Save model training metadata
1146
- stop = time.time()
1147
- training_time = stop - start
1148
- model_training.best_params = model_best_params
1149
- model_training.model_path = model_path
1150
- model_training.training_time = training_time
1151
- model_training.save()
1152
-
1153
- # Store metrics in DB
1314
+ # Save in db
1154
1315
  drop_cols = [
1155
1316
  "DATE",
1156
1317
  "MODEL_NAME",
1157
- "MODEL_PATH",
1158
1318
  ]
1159
1319
  best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1160
1320
  score_data = {k.lower(): v for k, v in best_score.items()}
1161
-
1162
- Score.upsert(
1163
- match_fields=["model_training_id"],
1164
- model_training_id=model_training.id,
1321
+ model = Model.upsert(
1322
+ name=model_name,
1323
+ type=self.target_type,
1324
+ )
1325
+ ModelSelectionScore.upsert(
1326
+ model_id=model.id,
1327
+ model_selection_id=model_selection.id,
1328
+ best_params=serialize_for_json(model_best_params),
1329
+ training_time=training_time,
1165
1330
  **score_data,
1166
1331
  )
1167
1332
 
1168
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1169
-
1170
- # find best model type
1333
+ # STEP 2 :FINDING BEST MODEL OVERALL
1171
1334
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1172
1335
  scores_tracking = pd.read_csv(scores_tracking_path)
1173
1336
  best_score_overall = scores_tracking.iloc[0, :]
@@ -1178,12 +1341,11 @@ class ModelSelectionEngine:
1178
1341
  else:
1179
1342
  best_thresholds = None
1180
1343
 
1181
- # Remove any .best or .keras files
1344
+ # Remove any .best or .keras files, and save best model in target_dir
1182
1345
  for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
1183
1346
  os.path.join(self.target_dir, "*.keras")
1184
1347
  ):
1185
1348
  os.remove(file_path)
1186
- # Copy the best model in root training folder for this target
1187
1349
  best_model_path = Path(
1188
1350
  f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1189
1351
  ).resolve()
@@ -1195,13 +1357,13 @@ class ModelSelectionEngine:
1195
1357
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1196
1358
  best_model_params = json.load(f)[best_model_name]
1197
1359
 
1198
- # Save model_selection results to db
1199
-
1360
+ # Save to db
1200
1361
  model_selection = ModelSelection.get(model_selection.id)
1201
- model_selection.best_model_id = Model.find_by(
1362
+ model = Model.find_by(
1202
1363
  name=best_score_overall["MODEL_NAME"], type=self.target_type
1203
- ).id
1204
- model_selection.best_model_params = best_model_params
1364
+ )
1365
+ model_selection.best_model_id = model.id
1366
+ model_selection.best_model_params = serialize_for_json(best_model_params)
1205
1367
  model_selection.best_thresholds = best_thresholds
1206
1368
  model_selection.best_model_path = best_model_path
1207
1369
 
@@ -1214,7 +1376,7 @@ class ModelSelectionEngine:
1214
1376
  k: v for k, v in best_score_overall.items() if k not in drop_cols
1215
1377
  }
1216
1378
  score_data = {k.lower(): v for k, v in best_score_overall.items()}
1217
- model_selection.best_score = score_data
1379
+ model_selection.best_score = serialize_for_json(score_data)
1218
1380
  model_selection.save()
1219
1381
 
1220
1382
  logger.info(f"Best model overall is : {best_score_overall}")
@@ -1222,11 +1384,188 @@ class ModelSelectionEngine:
1222
1384
  # Consolidate best parameters from all targets into a single file
1223
1385
  self.consolidate_best_params()
1224
1386
 
1225
- best_model = joblib.load(best_model_path)
1226
- return best_model
1387
+ self.best_model_ = BaseModel(
1388
+ path=self.target_dir, target_number=self.target_number
1389
+ )
1390
+ self._set_fitted()
1391
+ return self
1392
+
1393
+ def get_best_model(self):
1394
+ """
1395
+ Get the best trained model.
1396
+
1397
+ Returns:
1398
+ The best model found during training
1399
+ """
1400
+ self._check_is_fitted()
1401
+ return self.best_model_
1402
+
1403
+ def hyperoptimize(self, x_train, y_train, x_val, y_val, model: BaseModel):
1404
+ """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1405
+ if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1406
+ return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1407
+ elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
1408
+ return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1409
+ else:
1410
+ raise ValueError(
1411
+ f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
1412
+ )
1413
+
1414
+ def hyperoptimize_hyperopt(self, x_train, y_train, x_val, y_val, model: BaseModel):
1415
+ """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
1416
+
1417
+ logger.info("Start tuning hyperparameters with HyperOpt standalone...")
1418
+
1419
+ # Convert Ray search space to HyperOpt search space
1420
+ def convert_search_space(ray_space):
1421
+ """Convert Ray Tune search space to HyperOpt format."""
1422
+ from ray.tune.search.sample import Categorical, Float, Integer
1423
+
1424
+ hp_space = {}
1425
+ for key, value in ray_space.items():
1426
+ if isinstance(value, Float):
1427
+ if (
1428
+ hasattr(value, "sampler")
1429
+ and value.sampler.__class__.__name__ == "LogUniform"
1430
+ ):
1431
+ # LogUniform distribution
1432
+ hp_space[key] = hp.loguniform(
1433
+ key, np.log(value.lower), np.log(value.upper)
1434
+ )
1435
+ else:
1436
+ # Uniform distribution
1437
+ hp_space[key] = hp.uniform(key, value.lower, value.upper)
1438
+ elif isinstance(value, Integer):
1439
+ # Integer uniform distribution
1440
+ hp_space[key] = hp.randint(key, value.lower, value.upper)
1441
+ elif isinstance(value, Categorical):
1442
+ # Categorical/choice distribution
1443
+ hp_space[key] = hp.choice(key, value.categories)
1444
+ elif isinstance(value, dict):
1445
+ # Nested dict, recurse
1446
+ hp_space[key] = convert_search_space(value)
1447
+ else:
1448
+ # Static value or unknown type
1449
+ hp_space[key] = value
1450
+ return hp_space
1451
+
1452
+ # Create objective function for HyperOpt
1453
+ def objective(params):
1454
+ """Objective function to minimize."""
1455
+ try:
1456
+ # Convert numpy types to native Python types
1457
+ params = serialize_for_json(params)
1458
+
1459
+ # Use existing trainable function based on perform_crossval
1460
+ if self.perform_crossval:
1461
+ score = trainable_cv(
1462
+ params,
1463
+ x_train,
1464
+ y_train,
1465
+ x_val,
1466
+ y_val,
1467
+ model.model_name,
1468
+ self.target_type,
1469
+ self.experiment_name,
1470
+ self.target_number,
1471
+ model.create_model,
1472
+ n_splits=3,
1473
+ plot=model.plot,
1474
+ log_dir=model.log_dir,
1475
+ target_clf_thresholds=self.target_clf_thresholds,
1476
+ time_series=self.time_series,
1477
+ recurrent=model.recurrent,
1478
+ )
1479
+ else:
1480
+ score, _, _ = trainable(
1481
+ params,
1482
+ x_train,
1483
+ y_train,
1484
+ x_val,
1485
+ y_val,
1486
+ model.model_name,
1487
+ self.target_type,
1488
+ self.experiment_name,
1489
+ self.target_number,
1490
+ model.create_model,
1491
+ plot=model.plot,
1492
+ log_dir=model.log_dir,
1493
+ target_clf_thresholds=self.target_clf_thresholds,
1494
+ )
1495
+
1496
+ # HyperOpt minimizes, so return the metric directly
1497
+ loss = score[self.metric]
1498
+
1499
+ # Log trial info
1500
+ logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
1501
+
1502
+ return {
1503
+ "loss": loss,
1504
+ "status": STATUS_OK,
1505
+ "score": score, # Keep full score dict for analysis
1506
+ }
1507
+
1508
+ except Exception as e:
1509
+ logger.error(f"Trial failed: {str(e)}")
1510
+ return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
1511
+
1512
+ # Convert search space
1513
+ hp_search_space = convert_search_space(model.search_params)
1514
+
1515
+ # Run optimization
1516
+ trials = Trials()
1517
+ best_params = fmin(
1518
+ fn=objective,
1519
+ space=hp_search_space,
1520
+ algo=tpe.suggest,
1521
+ max_evals=self.number_of_trials,
1522
+ trials=trials,
1523
+ verbose=True,
1524
+ show_progressbar=True,
1525
+ )
1526
+
1527
+ # Get the actual parameter values (not just indices for hp.choice)
1528
+ best_params = space_eval(hp_search_space, best_params)
1529
+
1530
+ # Convert numpy types to native Python types
1531
+ best_params = serialize_for_json(best_params)
1532
+
1533
+ # Get best score from trials
1534
+ best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
1535
+ best_score = trials.trials[best_trial_idx]["result"].get("score", {})
1536
+
1537
+ # Log results
1538
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1539
+ logger.info(f"Best Scores found were:\n{best_score}")
1540
+
1541
+ # Create summary DataFrame for consistency with Ray version
1542
+ results_df = pd.DataFrame(
1543
+ [
1544
+ {
1545
+ "trial_id": i,
1546
+ self.metric: t["result"]["loss"],
1547
+ **{
1548
+ k: v
1549
+ for k, v in t["result"].get("score", {}).items()
1550
+ if isinstance(v, (int, float))
1551
+ },
1552
+ }
1553
+ for i, t in enumerate(trials.trials)
1554
+ if t["result"]["status"] == STATUS_OK
1555
+ ]
1556
+ )
1557
+
1558
+ if not results_df.empty:
1559
+ logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
1227
1560
 
1228
- def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1229
- self.type_name = "hyperopts"
1561
+ # Save trial history for analysis
1562
+ trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
1563
+ with open(trials_path, "wb") as f:
1564
+ pickle.dump(trials, f)
1565
+
1566
+ return best_params
1567
+
1568
+ def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: BaseModel):
1230
1569
 
1231
1570
  def collect_error_logs(target_dir: int, storage_path: str):
1232
1571
  output_error_file = f"{target_dir}/errors.log"
@@ -1269,9 +1608,22 @@ class ModelSelectionEngine:
1269
1608
  }
1270
1609
  )
1271
1610
 
1611
+ # Choose between regular trainable or CV version based on perform_crossval flag
1612
+ # perform_crossval controls whether to use CV during hyperopt
1613
+ if self.perform_crossval:
1614
+ trainable_fn = trainable_cv
1615
+ additional_params = {
1616
+ "n_splits": 3, # Can be made configurable
1617
+ "time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
1618
+ "recurrent": model.recurrent,
1619
+ }
1620
+ else:
1621
+ trainable_fn = trainable
1622
+ additional_params = {}
1623
+
1272
1624
  tuner = Tuner(
1273
1625
  trainable=with_parameters(
1274
- trainable,
1626
+ trainable_fn,
1275
1627
  x_train=x_train,
1276
1628
  y_train=y_train,
1277
1629
  x_val=x_val,
@@ -1281,10 +1633,10 @@ class ModelSelectionEngine:
1281
1633
  experiment_name=self.experiment_name,
1282
1634
  target_number=self.target_number,
1283
1635
  create_model=model.create_model,
1284
- type_name="hyperopts",
1285
1636
  plot=model.plot,
1286
1637
  log_dir=model.log_dir,
1287
1638
  target_clf_thresholds=self.target_clf_thresholds,
1639
+ **additional_params,
1288
1640
  ),
1289
1641
  param_space=model.search_params,
1290
1642
  tune_config=TuneConfig(
@@ -1324,7 +1676,7 @@ class ModelSelectionEngine:
1324
1676
 
1325
1677
  return best_params
1326
1678
 
1327
- def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
1679
+ def train_model(self, params, x_train, y_train, x_val, y_val, model: BaseModel):
1328
1680
  # Use the standalone training function to avoid duplication
1329
1681
  # For train_model, we pass the data directly (not as Ray references)
1330
1682
  return trainable(
@@ -1338,7 +1690,6 @@ class ModelSelectionEngine:
1338
1690
  self.experiment_name,
1339
1691
  self.target_number,
1340
1692
  model.create_model,
1341
- self.type_name,
1342
1693
  model.plot,
1343
1694
  log_dir=model.log_dir,
1344
1695
  target_clf_thresholds=self.target_clf_thresholds,
@@ -1444,11 +1795,11 @@ def evaluate(
1444
1795
  y_pred_proba = (
1445
1796
  prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1446
1797
  )
1447
- if num_classes > 2:
1448
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1449
- lb.fit(labels)
1450
- y_true_onhot = lb.transform(y_true)
1451
- y_pred_onehot = lb.transform(y_pred)
1798
+ # if num_classes > 2:
1799
+ # lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1800
+ # lb.fit(labels)
1801
+ # y_true_onhot = lb.transform(y_true)
1802
+ # y_pred_onehot = lb.transform(y_pred)
1452
1803
 
1453
1804
  score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1454
1805
  score["ACCURACY"] = accuracy_score(y_true, y_pred)
@@ -1825,6 +2176,20 @@ class Thresholds(BaseModel):
1825
2176
  def find_best_threshold(
1826
2177
  prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
1827
2178
  ) -> Thresholds:
2179
+ def _normalize_class_label(cls):
2180
+ if isinstance(cls, (np.integer, int)):
2181
+ return int(cls)
2182
+ if isinstance(cls, (float, np.floating)) and cls.is_integer():
2183
+ return int(cls)
2184
+ if isinstance(cls, str):
2185
+ try:
2186
+ as_float = float(cls)
2187
+ if as_float.is_integer():
2188
+ return int(as_float)
2189
+ except ValueError:
2190
+ pass
2191
+ return cls
2192
+
1828
2193
  """
1829
2194
  General function to find best threshold optimizing recall, precision, or f1.
1830
2195
 
@@ -1843,10 +2208,15 @@ def find_best_threshold(
1843
2208
  pred_cols = [
1844
2209
  col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
1845
2210
  ]
1846
- classes = [1] if len(pred_cols) <= 2 else sorted(y_true.unique())
2211
+ classes = (
2212
+ [1]
2213
+ if len(pred_cols) <= 2
2214
+ else sorted({_normalize_class_label(cls) for cls in y_true.unique()}, key=str)
2215
+ )
1847
2216
 
1848
2217
  results = {}
1849
- for cls in classes:
2218
+ for raw_cls in classes:
2219
+ cls = _normalize_class_label(raw_cls)
1850
2220
  cls_str = str(cls)
1851
2221
  if cls_str not in prediction.columns and cls not in prediction.columns:
1852
2222
  logger.warning(f"Missing predicted probabilities for class '{cls}'")