PyPI - lecrapaud - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show

lecrapaud/__init__.py +1 -0
lecrapaud/api.py +277 -0
lecrapaud/config.py +10 -0
lecrapaud/db/__init__.py +1 -0
lecrapaud/db/alembic/env.py +2 -2
lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
lecrapaud/db/alembic.ini +116 -0
lecrapaud/db/models/__init__.py +10 -10
lecrapaud/db/models/base.py +176 -1
lecrapaud/db/models/dataset.py +25 -20
lecrapaud/db/models/feature.py +5 -6
lecrapaud/db/models/feature_selection.py +3 -4
lecrapaud/db/models/feature_selection_rank.py +3 -4
lecrapaud/db/models/model.py +3 -4
lecrapaud/db/models/model_selection.py +15 -8
lecrapaud/db/models/model_training.py +15 -7
lecrapaud/db/models/score.py +9 -6
lecrapaud/db/models/target.py +16 -8
lecrapaud/db/session.py +66 -0
lecrapaud/experiment.py +64 -0
lecrapaud/feature_engineering.py +747 -1022
lecrapaud/feature_selection.py +915 -998
lecrapaud/integrations/openai_integration.py +225 -0
lecrapaud/jobs/__init__.py +2 -2
lecrapaud/jobs/config.py +1 -1
lecrapaud/jobs/scheduler.py +1 -1
lecrapaud/jobs/tasks.py +6 -6
lecrapaud/model_selection.py +1060 -960
lecrapaud/search_space.py +4 -0
lecrapaud/utils.py +2 -2
lecrapaud-0.4.1.dist-info/METADATA +171 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
lecrapaud/db/crud.py +0 -179
lecrapaud/db/services.py +0 -0
lecrapaud/db/setup.py +0 -58
lecrapaud/predictions.py +0 -292
lecrapaud/training.py +0 -151
lecrapaud-0.4.0.dist-info/METADATA +0 -103
/lecrapaud/{directory_management.py → directories.py} +0 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0

lecrapaud/model_selection.py CHANGED Viewed

@@ -10,6 +10,7 @@ import warnings
 import joblib
 import glob
 from pathlib import Path
+import pickle
 os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -63,12 +64,19 @@ from ray.tune.schedulers import ASHAScheduler
 from ray.air import session
 # Internal library
-from src.search_space import ml_models, dl_recurrent_models
-from src.directory_management import clean_directory
-from src.utils import copy_any, contains_best, logger, serialize_for_json
-from src.config import PYTHON_ENV
-from src.feature_selection import TARGETS_CLF, DATE_COLUMN, load_train_data
-from src.db.models import Model, ModelSelection, ModelTraining, Score, Target, Dataset
+from lecrapaud.search_space import all_models
+from lecrapaud.directories import clean_directory
+from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
+from lecrapaud.config import PYTHON_ENV
+from lecrapaud.feature_selection import load_train_data
+from lecrapaud.db import (
+    Model,
+    ModelSelection,
+    ModelTraining,
+    Score,
+    Target,
+    Dataset,
+)
 # Reproducible result
 keras.utils.set_random_seed(42)
@@ -100,1116 +108,1216 @@ def test_hardware():
 warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
-# Metrics
-def rmse_tf(y_true, y_pred):
-    y_true, y_pred = unscale_tf(y_true, y_pred)
-    results = K.sqrt(K.mean(K.square(y_pred - y_true)))
-    return results
-def mae_tf(y_true, y_pred):
-    y_true, y_pred = unscale_tf(y_true, y_pred)
-    results = K.mean(K.abs(y_pred - y_true))
-    return results
+class ModelEngine:
+    def __init__(
+        self,
+        model_name: str = None,
+        target_type: str = None,
+        path: str = None,
+        search_params: dict = {},
+        create_model=None,
+        plot: bool = False,
+        log_dir: str = None,
+    ):
+        self.path = path
+        if path:
+            self.load()
+        else:
+            self.model_name = model_name
+            self.target_type = target_type
-def unscale_tf(y_true, y_pred):
-    if _target_type == "regression":
-        scale = K.constant(_scaler_y.scale_[0])
-        mean = K.constant(_scaler_y.mean_[0])
+        config = [
+            config for config in all_models if config["model_name"] == self.model_name
+        ]
+        if config is None or len(config) == 0:
+            Exception(
+                f"Model {self.model_name} is not supported by this library."
+                f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
+            )
+        config = config[0]
-        y_true = K.mul(y_true, scale)
-        y_true = K.bias_add(y_true, mean)
+        self.recurrent = config["recurrent"]
+        self.need_scaling = config["need_scaling"]
+        self.search_params = search_params
+        self.create_model = create_model
+        self.plot = plot
+        self.log_dir = log_dir
-        y_pred = K.mul(y_pred, scale)
-        y_pred = K.bias_add(y_pred, mean)
-    return y_true, y_pred
+        if self.need_scaling and self.target_type == "regression":
+            self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
+        else:
+            self.scaler_y = None
+        self.threshold = None
-def recall_tf(y_true, y_pred):
-    y_true = K.ones_like(y_true)
-    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
-    all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+    def fit(self, *args):
+        if self.recurrent:
+            fit = self.fit_recurrent
+        elif (self.create_model == "lgb") or (self.create_model == "xgb"):
+            fit = self.fit_boosting
+        else:
+            fit = self.fit_sklearn
+        model = fit(*args)
+        return model
+    # Functions to fit & evaluate models
+    def fit_sklearn(self, x_train, y_train, x_val, y_val, params):
+        # Create & Compile the model
+        model = self.create_model(**params)
+        # Train the model
+        logger.info("Fitting the model...")
+        logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
+        logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
+        model.fit(x_train, y_train)
+        if (
+            self.target_type == "classification"
+            and "loss" in model.get_params().keys()
+            and "hinge" in model.get_params()["loss"]
+        ):
+            # This is for SVC models with hinge loss
+            # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
+            # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
+            logger.info(
+                f"Re-Calibrating {self.model_name} to get predict probabilities..."
+            )
+            calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
+            model = calibrator.fit(x_train, y_train)
-    recall = true_positives / (all_positives + K.epsilon())
-    return recall
+        # set model_name after calibrator
+        model.model_name = self.model_name
+        model.target_type = self.target_type
+        logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
-def precision_tf(y_true, y_pred):
-    y_true = K.ones_like(y_true)
-    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+        self._model = model
-    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
-    precision = true_positives / (predicted_positives + K.epsilon())
-    return precision
+        return model
+    def fit_boosting(self, x_train, y_train, x_val, y_val, params):
+        """
+        This is using lightGBM or XGboost C++ librairies
+        """
+        lightGBM = self.create_model == "lgb"
-def f1_score_tf(y_true, y_pred):
-    precision = precision_tf(y_true, y_pred)
-    recall = recall_tf(y_true, y_pred)
-    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
+        # Datasets
+        boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
+        train_data = boosting_dataset(x_train, label=y_train)
+        val_data = boosting_dataset(x_val, label=y_val)
+        # Create a TensorBoardX writer
+        writer = SummaryWriter(self.log_dir)
+        evals_result = {}
-def get_log_dir(training_target_dir: str, model_name="test_model"):
-    """Generates a structured log directory path for TensorBoard."""
-    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
-    log_dir = (
-        Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
-    )
-    log_dir.mkdir(parents=True, exist_ok=True)  # Create directories if they don't exist
-    return str(log_dir)
+        # Training
+        labels = np.unique(y_train)
+        num_class = (
+            labels.size
+            if self.target_type == "classification" and labels.size > 2
+            else 1
+        )
+        logger.info("Fitting the model...")
+        logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
+        logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
-# Functions to fit & evaluate models
-def fit_sklearn(x_train, y_train, x_val, y_val, create_model, params, config):
+        if lightGBM:
-    # Create & Compile the model
-    model = create_model(**params)
+            def tensorboard_callback(env):
+                for i, metric in enumerate(env.evaluation_result_list):
+                    metric_name, _, metric_value, _ = metric
+                    writer.add_scalar(
+                        f"LightGBM/{metric_name}", metric_value, env.iteration
+                    )
-    # Train the model
-    logger.info("Fitting the model...")
-    logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
-    logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
+            loss = (
+                "regression"
+                if self.target_type == "regression"
+                else ("binary" if num_class <= 2 else "multiclass")
+            )
+            eval_metric = (
+                "rmse"
+                if self.target_type == "regression"
+                else ("binary_logloss" if num_class <= 2 else "multi_logloss")
+            )
+            model = lgb.train(
+                params={
+                    **params["model_params"],
+                    "objective": loss,
+                    "metric": eval_metric,
+                    "num_class": num_class,
+                },
+                num_boost_round=params["num_boost_round"],
+                train_set=train_data,
+                valid_sets=[train_data, val_data],
+                valid_names=["train", "val"],
+                callbacks=[
+                    lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
+                    lgb.record_evaluation(evals_result),
+                    tensorboard_callback,
+                ],
+            )
+        else:
-    model.fit(x_train, y_train)
+            class TensorBoardCallback(xgb.callback.TrainingCallback):
-    if (
-        _target_type == "classification"
-        and "loss" in model.get_params().keys()
-        and "hinge" in model.get_params()["loss"]
-    ):
-        # This is for SVC models with hinge loss
-        # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
-        # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
-        logger.info(
-            f"Re-Calibrating {config["model_name"]} to get predict probabilities..."
-        )
-        calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
-        model = calibrator.fit(x_train, y_train)
+                def __init__(self, log_dir: str):
+                    self.writer = SummaryWriter(log_dir=log_dir)
-    # set model_name after calibrator
-    model.model_name = config["model_name"]
+                def after_iteration(
+                    self,
+                    model,
+                    epoch: int,
+                    evals_log: xgb.callback.TrainingCallback.EvalsLog,
+                ) -> bool:
+                    if not evals_log:
+                        return False
-    logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
+                    for data, metric in evals_log.items():
+                        for metric_name, log in metric.items():
+                            score = (
+                                log[-1][0] if isinstance(log[-1], tuple) else log[-1]
+                            )
+                            self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
-    return model
+                    return False
+            tensorboard_callback = TensorBoardCallback(self.log_dir)
-def fit_boosting(x_train, y_train, x_val, y_val, create_model, params, config):
-    """
-    This is using lightGBM or XGboost C++ librairies
-    """
-    lightGBM = create_model == "lgb"
+            loss = (
+                "reg:squarederror"
+                if self.target_type == "regression"
+                else ("binary:logistic" if num_class <= 2 else "multi:softprob")
+            )
+            eval_metric = (
+                "rmse"
+                if self.target_type == "regression"
+                else ("logloss" if num_class <= 2 else "mlogloss")
+            )
+            model = xgb.train(
+                params={
+                    **params["model_params"],
+                    "objective": loss,
+                    "eval_metric": eval_metric,
+                    "num_class": num_class,
+                },
+                num_boost_round=params["num_boost_round"],
+                dtrain=train_data,
+                evals=[(val_data, "val"), (train_data, "train")],
+                callbacks=[
+                    xgb.callback.EarlyStopping(
+                        rounds=params["early_stopping_rounds"], save_best=True
+                    ),
+                    xgb.callback.EvaluationMonitor(),  # This shows evaluation results at each iteration
+                    tensorboard_callback,
+                ],
+                evals_result=evals_result,  # Record evaluation result
+                verbose_eval=0,
+            )
-    # Datasets
-    Dataset = lgb.Dataset if lightGBM else xgb.DMatrix
-    train_data = Dataset(x_train, label=y_train)
-    val_data = Dataset(x_val, label=y_val)
+        model.model_name = self.create_model
+        model.target_type = self.target_type
+        logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
-    # Callbacks
-    log_dir = get_log_dir(_training_target_dir, create_model)
+        # Close the writer after training is done
+        writer.close()
-    # Create a TensorBoardX writer
-    writer = SummaryWriter(log_dir)
-    evals_result = {}
+        if self.plot:
+            # Plot loss per epoch
+            train_loss = evals_result["train"][eval_metric]
+            val_loss = evals_result["val"][eval_metric]
+            logs = pd.DataFrame({"train": train_loss, "val": val_loss})
-    # Training
-    labels = np.unique(y_train)
-    num_class = (
-        labels.size if _target_type == "classification" and labels.size > 2 else 1
-    )
-    logger.info("Fitting the model...")
-    logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
-    logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
+            plt.figure(figsize=(14, 4))
+            plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
+            plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
+            plt.xlabel("Epoch")
+            plt.ylabel("Loss")
+            plt.legend()
+            plt.show()
-    if lightGBM:
+        self._model = model
-        def tensorboard_callback(env):
-            for i, metric in enumerate(env.evaluation_result_list):
-                metric_name, _, metric_value, _ = metric
-                writer.add_scalar(
-                    f"LightGBM/{metric_name}", metric_value, env.iteration
-                )
+        return model
-        loss = (
-            "regression"
-            if _target_type == "regression"
-            else ("binary" if num_class <= 2 else "multiclass")
-        )
-        eval_metric = (
-            "rmse"
-            if _target_type == "regression"
-            else ("binary_logloss" if num_class <= 2 else "multi_logloss")
-        )
-        model = lgb.train(
-            params={
-                **params["model_params"],
-                "objective": loss,
-                "metric": eval_metric,
-                "num_class": num_class,
-            },
-            num_boost_round=params["num_boost_round"],
-            train_set=train_data,
-            valid_sets=[train_data, val_data],
-            valid_names=["train", "val"],
-            callbacks=[
-                lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
-                lgb.record_evaluation(evals_result),
-                tensorboard_callback,
-            ],
-        )
-    else:
+    def fit_recurrent(self, x_train, y_train, x_val, y_val, params):
-        class TensorBoardCallback(xgb.callback.TrainingCallback):
+        # metrics functions
+        def rmse_tf(y_true, y_pred):
+            y_true, y_pred = unscale_tf(y_true, y_pred)
+            results = K.sqrt(K.mean(K.square(y_pred - y_true)))
+            return results
-            def __init__(self, log_dir: str):
-                self.writer = SummaryWriter(log_dir=log_dir)
+        def mae_tf(y_true, y_pred):
+            y_true, y_pred = unscale_tf(y_true, y_pred)
+            results = K.mean(K.abs(y_pred - y_true))
+            return results
-            def after_iteration(
-                self,
-                model,
-                epoch: int,
-                evals_log: xgb.callback.TrainingCallback.EvalsLog,
-            ) -> bool:
-                if not evals_log:
-                    return False
+        def unscale_tf(y_true, y_pred):
+            if self.target_type == "regression":
+                scale = K.constant(self.scaler_y.scale_[0])
+                mean = K.constant(self.scaler_y.mean_[0])
-                for data, metric in evals_log.items():
-                    for metric_name, log in metric.items():
-                        score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
-                        self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
+                y_true = K.mul(y_true, scale)
+                y_true = K.bias_add(y_true, mean)
-                return False
+                y_pred = K.mul(y_pred, scale)
+                y_pred = K.bias_add(y_pred, mean)
+            return y_true, y_pred
-        tensorboard_callback = TensorBoardCallback(log_dir)
+        # Create the model
+        labels = np.unique(y_train[:, 0])
+        num_class = labels.size if self.target_type == "classification" else None
+        input_shape = (x_train.shape[1], x_train.shape[2])
+        model = self.create_model(params, input_shape, self.target_type, num_class)
+        model.target_type = self.target_type
+        # Compile the model
         loss = (
-            "reg:squarederror"
-            if _target_type == "regression"
-            else ("binary:logistic" if num_class <= 2 else "multi:softprob")
-        )
-        eval_metric = (
-            "rmse"
-            if _target_type == "regression"
-            else ("logloss" if num_class <= 2 else "mlogloss")
+            rmse_tf
+            if self.target_type == "regression"
+            else (
+                BinaryCrossentropy(from_logits=False)
+                if num_class <= 2
+                else CategoricalCrossentropy(from_logits=False)
+            )
         )
-        model = xgb.train(
-            params={
-                **params["model_params"],
-                "objective": loss,
-                "eval_metric": eval_metric,
-                "num_class": num_class,
-            },
-            num_boost_round=params["num_boost_round"],
-            dtrain=train_data,
-            evals=[(val_data, "val"), (train_data, "train")],
-            callbacks=[
-                xgb.callback.EarlyStopping(
-                    rounds=params["early_stopping_rounds"], save_best=True
-                ),
-                xgb.callback.EvaluationMonitor(),  # This shows evaluation results at each iteration
-                tensorboard_callback,
-            ],
-            evals_result=evals_result,  # Record evaluation result
-            verbose_eval=0,
+        optimizer = Adam(
+            learning_rate=params["learning_rate"], clipnorm=params["clipnorm"]
         )
-    model.model_name = create_model
-    logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
-    # Close the writer after training is done
-    writer.close()
-    if _plot:
-        # Plot loss per epoch
-        train_loss = evals_result["train"][eval_metric]
-        val_loss = evals_result["val"][eval_metric]
-        logs = pd.DataFrame({"train": train_loss, "val": val_loss})
-        plt.figure(figsize=(14, 4))
-        plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
-        plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
-        plt.xlabel("Epoch")
-        plt.ylabel("Loss")
-        plt.legend()
-        plt.show()
-    return model
-def fit_recurrent(x_train, y_train, x_val, y_val, create_model, params, config):
-    # Create the model
-    labels = np.unique(y_train[:, 0])
-    num_class = labels.size if _target_type == "classification" else None
-    input_shape = (x_train.shape[1], x_train.shape[2])
-    model = create_model(params, input_shape, _target_type, num_class)
-    # Compile the model
-    loss = (
-        rmse_tf
-        if _target_type == "regression"
-        else (
-            BinaryCrossentropy(from_logits=False)
-            if num_class <= 2
-            else CategoricalCrossentropy(from_logits=False)
+        metrics = (
+            [mae_tf]
+            if self.target_type == "regression"
+            else (
+                ["accuracy", Precision(), Recall()]
+                if num_class <= 2
+                else ["categorical_accuracy"]
+            )
         )
-    )
-    optimizer = Adam(learning_rate=params["learning_rate"], clipnorm=params["clipnorm"])
-    metrics = (
-        [mae_tf]
-        if _target_type == "regression"
-        else (
-            ["accuracy", Precision(), Recall()]
-            if num_class <= 2
-            else ["categorical_accuracy"]
+        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+        # Callbacks
+        tensorboard_callback = TensorBoard(log_dir=self.log_dir)
+        early_stopping_callback = EarlyStopping(
+            monitor="val_loss",
+            patience=3,
+            restore_best_weights=True,
+            start_from_epoch=5,
         )
-    )
-    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
-    # Callbacks
-    log_dir = get_log_dir(_training_target_dir, model.model_name)
-    tensorboard_callback = TensorBoard(log_dir=log_dir)
-    early_stopping_callback = EarlyStopping(
-        monitor="val_loss", patience=3, restore_best_weights=True, start_from_epoch=5
-    )
-    # Custom callbacks
-    class PrintTrainableWeights(keras.callbacks.Callback):
-        def on_epoch_end(self, epoch, logs={}):
-            logger.info(model.trainable_variables)
-    class GradientCalcCallback(keras.callbacks.Callback):
-        def __init__(self):
-            self.epoch_gradient = []
-        def get_gradient_func(self, model):
-            # grads = K.gradients(model.total_loss, model.trainable_weights)
-            grads = K.gradients(model.loss, model.trainable_weights)
-            # inputs = model.model.inputs + model.targets + model.sample_weights
-            # use below line of code if above line doesn't work for you
-            # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
-            inputs = (
-                model._feed_inputs + model._feed_targets + model._feed_sample_weights
-            )
-            func = K.function(inputs, grads)
-            return func
-    def on_epoch_end(self, epoch, logs=None):
-        get_gradient = self.get_gradient_func(model)
-        grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
-        self.epoch_gradient.append(grads)
-    # Train the model
-    if _target_type == "classification" and num_class > 2:
-        lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
-        lb.fit(labels)
-        y_train = lb.transform(y_train[:, 0].flatten())
-        y_val = lb.transform(y_val[:, 0].flatten())
-    else:
-        y_train = y_train[:, 0].flatten()
-        y_val = y_val[:, 0].flatten()
-    logger.info("Fitting the model...")
-    logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
-    logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
-    history = model.fit(
-        x_train,
-        y_train,
-        batch_size=params["batch_size"],
-        verbose=0,
-        epochs=params["epochs"],
-        shuffle=False,
-        validation_data=(x_val, y_val),
-        callbacks=[early_stopping_callback, tensorboard_callback],
-    )
-    logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
-    # logger.info(pd.DataFrame(gradiant.epoch_gradient))
-    if _plot:
-        # Plot loss per epoch
-        logs = pd.DataFrame(history.history)
-        plt.figure(figsize=(14, 4))
-        plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
-        plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
-        plt.xlabel("Epoch")
-        plt.ylabel("Loss")
-        plt.legend()
-        plt.show()
-    return model
-def predict(
-    model, data: pd.DataFrame, target_type: str, config: dict, threshold: float = 0.5
-):
-    """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
-    Args:
-        - model: the train model to predict value
-        - data: the data for prediction
-        - target_type: classification or regression
-        - config: dict containing model config
-    """
-    if config["recurrent"] or model.model_name in ["lgb", "xgb"]:
-        # keras, lgb & xgb
-        if model.model_name == "lgb":
-            # Direct prediction for LightGBM
-            pred = model.predict(data)
-        elif model.model_name == "xgb":
-            # Convert val_data to DMatrix for XGBoost
-            d_data = xgb.DMatrix(data)
-            pred = model.predict(d_data)
-        else:
-            # Reshape (flatten) for keras if not multiclass
-            pred = model.predict(data)
-            if pred.shape[1] == 1:
-                pred = pred.reshape(-1)
-        if target_type == "classification":
-            num_class = pred.shape[1] if len(pred.shape) > 1 else 2
-            if num_class <= 2:
-                # For binary classification, concatenate the predicted probabilities for both classes
-                pred_df = pd.DataFrame(
-                    {
-                        0: 1 - pred,  # Probability of class 0
-                        1: pred,  # Probability of class 1
-                    },
+        # Custom callbacks
+        class PrintTrainableWeights(keras.callbacks.Callback):
+            def on_epoch_end(self, epoch, logs={}):
+                logger.info(model.trainable_variables)
+        class GradientCalcCallback(keras.callbacks.Callback):
+            def __init__(self):
+                self.epoch_gradient = []
+            def get_gradient_func(self, model):
+                # grads = K.gradients(model.total_loss, model.trainable_weights)
+                grads = K.gradients(model.loss, model.trainable_weights)
+                # inputs = model.model.inputs + model.targets + model.sample_weights
+                # use below line of code if above line doesn't work for you
+                # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
+                inputs = (
+                    model._feed_inputs
+                    + model._feed_targets
+                    + model._feed_sample_weights
                 )
-            else:
-                # For multi-class classification, use the predicted probabilities for each class
-                pred_df = pd.DataFrame(pred, columns=range(num_class))
-            # Get final predictions (argmax for multi-class, threshold for binary)
-            if num_class == 2:
-                pred_df["PRED"] = np.where(
-                    pred_df[1] >= threshold, 1, 0
-                )  # Class 1 if prob >= threshold
-            else:
-                pred_df["PRED"] = pred_df.idxmax(
-                    axis=1
-                )  # Class with highest probability for multiclasses
+                func = K.function(inputs, grads)
+                return func
-            # Reorder columns to show predicted class first, then probabilities
-            pred = pred_df[["PRED"] + list(range(num_class))]
+            def on_epoch_end(self, epoch, logs=None):
+                get_gradient = self.get_gradient_func(model)
+                grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
+                self.epoch_gradient.append(grads)
+        # Train the model
+        if self.target_type == "classification" and num_class > 2:
+            lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
+            lb.fit(labels)
+            y_train = lb.transform(y_train[:, 0].flatten())
+            y_val = lb.transform(y_val[:, 0].flatten())
         else:
-            pred = pd.Series(pred, name="PRED")
+            y_train = y_train[:, 0].flatten()
+            y_val = y_val[:, 0].flatten()
+        logger.info("Fitting the model...")
+        logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
+        logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
+        history = model.fit(
+            x_train,
+            y_train,
+            batch_size=params["batch_size"],
+            verbose=0,
+            epochs=params["epochs"],
+            shuffle=False,
+            validation_data=(x_val, y_val),
+            callbacks=[early_stopping_callback, tensorboard_callback],
+        )
-        # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
-        if model.model_name in ["lgb", "xgb"]:
-            pred.index = data.index
-    else:
-        # sk learn
-        pred = pd.Series(model.predict(data), index=data.index, name="PRED")
-        if target_type == "classification":
-            pred_proba = pd.DataFrame(
-                model.predict_proba(data),
-                index=data.index,
-                columns=[
-                    int(c) if isinstance(c, float) and c.is_integer() else c
-                    for c in model.classes_
-                ],
-            )
+        logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
+        # logger.info(pd.DataFrame(gradiant.epoch_gradient))
-            # Apply threshold for binary classification
-            if len(model.classes_) == 2:
-                positive_class = model.classes_[1]  # Assuming classes are ordered
-                pred = (pred_proba[positive_class] >= threshold).astype(int)
-                pred.name = "PRED"
+        if self.plot:
+            # Plot loss per epoch
+            logs = pd.DataFrame(history.history)
-            pred = pd.concat([pred, pred_proba], axis=1)
+            plt.figure(figsize=(14, 4))
+            plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
+            plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
+            plt.xlabel("Epoch")
+            plt.ylabel("Loss")
+            plt.legend()
+            plt.show()
-    return pred
+        self._model = model
+        return model
-def evaluate(prediction: pd.DataFrame, target_type: str):
-    """
-    Function to evaluate model performance
+    def predict(
+        self,
+        data: pd.DataFrame,
+        threshold: float = 0.5,
+    ):
+        """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
+        Args:
+            - data: the data for prediction
+            - threshold: the threshold for classification
+        """
+        if not self._model:
+            raise Exception(
+                "Model is not fitted, cannot predict, run model.fit() first, or pass a fitted model when creating the Model object to the `model` parameter."
+            )
+        model = self._model
+        if self.threshold and threshold == 0.5:
+            threshold = self.threshold
+        if self.recurrent or model.model_name in ["lgb", "xgb"]:
+            # keras, lgb & xgb
+            if model.model_name == "lgb":
+                # Direct prediction for LightGBM
+                pred = model.predict(data)
+            elif model.model_name == "xgb":
+                # Convert val_data to DMatrix for XGBoost
+                d_data = xgb.DMatrix(data)
+                pred = model.predict(d_data)
+            else:
+                # Reshape (flatten) for keras if not multiclass
+                pred = model.predict(data)
+                if pred.shape[1] == 1:
+                    pred = pred.reshape(-1)
+            if self.target_type == "classification":
+                num_class = pred.shape[1] if len(pred.shape) > 1 else 2
+                if num_class <= 2:
+                    # For binary classification, concatenate the predicted probabilities for both classes
+                    pred_df = pd.DataFrame(
+                        {
+                            0: 1 - pred,  # Probability of class 0
+                            1: pred,  # Probability of class 1
+                        },
+                    )
+                else:
+                    # For multi-class classification, use the predicted probabilities for each class
+                    pred_df = pd.DataFrame(pred, columns=range(num_class))
+                # Get final predictions (argmax for multi-class, threshold for binary)
+                if num_class == 2:
+                    pred_df["PRED"] = np.where(
+                        pred_df[1] >= threshold, 1, 0
+                    )  # Class 1 if prob >= threshold
+                else:
+                    pred_df["PRED"] = pred_df.idxmax(
+                        axis=1
+                    )  # Class with highest probability for multiclasses
-    Args:
-        - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
-        - target_type: classification or regression
-    """
-    score = {}
-    y_true = prediction["TARGET"]
-    y_pred = prediction["PRED"]
+                # Reorder columns to show predicted class first, then probabilities
+                pred = pred_df[["PRED"] + list(range(num_class))]
-    if target_type == "regression":
-        # Main metrics
-        score["RMSE"] = root_mean_squared_error(y_true, y_pred)
-        score["MAE"] = mean_absolute_error(y_true, y_pred)
-        score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
-        score["R2"] = r2_score(y_true, y_pred)
+            else:
+                pred = pd.Series(pred, name="PRED")
-        # Robustness: avoid division by zero
-        std_target = y_true.std()
-        mean_target = y_true.mean()
-        median_target = y_true.median()
+            # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
+            if model.model_name in ["lgb", "xgb"]:
+                pred.index = data.index
+        else:
+            # sk learn
+            pred = pd.Series(model.predict(data), index=data.index, name="PRED")
+            if self.target_type == "classification":
+                pred_proba = pd.DataFrame(
+                    model.predict_proba(data),
+                    index=data.index,
+                    columns=[
+                        int(c) if isinstance(c, float) and c.is_integer() else c
+                        for c in model.classes_
+                    ],
+                )
-        # RMSE / STD
-        score["RMSE_STD_RATIO"] = (
-            float(100 * score["RMSE"] / std_target) if std_target else 1000
-        )
+                # Apply threshold for binary classification
+                if len(model.classes_) == 2:
+                    positive_class = model.classes_[1]  # Assuming classes are ordered
+                    pred = (pred_proba[positive_class] >= threshold).astype(int)
+                    pred.name = "PRED"
-        # Median absolute deviation (MAD)
-        mam = (y_true - mean_target).abs().median()  # Median Abs around Mean
-        mad = (y_true - median_target).abs().median()  # Median Abs around Median
-        score["MAM"] = mam
-        score["MAD"] = mad
-        score["MAE_MAM_RATIO"] = (
-            float(100 * score["MAE"] / mam) if mam else 1000
-        )  # MAE / MAD → Plus stable, moins sensible aux outliers.
-        score["MAE_MAD_RATIO"] = (
-            float(100 * score["MAE"] / mad) if mad else 1000
-        )  # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
+                pred = pd.concat([pred, pred_proba], axis=1)
-    else:
+        return pred
-        labels = np.unique(y_true)
-        num_classes = labels.size
-        y_pred_proba = (
-            prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
+    def save(self, path):
+        if self.recurrent:
+            path += "/" + self.model_name + ".keras"
+            self._model.save(path)
+        else:
+            path += "/" + self.model_name + ".best"
+            joblib.dump(self._model, path)
+        self.path = path
+        return path
+    def load(self):
+        if not self.path:
+            raise ValueError("Path is not set, cannot load model")
+        training_target_dir = Path(self.path)
+        # Load threshold
+        scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
+        self.threshold = (
+            scores_tracking["THRESHOLD"].values[0]
+            if "THRESHOLD" in scores_tracking.columns
+            else None
         )
-        if num_classes > 2:
-            lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
-            lb.fit(labels)
-            y_true_onhot = lb.transform(y_true)
-            y_pred_onehot = lb.transform(y_pred)
-        score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
-        score["ACCURACY"] = accuracy_score(y_true, y_pred)
-        score["PRECISION"] = precision_score(
-            y_true,
-            y_pred,
-            average=("binary" if num_classes == 2 else "macro"),
-        )
-        score["RECALL"] = recall_score(
-            y_true,
-            y_pred,
-            average=("binary" if num_classes == 2 else "macro"),
-        )
-        score["F1"] = f1_score(
-            y_true,
-            y_pred,
-            average=("binary" if num_classes == 2 else "macro"),
-        )
-        score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
-        (
-            score["THRESHOLD"],
-            score["PRECISION_AT_THRESHOLD"],
-            score["RECALL_AT_THRESHOLD"],
-        ) = (
-            find_best_precision_threshold(prediction)
-            if num_classes == 2
-            else (None, None, None)
+        # Search for files that contain '.best' or '.keras' in the name
+        best_files = list(training_target_dir.glob("*.best*")) + list(
+            training_target_dir.glob("*.keras*")
         )
-    return score
+        # If any files are found, try loading the first one (or process as needed)
+        if best_files:
+            file_path = best_files[
+                0
+            ]  # Assuming you want to open the first matching file
+            try:
+                # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
+                self._model = joblib.load(file_path)
+                logger.info(
+                    f"Loaded model {self._model.model_name} and threshold {self.threshold}"
+                )
+            except (pickle.UnpicklingError, EOFError):
+                # If it's not a pickle file, try loading it as a Keras model
+                try:
+                    # Attempt to load the file as a Keras model
+                    self._model = keras.models.load_model(file_path)
+                    logger.info(
+                        f"Loaded model {self._model.model_name} and threshold {self.threshold}"
+                    )
+                except Exception as e:
+                    raise FileNotFoundError(
+                        f"Model could not be loaded from path: {file_path}: {e}"
+                    )
+        else:
+            raise FileNotFoundError(
+                f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
+            )
+        self.model_name = self._model.model_name
+        self.target_type = self._model.target_type
-def train_model(params, x_train, y_train, x_val, y_val, config):
-    if "_type_name" in config.keys() and config["_type_name"] == "hyperopts":
-        global _target_number
-        global _target_type
-        global _session_name
-        global _plot
-        global _type_name
-        global _scaler_y
-        global _training_target_dir
-        _target_number = config["_target_number"]
-        _target_type = config["_target_type"]
-        _session_name = config["_session_name"]
-        _plot = config["_plot"]
-        _type_name = config["_type_name"]
-        _scaler_y = config["_scaler_y"]
-        _training_target_dir = config["_training_target_dir"]
-        # warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
-        # logging.getLogger("ray").setLevel(logging.CRITICAL)
-        # logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
-        # logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
-        # logging.getLogger("raylet").setLevel(logging.CRITICAL)
-    logger.info(
-        f"TARGET_{_target_number} - Training a {config['model_name']} at {datetime.now()} : {_session_name}, TARGET_{_target_number}"
+def trainable(
+    params,
+    x_train,
+    y_train,
+    x_val,
+    y_val,
+    model_name,
+    target_type,
+    session_name,
+    target_number,
+    create_model,
+    type_name="hyperopts",
+    plot=False,
+):
+    """Standalone version of train_model that doesn't depend on self"""
+    # Create model engine
+    model = ModelEngine(
+        model_name=model_name,
+        target_type=target_type,
+        create_model=create_model,
+        plot=plot,
     )
-    recurrent = config["recurrent"]
-    create_model = config["create_model"]
+    logger.info(
+        f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
+    )
-    if recurrent:
+    if model.recurrent:
         timesteps = params["timesteps"]
         x_train = x_train[:, -timesteps:, :]
         x_val = x_val[:, -timesteps:, :]
     # Compile and fit model on train set
     start = time.time()
-    if recurrent:
-        fit = fit_recurrent
-    elif (create_model == "lgb") or (create_model == "xgb"):
-        fit = fit_boosting
-    else:
-        fit = fit_sklearn
-    model = fit(
-        x_train,
-        y_train,
-        x_val,
-        y_val,
-        create_model,
-        params=params,
-        config=config,
-    )
+    model.fit(x_train, y_train, x_val, y_val, params)
     stop = time.time()
     # Prediction on val set
-    y_pred = predict(model, x_val, _target_type, config)
+    y_pred = model.predict(x_val)
     # fix for recurrent model because x_val has no index as it is a 3D np array
-    if config["recurrent"]:
+    if model.recurrent:
         y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
         y_pred.index = y_val.index
     prediction = pd.concat([y_val, y_pred], axis=1)
     # Unscale the data
-    if config["need_scaling"] and _target_type == "regression":
+    if (
+        model.need_scaling
+        and model.target_type == "regression"
+        and model.scaler_y is not None
+    ):
         # scaler_y needs 2D array with shape (-1, 1)
-        prediction.loc[:, "TARGET"] = _scaler_y.inverse_transform(
+        prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
             prediction[["TARGET"]].values
         )
-        prediction.loc[:, "PRED"] = _scaler_y.inverse_transform(
+        prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
             prediction[["PRED"]].values
         )
     # Evaluate model
     score = {
         "DATE": datetime.now(),
-        "SESSION": _session_name,
+        "SESSION": session_name,
         "TRAIN_DATA": x_train.shape[0],
         "VAL_DATA": x_val.shape[0],
         "FEATURES": x_train.shape[-1],
         "MODEL_NAME": model.model_name,
-        "TYPE": _type_name,
+        "TYPE": type_name,
         "TRAINING_TIME": stop - start,
         "EVAL_DATA_STD": prediction["TARGET"].std(),
     }
-    score.update(evaluate(prediction, _target_type))
+    score.update(evaluate(prediction, target_type))
-    if _type_name == "hyperopts":
+    if type_name == "hyperopts":
         session.report(metrics=score)
-        ray.tune.report(metrics=score)
         return score
     return score, model, prediction
-# Main training function
-def model_selection(
-    dataset_id: int,
-    models_idx: list,
-    target_number: int,
-    session_name,
-    perform_hyperoptimization=True,
-    perform_crossval=False,
-    number_of_trials=20,
-    plot=True,
-    clean_dir=False,  # TODO: This has been unused because now feature_selection is in the target directory
-    preserve_model=True,
-    reshaped_data=None,
-    data=None,
-):
-    """
-    Selects the best models based on a target variable, optionally performing hyperparameter optimization
-    and cross-validation, and manages outputs in a session-specific directory.
-    Args:
-        models_idx (list):
-            A list of indices or identifiers representing the models to evaluate.
-            Each identifier corresponds to a predefined or available model.
-        target_number (int):
-            The number of the target variable (e.g., column index or predefined target) to predict.
-            This determines the dataset's output variable for training and evaluation.
-        session_name (str):
-            A name for the current session, used to organize and store results
-            (e.g., logs, metrics, trained models) in a session-specific directory.
-        perform_hyperoptimization (bool, optional):
-            Whether to perform hyperparameter optimization for the models.
-            If `True`, the function will attempt to tune the hyperparameters of each model.
-            Defaults to `True`.
-        perform_crossval (bool, optional):
-            Whether to perform cross-validation to evaluate model performance.
-            If `True`, the function will use cross-validation to compute metrics.
-            Defaults to `True`.
-        number_of_trials (int, optional):
-            The number of trials to run for hyperparameter optimization.
-            Ignored if `perform_hyperoptimization` is `False`.
-            Defaults to `20`.
-        plot (bool, optional):
-            Whether to enable plotting during the process.
-            If `True`, plot will be displayed.
-            Defaults to `True`.
-        clean_dir (bool, optional):
-            Whether to clean the entire target training directory before starting the process.
-            If `True`, any existing files in the target training directory will be removed.
-            Defaults to `False`.
-        preserve_model (bool, optional):
-            Whether to run the search even if there is already a best model in the directory.
-            If `False`, previous best models won't be erased and the search will be skipped.
-            Defaults to `False`.
-    Returns:
-        None
-        The function runs the model selection process and outputs results
-        (e.g., logs, metrics, and optionally models) to the session directory.
-    """
-    global _target_number
-    global _target_type
-    global _session_name
-    global _plot
-    global _type_name
-    global _scaler_y
-    global _training_target_dir
-    global_vars = [
-        "_target_number",
-        "_target_type",
-        "_session_name",
-        "_plot",
-        "_type_name",
-        "_scaler_y",
-        "_training_target_dir",
-    ]
-    _target_number = target_number
-    _target_type = "classification" if target_number in TARGETS_CLF else "regression"
-    _session_name = session_name
-    _plot = plot
-    if dataset_id is None:
-        raise ValueError("dataset_id is not provided.")
-    dataset = Dataset.get(dataset_id)
-    dataset_dir = dataset.path
-    training_target_dir = f"{dataset_dir}/TARGET_{_target_number}"
-    _training_target_dir = training_target_dir
-    metric = "RMSE" if _target_type == "regression" else "LOGLOSS"
-    # load features, scalers and data
-    features = dataset.get_features(target_number)
-    all_features = dataset.get_all_features()
-    if data:
-        train = data["train"]
-        val = data["val"]
-        train_scaled = data["train_scaled"]
-        val_scaled = data["val_scaled"]
-        _scaler_y = (
-            data["scalers_y"][f"scaler_y_{target_number}"]
-            if _target_type == "regression"
-            else None
+class ModelSelectionEngine:
+    def __init__(
+        self,
+        data,
+        reshaped_data,
+        target_number,
+        target_clf,
+        dataset,
+        models_idx,
+        time_series,
+        date_column,
+        group_column,
+        **kwargs,
+    ):
+        self.data = data
+        self.reshaped_data = reshaped_data
+        self.target_number = target_number
+        self.dataset = dataset
+        self.target_clf = target_clf
+        self.models_idx = models_idx
+        self.time_series = time_series
+        self.date_column = date_column
+        self.group_column = group_column
+        self.target_type = (
+            "classification" if self.target_number in self.target_clf else "regression"
         )
-    else:
-        train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
-            dataset_dir, target_number, _target_type
+        self.dataset_dir = self.dataset.path
+        self.dataset_id = self.dataset.id
+        self.data_dir = f"{self.dataset_dir}/data"
+        self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
+        self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
+        self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
+        self.features = self.dataset.get_features(self.target_number)
+        self.all_features = self.dataset.get_all_features(
+            date_column=self.date_column, group_column=self.group_column
         )
-    list_models = ml_models + dl_recurrent_models
-    if any(list_models[i].get("recurrent") for i in models_idx):
-        if reshaped_data is None:
-            raise ValueError("reshaped_data is not provided.")
-        logger.info("Loading reshaped data...")
-        x_train_reshaped = reshaped_data["x_train_reshaped"]
-        y_train_reshaped = reshaped_data["y_train_reshaped"]
-        x_val_reshaped = reshaped_data["x_val_reshaped"]
-        y_val_reshaped = reshaped_data["y_val_reshaped"]
-    # create model selection in db
-    target = Target.find_by(name=f"TARGET_{target_number}")
-    model_selection = ModelSelection.upsert(
-        match_fields=["target_id", "dataset_id"],
-        target_id=target.id,
-        dataset_id=dataset.id,
-    )
+    # Main training function
+    def run(
+        self,
+        session_name,
+        perform_hyperopt=True,
+        number_of_trials=20,
+        perform_crossval=False,
+        plot=True,
+        clean_dir=False,  # TODO: This has been unused because now feature_selection is in the target directory
+        preserve_model=True,
+    ):
+        """
+        Selects the best models based on a target variable, optionally performing hyperparameter optimization
+        and cross-validation, and manages outputs in a session-specific directory.
+        """
+        self.session_name = session_name
+        self.plot = plot
+        self.number_of_trials = number_of_trials
+        if self.dataset_id is None:
+            raise ValueError("Please provide a dataset.")
+        if self.data:
+            train = self.data["train"]
+            val = self.data["val"]
+            test = self.data["test"]
+            train_scaled = self.data["train_scaled"]
+            val_scaled = self.data["val_scaled"]
+            test_scaled = self.data["test_scaled"]
+        else:
+            (
+                train,
+                val,
+                test,
+                train_scaled,
+                val_scaled,
+                test_scaled,
+            ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
+        if (
+            any(all_models[i].get("recurrent") for i in self.models_idx)
+            and not self.time_series
+        ):
+            ValueError(
+                "You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
+            )
-    # recurrent models starts at 9 # len(list_models)
-    for i in models_idx:
-        config = list_models[i]
-        if config["recurrent"] is False and config[_target_type] is None:
-            continue  # for naive bayes models that cannot be used in regression
-        results_dir = f"{training_target_dir}/{config['model_name']}"
-        if not os.path.exists(f"{results_dir}"):
-            os.makedirs(f"{results_dir}")
-        elif preserve_model and contains_best(results_dir):
-            continue
-        elif perform_hyperoptimization:
-            clean_directory(results_dir)
-        logger.info(f"Training a {config['model_name']}")
-        model = Model.upsert(
-            match_fields=["name", "type"],
-            name=config["model_name"],
-            type=_target_type,
-        )
-        model_training = ModelTraining.upsert(
-            match_fields=["model_id", "model_selection_id"],
-            model_id=model.id,
-            model_selection_id=model_selection.id,
+        if (
+            any(all_models[i].get("recurrent") for i in self.models_idx)
+            and self.time_series
+        ):
+            if self.reshaped_data is None:
+                raise ValueError("reshaped_data is not provided.")
+            logger.info("Loading reshaped data...")
+            x_train_reshaped = self.reshaped_data["x_train_reshaped"]
+            y_train_reshaped = self.reshaped_data["y_train_reshaped"]
+            x_val_reshaped = self.reshaped_data["x_val_reshaped"]
+            y_val_reshaped = self.reshaped_data["y_val_reshaped"]
+            x_test_reshaped = self.reshaped_data["x_test_reshaped"]
+            y_test_reshaped = self.reshaped_data["y_test_reshaped"]
+        # create model selection in db
+        target = Target.find_by(name=f"TARGET_{self.target_number}")
+        model_selection = ModelSelection.upsert(
+            match_fields=["target_id", "dataset_id"],
+            target_id=target.id,
+            dataset_id=self.dataset_id,
         )
-        # getting data
-        if config["recurrent"]:
-            # Clear cluster from previous Keras session graphs.
-            K.clear_session()
-            features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
-            # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
-            x_train = x_train_reshaped[:, :, features_idx]
-            y_train = y_train_reshaped[:, [target_number, 0]]
-            x_val = x_val_reshaped[:, :, features_idx]
-            y_val = y_val_reshaped[:, [target_number, 0]]
-        else:
-            new_config = config[_target_type]
-            new_config["model_name"] = config["model_name"]
-            new_config["recurrent"] = config["recurrent"]
-            new_config["need_scaling"] = config["need_scaling"]
-            config = new_config
-            if config["need_scaling"] and _target_type == "regression":
-                x_train = train_scaled[features]
-                y_train = train_scaled[f"TARGET_{target_number}"].rename("TARGET")
-                x_val = val_scaled[features]
-                y_val = val_scaled[f"TARGET_{target_number}"].rename("TARGET")
-            else:
-                x_train = train[features]
-                y_train = train[f"TARGET_{target_number}"].rename("TARGET")
-                x_val = val[features]
-                y_val = val[f"TARGET_{target_number}"].rename("TARGET")
-        start = time.time()
-        # Tuning hyperparameters
-        if perform_hyperoptimization:
-            _type_name = "hyperopts"
-            for var in global_vars:
-                config[var] = globals()[var]
-            logger.info("Start tuning hyperparameters...")
-            storage_path = f"{results_dir}/ray_results"
-            # ray.shutdown()
-            # ray.init(
-            #     runtime_env={
-            #         "working_dir": ".",  # or your project path
-            #         "env_vars": {"PYTHONPATH": "."}
-            #     }
-            # )
-            tuner = Tuner(
-                trainable=with_parameters(
-                    train_model,
-                    x_train=x_train,
-                    y_train=y_train,
-                    x_val=x_val,
-                    y_val=y_val,
-                    config=config,
-                ),
-                param_space=config["search_params"],
-                tune_config=TuneConfig(
-                    metric=metric,
-                    mode="min",
-                    search_alg=HyperOptSearch(),
-                    num_samples=number_of_trials,
-                    scheduler=ASHAScheduler(max_t=100, grace_period=10),
-                ),
-                run_config=RunConfig(
-                    stop={"training_iteration": 100},
-                    storage_path=storage_path,
-                    # name=datetime.now().strftime("%d-%m-%Y") + "-" + session_name,
-                    callbacks=[TBXLoggerCallback()],
-                    # log_to_file=("stdout.log", "stderr.log"), # depreciated
-                    # verbose=0,
-                ),
+        # recurrent models starts at 9 # len(list_models)
+        for i in self.models_idx:
+            config = all_models[i]
+            recurrent = config["recurrent"]
+            need_scaling = config["need_scaling"]
+            model_name = config["model_name"]
+            if recurrent is False and config[self.target_type] is None:
+                continue  # for naive bayes models that cannot be used in regression
+            self.results_dir = f"{self.training_target_dir}/{model_name}"
+            if not os.path.exists(f"{self.results_dir}"):
+                os.makedirs(f"{self.results_dir}")
+            elif preserve_model and contains_best(self.results_dir):
+                continue
+            elif perform_hyperopt:
+                clean_directory(self.results_dir)
+            logger.info(f"Training a {model_name}")
+            model = Model.upsert(
+                match_fields=["name", "type"],
+                name=model_name,
+                type=self.target_type,
+            )
+            model_training = ModelTraining.upsert(
+                match_fields=["model_id", "model_selection_id"],
+                model_id=model.id,
+                model_selection_id=model_selection.id,
             )
-            try:
-                results = tuner.fit()
-                best_result = results.get_best_result(metric, "max")
-                best_params = best_result.config
-                best_score = best_result.metrics
+            # getting data
+            if recurrent:
+                # Clear cluster from previous Keras session graphs.
+                K.clear_session()
+                features_idx = [
+                    i
+                    for i, e in enumerate(self.all_features)
+                    if e in set(self.features)
+                ]
+                # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
+                x_train = x_train_reshaped[:, :, features_idx]
+                y_train = y_train_reshaped[:, [self.target_number, 0]]
+                x_val = x_val_reshaped[:, :, features_idx]
+                y_val = y_val_reshaped[:, [self.target_number, 0]]
+                x_test = x_test_reshaped[:, :, features_idx]
+                y_test = y_test_reshaped[:, [self.target_number, 0]]
+            else:
+                config = config[self.target_type]
-                # log results
-                logger.info(f"Best hyperparameters found were:\n{best_params}")
-                logger.info(f"Best Scores found were:\n{best_score}")
+                if need_scaling and self.target_type == "regression":
+                    x_train = train_scaled[self.features]
+                    y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
+                        "TARGET"
+                    )
+                    x_val = val_scaled[self.features]
+                    y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
+                    x_test = test_scaled[self.features]
+                    y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
+                        "TARGET"
+                    )
+                else:
+                    x_train = train[self.features]
+                    y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
+                    x_val = val[self.features]
+                    y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
+                    x_test = test[self.features]
+                    y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
+            log_dir = get_log_dir(self.training_target_dir, model_name)
+            # instantiate model
+            model = ModelEngine(
+                model_name=model_name,
+                search_params=config["search_params"],
+                target_type=self.target_type,
+                create_model=config["create_model"],
+                plot=self.plot,
+                log_dir=log_dir,
+            )
-                df_results = results.get_dataframe()
-                logger.info(
-                    f"Markdown table with all trials :\n{df_results.to_markdown()}"
-                )
+            start = time.time()
+            # Tuning hyperparameters
+            if perform_hyperopt:
+                best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
                 # save best params
-                best_params_file = f"{training_target_dir}/best_params.json"
+                best_params_file = f"{self.training_target_dir}/best_params.json"
                 try:
                     with open(best_params_file, "r") as f:
                         json_dict = json.load(f)
                 except FileNotFoundError:
                     json_dict = {}
-                json_dict[config["model_name"]] = serialize_for_json(best_params)
+                json_dict[model.model_name] = serialize_for_json(best_params)
                 with open(best_params_file, "w") as f:
                     json.dump(json_dict, f, indent=4)
+            else:
+                try:
+                    with open(f"{self.training_target_dir}/best_params.json") as f:
+                        json_dict = json.load(f)
+                        best_params = json_dict[model_name]
+                except Exception:
+                    raise FileNotFoundError(
+                        f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true"
+                    )
-            except Exception as e:
-                ray.shutdown()
-                raise Exception(e)
-                logger.error(e)
-            ray.shutdown()
-            # Collect errors in single file
-            collect_error_logs(
-                training_target_dir=training_target_dir, storage_path=storage_path
-            )
-            # Clean up
-            for var in global_vars:
-                del config[var]
-        else:
-            try:
-                with open(f"{training_target_dir}/best_params.json") as f:
-                    json_dict = json.load(f)
-                    best_params = json_dict[config["model_name"]]
-            except Exception:
-                raise FileNotFoundError(
-                    f"Could not find {config['model_name']} in current data. Try to run an hyperoptimization by setting `perform_hyperoptimization` to true"
+            # Perform cross-validation of the best model on k-folds of train + val set
+            if perform_crossval:
+                x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
+                y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
+                n_splits = 4
+                n_samples = len(x_train_val)
+                test_size = int(n_samples / (n_splits + 4))
+                tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
+                # Store the scores
+                cross_validation_scores = []
+                for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
+                    self.type_name = f"crossval_fold_{i}"
+                    if self.time_series:
+                        date_series = train[self.date_column].copy()
+                        if need_scaling:
+                            date_series = date_series.map(pd.Timestamp.fromordinal)
+                        # Now you can use the actual train/val indices to extract ranges
+                        train_start = date_series.iloc[train_index[0]]
+                        train_end = date_series.iloc[train_index[-1]]
+                        val_start = date_series.iloc[val_index[0]]
+                        val_end = date_series.iloc[val_index[-1]]
+                        logger.info(
+                            f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
+                            f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
+                        )
+                    else:
+                        logger.info(
+                            f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
+                        )
+                    # Train the model and get the score
+                    if recurrent:
+                        cross_validation_score, _, _ = self.train_model(
+                            params=best_params,
+                            x_train=x_train_val[train_index],
+                            y_train=y_train_val[train_index],
+                            x_val=x_train_val[val_index],
+                            y_val=y_train_val[val_index],
+                            model=model,
+                        )
+                    else:
+                        cross_validation_score, _, _ = self.train_model(
+                            params=best_params,
+                            x_train=x_train_val.iloc[train_index],
+                            y_train=y_train_val.iloc[train_index],
+                            x_val=x_train_val.iloc[val_index],
+                            y_val=y_train_val.iloc[val_index],
+                            model=model,
+                        )
+                    # Append score to the list
+                    cross_validation_scores.append(cross_validation_score)
+                # Calculate and log the mean score
+                cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
+                    self.metric
+                ].mean()
+                logger.info(
+                    f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
                 )
-        # Perform cross-validation of the best model on k-folds of train + val set
-        if perform_crossval:
-            x_train_val = pd.concat([x_train, x_val], axis=0)
-            y_train_val = pd.concat([y_train, y_val], axis=0)
-            n_splits = 4
-            n_samples = len(x_train_val)
-            test_size = int(n_samples / (n_splits + 4))
-            tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
-            # Store the scores
-            cross_validation_scores = []
-            for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
-                _type_name = f"crossval_fold_{i}"
-                if DATE_COLUMN:
-                    date_column = train[DATE_COLUMN].copy()
+                # Retrain on entire training set, but keep score on cross-validation folds
+                best_score, best_model, best_pred = self.train_model(
+                    params=best_params,
+                    x_train=pd.concat([x_train, x_val], axis=0),
+                    y_train=pd.concat([y_train, y_val], axis=0),
+                    x_val=x_test,
+                    y_val=y_test,
+                    model=model,
+                )
+                best_score = cross_validation_mean_score
+            else:
+                # Evaluate on validation set
+                self.type_name = "validation"
+                best_score, best_model, best_pred = self.train_model(
+                    params=best_params,
+                    x_train=pd.concat([x_train, x_val], axis=0),
+                    y_train=pd.concat([y_train, y_val], axis=0),
+                    x_val=x_test,
+                    y_val=y_test,
+                    model=model,
+                )
-                    if config.get("need_scaling"):
-                        date_column = date_column.map(pd.Timestamp.fromordinal)
+                logger.info(f"Best model scores on test set: {best_score}")
-                    # Now you can use the actual train/val indices to extract ranges
-                    train_start = date_column.iloc[train_index[0]]
-                    train_end = date_column.iloc[train_index[-1]]
-                    val_start = date_column.iloc[val_index[0]]
-                    val_end = date_column.iloc[val_index[-1]]
+            # Save validation predictions
+            best_pred.to_csv(
+                f"{self.results_dir}/pred_val.csv",
+                index=True,
+                header=True,
+                index_label="ID",
+            )
-                    logger.info(
-                        f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
-                        f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
-                    )
-                else:
-                    logger.info(
-                        f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
-                    )
+            # Save best model
+            model_path = best_model.save(self.results_dir)
-                # Train the model and get the score
-                if config["recurrent"]:
-                    cross_validation_score, _, _ = train_model(
-                        params=best_params,
-                        x_train=x_train_val[train_index],
-                        y_train=y_train_val[train_index],
-                        x_val=x_train_val[val_index],
-                        y_val=y_train_val[val_index],
-                        config=config,
-                    )
-                else:
-                    cross_validation_score, _, _ = train_model(
-                        params=best_params,
-                        x_train=x_train_val.iloc[train_index],
-                        y_train=y_train_val.iloc[train_index],
-                        x_val=x_train_val.iloc[val_index],
-                        y_val=y_train_val.iloc[val_index],
-                        config=config,
-                    )
+            model_path = Path(model_path).resolve()
+            best_score["MODEL_PATH"] = model_path
-                # Append score to the list
-                cross_validation_scores.append(cross_validation_score)
+            # Track scores
+            scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
+            best_score_df = pd.DataFrame([best_score])
-            # Calculate and log the mean score
-            cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
-                metric
-            ].mean()
-            logger.info(
-                f"Best model mean cross-validation score: {cross_validation_mean_score}"
+            if os.path.exists(scores_tracking_path):
+                existing_scores = pd.read_csv(scores_tracking_path)
+                common_cols = existing_scores.columns.intersection(
+                    best_score_df.columns
+                )
+                best_score_df = best_score_df[common_cols]
+                scores_tracking = pd.concat(
+                    [existing_scores, best_score_df], ignore_index=True
+                )
+            else:
+                scores_tracking = best_score_df
+            scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
+            scores_tracking.to_csv(scores_tracking_path, index=False)
+            # Save model training metadata
+            stop = time.time()
+            training_time = stop - start
+            model_training.best_params = best_params
+            model_training.model_path = model_path
+            model_training.training_time = training_time
+            model_training.save()
+            # Store metrics in DB
+            drop_cols = [
+                "DATE",
+                "SESSION",
+                "TRAIN_DATA",
+                "VAL_DATA",
+                "FEATURES",
+                "MODEL_NAME",
+                "MODEL_PATH",
+            ]
+            best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
+            score_data = {k.lower(): v for k, v in best_score.items()}
+            Score.upsert(
+                match_fields=["model_training_id"],
+                model_training_id=model_training.id,
+                **score_data,
             )
-            # Retrain on entire training set, but keep score on cross-validation folds
-            best_score, best_model, best_pred = train_model(
-                params=best_params,
+            logger.info(f"Model training finished in {training_time:.2f} seconds")
+        # find best model type
+        scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
+        scores_tracking = pd.read_csv(scores_tracking_path)
+        best_score_overall = scores_tracking.iloc[0, :]
+        best_model_name = best_score_overall["MODEL_NAME"]
+        # Remove any .best or .keras files
+        for file_path in glob.glob(
+            os.path.join(self.training_target_dir, "*.best")
+        ) + glob.glob(os.path.join(self.training_target_dir, "*.keras")):
+            os.remove(file_path)
+        # Copy the best model in root training folder for this target
+        best_model_path = Path(
+            f"{self.training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
+        ).resolve()
+        copy_any(
+            best_score_overall["MODEL_PATH"],
+            best_model_path,
+        )
+        with open(f"{self.training_target_dir}/best_params.json", "r") as f:
+            best_model_params = json.load(f)[best_model_name]
+        # save model_selection results to db
+        model_selection = ModelSelection.get(model_selection.id)
+        model_selection.best_model_id = Model.find_by(
+            name=best_score_overall["MODEL_NAME"], type=self.target_type
+        ).id
+        model_selection.best_model_params = best_model_params
+        model_selection.best_model_path = best_model_path
+        model_selection.save()
+        logger.info(f"Best model overall is : {best_score_overall}")
+    def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
+        self.type_name = "hyperopts"
+        def collect_error_logs(training_target_dir: int, storage_path: str):
+            output_error_file = f"{training_target_dir}/errors.log"
+            with open(output_error_file, "a") as outfile:
+                # Walk through the ray_results directory
+                for root, dirs, files in os.walk(storage_path):
+                    # Check if 'error.txt' exists in the current directory
+                    if "error.txt" in files:
+                        error_file_path = os.path.join(root, "error.txt")
+                        logger.info(f"Processing error file: {error_file_path}")
+                        # Read and append the content of the error.txt file
+                        with open(error_file_path, "r") as infile:
+                            outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
+                            outfile.write(infile.read())
+            logger.info(f"All errors written to {output_error_file}")
+        logger.info("Start tuning hyperparameters...")
+        storage_path = f"{self.results_dir}/ray_results"
+        tuner = Tuner(
+            trainable=with_parameters(
+                trainable,
                 x_train=x_train,
                 y_train=y_train,
                 x_val=x_val,
                 y_val=y_val,
-                config=config,
+                model_name=model.model_name,
+                target_type=self.target_type,
+                session_name=self.session_name,
+                target_number=self.target_number,
+                create_model=model.create_model,
+                type_name="hyperopts",
+                plot=model.plot,
+            ),
+            param_space=model.search_params,
+            tune_config=TuneConfig(
+                metric=self.metric,
+                mode="min",
+                search_alg=HyperOptSearch(),
+                num_samples=self.number_of_trials,
+                scheduler=ASHAScheduler(max_t=100, grace_period=10),
+            ),
+            run_config=RunConfig(
+                stop={"training_iteration": 100},
+                storage_path=storage_path,
+                callbacks=[TBXLoggerCallback()],
+            ),
+        )
+        try:
+            results = tuner.fit()
+            best_result = results.get_best_result(self.metric, "max")
+            best_params = best_result.config
+            best_score = best_result.metrics
+            # log results
+            logger.info(f"Best hyperparameters found were:\n{best_params}")
+            logger.info(f"Best Scores found were:\n{best_score}")
+            logger.info(
+                f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
             )
-            best_score = cross_validation_mean_score
-        else:
-            # Evaluate on validation set
-            _type_name = "validation"
-            best_score, best_model, best_pred = train_model(
-                params=best_params,
-                x_train=x_train,
-                y_train=y_train,
-                x_val=x_val,
-                y_val=y_val,
-                config=config,
+            # Collect errors in single file
+            collect_error_logs(
+                training_target_dir=self.training_target_dir, storage_path=storage_path
             )
-            logger.info(f"Best model scores on validation set: {best_score}")
+        except Exception as e:
+            raise Exception(e)
-        # Save validation predictions
-        best_pred.to_csv(
-            f"{results_dir}/pred_val.csv",
-            index=True,
-            header=True,
-            index_label="ID",
+        finally:
+            ray.shutdown()
+        return best_params
+    def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
+        # Use the standalone training function to avoid duplication
+        # For train_model, we pass the data directly (not as Ray references)
+        return trainable(
+            params,
+            x_train,
+            y_train,
+            x_val,
+            y_val,
+            model.model_name,
+            self.target_type,
+            self.session_name,
+            self.target_number,
+            model.create_model,
+            self.type_name,
+            model.plot,
         )
-        # Save best model
-        if config["recurrent"]:
-            model_path = f"{results_dir}/{best_model.model_name}.keras"
-            best_model.save(model_path)
-        else:
-            model_path = f"{results_dir}/{best_model.model_name}.best"
-            joblib.dump(best_model, model_path)
-        model_path = Path(model_path).resolve()
-        best_score["MODEL_PATH"] = model_path
-        # Track scores
-        scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
-        best_score_df = pd.DataFrame([best_score])
-        if os.path.exists(scores_tracking_path):
-            existing_scores = pd.read_csv(scores_tracking_path)
-            common_cols = existing_scores.columns.intersection(best_score_df.columns)
-            best_score_df = best_score_df[common_cols]
-            scores_tracking = pd.concat(
-                [existing_scores, best_score_df], ignore_index=True
-            )
-        else:
-            scores_tracking = best_score_df
-        scores_tracking.sort_values(metric, ascending=True, inplace=True)
-        scores_tracking.to_csv(scores_tracking_path, index=False)
-        # Save model training metadata
-        stop = time.time()
-        training_time = stop - start
-        model_training.best_params = best_params
-        model_training.model_path = model_path
-        model_training.training_time = training_time
-        model_training.save()
-        # Store metrics in DB
-        drop_cols = [
-            "DATE",
-            "SESSION",
-            "TRAIN_DATA",
-            "VAL_DATA",
-            "FEATURES",
-            "MODEL_NAME",
-            "MODEL_PATH",
-        ]
-        best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
-        score_data = {k.lower(): v for k, v in best_score.items()}
-        Score.upsert(
-            match_fields=["model_training_id"],
-            model_training_id=model_training.id,
-            **score_data,
-        )
+def evaluate(prediction: pd.DataFrame, target_type: str):
+    """
+    Function to evaluate model performance
+    Args:
+        - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
+        - target_type: classification or regression
+    """
+    score = {}
+    y_true = prediction["TARGET"]
+    y_pred = prediction["PRED"]
-        logger.info(f"Model training finished in {training_time:.2f} seconds")
+    if target_type == "regression":
+        # Main metrics
+        score["RMSE"] = root_mean_squared_error(y_true, y_pred)
+        score["MAE"] = mean_absolute_error(y_true, y_pred)
+        score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
+        score["R2"] = r2_score(y_true, y_pred)
-    # find best model type
-    scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
-    scores_tracking = pd.read_csv(scores_tracking_path)
-    best_score_overall = scores_tracking.iloc[0, :]
-    best_model_name = best_score_overall["MODEL_NAME"]
+        # Robustness: avoid division by zero
+        std_target = y_true.std()
+        mean_target = y_true.mean()
+        median_target = y_true.median()
-    # Remove any .best or .keras files
-    for file_path in glob.glob(os.path.join(training_target_dir, "*.best")) + glob.glob(
-        os.path.join(training_target_dir, "*.keras")
-    ):
-        os.remove(file_path)
-    # Copy the best model in root training folder for this target
-    best_model_path = Path(
-        f"{training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
-    ).resolve()
-    copy_any(
-        best_score_overall["MODEL_PATH"],
-        best_model_path,
-    )
+        # RMSE / STD
+        score["RMSE_STD_RATIO"] = (
+            float(100 * score["RMSE"] / std_target) if std_target else 1000
+        )
-    with open(f"{training_target_dir}/best_params.json", "r") as f:
-        best_model_params = json.load(f)[best_model_name]
+        # Median absolute deviation (MAD)
+        mam = (y_true - mean_target).abs().median()  # Median Abs around Mean
+        mad = (y_true - median_target).abs().median()  # Median Abs around Median
+        score["MAM"] = mam
+        score["MAD"] = mad
+        score["MAE_MAM_RATIO"] = (
+            float(100 * score["MAE"] / mam) if mam else 1000
+        )  # MAE / MAD → Plus stable, moins sensible aux outliers.
+        score["MAE_MAD_RATIO"] = (
+            float(100 * score["MAE"] / mad) if mad else 1000
+        )  # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
-    # save model_selection results to db
-    model_selection = ModelSelection.get(model_selection.id)
-    model_selection.best_model_id = Model.find_by(
-        name=best_score_overall["MODEL_NAME"], type=_target_type
-    ).id
-    model_selection.best_model_params = best_model_params
-    model_selection.best_model_path = best_model_path
-    model_selection.save()
+    else:
+        labels = np.unique(y_true)
+        num_classes = labels.size
+        y_pred_proba = (
+            prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
+        )
+        if num_classes > 2:
+            lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
+            lb.fit(labels)
+            y_true_onhot = lb.transform(y_true)
+            y_pred_onehot = lb.transform(y_pred)
-    logger.info(f"Best model overall is : {best_score_overall}")
+        score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
+        score["ACCURACY"] = accuracy_score(y_true, y_pred)
+        score["PRECISION"] = precision_score(
+            y_true,
+            y_pred,
+            average=("binary" if num_classes == 2 else "macro"),
+        )
+        score["RECALL"] = recall_score(
+            y_true,
+            y_pred,
+            average=("binary" if num_classes == 2 else "macro"),
+        )
+        score["F1"] = f1_score(
+            y_true,
+            y_pred,
+            average=("binary" if num_classes == 2 else "macro"),
+        )
+        score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
+        (
+            score["THRESHOLD"],
+            score["PRECISION_AT_THRESHOLD"],
+            score["RECALL_AT_THRESHOLD"],
+        ) = (
+            find_best_precision_threshold(prediction)
+            if num_classes == 2
+            else (None, None, None)
+        )
+    return score
-def collect_error_logs(training_target_dir: int, storage_path: str):
+# utils
+def get_log_dir(training_target_dir: str, model_name="test_model"):
+    """Generates a structured log directory path for TensorBoard."""
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
+    log_dir = (
+        Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
+    )
+    log_dir.mkdir(parents=True, exist_ok=True)  # Create directories if they don't exist
+    return str(log_dir)
-    output_error_file = f"{training_target_dir}/errors.log"
-    with open(output_error_file, "a") as outfile:
-        # Walk through the ray_results directory
-        for root, dirs, files in os.walk(storage_path):
-            # Check if 'error.txt' exists in the current directory
-            if "error.txt" in files:
-                error_file_path = os.path.join(root, "error.txt")
-                logger.info(f"Processing error file: {error_file_path}")
-                # Read and append the content of the error.txt file
-                with open(error_file_path, "r") as infile:
-                    outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
-                    outfile.write(infile.read())
-    logger.info(f"All errors written to {output_error_file}")
+def print_scores(training_target_dir: str):
+    """
+    Monitor scores
+    """
+    scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
+    return scores_tracking
+# plots
 def plot_evaluation_for_classification(prediction: dict):
     """
     Args
@@ -1272,7 +1380,7 @@ def plot_confusion_matrix(y_true, y_pred):
     plt.show()
-# THRESHOLD
+# thresholds
 def find_max_f1_threshold(prediction):
     """
     Finds the threshold that maximizes the F1 score for a binary classification task.
@@ -1515,14 +1623,6 @@ def plot_threshold(prediction, threshold, precision, recall):
     return threshold
-def print_scores(training_target_dir: str):
-    """
-    Monitor scores
-    """
-    scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
-    return scores_tracking
 # OLD - to sort out
 def get_pred_distribution(training_target_dir: str, model_name="linear"):
     """

lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl