PyPI - lecrapaud - Versions diffs - 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl - Mend

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

lecrapaud/__init__.py +22 -1
lecrapaud/{api.py → base.py} +331 -241
lecrapaud/config.py +15 -3
lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
lecrapaud/db/models/__init__.py +2 -4
lecrapaud/db/models/base.py +116 -65
lecrapaud/db/models/experiment.py +195 -182
lecrapaud/db/models/feature_selection.py +0 -3
lecrapaud/db/models/feature_selection_rank.py +0 -18
lecrapaud/db/models/model_selection.py +2 -2
lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
lecrapaud/db/session.py +4 -0
lecrapaud/experiment.py +44 -17
lecrapaud/feature_engineering.py +45 -674
lecrapaud/feature_preprocessing.py +1202 -0
lecrapaud/feature_selection.py +145 -332
lecrapaud/integrations/sentry_integration.py +46 -0
lecrapaud/misc/tabpfn_tests.ipynb +2 -2
lecrapaud/mixins.py +247 -0
lecrapaud/model_preprocessing.py +295 -0
lecrapaud/model_selection.py +612 -242
lecrapaud/pipeline.py +548 -0
lecrapaud/search_space.py +2 -1
lecrapaud/utils.py +36 -3
lecrapaud-0.22.6.dist-info/METADATA +423 -0
lecrapaud-0.22.6.dist-info/RECORD +51 -0
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
lecrapaud/db/models/model_training.py +0 -64
lecrapaud/jobs/__init__.py +0 -13
lecrapaud/jobs/config.py +0 -17
lecrapaud/jobs/scheduler.py +0 -30
lecrapaud/jobs/tasks.py +0 -17
lecrapaud-0.19.0.dist-info/METADATA +0 -249
lecrapaud-0.19.0.dist-info/RECORD +0 -48

lecrapaud/model_selection.py CHANGED Viewed

@@ -15,7 +15,7 @@ from pydantic import BaseModel
 import ast
 # ML models
-from sklearn.model_selection import TimeSeriesSplit
+from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import (
     mean_absolute_percentage_error,
@@ -55,31 +55,46 @@ from tensorboardX import SummaryWriter
 # Optimization
 import ray
-from ray.tune import Tuner, TuneConfig, with_parameters
-from ray.train import RunConfig
+from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
 from ray.tune.search.hyperopt import HyperOptSearch
 from ray.tune.search.bayesopt import BayesOptSearch
 from ray.tune.logger import TBXLoggerCallback
 from ray.tune.schedulers import ASHAScheduler
 from ray.air import session
+# HyperOpt standalone
+from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
 # Internal library
 from lecrapaud.search_space import all_models
 from lecrapaud.directories import clean_directory
 from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
-from lecrapaud.config import PYTHON_ENV
+from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
 from lecrapaud.feature_selection import load_train_data
 from lecrapaud.db import (
     Model,
     ModelSelection,
-    ModelTraining,
-    Score,
+    ModelSelectionScore,
     Target,
     Experiment,
 )
+from lecrapaud.mixins import LeCrapaudEstimatorMixin
 os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
+# Suppress XGBoost and LightGBM logging
+import logging
+logging.getLogger("lightgbm").setLevel(logging.ERROR)
+logging.getLogger("xgboost").setLevel(logging.ERROR)
+# Set global verbosity for XGBoost
+xgb.set_config(verbosity=0)
+# Suppress warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
 # Reproducible result
 keras.utils.set_random_seed(42)
 np.random.seed(42)
@@ -110,7 +125,64 @@ def test_hardware():
 warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
-class ModelEngine:
+class CatBoostWrapper:
+    """
+    Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
+    as direct attributes, while forwarding all method calls and properties.
+    """
+    __slots__ = ("_model", "_extra_attrs")
+    def __init__(self, model, **kwargs):
+        object.__setattr__(self, "_model", model)
+        object.__setattr__(self, "_extra_attrs", {})
+        # Register kwargs as direct attributes
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+    # ---- Transparent access ----
+    def __getattr__(self, name):
+        """Forward attribute access to the underlying model if not found."""
+        model = object.__getattribute__(self, "_model")
+        if hasattr(model, name):
+            return getattr(model, name)
+        extra_attrs = object.__getattribute__(self, "_extra_attrs")
+        if name in extra_attrs:
+            return extra_attrs[name]
+        raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
+    def __setattr__(self, name, value):
+        """Set to wrapper or forward to model when appropriate."""
+        if name in CatBoostWrapper.__slots__:
+            object.__setattr__(self, name, value)
+            return
+        model = object.__getattribute__(self, "_model")
+        if hasattr(model, name):
+            setattr(model, name, value)
+        else:
+            extra_attrs = object.__getattribute__(self, "_extra_attrs")
+            extra_attrs[name] = value
+    def __dir__(self):
+        """Merge dir() from wrapper, model, and custom attributes."""
+        base = set(super().__dir__())
+        model_attrs = set(dir(object.__getattribute__(self, "_model")))
+        extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
+        return sorted(base | model_attrs | extra_attrs)
+    def __repr__(self):
+        model = object.__getattribute__(self, "_model")
+        extras = object.__getattribute__(self, "_extra_attrs")
+        return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
+    @property
+    def model(self):
+        """Access the raw CatBoost model."""
+        return object.__getattribute__(self, "_model")
+class BaseModel:
     def __init__(
         self,
@@ -296,12 +368,15 @@ class ModelEngine:
             )
         # Attach metadata for consistency with sklearn path
-        model.model_name = self.model_name
-        model.target_type = self.target_type
-        logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
+        model_wrapped = CatBoostWrapper(
+            model, model_name=self.model_name, target_type=self.target_type
+        )
+        logger.info(
+            f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
+        )
-        self._model = model
-        return model
+        self._model = model_wrapped
+        return model_wrapped
     def fit_boosting(self, x_train, y_train, x_val, y_val, params):
         """
@@ -350,6 +425,7 @@ class ModelEngine:
                     "metric": eval_metric,
                     "num_class": num_class,
                     "verbose": -1,
+                    "verbose_eval": False,
                 },
                 num_boost_round=params["num_boost_round"],
                 train_set=train_data,
@@ -361,6 +437,7 @@ class ModelEngine:
                     ),
                     lgb.record_evaluation(evals_result),
                     tensorboard_callback,
+                    lgb.log_evaluation(period=0),  # Disable evaluation logging
                 ],
             )
         else:
@@ -402,7 +479,7 @@ class ModelEngine:
                 if self.target_type == "regression"
                 else ("logloss" if num_class <= 2 else "mlogloss")
             )
-            xgb.set_config(verbosity=0)
+            # XGBoost verbosity already set globally
             model = xgb.train(
                 params={
                     **params["model_params"],
@@ -417,11 +494,11 @@ class ModelEngine:
                     xgb.callback.EarlyStopping(
                         rounds=params["early_stopping_rounds"], save_best=True
                     ),
-                    xgb.callback.EvaluationMonitor(),  # This shows evaluation results at each iteration
+                    # Removed EvaluationMonitor to suppress logs
                     tensorboard_callback,
                 ],
                 evals_result=evals_result,  # Record evaluation result
-                verbose_eval=10000,
+                verbose_eval=False,  # Disable evaluation logging
             )
         model.model_name = self.create_model
@@ -686,6 +763,171 @@ class ModelEngine:
         )
+def trainable_cv(
+    params,
+    x_train,
+    y_train,
+    x_val,
+    y_val,
+    model_name,
+    target_type,
+    experiment_name,
+    target_number,
+    create_model,
+    n_splits=3,
+    plot=False,
+    log_dir=None,
+    target_clf_thresholds: dict = None,
+    time_series=True,
+    recurrent=False,
+):
+    """Cross-validation version of trainable for hyperopt.
+    Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
+    Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
+    """
+    # Combine train and validation data for cross-validation
+    if recurrent:
+        x_train_val = np.concatenate([x_train, x_val], axis=0)
+        y_train_val = np.concatenate([y_train, y_val], axis=0)
+    else:
+        x_train_val = pd.concat([x_train, x_val], axis=0)
+        y_train_val = pd.concat([y_train, y_val], axis=0)
+        # Store original index for later use if needed
+        original_index = x_train_val.index.copy()
+        # Reset index for proper iloc indexing with CV splits
+        x_train_val = x_train_val.reset_index(drop=True)
+        y_train_val = y_train_val.reset_index(drop=True)
+    # Choose appropriate cross-validation splitter
+    if time_series:
+        # Time series split for temporal data
+        n_samples = len(x_train_val)
+        test_size = int(n_samples / (n_splits + 1))  # Ensure reasonable test size
+        cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
+    else:
+        # Stratified or regular K-fold for i.i.d. data
+        if target_type == "classification":
+            cv_splitter = StratifiedKFold(
+                n_splits=n_splits, shuffle=True, random_state=42
+            )
+        else:
+            cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
+    # Store all predictions and true values for pooled metrics
+    all_predictions = []
+    all_y_true = []
+    fold_times = []
+    # Get splits based on the CV strategy
+    if time_series or target_type == "regression":
+        splits = cv_splitter.split(x_train_val)
+    else:
+        # For stratified split, we need to pass y
+        if recurrent:
+            # Extract the target from the 2D array (first column is target)
+            y_for_split = y_train_val[:, 0]
+        else:
+            y_for_split = y_train_val
+        splits = cv_splitter.split(x_train_val, y_for_split)
+    for fold_idx, (train_idx, val_idx) in enumerate(splits):
+        # Extract fold data
+        if recurrent:
+            x_fold_train = x_train_val[train_idx]
+            y_fold_train = y_train_val[train_idx]
+            x_fold_val = x_train_val[val_idx]
+            y_fold_val = y_train_val[val_idx]
+        else:
+            x_fold_train = x_train_val.iloc[train_idx]
+            y_fold_train = y_train_val.iloc[train_idx]
+            x_fold_val = x_train_val.iloc[val_idx]
+            y_fold_val = y_train_val.iloc[val_idx]
+        # Train model for this fold
+        model = BaseModel(
+            model_name=model_name,
+            target_type=target_type,
+            target_number=target_number,
+            create_model=create_model,
+            plot=False,  # Disable individual fold plots
+            log_dir=log_dir,
+        )
+        if recurrent:
+            timesteps = params["timesteps"]
+            x_fold_train = x_fold_train[:, -timesteps:, :]
+            x_fold_val = x_fold_val[:, -timesteps:, :]
+        # Fit model
+        model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
+        # Get predictions
+        y_pred = model.predict(x_fold_val)
+        # Handle recurrent model indexing
+        if recurrent:
+            y_fold_val = pd.DataFrame(
+                y_fold_val, columns=["TARGET", "index"]
+            ).set_index("index")
+            y_pred.index = y_fold_val.index
+        # Store predictions and true values
+        all_predictions.append(y_pred)
+        all_y_true.append(y_fold_val)
+    # Concatenate all fold predictions
+    if target_type == "classification":
+        # For classification, we need to handle probability columns
+        all_pred_df = pd.concat(all_predictions, axis=0)
+        all_y_series = pd.concat(all_y_true, axis=0)
+        # Ensure we have a DataFrame with TARGET column
+        if isinstance(all_y_series, pd.Series):
+            all_y_df = pd.DataFrame({"TARGET": all_y_series})
+        else:
+            all_y_df = all_y_series
+    else:
+        # For regression, just concatenate the predictions
+        all_pred_series = pd.concat(all_predictions, axis=0)
+        all_y_series = pd.concat(all_y_true, axis=0)
+        all_pred_df = pd.DataFrame({"PRED": all_pred_series})
+        all_y_df = pd.DataFrame({"TARGET": all_y_series})
+    # Create combined prediction DataFrame
+    prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
+    # Calculate pooled metrics
+    score = {
+        "DATE": datetime.now(),
+        "MODEL_NAME": model_name,
+        "EVAL_DATA_STD": prediction["TARGET"].std(),
+    }
+    # Unscale if needed (for regression with scaling)
+    if (
+        model.need_scaling
+        and target_type == "regression"
+        and model.scaler_y is not None
+    ):
+        prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
+            prediction[["TARGET"]].values
+        )
+        prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
+            prediction[["PRED"]].values
+        )
+    # Evaluate with pooled predictions
+    score.update(evaluate(prediction, target_type, target_clf_thresholds))
+    metric = "RMSE" if target_type == "regression" else "LOGLOSS"
+    logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
+    # Report to Ray if in Ray context
+    if session.get_session():
+        session.report(metrics=score)
+    return score
 def trainable(
     params,
     x_train,
@@ -697,14 +939,13 @@ def trainable(
     experiment_name,
     target_number,
     create_model,
-    type_name="hyperopts",
     plot=False,
     log_dir=None,
     target_clf_thresholds: dict = None,
 ):
     """Standalone version of train_model that doesn't depend on self"""
     # Create model engine
-    model = ModelEngine(
+    model = BaseModel(
         model_name=model_name,
         target_type=target_type,
         target_number=target_number,
@@ -723,9 +964,7 @@ def trainable(
         x_val = x_val[:, -timesteps:, :]
     # Compile and fit model on train set
-    start = time.time()
     model.fit(x_train, y_train, x_val, y_val, params)
-    stop = time.time()
     # Prediction on val set
     y_pred = model.predict(x_val)
@@ -755,8 +994,6 @@ def trainable(
     score = {
         "DATE": datetime.now(),
         "MODEL_NAME": model.model_name,
-        "TYPE": type_name,
-        "TRAINING_TIME": stop - start,
         "EVAL_DATA_STD": prediction["TARGET"].std(),
     }
@@ -765,77 +1002,107 @@ def trainable(
     metric = "RMSE" if target_type == "regression" else "LOGLOSS"
     logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
-    if type_name == "hyperopts":
+    # Report to Ray if in Ray context
+    if session.get_session():
         session.report(metrics=score)
         return score
     return score, model, prediction
-class ModelSelectionEngine:
+class ModelSelector(LeCrapaudEstimatorMixin):
     def __init__(
         self,
-        data,
-        reshaped_data,
-        target_number,
-        target_clf,
-        experiment,
-        models_idx,
-        time_series,
-        date_column,
-        group_column,
-        target_clf_thresholds,
+        experiment: Experiment = None,
+        target_number: int = None,
         **kwargs,
     ):
-        self.data = data
-        self.reshaped_data = reshaped_data
+        # The mixin will automatically set all experiment.context parameters as attributes
+        super().__init__(experiment=experiment, target_number=target_number, **kwargs)
+        # Set defaults for required parameters if not provided
+        if not hasattr(self, "target_clf"):
+            self.target_clf = []
+        if not hasattr(self, "models_idx"):
+            self.models_idx = []
+        if not hasattr(self, "time_series"):
+            self.time_series = False
+        if not hasattr(self, "date_column"):
+            self.date_column = None
+        if not hasattr(self, "group_column"):
+            self.group_column = None
+        if not hasattr(self, "target_clf_thresholds"):
+            self.target_clf_thresholds = {}
         self.target_number = target_number
-        self.experiment = experiment
-        self.target_clf = target_clf
-        self.models_idx = models_idx
-        self.time_series = time_series
-        self.date_column = date_column
-        self.group_column = group_column
-        self.target_clf_thresholds = (
-            target_clf_thresholds[target_number]
-            if target_number in target_clf_thresholds.keys()
-            else None
-        )
-        self.target_type = (
-            "classification" if self.target_number in self.target_clf else "regression"
-        )
-        self.experiment_dir = self.experiment.path
-        self.experiment_id = self.experiment.id
-        self.data_dir = f"{self.experiment_dir}/data"
-        self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
-        self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
-        self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
-        self.features = self.experiment.get_features(self.target_number)
-        self.all_features = self.experiment.get_all_features(
-            date_column=self.date_column, group_column=self.group_column
-        )
+        # Handle target_clf_thresholds for specific target
+        # Handle both string and integer keys for backward compatibility
+        if self.target_number and self.target_clf_thresholds:
+            # Try both integer and string versions of the target number
+            if self.target_number in self.target_clf_thresholds:
+                self.target_clf_thresholds = self.target_clf_thresholds[
+                    self.target_number
+                ]
+            elif str(self.target_number) in self.target_clf_thresholds:
+                self.target_clf_thresholds = self.target_clf_thresholds[
+                    str(self.target_number)
+                ]
+        # Derived attributes
+        if self.target_number is not None:
+            self.target_type = (
+                "classification"
+                if self.target_number in self.target_clf
+                else "regression"
+            )
+            self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
+        # Set paths and features if experiment is available
+        if self.experiment:
+            self.experiment_dir = self.experiment.path
+            self.experiment_id = self.experiment.id
+            self.data_dir = f"{self.experiment_dir}/data"
+            self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
+            if self.target_number is not None:
+                self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
+                self.features = self.experiment.get_features(self.target_number)
+            self.all_features = self.experiment.get_all_features(
+                date_column=self.date_column, group_column=self.group_column
+            )
     # Main training function
-    def run(
-        self,
-        experiment_name,
-        perform_hyperopt=True,
-        number_of_trials=20,
-        perform_crossval=False,
-        plot=True,
-        clean_dir=False,  # TODO: This has been unused because now feature_selection is in the target directory
-        preserve_model=True,
-        best_params=None,
-    ):
+    def fit(self, X, y=None, reshaped_data=None, best_params=None):
         """
-        Selects the best models based on a target variable, optionally performing hyperparameter optimization
-        and cross-validation, and manages outputs in a session-specific directory.
+        Fit the model selector (train and select best model).
+        Args:
+            X: Either a DataFrame or a dict with train/val/test data
+            y: Target values (ignored, uses TARGET columns)
+            reshaped_data: Optional reshaped data for recurrent models
+            best_params: Optional pre-defined best parameters
+        Returns:
+            self: Returns self for chaining
         """
-        self.experiment_name = experiment_name
-        self.plot = plot
-        self.number_of_trials = number_of_trials
+        # Handle both DataFrame and dict inputs
+        if isinstance(X, dict):
+            self.data = X
+            self.reshaped_data = reshaped_data
+        else:
+            # For simple DataFrame input, we expect it to be just training data
+            # This is less common for ModelSelector which typically needs train/val/test
+            raise ValueError("ModelSelector requires a dict with train/val/test data")
+        # Get all parameters from experiment context
+        context = self.experiment.context
+        self.experiment_name = context.get("experiment_name", "")
+        self.plot = context.get("plot", True)
+        self.number_of_trials = context.get("number_of_trials", 20)
+        self.perform_crossval = context.get("perform_crossval", False)
+        self.preserve_model = context.get("preserve_model", True)
+        self.perform_hyperopt = context.get("perform_hyperopt", True)
         if self.experiment_id is None:
             raise ValueError("Please provide a experiment.")
@@ -885,12 +1152,11 @@ class ModelSelectionEngine:
         # create model selection in db
         target = Target.find_by(name=f"TARGET_{self.target_number}")
         model_selection = ModelSelection.upsert(
-            match_fields=["target_id", "experiment_id"],
             target_id=target.id,
             experiment_id=self.experiment_id,
         )
-        # recurrent models starts at 9 # len(list_models)
+        # STEP 1 : TRAINING MODELS
         for i in self.models_idx:
             config = all_models[i]
             recurrent = config["recurrent"]
@@ -903,24 +1169,16 @@ class ModelSelectionEngine:
             self.results_dir = f"{self.target_dir}/{model_name}"
             if not os.path.exists(f"{self.results_dir}"):
                 os.makedirs(f"{self.results_dir}")
-            elif preserve_model and contains_best(self.results_dir):
+            elif self.preserve_model and contains_best(self.results_dir):
                 continue
-            elif perform_hyperopt:
+            elif self.perform_hyperopt:
                 clean_directory(self.results_dir)
-            logger.info(f"Training a {model_name}")
-            model = Model.upsert(
-                match_fields=["name", "type"],
-                name=model_name,
-                type=self.target_type,
-            )
-            model_training = ModelTraining.upsert(
-                match_fields=["model_id", "model_selection_id"],
-                model_id=model.id,
-                model_selection_id=model_selection.id,
+            logger.info(
+                f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
             )
-            # getting data
+            # Getting data
             if recurrent:
                 # Clear cluster from previous Keras session graphs.
                 K.clear_session()
@@ -930,7 +1188,7 @@ class ModelSelectionEngine:
                     for i, e in enumerate(self.all_features)
                     if e in set(self.features)
                 ]
-                # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
+                # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
                 x_train = x_train_reshaped[:, :, features_idx]
                 y_train = y_train_reshaped[:, [self.target_number, 0]]
                 x_val = x_val_reshaped[:, :, features_idx]
@@ -960,8 +1218,9 @@ class ModelSelectionEngine:
                     y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
             log_dir = get_log_dir(self.target_dir, model_name)
-            # instantiate model
-            model = ModelEngine(
+            # Instantiate model
+            model = BaseModel(
                 target_number=self.target_number,
                 model_name=model_name,
                 search_params=config["search_params"],
@@ -971,9 +1230,9 @@ class ModelSelectionEngine:
                 log_dir=log_dir,
             )
-            start = time.time()
             # Tuning hyperparameters
-            if perform_hyperopt:
+            start = time.time()
+            if self.perform_hyperopt:
                 model_best_params = self.hyperoptimize(
                     x_train, y_train, x_val, y_val, model
                 )
@@ -989,7 +1248,7 @@ class ModelSelectionEngine:
                         f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
                     )
-            # save best params
+            # Save best params
             best_params_file = f"{self.target_dir}/best_params.json"
             try:
                 with open(best_params_file, "r") as f:
@@ -1001,114 +1260,25 @@ class ModelSelectionEngine:
             with open(best_params_file, "w") as f:
                 json.dump(json_dict, f, indent=4)
-            # Perform cross-validation of the best model on k-folds of train + val set
-            if perform_crossval:
-                x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
-                y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
-                n_splits = 4
-                n_samples = len(x_train_val)
-                test_size = int(n_samples / (n_splits + 4))
-                tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
-                # Store the scores
-                cv_scores = []
-                for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
-                    self.type_name = f"crossval_fold_{i}"
-                    if self.time_series:
-                        date_series = pd.concat(
-                            [
-                                train[self.date_column],
-                                val[self.date_column],
-                                test[self.date_column],
-                            ],
-                            axis=0,
-                        ).reset_index(drop=True)
-                        date_series = date_series.map(pd.Timestamp.fromordinal)
-                        # Now you can use the actual train/val indices to extract ranges
-                        train_start = date_series.iloc[train_index[0]]
-                        train_end = date_series.iloc[train_index[-1]]
-                        val_start = date_series.iloc[val_index[0]]
-                        val_end = date_series.iloc[val_index[-1]]
-                        logger.info(
-                            f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
-                            f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
-                        )
-                    else:
-                        logger.info(
-                            f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
-                        )
-                    # Train the model and get the score
-                    if recurrent:
-                        cv_score, _, _ = self.train_model(
-                            params=model_best_params,
-                            x_train=x_train_val[train_index],
-                            y_train=y_train_val[train_index],
-                            x_val=x_train_val[val_index],
-                            y_val=y_train_val[val_index],
-                            model=model,
-                        )
-                    else:
-                        cv_score, _, _ = self.train_model(
-                            params=model_best_params,
-                            x_train=x_train_val.iloc[train_index],
-                            y_train=y_train_val.iloc[train_index],
-                            x_val=x_train_val.iloc[val_index],
-                            y_val=y_train_val.iloc[val_index],
-                            model=model,
-                        )
-                    # Append score to the list
-                    cv_scores.append(cv_score)
-                # Calculate mean of all numerical metrics across all cross-validation folds
-                cv_scores_df = pd.DataFrame(cv_scores)
-                # Get mean of all numeric columns
-                cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
+            # Always evaluate on test set (no cross-validation here)
+            # The hyperopt already did CV if needed to find best params
+            best_score, best_model, best_pred = self.train_model(
+                params=model_best_params,
+                x_train=pd.concat([x_train, x_val], axis=0),
+                y_train=pd.concat([y_train, y_val], axis=0),
+                x_val=x_test,
+                y_val=y_test,
+                model=model,
+            )
+            stop = time.time()
+            training_time = stop - start
-                logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
-                for metric, value in cv_means.items():
+            logger.info(f"Model training finished in {training_time:.2f} seconds")
+            logger.info(f"👉 {model.model_name} scores on test set:")
+            for metric, value in best_score.items():
+                if isinstance(value, (int, float)):
                     logger.info(f"  {metric}: {value:.4f}")
-                # Retrain on entire training set, but keep score on cross-validation folds
-                # Get the test score using the best model
-                test_score, best_model, best_pred = self.train_model(
-                    params=model_best_params,
-                    x_train=pd.concat([x_train, x_val], axis=0),
-                    y_train=pd.concat([y_train, y_val], axis=0),
-                    x_val=x_test,
-                    y_val=y_test,
-                    model=model,
-                )
-                # Update all metrics with cross-validation means
-                for metric, value in cv_means.items():
-                    if metric in test_score:  # Only update existing metrics
-                        test_score[metric] = value
-                best_score = test_score
-                best_score["TYPE"] = "crossval"
-            else:
-                # Evaluate on test set
-                self.type_name = "testset"
-                best_score, best_model, best_pred = self.train_model(
-                    params=model_best_params,
-                    x_train=pd.concat([x_train, x_val], axis=0),
-                    y_train=pd.concat([y_train, y_val], axis=0),
-                    x_val=x_test,
-                    y_val=y_test,
-                    model=model,
-                )
-                logger.info(f"👉 {model.model_name} scores on test set:")
-                for metric, value in best_score.items():
-                    if isinstance(value, (int, float)):
-                        logger.info(f"  {metric}: {value:.4f}")
             # Save predictions
             best_pred.to_csv(
                 f"{self.results_dir}/prediction.csv",
@@ -1119,7 +1289,6 @@ class ModelSelectionEngine:
             # Save best model
             model_path = best_model.save(self.results_dir)
             model_path = Path(model_path).resolve()
             best_score["MODEL_PATH"] = model_path
@@ -1142,32 +1311,26 @@ class ModelSelectionEngine:
             scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
             scores_tracking.to_csv(scores_tracking_path, index=False)
-            # Save model training metadata
-            stop = time.time()
-            training_time = stop - start
-            model_training.best_params = model_best_params
-            model_training.model_path = model_path
-            model_training.training_time = training_time
-            model_training.save()
-            # Store metrics in DB
+            # Save in db
             drop_cols = [
                 "DATE",
                 "MODEL_NAME",
-                "MODEL_PATH",
             ]
             best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
             score_data = {k.lower(): v for k, v in best_score.items()}
-            Score.upsert(
-                match_fields=["model_training_id"],
-                model_training_id=model_training.id,
+            model = Model.upsert(
+                name=model_name,
+                type=self.target_type,
+            )
+            ModelSelectionScore.upsert(
+                model_id=model.id,
+                model_selection_id=model_selection.id,
+                best_params=serialize_for_json(model_best_params),
+                training_time=training_time,
                 **score_data,
             )
-            logger.info(f"Model training finished in {training_time:.2f} seconds")
-        # find best model type
+        # STEP 2 :FINDING BEST MODEL OVERALL
         scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
         scores_tracking = pd.read_csv(scores_tracking_path)
         best_score_overall = scores_tracking.iloc[0, :]
@@ -1178,12 +1341,11 @@ class ModelSelectionEngine:
         else:
             best_thresholds = None
-        # Remove any .best or .keras files
+        # Remove any .best or .keras files, and save best model in target_dir
         for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
             os.path.join(self.target_dir, "*.keras")
         ):
             os.remove(file_path)
-        # Copy the best model in root training folder for this target
         best_model_path = Path(
             f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
         ).resolve()
@@ -1195,13 +1357,13 @@ class ModelSelectionEngine:
         with open(f"{self.target_dir}/best_params.json", "r") as f:
             best_model_params = json.load(f)[best_model_name]
-        # Save model_selection results to db
+        # Save to db
         model_selection = ModelSelection.get(model_selection.id)
-        model_selection.best_model_id = Model.find_by(
+        model = Model.find_by(
             name=best_score_overall["MODEL_NAME"], type=self.target_type
-        ).id
-        model_selection.best_model_params = best_model_params
+        )
+        model_selection.best_model_id = model.id
+        model_selection.best_model_params = serialize_for_json(best_model_params)
         model_selection.best_thresholds = best_thresholds
         model_selection.best_model_path = best_model_path
@@ -1214,7 +1376,7 @@ class ModelSelectionEngine:
             k: v for k, v in best_score_overall.items() if k not in drop_cols
         }
         score_data = {k.lower(): v for k, v in best_score_overall.items()}
-        model_selection.best_score = score_data
+        model_selection.best_score = serialize_for_json(score_data)
         model_selection.save()
         logger.info(f"Best model overall is : {best_score_overall}")
@@ -1222,11 +1384,188 @@ class ModelSelectionEngine:
         # Consolidate best parameters from all targets into a single file
         self.consolidate_best_params()
-        best_model = joblib.load(best_model_path)
-        return best_model
+        self.best_model_ = BaseModel(
+            path=self.target_dir, target_number=self.target_number
+        )
+        self._set_fitted()
+        return self
+    def get_best_model(self):
+        """
+        Get the best trained model.
+        Returns:
+            The best model found during training
+        """
+        self._check_is_fitted()
+        return self.best_model_
+    def hyperoptimize(self, x_train, y_train, x_val, y_val, model: BaseModel):
+        """Choose between Ray Tune and HyperOpt standalone based on configuration."""
+        if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
+            return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
+        elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
+            return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
+        else:
+            raise ValueError(
+                f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
+            )
+    def hyperoptimize_hyperopt(self, x_train, y_train, x_val, y_val, model: BaseModel):
+        """Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
+        logger.info("Start tuning hyperparameters with HyperOpt standalone...")
+        # Convert Ray search space to HyperOpt search space
+        def convert_search_space(ray_space):
+            """Convert Ray Tune search space to HyperOpt format."""
+            from ray.tune.search.sample import Categorical, Float, Integer
+            hp_space = {}
+            for key, value in ray_space.items():
+                if isinstance(value, Float):
+                    if (
+                        hasattr(value, "sampler")
+                        and value.sampler.__class__.__name__ == "LogUniform"
+                    ):
+                        # LogUniform distribution
+                        hp_space[key] = hp.loguniform(
+                            key, np.log(value.lower), np.log(value.upper)
+                        )
+                    else:
+                        # Uniform distribution
+                        hp_space[key] = hp.uniform(key, value.lower, value.upper)
+                elif isinstance(value, Integer):
+                    # Integer uniform distribution
+                    hp_space[key] = hp.randint(key, value.lower, value.upper)
+                elif isinstance(value, Categorical):
+                    # Categorical/choice distribution
+                    hp_space[key] = hp.choice(key, value.categories)
+                elif isinstance(value, dict):
+                    # Nested dict, recurse
+                    hp_space[key] = convert_search_space(value)
+                else:
+                    # Static value or unknown type
+                    hp_space[key] = value
+            return hp_space
+        # Create objective function for HyperOpt
+        def objective(params):
+            """Objective function to minimize."""
+            try:
+                # Convert numpy types to native Python types
+                params = serialize_for_json(params)
+                # Use existing trainable function based on perform_crossval
+                if self.perform_crossval:
+                    score = trainable_cv(
+                        params,
+                        x_train,
+                        y_train,
+                        x_val,
+                        y_val,
+                        model.model_name,
+                        self.target_type,
+                        self.experiment_name,
+                        self.target_number,
+                        model.create_model,
+                        n_splits=3,
+                        plot=model.plot,
+                        log_dir=model.log_dir,
+                        target_clf_thresholds=self.target_clf_thresholds,
+                        time_series=self.time_series,
+                        recurrent=model.recurrent,
+                    )
+                else:
+                    score, _, _ = trainable(
+                        params,
+                        x_train,
+                        y_train,
+                        x_val,
+                        y_val,
+                        model.model_name,
+                        self.target_type,
+                        self.experiment_name,
+                        self.target_number,
+                        model.create_model,
+                        plot=model.plot,
+                        log_dir=model.log_dir,
+                        target_clf_thresholds=self.target_clf_thresholds,
+                    )
+                # HyperOpt minimizes, so return the metric directly
+                loss = score[self.metric]
+                # Log trial info
+                logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
+                return {
+                    "loss": loss,
+                    "status": STATUS_OK,
+                    "score": score,  # Keep full score dict for analysis
+                }
+            except Exception as e:
+                logger.error(f"Trial failed: {str(e)}")
+                return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
+        # Convert search space
+        hp_search_space = convert_search_space(model.search_params)
+        # Run optimization
+        trials = Trials()
+        best_params = fmin(
+            fn=objective,
+            space=hp_search_space,
+            algo=tpe.suggest,
+            max_evals=self.number_of_trials,
+            trials=trials,
+            verbose=True,
+            show_progressbar=True,
+        )
+        # Get the actual parameter values (not just indices for hp.choice)
+        best_params = space_eval(hp_search_space, best_params)
+        # Convert numpy types to native Python types
+        best_params = serialize_for_json(best_params)
+        # Get best score from trials
+        best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
+        best_score = trials.trials[best_trial_idx]["result"].get("score", {})
+        # Log results
+        logger.info(f"Best hyperparameters found were:\n{best_params}")
+        logger.info(f"Best Scores found were:\n{best_score}")
+        # Create summary DataFrame for consistency with Ray version
+        results_df = pd.DataFrame(
+            [
+                {
+                    "trial_id": i,
+                    self.metric: t["result"]["loss"],
+                    **{
+                        k: v
+                        for k, v in t["result"].get("score", {}).items()
+                        if isinstance(v, (int, float))
+                    },
+                }
+                for i, t in enumerate(trials.trials)
+                if t["result"]["status"] == STATUS_OK
+            ]
+        )
+        if not results_df.empty:
+            logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
-    def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
-        self.type_name = "hyperopts"
+        # Save trial history for analysis
+        trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
+        with open(trials_path, "wb") as f:
+            pickle.dump(trials, f)
+        return best_params
+    def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: BaseModel):
         def collect_error_logs(target_dir: int, storage_path: str):
             output_error_file = f"{target_dir}/errors.log"
@@ -1269,9 +1608,22 @@ class ModelSelectionEngine:
             }
         )
+        # Choose between regular trainable or CV version based on perform_crossval flag
+        # perform_crossval controls whether to use CV during hyperopt
+        if self.perform_crossval:
+            trainable_fn = trainable_cv
+            additional_params = {
+                "n_splits": 3,  # Can be made configurable
+                "time_series": self.time_series,  # Controls whether to use TimeSeriesSplit or StratifiedKFold
+                "recurrent": model.recurrent,
+            }
+        else:
+            trainable_fn = trainable
+            additional_params = {}
         tuner = Tuner(
             trainable=with_parameters(
-                trainable,
+                trainable_fn,
                 x_train=x_train,
                 y_train=y_train,
                 x_val=x_val,
@@ -1281,10 +1633,10 @@ class ModelSelectionEngine:
                 experiment_name=self.experiment_name,
                 target_number=self.target_number,
                 create_model=model.create_model,
-                type_name="hyperopts",
                 plot=model.plot,
                 log_dir=model.log_dir,
                 target_clf_thresholds=self.target_clf_thresholds,
+                **additional_params,
             ),
             param_space=model.search_params,
             tune_config=TuneConfig(
@@ -1324,7 +1676,7 @@ class ModelSelectionEngine:
         return best_params
-    def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
+    def train_model(self, params, x_train, y_train, x_val, y_val, model: BaseModel):
         # Use the standalone training function to avoid duplication
         # For train_model, we pass the data directly (not as Ray references)
         return trainable(
@@ -1338,7 +1690,6 @@ class ModelSelectionEngine:
             self.experiment_name,
             self.target_number,
             model.create_model,
-            self.type_name,
             model.plot,
             log_dir=model.log_dir,
             target_clf_thresholds=self.target_clf_thresholds,
@@ -1444,11 +1795,11 @@ def evaluate(
         y_pred_proba = (
             prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
         )
-        if num_classes > 2:
-            lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
-            lb.fit(labels)
-            y_true_onhot = lb.transform(y_true)
-            y_pred_onehot = lb.transform(y_pred)
+        # if num_classes > 2:
+        #     lb = LabelBinarizer(sparse_output=False)  # Change to True for sparse matrix
+        #     lb.fit(labels)
+        #     y_true_onhot = lb.transform(y_true)
+        #     y_pred_onehot = lb.transform(y_pred)
         score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
         score["ACCURACY"] = accuracy_score(y_true, y_pred)
@@ -1825,6 +2176,20 @@ class Thresholds(BaseModel):
 def find_best_threshold(
     prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
 ) -> Thresholds:
+    def _normalize_class_label(cls):
+        if isinstance(cls, (np.integer, int)):
+            return int(cls)
+        if isinstance(cls, (float, np.floating)) and cls.is_integer():
+            return int(cls)
+        if isinstance(cls, str):
+            try:
+                as_float = float(cls)
+                if as_float.is_integer():
+                    return int(as_float)
+            except ValueError:
+                pass
+        return cls
     """
     General function to find best threshold optimizing recall, precision, or f1.
@@ -1843,10 +2208,15 @@ def find_best_threshold(
     pred_cols = [
         col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
     ]
-    classes = [1] if len(pred_cols) <= 2 else sorted(y_true.unique())
+    classes = (
+        [1]
+        if len(pred_cols) <= 2
+        else sorted({_normalize_class_label(cls) for cls in y_true.unique()}, key=str)
+    )
     results = {}
-    for cls in classes:
+    for raw_cls in classes:
+        cls = _normalize_class_label(raw_cls)
         cls_str = str(cls)
         if cls_str not in prediction.columns and cls not in prediction.columns:
             logger.warning(f"Missing predicted probabilities for class '{cls}'")

lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl