PyPI - lecrapaud - Versions diffs - 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

lecrapaud 0.12.1py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (6) hide show

lecrapaud/api.py CHANGED Viewed

@@ -175,7 +175,7 @@ class ExperimentEngine:
             )
             features = self.experiment.get_features(target_number)
-            model = ModelEngine(path=target_dir)
+            model = ModelEngine(path=target_dir, target_number=target_number)
             # getting data
             if model.recurrent:
@@ -335,6 +335,10 @@ class ExperimentEngine:
                 group_column=self.group_column,
                 target_clf_thresholds=self.target_clf_thresholds,
             )
+            if best_params and target_number not in best_params.keys():
+                raise ValueError(
+                    f"Target {target_number} not found in best_params passed as argument"
+                )
             app.run(
                 self.experiment_name,
                 perform_hyperopt=self.perform_hyperopt,
@@ -342,7 +346,7 @@ class ExperimentEngine:
                 perform_crossval=self.perform_crossval,
                 plot=self.plot,
                 preserve_model=self.preserve_model,
-                best_params=best_params[target_number],
+                best_params=best_params[target_number] if best_params else None,
             )
     def get_scores(self, target_number: int):
@@ -466,3 +470,52 @@ class ExperimentEngine:
                 plot_threshold(tmp_pred, threshold, precision, recall)
             else:
                 logger.info(f"No threshold found for class {class_label}")
+    def get_best_params(self, target_number: int = None) -> dict:
+        """
+        Load the best parameters for the experiment.
+        Args:
+            target_number (int, optional): If provided, returns parameters for this specific target.
+                                         If None, returns parameters for all targets.
+        Returns:
+            dict: Dictionary containing the best parameters. If target_number is provided,
+                  returns parameters for that target only. Otherwise, returns a dictionary
+                  with target numbers as keys.
+        """
+        import json
+        import os
+        params_file = os.path.join(
+            self.experiment.path, "preprocessing", "all_targets_best_params.json"
+        )
+        if not os.path.exists(params_file):
+            raise FileNotFoundError(
+                f"Best parameters file not found at {params_file}. "
+                "Make sure to run model training first."
+            )
+        try:
+            with open(params_file, "r") as f:
+                all_params = json.load(f)
+            # Convert string keys to integers
+            all_params = {int(k): v for k, v in all_params.items()}
+            if target_number is not None:
+                if target_number not in all_params:
+                    available_targets = list(all_params.keys())
+                    raise ValueError(
+                        f"No parameters found for target {target_number}. "
+                        f"Available targets: {available_targets}"
+                    )
+                return all_params[target_number]
+            return all_params
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Error parsing best parameters file: {str(e)}")
+        except Exception as e:
+            raise Exception(f"Error loading best parameters: {str(e)}")

lecrapaud/model_selection.py CHANGED Viewed

@@ -114,6 +114,7 @@ class ModelEngine:
         self,
         model_name: str = None,
         target_type: str = None,
+        target_number: int = None,
         path: str = None,
         search_params: dict = {},
         create_model=None,
@@ -126,6 +127,7 @@ class ModelEngine:
         else:
             self.model_name = model_name
             self.target_type = target_type
+            self.target_number = target_number
         config = [
             config for config in all_models if config["model_name"] == self.model_name
@@ -326,18 +328,16 @@ class ModelEngine:
         writer.close()
         if self.plot:
-            # Plot loss per epoch
-            train_loss = evals_result["train"][eval_metric]
-            val_loss = evals_result["val"][eval_metric]
-            logs = pd.DataFrame({"train": train_loss, "val": val_loss})
-            plt.figure(figsize=(14, 4))
-            plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
-            plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
-            plt.xlabel("Epoch")
-            plt.ylabel("Loss")
-            plt.legend()
-            plt.show()
+            # Plot training progress
+            plot_training_progress(
+                logs={
+                    "train": evals_result["train"][eval_metric],
+                    "val": evals_result["val"][eval_metric],
+                },
+                model_name=self.model_name,
+                target_number=self.target_number,
+                title_suffix=f"Training Progress - {eval_metric}",
+            )
         self._model = model
@@ -465,16 +465,12 @@ class ModelEngine:
         # logger.info(pd.DataFrame(gradiant.epoch_gradient))
         if self.plot:
-            # Plot loss per epoch
-            logs = pd.DataFrame(history.history)
-            plt.figure(figsize=(14, 4))
-            plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
-            plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
-            plt.xlabel("Epoch")
-            plt.ylabel("Loss")
-            plt.legend()
-            plt.show()
+            # Plot training progress using the utility function
+            plot_training_progress(
+                logs=history.history,
+                model_name=self.model_name,
+                target_number=self.target_number,
+            )
         self._model = model
@@ -605,6 +601,7 @@ def trainable(
     model = ModelEngine(
         model_name=model_name,
         target_type=target_type,
+        target_number=target_number,
         create_model=create_model,
         plot=plot,
         log_dir=log_dir,
@@ -659,6 +656,9 @@ def trainable(
     score.update(evaluate(prediction, target_type, target_clf_thresholds))
+    metric = "RMSE" if target_type == "regression" else "LOGLOSS"
+    logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
     if type_name == "hyperopts":
         session.report(metrics=score)
         return score
@@ -856,6 +856,7 @@ class ModelSelectionEngine:
             log_dir = get_log_dir(self.target_dir, model_name)
             # instantiate model
             model = ModelEngine(
+                target_number=self.target_number,
                 model_name=model_name,
                 search_params=config["search_params"],
                 target_type=self.target_type,
@@ -904,16 +905,22 @@ class ModelSelectionEngine:
                 tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
                 # Store the scores
-                cross_validation_scores = []
+                cv_scores = []
                 for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
                     self.type_name = f"crossval_fold_{i}"
                     if self.time_series:
-                        date_series = train[self.date_column].copy()
+                        date_series = pd.concat(
+                            [
+                                train[self.date_column],
+                                val[self.date_column],
+                                test[self.date_column],
+                            ],
+                            axis=0,
+                        ).reset_index(drop=True)
-                        if need_scaling:
-                            date_series = date_series.map(pd.Timestamp.fromordinal)
+                        date_series = date_series.map(pd.Timestamp.fromordinal)
                         # Now you can use the actual train/val indices to extract ranges
                         train_start = date_series.iloc[train_index[0]]
@@ -932,7 +939,7 @@ class ModelSelectionEngine:
                     # Train the model and get the score
                     if recurrent:
-                        cross_validation_score, _, _ = self.train_model(
+                        cv_score, _, _ = self.train_model(
                             params=model_best_params,
                             x_train=x_train_val[train_index],
                             y_train=y_train_val[train_index],
@@ -941,7 +948,7 @@ class ModelSelectionEngine:
                             model=model,
                         )
                     else:
-                        cross_validation_score, _, _ = self.train_model(
+                        cv_score, _, _ = self.train_model(
                             params=model_best_params,
                             x_train=x_train_val.iloc[train_index],
                             y_train=y_train_val.iloc[train_index],
@@ -951,18 +958,20 @@ class ModelSelectionEngine:
                         )
                     # Append score to the list
-                    cross_validation_scores.append(cross_validation_score)
-                # Calculate and log the mean score
-                cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
-                    self.metric
-                ].mean()
-                logger.info(
-                    f"Best model mean cross-validation score on entire experiment: {cross_validation_mean_score}"
-                )
+                    cv_scores.append(cv_score)
+                # Calculate mean of all numerical metrics across all cross-validation folds
+                cv_scores_df = pd.DataFrame(cv_scores)
+                # Get mean of all numeric columns
+                cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
+                logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
+                for metric, value in cv_means.items():
+                    logger.info(f"  {metric}: {value:.4f}")
                 # Retrain on entire training set, but keep score on cross-validation folds
-                best_score, best_model, best_pred = self.train_model(
+                # Get the test score using the best model
+                test_score, best_model, best_pred = self.train_model(
                     params=model_best_params,
                     x_train=pd.concat([x_train, x_val], axis=0),
                     y_train=pd.concat([y_train, y_val], axis=0),
@@ -970,10 +979,16 @@ class ModelSelectionEngine:
                     y_val=y_test,
                     model=model,
                 )
-                best_score = cross_validation_mean_score
+                # Update all metrics with cross-validation means
+                for metric, value in cv_means.items():
+                    if metric in test_score:  # Only update existing metrics
+                        test_score[metric] = value
+                best_score = test_score
+                best_score["TYPE"] = "crossval"
             else:
-                # Evaluate on validation set
-                self.type_name = "validation"
+                # Evaluate on test set
+                self.type_name = "testset"
                 best_score, best_model, best_pred = self.train_model(
                     params=model_best_params,
                     x_train=pd.concat([x_train, x_val], axis=0),
@@ -983,9 +998,11 @@ class ModelSelectionEngine:
                     model=model,
                 )
-                logger.info(f"Best model scores on test set: {best_score}")
+                logger.info(f"👉 {model.model_name} scores on test set:")
+                for metric, value in best_score.items():
+                    logger.info(f"  {metric}: {value:.4f}")
-            # Save validation predictions
+            # Save predictions
             best_pred.to_csv(
                 f"{self.results_dir}/prediction.csv",
                 index=True,
@@ -999,7 +1016,7 @@ class ModelSelectionEngine:
             model_path = Path(model_path).resolve()
             best_score["MODEL_PATH"] = model_path
-            # Track scores
+            # Save best scores
             scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
             best_score_df = pd.DataFrame([best_score])
@@ -1071,7 +1088,7 @@ class ModelSelectionEngine:
         with open(f"{self.target_dir}/best_params.json", "r") as f:
             best_model_params = json.load(f)[best_model_name]
-        # save model_selection results to db
+        # Save model_selection results to db
         model_selection = ModelSelection.get(model_selection.id)
         model_selection.best_model_id = Model.find_by(
             name=best_score_overall["MODEL_NAME"], type=self.target_type
@@ -1083,6 +1100,9 @@ class ModelSelectionEngine:
         logger.info(f"Best model overall is : {best_score_overall}")
+        # Consolidate best parameters from all targets into a single file
+        self.consolidate_best_params()
         best_model = joblib.load(best_model_path)
         return best_model
@@ -1184,11 +1204,53 @@ class ModelSelectionEngine:
             target_clf_thresholds=self.target_clf_thresholds,
         )
+    def consolidate_best_params(self):
+        """
+        Consolidate best parameters from all targets into a single JSON file in the preprocessing folder.
+        The output will be a dictionary with target numbers as keys and their best parameters as values.
+        """
+        # Initialize the consolidated parameters dictionary
+        all_best_params = {}
+        # Find all target directories
+        target_dirs = [
+            d for d in os.listdir(self.experiment_dir) if d.startswith("TARGET_")
+        ]
+        for target_dir in target_dirs:
+            target_number = target_dir.split("_")[1]
+            best_params_file = os.path.join(
+                self.experiment_dir, target_dir, "best_params.json"
+            )
+            # Check if best_params.json exists for this target
+            if os.path.exists(best_params_file):
+                try:
+                    with open(best_params_file, "r") as f:
+                        target_params = json.load(f)
+                        all_best_params[target_number] = target_params
+                except Exception as e:
+                    logger.warning(
+                        f"Error loading best params for {target_dir}: {str(e)}"
+                    )
+        # Save consolidated parameters to preprocessing folder
+        if all_best_params:
+            output_file = os.path.join(
+                self.preprocessing_dir, "all_targets_best_params.json"
+            )
+            os.makedirs(os.path.dirname(output_file), exist_ok=True)
+            with open(output_file, "w") as f:
+                json.dump(all_best_params, f, indent=4)
+            logger.info(f"Consolidated best parameters saved to {output_file}")
+        return all_best_params
 def evaluate(
     prediction: pd.DataFrame,
     target_type: str,
-    target_clf_thresholds: dict = {"precision": 0.80},
+    target_clf_thresholds: dict = None,
 ):
     """
     Function to evaluate model performance
@@ -1202,6 +1264,10 @@ def evaluate(
     y_true = prediction["TARGET"]
     y_pred = prediction["PRED"]
+    # Set default threshold if not provided
+    if target_clf_thresholds is None:
+        target_clf_thresholds = {"precision": 0.80}
     if target_type == "regression":
         # Main metrics
         score["RMSE"] = root_mean_squared_error(y_true, y_pred)
@@ -1330,6 +1396,46 @@ def load_model(target_dir: str):
         )
+def plot_training_progress(
+    logs, model_name, target_number, title_suffix="Training Progress"
+):
+    """
+    Plot training and validation metrics during model training.
+    Args:
+        logs: DataFrame or dict containing training history
+        model_name: Name of the model being trained
+        target_number: Target number for the model
+        title_suffix: Optional suffix for the plot title
+    """
+    if isinstance(logs, dict):
+        logs = pd.DataFrame(logs)
+    plt.figure(figsize=(14, 4))
+    # Plot all metrics that exist in the logs
+    if "loss" in logs.columns:
+        plt.plot(logs["loss"], lw=2, label="Training loss")
+    if "val_loss" in logs.columns:
+        plt.plot(logs["val_loss"], lw=2, label="Validation loss")
+    # If no specific loss columns, plot all available metrics
+    if "loss" not in logs.columns and "val_loss" not in logs.columns and not logs.empty:
+        for col in logs.columns:
+            if col.startswith("val_"):
+                plt.plot(logs[col], "--", lw=2, label=f"Validation {col[4:]}")
+            else:
+                plt.plot(logs[col], lw=2, label=f"Training {col}")
+    plt.title(f"{model_name} - Target {target_number}\n{title_suffix}")
+    plt.xlabel("Epoch")
+    plt.ylabel("Metric Value")
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.show()
 # plots
 def plot_evaluation_for_classification(prediction: dict):
     """

{lecrapaud-0.12.1.dist-info → lecrapaud-0.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.12.1
+Version: 0.13.0
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet

{lecrapaud-0.12.1.dist-info → lecrapaud-0.13.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
-lecrapaud/api.py,sha256=YKxeWcTuq3QZ-UI9UjRkq7OI7W1UXW0EWgBbY3uEmLE,17409
+lecrapaud/api.py,sha256=K5eM5dXtU8DGH6je7Ai60hOgycXUAIVE1OvMh3Qvh5c,19541
 lecrapaud/config.py,sha256=eYnrktVq457xMIMGcUSilJdNxCsaGP_gRAlzCSwd6Vo,1047
 lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
 lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -35,10 +35,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
 lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
 lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
 lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
-lecrapaud/model_selection.py,sha256=8TfYjVJnFDviycX4DMe6mpHm7oxTfS-UXO55TvOLPJs,63377
+lecrapaud/model_selection.py,sha256=S16Zc6PxyNx-HrB_5JucCijFMDAjZlHiHPrl7mer4Cw,67517
 lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
 lecrapaud/utils.py,sha256=JdBB1NvbNIx4y0Una-kSZdo1_ZEocc5hwyYFIZKHmGg,8305
-lecrapaud-0.12.1.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
-lecrapaud-0.12.1.dist-info/METADATA,sha256=AIsGG4s0ZD_P3d2rRj-vGgDFvQzTO8ipvM_zmEbKZv8,11016
-lecrapaud-0.12.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-lecrapaud-0.12.1.dist-info/RECORD,,
+lecrapaud-0.13.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
+lecrapaud-0.13.0.dist-info/METADATA,sha256=OhgqiesFiciX8XtyC_wXTRPcWlWwCwGUuC1zVpoWIOI,11016
+lecrapaud-0.13.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+lecrapaud-0.13.0.dist-info/RECORD,,

{lecrapaud-0.12.1.dist-info → lecrapaud-0.13.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{lecrapaud-0.12.1.dist-info → lecrapaud-0.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

lecrapaud 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.12.1py3-none-any.whl → 0.13.0py3-none-any.whl