PyPI - lecrapaud - Versions diffs - 0.16.6__tar.gz → 0.17.0__tar.gz - Mend

lecrapaud 0.16.6tar.gz → 0.17.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (45) hide show

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.16.6
+Version: 0.17.0
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/api.py RENAMED Viewed

@@ -100,6 +100,68 @@ class LeCrapaud:
             id=Experiment.get_best_by_score(name=name, metric=metric).id, **kwargs
         )
+    def compare_experiment_scores(self, name: str):
+        """Compare scores of experiments with matching names.
+        Args:
+            name (str): Name or partial name of experiments to compare
+        Returns:
+            dict: Dictionary containing experiment names as keys and their scores as values
+        """
+        from lecrapaud.db import SessionLocal
+        from sqlalchemy.orm import joinedload
+        db = SessionLocal()
+        try:
+            # Get all experiments with the given name pattern
+            experiments = (
+                db.query(Experiment)
+                .options(
+                    joinedload(Experiment.model_selections).joinedload(
+                        ModelSelection.scores
+                    )
+                )
+                .filter(Experiment.name.ilike(f"%{name}%"))
+                .all()
+            )
+            if not experiments:
+                return {"error": f"No experiments found with name containing '{name}'"}
+            comparison = {}
+            for exp in experiments:
+                scores = {
+                    "rmse": exp.avg_rmse,
+                    "logloss": exp.avg_logloss,
+                    "accuracy": None,
+                    "f1": None,
+                    "roc_auc": None,
+                }
+                # Get classification metrics from the first model selection with scores
+                for model_sel in exp.model_selections:
+                    if model_sel.scores:
+                        for score in model_sel.scores:
+                            if score.type == "validation":  # Use validation scores
+                                if score.accuracy is not None:
+                                    scores["accuracy"] = score.accuracy
+                                if score.f1 is not None:
+                                    scores["f1"] = score.f1
+                                if score.roc_auc is not None:
+                                    scores["roc_auc"] = score.roc_auc
+                                break
+                comparison[exp.name] = scores
+            return comparison
+        except Exception as e:
+            return {"error": f"Error comparing experiment scores: {str(e)}"}
+        finally:
+            db.close()
     def list_experiments(
         self, name: str = None, limit: int = 1000
     ) -> list["ExperimentEngine"]:
@@ -289,6 +351,8 @@ class ExperimentEngine:
             val_size=self.val_size,
             test_size=self.test_size,
             columns_pca=self.columns_pca,
+            pca_temporal=self.pca_temporal,
+            pca_cross_sectional=self.pca_cross_sectional,
             columns_onehot=self.columns_onehot,
             columns_binary=self.columns_binary,
             columns_frequency=self.columns_frequency,

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/experiment.py RENAMED Viewed

@@ -303,6 +303,83 @@ class Experiment(Base):
         else:
             raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
+    def best_score(self, target_number: int) -> dict:
+        """
+        Returns the scores for the best model of the specified target.
+        Args:
+            target_number (int): The target number to get scores for
+        Returns:
+            dict: A dictionary containing the experiment name, target number, and the best model's scores
+        """
+        # Find the target
+        target_name = f"TARGET_{target_number}"
+        target = next((t for t in self.targets if t.name == target_name), None)
+        if not target:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': f'Target {target_name} not found in this experiment',
+                'scores': {}
+            }
+        # Find the best model selection for this target
+        best_model_selection = next(
+            (ms for ms in self.model_selections if ms.target_id == target.id),
+            None
+        )
+        if not best_model_selection or not best_model_selection.model_trainings:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': 'No model found for this target',
+                'scores': {}
+            }
+        # Get the best model training (assuming the first one is the best)
+        best_training = best_model_selection.model_trainings[0]
+        # Get the validation score for this training
+        validation_scores = [s for s in best_training.score if s.type == 'validation']
+        if not validation_scores:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': 'No validation scores found for the best model',
+                'scores': {}
+            }
+        # Get all available metrics from the first validation score
+        score = validation_scores[0]
+        available_metrics = [
+            'rmse', 'mae', 'r2', 'logloss', 'accuracy',
+            'precision', 'recall', 'f1', 'roc_auc'
+        ]
+        scores = {}
+        for metric in available_metrics:
+            value = getattr(score, metric, None)
+            if value is not None:
+                scores[metric] = value
+        # Get the model info
+        model_info = {
+            'model_type': best_training.model.model_type if best_training.model else 'unknown',
+            'model_name': best_training.model.name if best_training.model else 'unknown',
+            'training_time_seconds': best_training.training_time
+        }
+        return {
+            'experiment_name': self.name,
+            'target_number': target_number,
+            'model': model_info,
+            'scores': scores
+        }
     def get_features(self, target_number: int):
         targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
         if targets:

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/feature_engineering.py RENAMED Viewed

@@ -52,6 +52,9 @@ import os
 from sklearn.compose import ColumnTransformer
 from sklearn.decomposition import PCA
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
 from category_encoders import BinaryEncoder, CountEncoder
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.model_selection import train_test_split
@@ -316,6 +319,8 @@ class PreprocessFeature:
         val_size: float = 0.2,
         test_size: float = 0.2,
         columns_pca: list[str] = [],
+        pca_temporal: dict[str, list[str]] = {},
+        pca_cross_sectional: dict[str, list[str]] = {},
         columns_onehot: list[str] = [],
         columns_binary: list[str] = [],
         columns_ordinal: list[str] = [],
@@ -329,6 +334,8 @@ class PreprocessFeature:
         self.experiment = experiment
         self.columns_pca = [col.upper() for col in columns_pca]
+        self.pca_temporal = pca_temporal
+        self.pca_cross_sectional = pca_cross_sectional
         self.columns_onehot = [col.upper() for col in columns_onehot]
         self.columns_binary = [col.upper() for col in columns_binary]
         self.columns_ordinal = [col.upper() for col in columns_ordinal]
@@ -364,6 +371,26 @@ class PreprocessFeature:
         joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
+        train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
+        val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
+        test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
+        joblib.dump(
+            pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
+        )
+        train, pcas_temporal = self.add_pca_feature_temporal(train)
+        val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
+        test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
+        joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
+        # Save all features before encoding
+        joblib.dump(
+            list(train.columns),
+            f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
+        )
         # Encoding
         train, transformer = self.encode_categorical_features(train)
         val, _ = self.encode_categorical_features(
@@ -382,7 +409,8 @@ class PreprocessFeature:
         # Save all features before selection
         joblib.dump(
-            train, f"{self.preprocessing_dir}/all_features_before_selection.pkl"
+            list(train.columns),
+            f"{self.preprocessing_dir}/all_features_before_selection.pkl",
         )
         return train, val, test
@@ -395,6 +423,18 @@ class PreprocessFeature:
             pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
             data, _ = self.add_pca_features(data, pcas=pcas)
+        if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
+            pcas_cross_sectional = joblib.load(
+                f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
+            )
+            data, _ = self.add_pca_feature_cross_sectional(
+                data, pcas=pcas_cross_sectional
+            )
+        if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
+            pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
+            data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
         # Encoding
         transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
         data, _ = self.encode_categorical_features(
@@ -570,6 +610,120 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_cross_sectional(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: str = "median",
+        standardize: bool = True,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Construit un pivot (index=index_col, columns=columns_col, values=value_col),
+        fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
+        (par index_col) dans df. Renvoie (df_avec_features, pipe).
+        """
+        pcas_dict = {}
+        for pca_cross_sectional in self.pca_cross_sectional:
+            name, index_col, columns_col, value_col = (
+                pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
+            )
+            prefix = f"CS_PC_{name}"
+            pivot = df.pivot_table(
+                index=index_col, columns=columns_col, values=value_col
+            ).sort_index()
+            # Pipeline à réutiliser entre train et test
+            if pcas is None:
+                steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
+                if standardize:
+                    steps.append(
+                        ("scaler", StandardScaler(with_mean=True, with_std=True))
+                    )
+                pca = PCA(n_components=n_components, random_state=0)
+                steps.append(("pca", pca))
+                pipe = Pipeline(steps)
+                pipe.fit(pivot)  # <- fit sur TRAIN uniquement
+            else:
+                pipe = pcas[name]  # <- TEST : on réutilise le pipe existant
+            scores = pipe.transform(pivot)  # shape: (n_index, n_components)
+            cols = [f"{prefix}_{i}" for i in range(n_components)]
+            scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+            df = df.merge(scores_df.reset_index(), on=index_col, how="left")
+            pcas_dict.update({name: pipe})
+        return df, pcas_dict
+    # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
+    def add_pca_feature_temporal(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: (
+            str | None
+        ) = None,  # None = on exige toutes les colonnes présentes
+        standardize: bool = True,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Applique une PCA sur une matrice (rows = lignes df, cols = lags).
+        Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
+        Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
+        """
+        pcas_dict = {}
+        for pca_temporal in self.pca_temporal:
+            name, cols = (pca_temporal[k] for k in ("name", "columns"))
+            prefix = f"TMP_PC_{name}"
+            # Masque des lignes utilisables
+            if impute_strategy is None:
+                mask = (
+                    df[cols].notna().all(axis=1)
+                )  # on n'impute pas → lignes complètes
+                X_fit = df.loc[mask, cols]
+            else:
+                mask = df[cols].notna().any(axis=1)  # on imputera → au moins une valeur
+                X_fit = df.loc[mask, cols]
+            # Pipeline
+            if pcas is None:
+                steps = []
+                if impute_strategy is not None:
+                    steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                if standardize:
+                    steps.append(
+                        ("scaler", StandardScaler(with_mean=True, with_std=True))
+                    )
+                pca = PCA(n_components=n_components, random_state=0)
+                steps.append(("pca", pca))
+                pipe = Pipeline(steps)
+                if not X_fit.empty:
+                    pipe.fit(X_fit)  # <- fit sur TRAIN uniquement
+            else:
+                pipe = pcas[name]  # <- TEST
+            # Transform uniquement sur lignes valides (mask)
+            if not df.loc[mask, cols].empty:
+                Z = pipe.transform(df.loc[mask, cols])
+                for i in range(n_components):
+                    df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
+            else:
+                # crée les colonnes vides si aucune ligne valide (cohérence de schéma)
+                for i in range(n_components):
+                    df[f"{prefix}_{i}"] = pd.NA
+            pcas_dict.update({name: pipe})
+        return df, pcas_dict
     # encoding categorical features
     def encode_categorical_features(
         self,

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/model_selection.py RENAMED Viewed

@@ -1592,20 +1592,104 @@ def plot_evaluation_for_classification(prediction: dict):
 def plot_confusion_matrix(y_true, y_pred):
-    unique_labels = np.unique(np.concatenate((y_true, y_pred)))
+    # Calculate confusion matrix
     cm = confusion_matrix(y_true, y_pred)
-    labels = np.sort(unique_labels)  # Sort labels based on numerical order
+    # Get unique, sorted class labels
+    labels = np.unique(np.concatenate((y_true, y_pred)))
+    labels = np.sort(labels)
+    # Calculate class distribution
+    class_dist = np.bincount(y_true.astype(int))
+    class_dist_pct = class_dist / len(y_true) * 100
+    # Create figure with two subplots stacked vertically
+    fig = plt.figure(figsize=(10, 12))
+    # Subplot 1: Confusion Matrix
+    ax1 = plt.subplot(2, 1, 1)  # Changed to 2 rows, 1 column, first subplot
+    # Create a custom colormap (blue to white to red)
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+    # Plot heatmap with better styling
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt="d",
+        cmap=cmap,
+        center=0,
+        linewidths=0.5,
+        linecolor="lightgray",
+        cbar_kws={"label": "Number of Samples"},
+        ax=ax1,
+    )
+    # Add title and labels with better styling
+    ax1.set_title("Confusion Matrix", fontsize=14, pad=20, weight="bold")
+    ax1.set_xlabel("Predicted Label", fontsize=12, labelpad=10)
+    ax1.set_ylabel("True Label", fontsize=12, labelpad=10)
+    # Set tick labels to be centered and more readable
+    ax1.set_xticks(np.arange(len(labels)) + 0.5)
+    ax1.set_yticks(np.arange(len(labels)) + 0.5)
+    ax1.set_xticklabels(labels, fontsize=10)
+    ax1.set_yticklabels(labels, fontsize=10, rotation=0)
+    # Add grid lines for better readability
+    ax1.set_xticks(np.arange(len(labels) + 1) - 0.5, minor=True)
+    ax1.set_yticks(np.arange(len(labels) + 1) - 0.5, minor=True)
+    ax1.grid(which="minor", color="w", linestyle="-", linewidth=2)
+    ax1.tick_params(which="minor", bottom=False, left=False)
+    # Subplot 2: Class Distribution
+    ax2 = plt.subplot(2, 1, 2)  # Changed to 2 rows, 1 column, second subplot
+    # Create a bar plot for class distribution
+    bars = ax2.bar(
+        labels.astype(str),
+        class_dist_pct,
+        color=sns.color_palette("viridis", len(labels)),
+    )
-    plt.figure(figsize=(10, 7))
-    sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
-    plt.xlabel("Predicted", fontsize=12)
-    plt.ylabel("True", fontsize=12)
-    plt.title("Confusion Matrix", fontsize=14)
+    # Add percentage labels on top of bars
+    for bar in bars:
+        height = bar.get_height()
+        ax2.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            height + 1,
+            f"{height:.1f}%",
+            ha="center",
+            va="bottom",
+            fontsize=10,
+        )
-    plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
-    plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
+    # Add title and labels
+    ax2.set_title("Class Distribution", fontsize=14, pad=20, weight="bold")
+    ax2.set_xlabel("Class", fontsize=12, labelpad=10)
+    ax2.set_ylabel("Percentage of Total Samples", fontsize=12, labelpad=10)
+    ax2.set_ylim(0, 100)
+    ax2.grid(axis="y", linestyle="--", alpha=0.7)
+    # Add total count annotation
+    total = len(y_true)
+    ax2.text(
+        0.5,
+        -0.15,  # Adjusted y-position for better spacing
+        f"Total samples: {total:,}",
+        transform=ax2.transAxes,
+        ha="center",
+        fontsize=10,
+        bbox=dict(
+            facecolor="white",
+            alpha=0.8,
+            edgecolor="lightgray",
+            boxstyle="round,pad=0.5",
+        ),
+    )
+    # Adjust layout to prevent overlap with more vertical space
+    plt.tight_layout(rect=[0, 0.03, 1, 0.98])
     plt.show()
@@ -1697,7 +1781,17 @@ def find_best_threshold(
                     logger.warning(
                         f"[Class {cls}] No threshold with precision ≥ {target_value}"
                     )
-                    best_idx = int(np.argmax(precision))  # fallback
+                    # fallback: meilleure precision parmi ceux avec recall>0
+                    cand = np.where(recall > 0)[0]
+                    if cand.size:
+                        best_idx = cand[int(np.argmax(precision[cand]))]
+                        logger.warning(
+                            f"[Class {cls}] Fallback to best precision with recall>0: "
+                            f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
+                        )
+                    else:
+                        logger.error(f"[Class {cls}] No threshold achieves recall>0.")
+                        best_idx = None
             elif metric == "f1":
                 valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
@@ -1711,6 +1805,15 @@ def find_best_threshold(
         else:
             best_idx = int(np.argmax(values))  # no constraint, get best value
+        if best_idx is None:
+            results[cls_str] = {
+                "threshold": None,
+                "precision": None,
+                "recall": None,
+                "f1": None,
+            }
+            continue
         results[cls_str] = {
             "threshold": float(thresholds[best_idx]),
             "precision": float(precision[best_idx]),

{lecrapaud-0.16.6 → lecrapaud-0.17.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.16.6"
+version = "0.17.0"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}