PyPI - lecrapaud - Versions diffs - 0.16.6__tar.gz → 0.16.7__tar.gz - Mend

lecrapaud 0.16.6tar.gz → 0.16.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (45) hide show

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.16.6
+Version: 0.16.7
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/lecrapaud/api.py RENAMED Viewed

@@ -100,6 +100,65 @@ class LeCrapaud:
             id=Experiment.get_best_by_score(name=name, metric=metric).id, **kwargs
         )
+    def compare_experiment_scores(self, name: str):
+        """Compare scores of experiments with matching names.
+        Args:
+            name (str): Name or partial name of experiments to compare
+        Returns:
+            dict: Dictionary containing experiment names as keys and their scores as values
+        """
+        from lecrapaud.db import SessionLocal
+        from sqlalchemy.orm import joinedload
+        db = SessionLocal()
+        try:
+            # Get all experiments with the given name pattern
+            experiments = (
+                db.query(Experiment)
+                .options(joinedload(Experiment.model_selections)
+                        .joinedload(ModelSelection.scores))
+                .filter(Experiment.name.ilike(f"%{name}%"))
+                .all()
+            )
+            if not experiments:
+                return {"error": f"No experiments found with name containing '{name}'"}
+            comparison = {}
+            for exp in experiments:
+                scores = {
+                    "rmse": exp.avg_rmse,
+                    "logloss": exp.avg_logloss,
+                    "accuracy": None,
+                    "f1": None,
+                    "roc_auc": None
+                }
+                # Get classification metrics from the first model selection with scores
+                for model_sel in exp.model_selections:
+                    if model_sel.scores:
+                        for score in model_sel.scores:
+                            if score.type == 'validation':  # Use validation scores
+                                if score.accuracy is not None:
+                                    scores["accuracy"] = score.accuracy
+                                if score.f1 is not None:
+                                    scores["f1"] = score.f1
+                                if score.roc_auc is not None:
+                                    scores["roc_auc"] = score.roc_auc
+                                break
+                comparison[exp.name] = scores
+            return comparison
+        except Exception as e:
+            return {"error": f"Error comparing experiment scores: {str(e)}"}
+        finally:
+            db.close()
     def list_experiments(
         self, name: str = None, limit: int = 1000
     ) -> list["ExperimentEngine"]:

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/lecrapaud/db/models/experiment.py RENAMED Viewed

@@ -303,6 +303,83 @@ class Experiment(Base):
         else:
             raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
+    def best_score(self, target_number: int) -> dict:
+        """
+        Returns the scores for the best model of the specified target.
+        Args:
+            target_number (int): The target number to get scores for
+        Returns:
+            dict: A dictionary containing the experiment name, target number, and the best model's scores
+        """
+        # Find the target
+        target_name = f"TARGET_{target_number}"
+        target = next((t for t in self.targets if t.name == target_name), None)
+        if not target:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': f'Target {target_name} not found in this experiment',
+                'scores': {}
+            }
+        # Find the best model selection for this target
+        best_model_selection = next(
+            (ms for ms in self.model_selections if ms.target_id == target.id),
+            None
+        )
+        if not best_model_selection or not best_model_selection.model_trainings:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': 'No model found for this target',
+                'scores': {}
+            }
+        # Get the best model training (assuming the first one is the best)
+        best_training = best_model_selection.model_trainings[0]
+        # Get the validation score for this training
+        validation_scores = [s for s in best_training.score if s.type == 'validation']
+        if not validation_scores:
+            return {
+                'experiment_name': self.name,
+                'target_number': target_number,
+                'error': 'No validation scores found for the best model',
+                'scores': {}
+            }
+        # Get all available metrics from the first validation score
+        score = validation_scores[0]
+        available_metrics = [
+            'rmse', 'mae', 'r2', 'logloss', 'accuracy',
+            'precision', 'recall', 'f1', 'roc_auc'
+        ]
+        scores = {}
+        for metric in available_metrics:
+            value = getattr(score, metric, None)
+            if value is not None:
+                scores[metric] = value
+        # Get the model info
+        model_info = {
+            'model_type': best_training.model.model_type if best_training.model else 'unknown',
+            'model_name': best_training.model.name if best_training.model else 'unknown',
+            'training_time_seconds': best_training.training_time
+        }
+        return {
+            'experiment_name': self.name,
+            'target_number': target_number,
+            'model': model_info,
+            'scores': scores
+        }
     def get_features(self, target_number: int):
         targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
         if targets:

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/lecrapaud/feature_engineering.py RENAMED Viewed

@@ -364,6 +364,12 @@ class PreprocessFeature:
         joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
+        # Save all features before encoding
+        joblib.dump(
+            list(train.columns),
+            f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
+        )
         # Encoding
         train, transformer = self.encode_categorical_features(train)
         val, _ = self.encode_categorical_features(
@@ -382,7 +388,8 @@ class PreprocessFeature:
         # Save all features before selection
         joblib.dump(
-            train, f"{self.preprocessing_dir}/all_features_before_selection.pkl"
+            list(train.columns),
+            f"{self.preprocessing_dir}/all_features_before_selection.pkl",
         )
         return train, val, test

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/lecrapaud/model_selection.py RENAMED Viewed

@@ -1592,20 +1592,104 @@ def plot_evaluation_for_classification(prediction: dict):
 def plot_confusion_matrix(y_true, y_pred):
-    unique_labels = np.unique(np.concatenate((y_true, y_pred)))
+    # Calculate confusion matrix
     cm = confusion_matrix(y_true, y_pred)
-    labels = np.sort(unique_labels)  # Sort labels based on numerical order
+    # Get unique, sorted class labels
+    labels = np.unique(np.concatenate((y_true, y_pred)))
+    labels = np.sort(labels)
+    # Calculate class distribution
+    class_dist = np.bincount(y_true.astype(int))
+    class_dist_pct = class_dist / len(y_true) * 100
+    # Create figure with two subplots stacked vertically
+    fig = plt.figure(figsize=(10, 12))
+    # Subplot 1: Confusion Matrix
+    ax1 = plt.subplot(2, 1, 1)  # Changed to 2 rows, 1 column, first subplot
+    # Create a custom colormap (blue to white to red)
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+    # Plot heatmap with better styling
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt="d",
+        cmap=cmap,
+        center=0,
+        linewidths=0.5,
+        linecolor="lightgray",
+        cbar_kws={"label": "Number of Samples"},
+        ax=ax1,
+    )
-    plt.figure(figsize=(10, 7))
-    sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
-    plt.xlabel("Predicted", fontsize=12)
-    plt.ylabel("True", fontsize=12)
-    plt.title("Confusion Matrix", fontsize=14)
+    # Add title and labels with better styling
+    ax1.set_title("Confusion Matrix", fontsize=14, pad=20, weight="bold")
+    ax1.set_xlabel("Predicted Label", fontsize=12, labelpad=10)
+    ax1.set_ylabel("True Label", fontsize=12, labelpad=10)
+    # Set tick labels to be centered and more readable
+    ax1.set_xticks(np.arange(len(labels)) + 0.5)
+    ax1.set_yticks(np.arange(len(labels)) + 0.5)
+    ax1.set_xticklabels(labels, fontsize=10)
+    ax1.set_yticklabels(labels, fontsize=10, rotation=0)
+    # Add grid lines for better readability
+    ax1.set_xticks(np.arange(len(labels) + 1) - 0.5, minor=True)
+    ax1.set_yticks(np.arange(len(labels) + 1) - 0.5, minor=True)
+    ax1.grid(which="minor", color="w", linestyle="-", linewidth=2)
+    ax1.tick_params(which="minor", bottom=False, left=False)
+    # Subplot 2: Class Distribution
+    ax2 = plt.subplot(2, 1, 2)  # Changed to 2 rows, 1 column, second subplot
+    # Create a bar plot for class distribution
+    bars = ax2.bar(
+        labels.astype(str),
+        class_dist_pct,
+        color=sns.color_palette("viridis", len(labels)),
+    )
-    plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
-    plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
+    # Add percentage labels on top of bars
+    for bar in bars:
+        height = bar.get_height()
+        ax2.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            height + 1,
+            f"{height:.1f}%",
+            ha="center",
+            va="bottom",
+            fontsize=10,
+        )
+    # Add title and labels
+    ax2.set_title("Class Distribution", fontsize=14, pad=20, weight="bold")
+    ax2.set_xlabel("Class", fontsize=12, labelpad=10)
+    ax2.set_ylabel("Percentage of Total Samples", fontsize=12, labelpad=10)
+    ax2.set_ylim(0, 100)
+    ax2.grid(axis="y", linestyle="--", alpha=0.7)
+    # Add total count annotation
+    total = len(y_true)
+    ax2.text(
+        0.5,
+        -0.15,  # Adjusted y-position for better spacing
+        f"Total samples: {total:,}",
+        transform=ax2.transAxes,
+        ha="center",
+        fontsize=10,
+        bbox=dict(
+            facecolor="white",
+            alpha=0.8,
+            edgecolor="lightgray",
+            boxstyle="round,pad=0.5",
+        ),
+    )
+    # Adjust layout to prevent overlap with more vertical space
+    plt.tight_layout(rect=[0, 0.03, 1, 0.98])
     plt.show()

{lecrapaud-0.16.6 → lecrapaud-0.16.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.16.6"
+version = "0.16.7"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}