PyPI - validmind - Versions diffs - 2.4.13__py3-none-any.whl → 2.5.1__py3-none-any.whl - Mend

validmind 2.4.13py3-none-any.whl → 2.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py CHANGED Viewed

@@ -65,7 +65,7 @@ class PrecisionRecallCurve(Metric):
             raise SkipTestError("Skipping PrecisionRecallCurve for Foundation models")
         y_true = self.inputs.dataset.y
-        y_pred = self.inputs.model.predict_proba(self.inputs.dataset.x)
+        y_pred = self.inputs.dataset.y_prob(self.inputs.model)
         # PR curve is only supported for binary classification
         if len(np.unique(y_true)) > 2:

validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py CHANGED Viewed

@@ -12,6 +12,7 @@ import pandas as pd
 import seaborn as sns
 from sklearn import metrics
+from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.vm_models import (
     Figure,
     ResultSummary,
@@ -22,6 +23,7 @@ from validmind.vm_models import (
 )
+# TODO: make this support regression and classification as well as more performance metrics
 @dataclass
 class RobustnessDiagnosis(ThresholdTest):
     """
@@ -39,13 +41,13 @@ class RobustnessDiagnosis(ThresholdTest):
     This test is conducted by adding Gaussian noise, proportional to a particular standard deviation scale, to numeric
     input features of both the training and testing datasets. The model performance in the face of these perturbed
-    features is then evaluated using metrics (default: 'accuracy'). This process is iterated over a range of scale
-    factors. The resulting accuracy trend against the amount of noise introduced is illustrated with a line chart. A
-    predetermined threshold determines what level of accuracy decay due to perturbation is considered acceptable.
+    features is then evaluated using the ROC_AUC score. This process is iterated over a range of scale
+    factors. The resulting auc trend against the amount of noise introduced is illustrated with a line chart. A
+    predetermined threshold determines what level of auc decay due to perturbation is considered acceptable.
     **Signs of High Risk**:
-    - Substantial decreases in accuracy when noise is introduced to feature inputs.
-    - The decay in accuracy surpasses the configured threshold, indicating that the model is not robust against input
+    - Substantial decreases in auc when noise is introduced to feature inputs.
+    - The decay in auc surpasses the configured threshold, indicating that the model is not robust against input
     noise.
     - Instances where one or more elements provided in the features list don't match with the training dataset's
     numerical feature columns.
@@ -57,15 +59,12 @@ class RobustnessDiagnosis(ThresholdTest):
     - Detailed results visualization helps in interpreting the outcome of robustness testing.
     **Limitations**:
+    - The default threshold for auc decay is set to 0.05, which is unlikely to be optimal for most use cases and
+    should be adjusted based on domain expertise to suit the needs of the specific model.
     - Only numerical features are perturbed, leaving out non-numerical features, which can lead to an incomplete
     analysis of robustness.
-    - The default metric used is accuracy, which might not always give the best measure of a model's success,
-    particularly for imbalanced datasets.
     - The test is contingent on the assumption that the added Gaussian noise sufficiently represents potential data
     corruption or incompleteness in real-world scenarios.
-    - There might be a requirement to fine-tune the set decay threshold for accuracy with the help of domain knowledge
-    or specific project requisites.
-    - The robustness test might not deliver the expected results for datasets with a text column.
     """
     name = "robustness"
@@ -73,9 +72,9 @@ class RobustnessDiagnosis(ThresholdTest):
     default_params = {
         "features_columns": None,
         "scaling_factor_std_dev_list": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
-        "accuracy_decay_threshold": 4,
+        "auc_decay_threshold": 0.05,
     }
-    tasks = ["classification", "text_classification"]
+    tasks = ["classification"]
     tags = [
         "sklearn",
         "binary_classification",
@@ -84,17 +83,15 @@ class RobustnessDiagnosis(ThresholdTest):
         "visualization",
     ]
-    default_metrics = {"accuracy": metrics.accuracy_score}
     def run(self):
         # Validate X std deviation parameter
         if "scaling_factor_std_dev_list" not in self.params:
             raise ValueError("scaling_factor_std_dev_list must be provided in params")
         x_std_dev_list = self.params["scaling_factor_std_dev_list"]
-        if self.params["accuracy_decay_threshold"] is None:
-            raise ValueError("accuracy_decay_threshold must be provided in params")
-        accuracy_threshold = self.params["accuracy_decay_threshold"]
+        if self.params["auc_decay_threshold"] is None:
+            raise ValueError("auc_decay_threshold must be provided in params")
+        auc_threshold = self.params["auc_decay_threshold"]
         if self.inputs.model is None:
             raise ValueError("model must of provided to run this test")
@@ -131,9 +128,7 @@ class RobustnessDiagnosis(ThresholdTest):
         test_results = []
         test_figures = []
-        results_headers = ["Perturbation Size", "Dataset Type", "Records"] + list(
-            self.default_metrics.keys()
-        )
+        results_headers = ["Perturbation Size", "Dataset Type", "Records", "AUC"]
         results = {k: [] for k in results_headers}
         # Iterate scaling factor for the standard deviation list
         for x_std_dev in x_std_dev_list:
@@ -159,32 +154,32 @@ class RobustnessDiagnosis(ThresholdTest):
         test_figures.append(
             Figure(
                 for_object=self,
-                key=f"{self.name}:accuracy",
+                key=f"{self.name}:auc",
                 figure=fig,
                 metadata={
-                    "metric": "accuracy",
+                    "metric": "AUC",
                     "features_list": features_list,
                 },
             )
         )
-        train_acc = df.loc[(df["Dataset Type"] == "Training"), "accuracy"].values[0]
-        test_acc = df.loc[(df["Dataset Type"] == "Test"), "accuracy"].values[0]
+        train_auc = df.loc[(df["Dataset Type"] == "Training"), "AUC"].values[0]
+        test_auc = df.loc[(df["Dataset Type"] == "Test"), "AUC"].values[0]
         df["Passed"] = np.where(
             (df["Dataset Type"] == "Training")
-            & (df["accuracy"] >= (train_acc - accuracy_threshold)),
+            & (df["AUC"] >= (train_auc - auc_threshold)),
             True,
             np.where(
                 (df["Dataset Type"] == "Test")
-                & (df["accuracy"] >= (test_acc - accuracy_threshold)),
+                & (df["AUC"] >= (test_auc - auc_threshold)),
                 True,
                 False,
             ),
         )
         test_results.append(
             ThresholdTestResult(
-                test_name="accuracy",
+                test_name="AUC",
                 column=features_list,
                 passed=True,
                 values={"records": df.to_dict("records")},
@@ -194,7 +189,7 @@ class RobustnessDiagnosis(ThresholdTest):
             test_results, passed=df["Passed"].all(), figures=test_figures
         )
-    def summary(self, results: List[ThresholdTestResult], all_passed: bool):
+    def summary(self, results: List[ThresholdTestResult], _):
         results_table = [
             record for result in results for record in result.values["records"]
         ]
@@ -229,9 +224,13 @@ class RobustnessDiagnosis(ThresholdTest):
         results["Dataset Type"].append(dataset_type)
         results["Perturbation Size"].append(x_std_dev)
         results["Records"].append(df.shape[0])
-        y_prediction = self.inputs.model.predict(df)
-        for metric, metric_fn in self.default_metrics.items():
-            results[metric].append(metric_fn(y_true, y_prediction) * 100)
+        try:
+            y_proba = self.inputs.model.predict_proba(df)
+        except MissingOrInvalidModelPredictFnError:
+            y_proba = self.inputs.model.predict(df)
+        results["AUC"].append(metrics.roc_auc_score(y_true, y_proba))
     def _add_noise_std_dev(
         self, values: List[float], x_std_dev: float
@@ -256,14 +255,14 @@ class RobustnessDiagnosis(ThresholdTest):
     def _plot_robustness(self, results: dict, features_columns: List[str]):
         """
-        Plots the model's accuracy under feature perturbations.
+        Plots the model's auc under feature perturbations.
         Args:
             results (dict): A dictionary containing the results of the evaluation.
                 It has the following keys:
                     - 'Dataset Type': the type of dataset evaluated, e.g. 'Training' or 'Test'.
                     - 'Perturbation Size': the size of the perturbation applied to the features.
                     - 'Records': the number of records evaluated.
-                    - Any other metric used for evaluation as keys, e.g. 'accuracy', 'precision', 'recall'.
+                    - 'auc': the ROC AUC score obtained for the evaluation.
                 The values of each key are lists containing the results for each evaluation.
             features_columns (list[str]): A list containing the names of the features perturbed.
         Returns:
@@ -277,7 +276,7 @@ class RobustnessDiagnosis(ThresholdTest):
         sns.lineplot(
             data=df,
             x="Perturbation Size",
-            y="accuracy",
+            y="AUC",
             hue="Dataset Type",
             style="Dataset Type",
             linewidth=3,
@@ -288,7 +287,7 @@ class RobustnessDiagnosis(ThresholdTest):
             ax=ax,
         )
         ax.tick_params(axis="x")
-        ax.set_ylabel("Accuracy", weight="bold", fontsize=18)
+        ax.set_ylabel("AUC", weight="bold", fontsize=18)
         ax.legend(fontsize=18)
         ax.set_xlabel(
             "Perturbation Size (X * Standard Deviation)", weight="bold", fontsize=18
@@ -321,9 +320,9 @@ class RobustnessDiagnosis(ThresholdTest):
             assert isinstance(test_result.values, dict)
             assert "records" in test_result.values
-            # For unperturbed training dataset, accuracy should be present
+            # For unperturbed training dataset, auc should be present
             if (
                 test_result.column == self.params["features_columns"]
                 and 0.0 in test_result.values["records"][0]["Perturbation Size"]
             ):
-                assert "accuracy" in test_result.values["records"][0]
+                assert "AUC" in test_result.values["records"][0]

validmind/tests/ongoing_monitoring/FeatureDrift.py ADDED Viewed

@@ -0,0 +1,182 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from validmind import tags, tasks
+@tags("visualization")
+@tasks("monitoring")
+def FeatureDrift(
+    datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
+):
+    """
+    **Purpose**:
+    The Feature Drift test aims to evaluate how much the distribution of features has shifted over time between two
+    datasets, typically training and monitoring datasets. It uses the Population Stability Index (PSI) to quantify this
+    change, providing insights into the model's robustness and the necessity for retraining or feature engineering.
+    **Test Mechanism**:
+    This test calculates the PSI by:
+    - Bucketing the distributions of each feature in both datasets.
+    - Comparing the percentage of observations in each bucket between the two datasets.
+    - Aggregating the differences across all buckets for each feature to produce the PSI score for that feature.
+    The PSI score is interpreted as:
+    - PSI < 0.1: No significant population change.
+    - PSI < 0.2: Moderate population change.
+    - PSI >= 0.2: Significant population change.
+    **Signs of High Risk**:
+    - PSI >= 0.2 for any feature, indicating a significant distribution shift.
+    - Consistently high PSI scores across multiple features.
+    - Sudden spikes in PSI in recent monitoring data compared to historical data.
+    **Strengths**:
+    - Provides a quantitative measure of feature distribution changes.
+    - Easily interpretable thresholds for decision-making.
+    - Helps in early detection of data drift, prompting timely interventions.
+    **Limitations**:
+    - May not capture more intricate changes in data distribution nuances.
+    - Assumes that bucket thresholds (quantiles) adequately represent distribution shifts.
+    - PSI score interpretation can be overly simplistic for complex datasets.
+    """
+    # Feature columns for both datasets should be the same if not given
+    default_feature_columns = datasets[0].feature_columns
+    feature_columns = feature_columns or default_feature_columns
+    x_train_df = datasets[0].x_df()
+    x_test_df = datasets[1].x_df()
+    quantiles_train = x_train_df[feature_columns].quantile(
+        bins, method="single", interpolation="nearest"
+    )
+    PSI_QUANTILES = quantiles_train.to_dict()
+    PSI_BUCKET_FRAC, col, n = get_psi_buckets(
+        x_test_df, x_train_df, feature_columns, bins, PSI_QUANTILES
+    )
+    def nest(d: dict) -> dict:
+        result = {}
+        for key, value in d.items():
+            target = result
+            for k in key[:-1]:  # traverse all keys but the last
+                target = target.setdefault(k, {})
+            target[key[-1]] = value
+        return result
+    PSI_BUCKET_FRAC = nest(PSI_BUCKET_FRAC)
+    PSI_SCORES = {}
+    for col in feature_columns:
+        psi = 0
+        for n in bins:
+            actual = PSI_BUCKET_FRAC["test"][col][n]
+            expected = PSI_BUCKET_FRAC["train"][col][n]
+            psi_of_bucket = (actual - expected) * np.log(
+                (actual + 1e-6) / (expected + 1e-6)
+            )
+            psi += psi_of_bucket
+        PSI_SCORES[col] = psi
+    psi_df = pd.DataFrame(list(PSI_SCORES.items()), columns=["Features", "PSI Score"])
+    psi_df.sort_values(by=["PSI Score"], inplace=True, ascending=False)
+    psi_table = [
+        {"Features": values["Features"], "PSI Score": values["PSI Score"]}
+        for i, values in enumerate(psi_df.to_dict(orient="records"))
+    ]
+    save_fig = plot_hist(PSI_BUCKET_FRAC, bins)
+    final_psi = pd.DataFrame(psi_table)
+    return (final_psi, *save_fig)
+def get_psi_buckets(x_test_df, x_train_df, feature_columns, bins, PSI_QUANTILES):
+    DATA = {"test": x_test_df, "train": x_train_df}
+    PSI_BUCKET_FRAC = {}
+    for table in DATA.keys():
+        total_count = DATA[table].shape[0]
+        for col in feature_columns:
+            count_sum = 0
+            for n in bins:
+                if n == 0:
+                    bucket_count = (DATA[table][col] < PSI_QUANTILES[col][n]).sum()
+                elif n < 9:
+                    bucket_count = (
+                        total_count
+                        - count_sum
+                        - ((DATA[table][col] >= PSI_QUANTILES[col][n]).sum())
+                    )
+                elif n == 9:
+                    bucket_count = total_count - count_sum
+                count_sum += bucket_count
+                PSI_BUCKET_FRAC[table, col, n] = bucket_count / total_count
+    return PSI_BUCKET_FRAC, col, n
+def plot_hist(PSI_BUCKET_FRAC, bins):
+    bin_table_psi = pd.DataFrame(PSI_BUCKET_FRAC)
+    save_fig = []
+    for i in range(len(bin_table_psi)):
+        x = pd.DataFrame(
+            bin_table_psi.iloc[i]["test"].items(),
+            columns=["Bin", "Population % Reference"],
+        )
+        y = pd.DataFrame(
+            bin_table_psi.iloc[i]["train"].items(),
+            columns=["Bin", "Population % Monitoring"],
+        )
+        xy = x.merge(y, on="Bin")
+        xy.index = xy["Bin"]
+        xy = xy.drop(columns="Bin", axis=1)
+        feature_name = bin_table_psi.index[i]
+        n = len(bins)
+        r = np.arange(n)
+        width = 0.25
+        fig = plt.figure()
+        plt.bar(
+            r,
+            xy["Population % Reference"],
+            color="b",
+            width=width,
+            edgecolor="black",
+            label="Reference {0}".format(feature_name),
+        )
+        plt.bar(
+            r + width,
+            xy["Population % Monitoring"],
+            color="g",
+            width=width,
+            edgecolor="black",
+            label="Monitoring {0}".format(feature_name),
+        )
+        plt.xlabel("Bin")
+        plt.ylabel("Population %")
+        plt.title("Histogram of Population Differences {0}".format(feature_name))
+        plt.legend()
+        plt.tight_layout()
+        plt.close()
+        save_fig.append(fig)
+    return save_fig

validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import matplotlib.pyplot as plt
+from validmind import tags, tasks
+@tags("visualization")
+@tasks("monitoring")
+def PredictionAcrossEachFeature(datasets, model):
+    """
+    **Purpose:**
+    This test shows visually the prediction using reference data and monitoring data across each individual feature. If
+    there are significant differences in predictions across feature values from reference to monitoring dataset, then
+    further investigation is needed as the model is producing predictions that are different than what was observed
+    during the training of the model.
+    **Test Mechanism:**
+    The test creates scatter plots for each feature, comparing the reference dataset (used for training) with the
+    monitoring dataset (used in production). Each plot has two subplots: one for the reference data and one for the
+    monitoring data, visualizing the prediction probabilities. This allows for a visual comparison of the model's
+    behavior across different datasets.
+    **Signs of High Risk:**
+    - Significant discrepancies between the reference and monitoring subplots for the same feature
+    - Unexpected patterns or trends in monitoring data that weren't present in reference data
+    **Strengths:**
+    - Provides a clear visual representation of model performance across different features
+    - Allows for easy identification of features where the model's predictions have changed
+    - Facilitates quick detection of potential issues with the model when deployed in production
+    **Limitations:**
+    - Interpretation of scatter plots can be subjective and may require expertise
+    - Visualizations do not provide quantitative metrics for objective evaluation
+    - May not capture all types of distribution changes or issues with the model's predictions
+    """
+    """
+    This test shows visually the prediction using reference data and monitoring data
+    across each individual feature. If there are significant differences in predictions
+    across feature values from reference to monitoring dataset then futher investigation
+    is needed as the model is producing predictions that are different then what was
+    observed during the training of the model.
+    """
+    df_reference = datasets[0]._df
+    df_monitoring = datasets[1]._df
+    figures_to_save = []
+    for column in df_reference:
+        prediction_prob_column = f"{model.input_id}_probabilities"
+        prediction_column = f"{model.input_id}_prediction"
+        if column == prediction_prob_column or column == prediction_column:
+            pass
+        else:
+            fig, axs = plt.subplots(1, 2, figsize=(20, 10), sharey="row")
+            ax1, ax2 = axs
+            ax1.scatter(df_reference[column], df_reference[prediction_prob_column])
+            ax2.scatter(df_monitoring[column], df_monitoring[prediction_prob_column])
+            ax1.set_title("Reference")
+            ax1.set_xlabel(column)
+            ax1.set_ylabel("Prediction Value")
+            ax2.set_title("Monitoring")
+            ax2.set_xlabel(column)
+            figures_to_save.append(fig)
+            plt.close()
+    return tuple(figures_to_save)

validmind/tests/ongoing_monitoring/PredictionCorrelation.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import matplotlib.pyplot as plt
+import numpy as np
+from validmind import tags, tasks
+@tags("visualization")
+@tasks("monitoring")
+def PredictionCorrelation(datasets, model):
+    """
+    **Purpose:**
+    The test is used to assess the correlation pairs for each feature between model predictions from reference and
+    monitoring datasets. The primary goal is to detect significant changes in these pairs, which may signal target
+    drift, leading to lower model performance.
+    **Test Mechanism:**
+    The test calculates the correlation of each feature with model predictions for both reference and monitoring
+    datasets. The test then compares these correlations side-by-side via a bar plot and a correlation table. Features
+    with significant changes in correlation pairs highlight potential risks of model drift.
+    **Signs of High Risk:**
+    - Significant changes in correlation pairs between the reference and monitoring predictions.
+    - Notable correlation differences indicating a potential shift in the relationship between features and the target
+    variable.
+    **Strengths:**
+    - Allows for visual identification of drift in feature relationships with model predictions.
+    - Comparison via a clear bar plot assists in understanding model stability over time.
+    - Helps in early detection of target drift, enabling timely interventions.
+    **Limitations:**
+    - May require substantial reference and monitoring data for accurate comparison.
+    - Correlation does not imply causation, and other factors might influence changes.
+    - The method solely focuses on linear relationships, potentially missing non-linear interactions.
+    """
+    prediction_prob_column = f"{model.input_id}_probabilities"
+    prediction_column = f"{model.input_id}_prediction"
+    df_corr = datasets[0]._df.corr()
+    df_corr = df_corr[[prediction_prob_column]]
+    df_corr2 = datasets[1]._df.corr()
+    df_corr2 = df_corr2[[prediction_prob_column]]
+    corr_final = df_corr.merge(df_corr2, left_index=True, right_index=True)
+    corr_final.columns = ["Reference Predictions", "Monitoring Predictions"]
+    corr_final = corr_final.drop(index=[prediction_column, prediction_prob_column])
+    n = len(corr_final)
+    r = np.arange(n)
+    width = 0.25
+    fig = plt.figure()
+    plt.bar(
+        r,
+        corr_final["Reference Predictions"],
+        color="b",
+        width=width,
+        edgecolor="black",
+        label="Reference Prediction Correlation",
+    )
+    plt.bar(
+        r + width,
+        corr_final["Monitoring Predictions"],
+        color="g",
+        width=width,
+        edgecolor="black",
+        label="Monitoring Prediction Correlation",
+    )
+    plt.xlabel("Features")
+    plt.ylabel("Correlation")
+    plt.title("Correlation between Predictions and Features")
+    features = corr_final.index.to_list()
+    plt.xticks(r + width / 2, features, rotation=45)
+    plt.legend()
+    plt.tight_layout()
+    corr_final["Features"] = corr_final.index
+    corr_final = corr_final[
+        ["Features", "Reference Predictions", "Monitoring Predictions"]
+    ]
+    return ({"Correlation Pair Table": corr_final}, fig)

validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py ADDED Viewed

@@ -0,0 +1,57 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import matplotlib.pyplot as plt
+import seaborn as sns
+from validmind import tags, tasks
+@tags("visualization")
+@tasks("monitoring")
+def TargetPredictionDistributionPlot(datasets, model):
+    """
+    **Purpose:**
+    This test provides the prediction distributions from the reference dataset and the new monitoring dataset. If there
+    are significant differences in the distributions, it might indicate different underlying data characteristics that
+    warrant further investigation into the root causes.
+    **Test Mechanism:**
+    The methodology involves generating Kernel Density Estimation (KDE) plots for the prediction probabilities from
+    both the reference and monitoring datasets. By comparing these KDE plots, one can visually assess any significant
+    differences in the prediction distributions between the two datasets.
+    **Signs of High Risk:**
+    - Significant divergence between the distribution curves of the reference and monitoring predictions
+    - Unusual shifts or bimodal distribution in the monitoring predictions compared to the reference predictions
+    **Strengths:**
+    - Visual representation makes it easy to spot differences in prediction distributions
+    - Useful for identifying potential data drift or changes in underlying data characteristics
+    - Simple and efficient to implement using standard plotting libraries
+    **Limitations:**
+    - Subjective interpretation of the visual plots
+    - Might not pinpoint the exact cause of distribution changes
+    - Less effective if the differences in distributions are subtle and not easily visible
+    """
+    pred_ref = datasets[0].y_prob_df(model)
+    pred_ref.columns = ["Reference Prediction"]
+    pred_monitor = datasets[1].y_prob_df(model)
+    pred_monitor.columns = ["Monitoring Prediction"]
+    fig = plt.figure()
+    plot = sns.kdeplot(
+        pred_ref["Reference Prediction"], shade=True, label="Reference Prediction"
+    )
+    plot = sns.kdeplot(
+        pred_monitor["Monitoring Prediction"], shade=True, label="Monitor Prediction"
+    )
+    plot.set(
+        xlabel="Prediction", title="Distribution of Reference & Monitor Predictions"
+    )
+    plot.legend()
+    return fig

validmind/unit_metrics/classification/sklearn/ROC_AUC.py CHANGED Viewed

@@ -2,7 +2,9 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from numpy import unique
 from sklearn.metrics import roc_auc_score
+from sklearn.preprocessing import LabelBinarizer
 from validmind import tags, tasks
@@ -10,4 +12,23 @@ from validmind import tags, tasks
 @tags("classification", "sklearn", "unit_metric")
 @tasks("classification")
 def ROC_AUC(model, dataset, **kwargs):
-    return roc_auc_score(dataset.y, dataset.y_pred(model), **kwargs)
+    y_true = dataset.y
+    if len(unique(y_true)) > 2:
+        y_pred = dataset.y_pred(model)
+        y_true = y_true.astype(y_pred.dtype)
+        roc_auc = _multiclass_roc_auc_score(y_true, y_pred, **kwargs)
+    else:
+        y_prob = dataset.y_prob(model)
+        y_true = y_true.astype(y_prob.dtype).flatten()
+        roc_auc = roc_auc_score(y_true, y_prob, **kwargs)
+    return roc_auc
+def _multiclass_roc_auc_score(y_test, y_pred, average="macro"):
+    lb = LabelBinarizer()
+    lb.fit(y_test)
+    return roc_auc_score(lb.transform(y_test), lb.transform(y_pred), average=average)

validmind/utils.py CHANGED Viewed

@@ -364,7 +364,7 @@ def get_model_info(model):
     if language is None:
         language = f"Python {python_version()}"
-    if framework_version is None:
+    if framework_version == "N/A" or framework_version is None:
         try:
             framework_version = sys.modules[framework].__version__
         except (KeyError, AttributeError):

validmind/vm_models/dataset/dataset.py CHANGED Viewed

@@ -393,7 +393,8 @@ class VMDataset(VMInput):
             assert self.target_column not in columns
             columns.append(self.target_column)
-        return as_df(self._df[columns])
+        # return a copy to prevent accidental modification
+        return as_df(self._df[columns]).copy()
     @property
     def x(self) -> np.ndarray:

validmind 2.4.13__py3-none-any.whl → 2.5.1__py3-none-any.whl

validmind 2.4.13py3-none-any.whl → 2.5.1py3-none-any.whl