PyPI - validmind - Versions diffs - 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl - Mend

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

validmind/__init__.py +8 -17
validmind/__version__.py +1 -1
validmind/ai/test_descriptions.py +66 -85
validmind/ai/test_result_description/context.py +2 -2
validmind/ai/utils.py +26 -1
validmind/api_client.py +43 -79
validmind/client.py +5 -7
validmind/client_config.py +1 -1
validmind/datasets/__init__.py +1 -1
validmind/datasets/classification/customer_churn.py +7 -5
validmind/datasets/nlp/__init__.py +2 -2
validmind/errors.py +6 -10
validmind/html_templates/content_blocks.py +18 -16
validmind/logging.py +21 -16
validmind/tests/__init__.py +28 -5
validmind/tests/__types__.py +186 -170
validmind/tests/_store.py +7 -21
validmind/tests/comparison.py +362 -0
validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
validmind/tests/data_validation/ADF.py +49 -83
validmind/tests/data_validation/AutoAR.py +59 -96
validmind/tests/data_validation/AutoMA.py +59 -96
validmind/tests/data_validation/AutoStationarity.py +66 -114
validmind/tests/data_validation/ClassImbalance.py +48 -117
validmind/tests/data_validation/DatasetDescription.py +180 -209
validmind/tests/data_validation/DatasetSplit.py +50 -75
validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
validmind/tests/data_validation/Duplicates.py +21 -90
validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
validmind/tests/data_validation/HighCardinality.py +32 -80
validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
validmind/tests/data_validation/IQROutliersTable.py +40 -80
validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
validmind/tests/data_validation/KPSS.py +33 -81
validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
validmind/tests/data_validation/MissingValues.py +17 -58
validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
validmind/tests/data_validation/RollingStatsPlot.py +50 -81
validmind/tests/data_validation/SeasonalDecompose.py +102 -184
validmind/tests/data_validation/Skewness.py +27 -64
validmind/tests/data_validation/SpreadPlot.py +34 -57
validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
validmind/tests/data_validation/TooManyZeroValues.py +21 -70
validmind/tests/data_validation/UniqueRows.py +23 -62
validmind/tests/data_validation/WOEBinPlots.py +83 -109
validmind/tests/data_validation/WOEBinTable.py +28 -69
validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
validmind/tests/data_validation/nlp/CommonWords.py +49 -57
validmind/tests/data_validation/nlp/Hashtags.py +27 -49
validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
validmind/tests/data_validation/nlp/Mentions.py +32 -63
validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
validmind/tests/data_validation/nlp/Punctuations.py +63 -47
validmind/tests/data_validation/nlp/Sentiment.py +4 -0
validmind/tests/data_validation/nlp/StopWords.py +62 -91
validmind/tests/data_validation/nlp/TextDescription.py +116 -159
validmind/tests/data_validation/nlp/Toxicity.py +12 -4
validmind/tests/decorator.py +33 -242
validmind/tests/load.py +212 -153
validmind/tests/model_validation/BertScore.py +13 -7
validmind/tests/model_validation/BleuScore.py +4 -0
validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
validmind/tests/model_validation/ContextualRecall.py +3 -0
validmind/tests/model_validation/FeaturesAUC.py +43 -74
validmind/tests/model_validation/MeteorScore.py +3 -0
validmind/tests/model_validation/RegardScore.py +5 -1
validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
validmind/tests/model_validation/embeddings/utils.py +53 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
validmind/tests/output.py +120 -0
validmind/tests/prompt_validation/Bias.py +55 -98
validmind/tests/prompt_validation/Clarity.py +56 -99
validmind/tests/prompt_validation/Conciseness.py +63 -101
validmind/tests/prompt_validation/Delimitation.py +48 -89
validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
validmind/tests/prompt_validation/Robustness.py +80 -121
validmind/tests/prompt_validation/Specificity.py +61 -95
validmind/tests/prompt_validation/ai_powered_test.py +2 -2
validmind/tests/run.py +314 -496
validmind/tests/test_providers.py +109 -79
validmind/tests/utils.py +91 -0
validmind/unit_metrics/__init__.py +16 -155
validmind/unit_metrics/classification/F1.py +1 -0
validmind/unit_metrics/classification/Precision.py +1 -0
validmind/unit_metrics/classification/ROC_AUC.py +1 -0
validmind/unit_metrics/classification/Recall.py +1 -0
validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
validmind/unit_metrics/regression/HuberLoss.py +1 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
validmind/unit_metrics/regression/QuantileLoss.py +1 -0
validmind/unit_metrics/regression/RSquaredScore.py +2 -1
validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
validmind/utils.py +66 -17
validmind/vm_models/__init__.py +2 -17
validmind/vm_models/dataset/dataset.py +31 -4
validmind/vm_models/figure.py +7 -37
validmind/vm_models/model.py +3 -0
validmind/vm_models/result/__init__.py +7 -0
validmind/vm_models/result/result.jinja +21 -0
validmind/vm_models/result/result.py +337 -0
validmind/vm_models/result/utils.py +160 -0
validmind/vm_models/test_suite/runner.py +16 -54
validmind/vm_models/test_suite/summary.py +3 -3
validmind/vm_models/test_suite/test.py +43 -77
validmind/vm_models/test_suite/test_suite.py +8 -40
validmind-2.6.8.dist-info/METADATA +137 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
validmind/tests/data_validation/AutoSeasonality.py +0 -190
validmind/tests/metadata.py +0 -59
validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
validmind/unit_metrics/composite.py +0 -238
validmind/vm_models/test/metric.py +0 -98
validmind/vm_models/test/metric_result.py +0 -61
validmind/vm_models/test/output_template.py +0 -55
validmind/vm_models/test/result_summary.py +0 -76
validmind/vm_models/test/result_wrapper.py +0 -488
validmind/vm_models/test/test.py +0 -103
validmind/vm_models/test/threshold_test.py +0 -106
validmind/vm_models/test/threshold_test_result.py +0 -75
validmind/vm_models/test_context.py +0 -259
validmind-2.5.25.dist-info/METADATA +0 -118
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/sklearn/ROCCurve.py CHANGED Viewed

@@ -2,19 +2,24 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
 import numpy as np
 import plotly.graph_objects as go
 from sklearn.metrics import roc_auc_score, roc_curve
+from validmind import tags, tasks
 from validmind.errors import SkipTestError
-from validmind.models import FoundationModel
-from validmind.vm_models import Figure, Metric
-@dataclass
-class ROCCurve(Metric):
+from validmind.vm_models import VMDataset, VMModel
+@tags(
+    "sklearn",
+    "binary_classification",
+    "multiclass_classification",
+    "model_performance",
+    "visualization",
+)
+@tasks("classification", "text_classification")
+def ROCCurve(model: VMModel, dataset: VMDataset):
     """
     Evaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic
     (ROC) curve and calculating the Area Under Curve (AUC) score.
@@ -61,78 +66,39 @@ class ROCCurve(Metric):
     incorrect, provided that the model's ranking format is retained. This phenomenon is commonly termed the "Class
     Imbalance Problem".
     """
-    name = "roc_curve"
-    required_inputs = ["model", "dataset"]
-    tasks = ["classification", "text_classification"]
-    tags = [
-        "sklearn",
-        "binary_classification",
-        "multiclass_classification",
-        "model_performance",
-        "visualization",
-    ]
-    def run(self):
-        if isinstance(self.inputs.model, FoundationModel):
-            raise SkipTestError("Skipping ROCCurve for Foundation models")
-        y_true = self.inputs.dataset.y
-        y_prob = self.inputs.dataset.y_prob(self.inputs.model)
-        # ROC curve is only supported for binary classification
-        if len(np.unique(y_true)) > 2:
-            raise SkipTestError(
-                "ROC Curve is only supported for binary classification models"
-            )
-        y_true = y_true.astype(y_prob.dtype).flatten()
-        assert np.all((y_prob >= 0) & (y_prob <= 1)), "Invalid probabilities in y_prob."
-        fpr, tpr, roc_thresholds = roc_curve(y_true, y_prob, drop_intermediate=False)
-        # Remove Inf values from roc_thresholds
-        valid_thresholds_mask = np.isfinite(roc_thresholds)
-        roc_thresholds = roc_thresholds[valid_thresholds_mask]
-        auc = roc_auc_score(y_true, y_prob)
-        trace0 = go.Scatter(
-            x=fpr,
-            y=tpr,
-            mode="lines",
-            name=f"ROC curve (AUC = {auc:.2f})",
-            line=dict(color="#DE257E"),
-        )
-        trace1 = go.Scatter(
-            x=[0, 1],
-            y=[0, 1],
-            mode="lines",
-            name="Random (AUC = 0.5)",
-            line=dict(color="grey", dash="dash"),
+    if len(np.unique(dataset.y)) > 2:
+        raise SkipTestError(
+            "ROC Curve is only supported for binary classification models"
         )
-        layout = go.Layout(
-            title=f"ROC Curve for {self.inputs.model.input_id} on {self.inputs.dataset.input_id}",
+    y_prob = dataset.y_prob(model)
+    y_true = dataset.y.astype(y_prob.dtype).flatten()
+    fpr, tpr, _ = roc_curve(y_true, y_prob, drop_intermediate=False)
+    auc = roc_auc_score(y_true, y_prob)
+    return go.Figure(
+        data=[
+            go.Scatter(
+                x=fpr,
+                y=tpr,
+                mode="lines",
+                name=f"ROC curve (AUC = {auc:.2f})",
+                line=dict(color="#DE257E"),
+            ),
+            go.Scatter(
+                x=[0, 1],
+                y=[0, 1],
+                mode="lines",
+                name="Random (AUC = 0.5)",
+                line=dict(color="grey", dash="dash"),
+            ),
+        ],
+        layout=go.Layout(
+            title=f"ROC Curve for {model.input_id} on {dataset.input_id}",
             xaxis=dict(title="False Positive Rate"),
             yaxis=dict(title="True Positive Rate"),
             width=700,
             height=500,
-        )
-        fig = go.Figure(data=[trace0, trace1], layout=layout)
-        return self.cache_results(
-            metric_value={
-                "auc": auc,
-                "fpr": fpr,
-                "tpr": tpr,
-                "thresholds": roc_thresholds,
-            },
-            figures=[
-                Figure(
-                    for_object=self,
-                    key="roc_auc_curve",
-                    figure=fig,
-                )
-            ],
-        )
+        ),
+    )

validmind/tests/model_validation/sklearn/RegressionPerformance.py CHANGED Viewed

@@ -2,52 +2,43 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-import re
-from dataclasses import dataclass
 import numpy as np
 from sklearn.metrics import mean_absolute_error, mean_squared_error
-from validmind.errors import SkipTestError
+from validmind import tags, tasks
 from validmind.logging import get_logger
-from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
+from validmind.vm_models import VMDataset, VMModel
 logger = get_logger(__name__)
-@dataclass
-class RegressionPerformance(Metric):
+@tags("sklearn", "model_performance")
+@tasks("regression")
+def RegressionPerformance(model: VMModel, dataset: VMDataset):
     """
-    Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
-    MAPE, and MBD.
+    Evaluates the performance of a regression model using five different metrics: MAE, MSE, RMSE, MAPE, and MBD.
     ### Purpose
-    The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
-    models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
+    The Regression Models Performance Comparison metric is used to measure the performance of regression models. It
+    calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
     Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
     enabling a comprehensive view of model performance.
     ### Test Mechanism
-    The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
-    MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
-    providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
-    performance of all models using these metrics. The results are then appended to a table for presenting a
-    comparative summary.
+    The test uses the sklearn library to calculate the MAE, MSE, RMSE, MAPE, and MBD. These calculations encapsulate both
+    the direction and the magnitude of error in predictions, thereby providing a multi-faceted view of model accuracy.
     ### Signs of High Risk
     - High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
     model's predictions from the true values.
     - A large value of MBD, which shows a consistent bias in the model’s predictions.
-    - If the test returns an error citing that no models were provided for comparison, it implies a risk in the
-    evaluation process itself.
     ### Strengths
     - The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
-    - It compares multiple models simultaneously, aiding in the selection of the best-performing models.
     - It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
     ### Limitations
@@ -55,82 +46,38 @@ class RegressionPerformance(Metric):
     - The metric only evaluates regression models and does not evaluate classification models.
     - The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
     handle pre-processing, feature selection, or other stages in the model lifecycle.
-    - It may fail to run if it doesn't receive valid models as inputs. The models are passed externally and the test
-    doesn't have an internal mechanism to verify their validity.
-    - The test could exhibit performance limitations if a large number of models is input for comparison.
     """
-    name = "regression_performance"
-    required_inputs = ["dataset", "model"]
-    tasks = ["regression"]
-    tags = [
-        "sklearn",
-        "model_performance",
-    ]
-    def regression_errors(self, y_true_test, y_pred_test):
-        mae_test = mean_absolute_error(y_true_test, y_pred_test)
-        results = {}
-        results["Mean Absolute Error (MAE)"] = mae_test
-        mse_test = mean_squared_error(y_true_test, y_pred_test)
-        results["Mean Squared Error (MSE)"] = mse_test
-        results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
-        if np.any(y_true_test == 0):
-            logger.warning(
-                "y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
-            )
-            results["Mean Absolute Percentage Error (MAPE)"] = None
-        else:
-            mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
-            results["Mean Absolute Percentage Error (MAPE)"] = mape_test
-        mbd_test = np.mean(y_pred_test - y_true_test)
-        results["Mean Bias Deviation (MBD)"] = mbd_test
-        return results
-    def summary(self, metric_value: dict):
-        """
-        This summary varies depending if we're evaluating a binary or multi-class model
-        """
-        results = []
-        metrics = metric_value[self.inputs.model.input_id].keys()
-        error_table = []
-        for metric_name in metrics:
-            errors_dict = {}
-            errors_dict["Errors"] = metric_name
-            for m, _ in metric_value.items():
-                for metric in metrics:
-                    res = re.findall(r"\(.*?\)", metric)
-                    res[0][1:-1]
-                    errors_dict[f"{res[0][1:-1]}-{m}"] = metric_value[m][metric]
-            error_table.append(errors_dict)
-        results.append(
-            ResultTable(
-                data=error_table,
-                metadata=ResultTableMetadata(title="Regression Errors Comparison"),
-            )
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    # MAE calculation
+    metrics = {
+        "Mean Absolute Error (MAE)": mean_absolute_error(y_true, y_pred),
+    }
+    # MSE and RMSE calculations
+    mse = mean_squared_error(y_true, y_pred)
+    metrics["Mean Squared Error (MSE)"] = mse
+    metrics["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
+    # MAPE calculation
+    if np.any(y_true == 0):
+        logger.warning(
+            "y_true contains zero values. Skipping MAPE calculation to avoid division by zero."
         )
-        return ResultSummary(results=results)
-    def run(self):
-        # Check models list is not empty
-        if not self.inputs.model:
-            raise SkipTestError(
-                "Model must be provided as a `models` parameter to compare performance"
-            )
-        results = {}
-        result = self.regression_errors(
-            y_true_test=self.inputs.dataset.y,
-            y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
+        metrics["Mean Absolute Percentage Error (MAPE)"] = None
+    else:
+        metrics["Mean Absolute Percentage Error (MAPE)"] = (
+            np.mean(np.abs((y_true - y_pred) / y_true)) * 100
         )
-        results[self.inputs.model.input_id] = result
-        return self.cache_results(results)
+    # MBD calculation
+    metrics["Mean Bias Deviation (MBD)"] = np.mean(y_pred - y_true)
+    return [
+        {
+            "Metric": metric,
+            "Value": value,
+        }
+        for metric, value in metrics.items()
+    ]

validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py CHANGED Viewed

@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 from collections import defaultdict
-from dataclasses import dataclass
 from operator import add
 from typing import List, Tuple
@@ -15,16 +14,8 @@ from sklearn import metrics
 from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.logging import get_logger
-from validmind.vm_models import (
-    Figure,
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-    VMDataset,
-    VMModel,
-)
+from validmind.tests import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
 logger = get_logger(__name__)
@@ -222,32 +213,59 @@ def _plot_robustness(
     return fig
-# TODO: make this a functional test instead of class-based when appropriate
-# simply have to remove the class and rename this func to OverfitDiagnosis
-def robustness_diagnosis(
-    model: VMModel,
+@tags("sklearn", "model_diagnosis", "visualization")
+@tasks("classification", "regression")
+def RobustnessDiagnosis(
     datasets: List[VMDataset],
+    model: VMModel,
     metric: str = None,
     scaling_factor_std_dev_list: List[float] = DEFAULT_STD_DEV_LIST,
     performance_decay_threshold: float = DEFAULT_DECAY_THRESHOLD,
 ):
+    """
+    Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
+    ### Purpose
+    The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
+    perturbations or noise in its input data. This is essential for understanding the model's ability to handle
+    real-world scenarios where data may be imperfect or corrupted.
+    ### Test Mechanism
+    This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
+    deviation. The performance of the model is then measured using a specified metric. The process includes:
+    - Adding Gaussian noise to numerical input features based on scaling factors.
+    - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
+    for regression tasks.
+    - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
+    ### Signs of High Risk
+    - A significant drop in performance metrics with minimal noise.
+    - Performance decay values exceeding the specified threshold.
+    - Consistent failure to meet performance standards across multiple perturbation scales.
+    ### Strengths
+    - Provides insights into the model's robustness against noisy or corrupted data.
+    - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
+    - Visualization helps in understanding the extent of performance degradation.
+    ### Limitations
+    - Gaussian noise might not adequately represent all types of real-world data perturbations.
+    - Performance thresholds are somewhat arbitrary and might need tuning.
+    - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
+    """
+    # TODO: use single dataset
     if not metric:
         metric = (
             DEFAULT_CLASSIFICATION_METRIC
             if datasets[0].probability_column(model)
             else DEFAULT_REGRESSION_METRIC
         )
-        logger.info(f"Using default metric ({metric.upper()}) for robustness diagnosis")
-    if id(scaling_factor_std_dev_list) == id(DEFAULT_STD_DEV_LIST):
-        logger.info(
-            f"Using default scaling factors for the standard deviation of the noise: {DEFAULT_STD_DEV_LIST}"
-        )
-    if id(performance_decay_threshold) == id(DEFAULT_DECAY_THRESHOLD):
-        logger.info(
-            f"Using default performance decay threshold of {DEFAULT_DECAY_THRESHOLD}"
-        )
     results = [{} for _ in range(len(datasets))]
@@ -304,116 +322,9 @@ def robustness_diagnosis(
         columns=datasets[0].feature_columns_numeric,
         model=model.input_id,
     )
     # rename perturbation size for baseline
-    results_df["Perturbation Size"][
-        results_df["Perturbation Size"] == 0.0
+    results_df.loc[
+        results_df["Perturbation Size"] == 0.0, "Perturbation Size"
     ] = "Baseline (0.0)"
-    return results_df, fig
-@dataclass
-class RobustnessDiagnosis(ThresholdTest):
-    """
-    Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
-    ### Purpose
-    The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
-    perturbations or noise in its input data. This is essential for understanding the model's ability to handle
-    real-world scenarios where data may be imperfect or corrupted.
-    ### Test Mechanism
-    This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
-    deviation. The performance of the model is then measured using a specified metric. The process includes:
-    - Adding Gaussian noise to numerical input features based on scaling factors.
-    - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
-    for regression tasks.
-    - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
-    ### Signs of High Risk
-    - A significant drop in performance metrics with minimal noise.
-    - Performance decay values exceeding the specified threshold.
-    - Consistent failure to meet performance standards across multiple perturbation scales.
-    ### Strengths
-    - Provides insights into the model's robustness against noisy or corrupted data.
-    - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
-    - Visualization helps in understanding the extent of performance degradation.
-    ### Limitations
-    - Gaussian noise might not adequately represent all types of real-world data perturbations.
-    - Performance thresholds are somewhat arbitrary and might need tuning.
-    - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
-    """
-    name = "robustness"
-    required_inputs = ["model", "datasets"]
-    default_params = {
-        "metric": None,
-        "scaling_factor_std_dev_list": DEFAULT_STD_DEV_LIST,
-        "performance_decay_threshold": DEFAULT_DECAY_THRESHOLD,
-    }
-    tasks = ["classification", "regression"]
-    tags = [
-        "sklearn",
-        "model_diagnosis",
-        "visualization",
-    ]
-    def run(self):
-        results, fig = robustness_diagnosis(
-            model=self.inputs.model,
-            datasets=self.inputs.datasets,
-            metric=self.params["metric"],
-            scaling_factor_std_dev_list=self.params["scaling_factor_std_dev_list"],
-            performance_decay_threshold=self.params["performance_decay_threshold"],
-        )
-        return self.cache_results(
-            passed=results["Passed"].all(),
-            test_results_list=[
-                ThresholdTestResult(
-                    test_name=self.params["metric"],
-                    passed=results["Passed"].all(),
-                    values=results.to_dict(orient="records"),
-                )
-            ],
-            figures=[
-                Figure(
-                    for_object=self,
-                    key=f"{self.name}:{self.params['metric']}",
-                    figure=fig,
-                )
-            ],
-        )
-    def summary(self, results: List[ThresholdTestResult], _):
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=results[0].values,
-                    metadata=ResultTableMetadata(title="Robustness Diagnosis Results"),
-                )
-            ]
-        )
-    def test(self):
-        """Unit Test for Robustness Diagnosis Threshold Test"""
-        # Verify the result object is present
-        assert self.result is not None
-        # Verify test results and their type
-        assert isinstance(self.result.test_results.results, list)
-        # Check for presence and validity of 'values' and 'passed' flag in each result
-        for test_result in self.result.test_results.results:
-            assert "values" in test_result.__dict__
-            assert "passed" in test_result.__dict__
-            assert isinstance(test_result.values, list)
+    return results_df, fig, all(results_df["Passed"])

validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl