validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,24 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
from sklearn.metrics import roc_auc_score, roc_curve
|
10
8
|
|
9
|
+
from validmind import tags, tasks
|
11
10
|
from validmind.errors import SkipTestError
|
12
|
-
from validmind.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
11
|
+
from validmind.vm_models import VMDataset, VMModel
|
12
|
+
|
13
|
+
|
14
|
+
@tags(
|
15
|
+
"sklearn",
|
16
|
+
"binary_classification",
|
17
|
+
"multiclass_classification",
|
18
|
+
"model_performance",
|
19
|
+
"visualization",
|
20
|
+
)
|
21
|
+
@tasks("classification", "text_classification")
|
22
|
+
def ROCCurve(model: VMModel, dataset: VMDataset):
|
18
23
|
"""
|
19
24
|
Evaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic
|
20
25
|
(ROC) curve and calculating the Area Under Curve (AUC) score.
|
@@ -61,78 +66,39 @@ class ROCCurve(Metric):
|
|
61
66
|
incorrect, provided that the model's ranking format is retained. This phenomenon is commonly termed the "Class
|
62
67
|
Imbalance Problem".
|
63
68
|
"""
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
tasks = ["classification", "text_classification"]
|
68
|
-
tags = [
|
69
|
-
"sklearn",
|
70
|
-
"binary_classification",
|
71
|
-
"multiclass_classification",
|
72
|
-
"model_performance",
|
73
|
-
"visualization",
|
74
|
-
]
|
75
|
-
|
76
|
-
def run(self):
|
77
|
-
if isinstance(self.inputs.model, FoundationModel):
|
78
|
-
raise SkipTestError("Skipping ROCCurve for Foundation models")
|
79
|
-
|
80
|
-
y_true = self.inputs.dataset.y
|
81
|
-
y_prob = self.inputs.dataset.y_prob(self.inputs.model)
|
82
|
-
|
83
|
-
# ROC curve is only supported for binary classification
|
84
|
-
if len(np.unique(y_true)) > 2:
|
85
|
-
raise SkipTestError(
|
86
|
-
"ROC Curve is only supported for binary classification models"
|
87
|
-
)
|
88
|
-
|
89
|
-
y_true = y_true.astype(y_prob.dtype).flatten()
|
90
|
-
assert np.all((y_prob >= 0) & (y_prob <= 1)), "Invalid probabilities in y_prob."
|
91
|
-
|
92
|
-
fpr, tpr, roc_thresholds = roc_curve(y_true, y_prob, drop_intermediate=False)
|
93
|
-
|
94
|
-
# Remove Inf values from roc_thresholds
|
95
|
-
valid_thresholds_mask = np.isfinite(roc_thresholds)
|
96
|
-
roc_thresholds = roc_thresholds[valid_thresholds_mask]
|
97
|
-
auc = roc_auc_score(y_true, y_prob)
|
98
|
-
|
99
|
-
trace0 = go.Scatter(
|
100
|
-
x=fpr,
|
101
|
-
y=tpr,
|
102
|
-
mode="lines",
|
103
|
-
name=f"ROC curve (AUC = {auc:.2f})",
|
104
|
-
line=dict(color="#DE257E"),
|
105
|
-
)
|
106
|
-
trace1 = go.Scatter(
|
107
|
-
x=[0, 1],
|
108
|
-
y=[0, 1],
|
109
|
-
mode="lines",
|
110
|
-
name="Random (AUC = 0.5)",
|
111
|
-
line=dict(color="grey", dash="dash"),
|
69
|
+
if len(np.unique(dataset.y)) > 2:
|
70
|
+
raise SkipTestError(
|
71
|
+
"ROC Curve is only supported for binary classification models"
|
112
72
|
)
|
113
73
|
|
114
|
-
|
115
|
-
|
74
|
+
y_prob = dataset.y_prob(model)
|
75
|
+
y_true = dataset.y.astype(y_prob.dtype).flatten()
|
76
|
+
|
77
|
+
fpr, tpr, _ = roc_curve(y_true, y_prob, drop_intermediate=False)
|
78
|
+
auc = roc_auc_score(y_true, y_prob)
|
79
|
+
|
80
|
+
return go.Figure(
|
81
|
+
data=[
|
82
|
+
go.Scatter(
|
83
|
+
x=fpr,
|
84
|
+
y=tpr,
|
85
|
+
mode="lines",
|
86
|
+
name=f"ROC curve (AUC = {auc:.2f})",
|
87
|
+
line=dict(color="#DE257E"),
|
88
|
+
),
|
89
|
+
go.Scatter(
|
90
|
+
x=[0, 1],
|
91
|
+
y=[0, 1],
|
92
|
+
mode="lines",
|
93
|
+
name="Random (AUC = 0.5)",
|
94
|
+
line=dict(color="grey", dash="dash"),
|
95
|
+
),
|
96
|
+
],
|
97
|
+
layout=go.Layout(
|
98
|
+
title=f"ROC Curve for {model.input_id} on {dataset.input_id}",
|
116
99
|
xaxis=dict(title="False Positive Rate"),
|
117
100
|
yaxis=dict(title="True Positive Rate"),
|
118
101
|
width=700,
|
119
102
|
height=500,
|
120
|
-
)
|
121
|
-
|
122
|
-
fig = go.Figure(data=[trace0, trace1], layout=layout)
|
123
|
-
|
124
|
-
return self.cache_results(
|
125
|
-
metric_value={
|
126
|
-
"auc": auc,
|
127
|
-
"fpr": fpr,
|
128
|
-
"tpr": tpr,
|
129
|
-
"thresholds": roc_thresholds,
|
130
|
-
},
|
131
|
-
figures=[
|
132
|
-
Figure(
|
133
|
-
for_object=self,
|
134
|
-
key="roc_auc_curve",
|
135
|
-
figure=fig,
|
136
|
-
)
|
137
|
-
],
|
138
|
-
)
|
103
|
+
),
|
104
|
+
)
|
@@ -2,52 +2,43 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import re
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import numpy as np
|
9
6
|
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
10
7
|
|
11
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
12
9
|
from validmind.logging import get_logger
|
13
|
-
from validmind.vm_models import
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
14
11
|
|
15
12
|
logger = get_logger(__name__)
|
16
13
|
|
17
14
|
|
18
|
-
@
|
19
|
-
|
15
|
+
@tags("sklearn", "model_performance")
|
16
|
+
@tasks("regression")
|
17
|
+
def RegressionPerformance(model: VMModel, dataset: VMDataset):
|
20
18
|
"""
|
21
|
-
|
22
|
-
MAPE, and MBD.
|
19
|
+
Evaluates the performance of a regression model using five different metrics: MAE, MSE, RMSE, MAPE, and MBD.
|
23
20
|
|
24
21
|
### Purpose
|
25
22
|
|
26
|
-
The Regression Models Performance Comparison metric is used to measure
|
27
|
-
|
23
|
+
The Regression Models Performance Comparison metric is used to measure the performance of regression models. It
|
24
|
+
calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
|
28
25
|
Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
|
29
26
|
enabling a comprehensive view of model performance.
|
30
27
|
|
31
28
|
### Test Mechanism
|
32
29
|
|
33
|
-
The test
|
34
|
-
|
35
|
-
providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
|
36
|
-
performance of all models using these metrics. The results are then appended to a table for presenting a
|
37
|
-
comparative summary.
|
30
|
+
The test uses the sklearn library to calculate the MAE, MSE, RMSE, MAPE, and MBD. These calculations encapsulate both
|
31
|
+
the direction and the magnitude of error in predictions, thereby providing a multi-faceted view of model accuracy.
|
38
32
|
|
39
33
|
### Signs of High Risk
|
40
34
|
|
41
35
|
- High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
|
42
36
|
model's predictions from the true values.
|
43
37
|
- A large value of MBD, which shows a consistent bias in the model’s predictions.
|
44
|
-
- If the test returns an error citing that no models were provided for comparison, it implies a risk in the
|
45
|
-
evaluation process itself.
|
46
38
|
|
47
39
|
### Strengths
|
48
40
|
|
49
41
|
- The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
|
50
|
-
- It compares multiple models simultaneously, aiding in the selection of the best-performing models.
|
51
42
|
- It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
|
52
43
|
|
53
44
|
### Limitations
|
@@ -55,82 +46,38 @@ class RegressionPerformance(Metric):
|
|
55
46
|
- The metric only evaluates regression models and does not evaluate classification models.
|
56
47
|
- The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
|
57
48
|
handle pre-processing, feature selection, or other stages in the model lifecycle.
|
58
|
-
- It may fail to run if it doesn't receive valid models as inputs. The models are passed externally and the test
|
59
|
-
doesn't have an internal mechanism to verify their validity.
|
60
|
-
- The test could exhibit performance limitations if a large number of models is input for comparison.
|
61
49
|
"""
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
results["Mean Squared Error (MSE)"] = mse_test
|
80
|
-
results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
|
81
|
-
|
82
|
-
if np.any(y_true_test == 0):
|
83
|
-
logger.warning(
|
84
|
-
"y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
|
85
|
-
)
|
86
|
-
results["Mean Absolute Percentage Error (MAPE)"] = None
|
87
|
-
else:
|
88
|
-
mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
|
89
|
-
results["Mean Absolute Percentage Error (MAPE)"] = mape_test
|
90
|
-
|
91
|
-
mbd_test = np.mean(y_pred_test - y_true_test)
|
92
|
-
results["Mean Bias Deviation (MBD)"] = mbd_test
|
93
|
-
|
94
|
-
return results
|
95
|
-
|
96
|
-
def summary(self, metric_value: dict):
|
97
|
-
"""
|
98
|
-
This summary varies depending if we're evaluating a binary or multi-class model
|
99
|
-
"""
|
100
|
-
results = []
|
101
|
-
metrics = metric_value[self.inputs.model.input_id].keys()
|
102
|
-
error_table = []
|
103
|
-
for metric_name in metrics:
|
104
|
-
errors_dict = {}
|
105
|
-
errors_dict["Errors"] = metric_name
|
106
|
-
for m, _ in metric_value.items():
|
107
|
-
for metric in metrics:
|
108
|
-
res = re.findall(r"\(.*?\)", metric)
|
109
|
-
res[0][1:-1]
|
110
|
-
errors_dict[f"{res[0][1:-1]}-{m}"] = metric_value[m][metric]
|
111
|
-
error_table.append(errors_dict)
|
112
|
-
|
113
|
-
results.append(
|
114
|
-
ResultTable(
|
115
|
-
data=error_table,
|
116
|
-
metadata=ResultTableMetadata(title="Regression Errors Comparison"),
|
117
|
-
)
|
50
|
+
y_true = dataset.y
|
51
|
+
y_pred = dataset.y_pred(model)
|
52
|
+
|
53
|
+
# MAE calculation
|
54
|
+
metrics = {
|
55
|
+
"Mean Absolute Error (MAE)": mean_absolute_error(y_true, y_pred),
|
56
|
+
}
|
57
|
+
|
58
|
+
# MSE and RMSE calculations
|
59
|
+
mse = mean_squared_error(y_true, y_pred)
|
60
|
+
metrics["Mean Squared Error (MSE)"] = mse
|
61
|
+
metrics["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
|
62
|
+
|
63
|
+
# MAPE calculation
|
64
|
+
if np.any(y_true == 0):
|
65
|
+
logger.warning(
|
66
|
+
"y_true contains zero values. Skipping MAPE calculation to avoid division by zero."
|
118
67
|
)
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
# Check models list is not empty
|
124
|
-
if not self.inputs.model:
|
125
|
-
raise SkipTestError(
|
126
|
-
"Model must be provided as a `models` parameter to compare performance"
|
127
|
-
)
|
128
|
-
results = {}
|
129
|
-
|
130
|
-
result = self.regression_errors(
|
131
|
-
y_true_test=self.inputs.dataset.y,
|
132
|
-
y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
|
68
|
+
metrics["Mean Absolute Percentage Error (MAPE)"] = None
|
69
|
+
else:
|
70
|
+
metrics["Mean Absolute Percentage Error (MAPE)"] = (
|
71
|
+
np.mean(np.abs((y_true - y_pred) / y_true)) * 100
|
133
72
|
)
|
134
|
-
results[self.inputs.model.input_id] = result
|
135
73
|
|
136
|
-
|
74
|
+
# MBD calculation
|
75
|
+
metrics["Mean Bias Deviation (MBD)"] = np.mean(y_pred - y_true)
|
76
|
+
|
77
|
+
return [
|
78
|
+
{
|
79
|
+
"Metric": metric,
|
80
|
+
"Value": value,
|
81
|
+
}
|
82
|
+
for metric, value in metrics.items()
|
83
|
+
]
|
@@ -3,7 +3,6 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
from collections import defaultdict
|
6
|
-
from dataclasses import dataclass
|
7
6
|
from operator import add
|
8
7
|
from typing import List, Tuple
|
9
8
|
|
@@ -15,16 +14,8 @@ from sklearn import metrics
|
|
15
14
|
|
16
15
|
from validmind.errors import MissingOrInvalidModelPredictFnError
|
17
16
|
from validmind.logging import get_logger
|
18
|
-
from validmind.
|
19
|
-
|
20
|
-
ResultSummary,
|
21
|
-
ResultTable,
|
22
|
-
ResultTableMetadata,
|
23
|
-
ThresholdTest,
|
24
|
-
ThresholdTestResult,
|
25
|
-
VMDataset,
|
26
|
-
VMModel,
|
27
|
-
)
|
17
|
+
from validmind.tests import tags, tasks
|
18
|
+
from validmind.vm_models import VMDataset, VMModel
|
28
19
|
|
29
20
|
logger = get_logger(__name__)
|
30
21
|
|
@@ -222,32 +213,59 @@ def _plot_robustness(
|
|
222
213
|
return fig
|
223
214
|
|
224
215
|
|
225
|
-
|
226
|
-
|
227
|
-
def
|
228
|
-
model: VMModel,
|
216
|
+
@tags("sklearn", "model_diagnosis", "visualization")
|
217
|
+
@tasks("classification", "regression")
|
218
|
+
def RobustnessDiagnosis(
|
229
219
|
datasets: List[VMDataset],
|
220
|
+
model: VMModel,
|
230
221
|
metric: str = None,
|
231
222
|
scaling_factor_std_dev_list: List[float] = DEFAULT_STD_DEV_LIST,
|
232
223
|
performance_decay_threshold: float = DEFAULT_DECAY_THRESHOLD,
|
233
224
|
):
|
225
|
+
"""
|
226
|
+
Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
|
227
|
+
|
228
|
+
### Purpose
|
229
|
+
|
230
|
+
The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
|
231
|
+
perturbations or noise in its input data. This is essential for understanding the model's ability to handle
|
232
|
+
real-world scenarios where data may be imperfect or corrupted.
|
233
|
+
|
234
|
+
### Test Mechanism
|
235
|
+
|
236
|
+
This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
|
237
|
+
deviation. The performance of the model is then measured using a specified metric. The process includes:
|
238
|
+
|
239
|
+
- Adding Gaussian noise to numerical input features based on scaling factors.
|
240
|
+
- Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
|
241
|
+
for regression tasks.
|
242
|
+
- Aggregating and plotting the results to visualize performance decay relative to perturbation size.
|
243
|
+
|
244
|
+
### Signs of High Risk
|
245
|
+
|
246
|
+
- A significant drop in performance metrics with minimal noise.
|
247
|
+
- Performance decay values exceeding the specified threshold.
|
248
|
+
- Consistent failure to meet performance standards across multiple perturbation scales.
|
249
|
+
|
250
|
+
### Strengths
|
251
|
+
|
252
|
+
- Provides insights into the model's robustness against noisy or corrupted data.
|
253
|
+
- Utilizes a variety of performance metrics suitable for both classification and regression tasks.
|
254
|
+
- Visualization helps in understanding the extent of performance degradation.
|
255
|
+
|
256
|
+
### Limitations
|
257
|
+
|
258
|
+
- Gaussian noise might not adequately represent all types of real-world data perturbations.
|
259
|
+
- Performance thresholds are somewhat arbitrary and might need tuning.
|
260
|
+
- The test may not account for more complex or unstructured noise patterns that could affect model robustness.
|
261
|
+
"""
|
262
|
+
# TODO: use single dataset
|
234
263
|
if not metric:
|
235
264
|
metric = (
|
236
265
|
DEFAULT_CLASSIFICATION_METRIC
|
237
266
|
if datasets[0].probability_column(model)
|
238
267
|
else DEFAULT_REGRESSION_METRIC
|
239
268
|
)
|
240
|
-
logger.info(f"Using default metric ({metric.upper()}) for robustness diagnosis")
|
241
|
-
|
242
|
-
if id(scaling_factor_std_dev_list) == id(DEFAULT_STD_DEV_LIST):
|
243
|
-
logger.info(
|
244
|
-
f"Using default scaling factors for the standard deviation of the noise: {DEFAULT_STD_DEV_LIST}"
|
245
|
-
)
|
246
|
-
|
247
|
-
if id(performance_decay_threshold) == id(DEFAULT_DECAY_THRESHOLD):
|
248
|
-
logger.info(
|
249
|
-
f"Using default performance decay threshold of {DEFAULT_DECAY_THRESHOLD}"
|
250
|
-
)
|
251
269
|
|
252
270
|
results = [{} for _ in range(len(datasets))]
|
253
271
|
|
@@ -304,116 +322,9 @@ def robustness_diagnosis(
|
|
304
322
|
columns=datasets[0].feature_columns_numeric,
|
305
323
|
model=model.input_id,
|
306
324
|
)
|
307
|
-
|
308
325
|
# rename perturbation size for baseline
|
309
|
-
results_df[
|
310
|
-
results_df["Perturbation Size"] == 0.0
|
326
|
+
results_df.loc[
|
327
|
+
results_df["Perturbation Size"] == 0.0, "Perturbation Size"
|
311
328
|
] = "Baseline (0.0)"
|
312
329
|
|
313
|
-
return results_df, fig
|
314
|
-
|
315
|
-
|
316
|
-
@dataclass
|
317
|
-
class RobustnessDiagnosis(ThresholdTest):
|
318
|
-
"""
|
319
|
-
Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
|
320
|
-
|
321
|
-
### Purpose
|
322
|
-
|
323
|
-
The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
|
324
|
-
perturbations or noise in its input data. This is essential for understanding the model's ability to handle
|
325
|
-
real-world scenarios where data may be imperfect or corrupted.
|
326
|
-
|
327
|
-
### Test Mechanism
|
328
|
-
|
329
|
-
This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
|
330
|
-
deviation. The performance of the model is then measured using a specified metric. The process includes:
|
331
|
-
|
332
|
-
- Adding Gaussian noise to numerical input features based on scaling factors.
|
333
|
-
- Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
|
334
|
-
for regression tasks.
|
335
|
-
- Aggregating and plotting the results to visualize performance decay relative to perturbation size.
|
336
|
-
|
337
|
-
### Signs of High Risk
|
338
|
-
|
339
|
-
- A significant drop in performance metrics with minimal noise.
|
340
|
-
- Performance decay values exceeding the specified threshold.
|
341
|
-
- Consistent failure to meet performance standards across multiple perturbation scales.
|
342
|
-
|
343
|
-
### Strengths
|
344
|
-
|
345
|
-
- Provides insights into the model's robustness against noisy or corrupted data.
|
346
|
-
- Utilizes a variety of performance metrics suitable for both classification and regression tasks.
|
347
|
-
- Visualization helps in understanding the extent of performance degradation.
|
348
|
-
|
349
|
-
### Limitations
|
350
|
-
|
351
|
-
- Gaussian noise might not adequately represent all types of real-world data perturbations.
|
352
|
-
- Performance thresholds are somewhat arbitrary and might need tuning.
|
353
|
-
- The test may not account for more complex or unstructured noise patterns that could affect model robustness.
|
354
|
-
"""
|
355
|
-
|
356
|
-
name = "robustness"
|
357
|
-
required_inputs = ["model", "datasets"]
|
358
|
-
default_params = {
|
359
|
-
"metric": None,
|
360
|
-
"scaling_factor_std_dev_list": DEFAULT_STD_DEV_LIST,
|
361
|
-
"performance_decay_threshold": DEFAULT_DECAY_THRESHOLD,
|
362
|
-
}
|
363
|
-
tasks = ["classification", "regression"]
|
364
|
-
tags = [
|
365
|
-
"sklearn",
|
366
|
-
"model_diagnosis",
|
367
|
-
"visualization",
|
368
|
-
]
|
369
|
-
|
370
|
-
def run(self):
|
371
|
-
results, fig = robustness_diagnosis(
|
372
|
-
model=self.inputs.model,
|
373
|
-
datasets=self.inputs.datasets,
|
374
|
-
metric=self.params["metric"],
|
375
|
-
scaling_factor_std_dev_list=self.params["scaling_factor_std_dev_list"],
|
376
|
-
performance_decay_threshold=self.params["performance_decay_threshold"],
|
377
|
-
)
|
378
|
-
|
379
|
-
return self.cache_results(
|
380
|
-
passed=results["Passed"].all(),
|
381
|
-
test_results_list=[
|
382
|
-
ThresholdTestResult(
|
383
|
-
test_name=self.params["metric"],
|
384
|
-
passed=results["Passed"].all(),
|
385
|
-
values=results.to_dict(orient="records"),
|
386
|
-
)
|
387
|
-
],
|
388
|
-
figures=[
|
389
|
-
Figure(
|
390
|
-
for_object=self,
|
391
|
-
key=f"{self.name}:{self.params['metric']}",
|
392
|
-
figure=fig,
|
393
|
-
)
|
394
|
-
],
|
395
|
-
)
|
396
|
-
|
397
|
-
def summary(self, results: List[ThresholdTestResult], _):
|
398
|
-
return ResultSummary(
|
399
|
-
results=[
|
400
|
-
ResultTable(
|
401
|
-
data=results[0].values,
|
402
|
-
metadata=ResultTableMetadata(title="Robustness Diagnosis Results"),
|
403
|
-
)
|
404
|
-
]
|
405
|
-
)
|
406
|
-
|
407
|
-
def test(self):
|
408
|
-
"""Unit Test for Robustness Diagnosis Threshold Test"""
|
409
|
-
# Verify the result object is present
|
410
|
-
assert self.result is not None
|
411
|
-
|
412
|
-
# Verify test results and their type
|
413
|
-
assert isinstance(self.result.test_results.results, list)
|
414
|
-
|
415
|
-
# Check for presence and validity of 'values' and 'passed' flag in each result
|
416
|
-
for test_result in self.result.test_results.results:
|
417
|
-
assert "values" in test_result.__dict__
|
418
|
-
assert "passed" in test_result.__dict__
|
419
|
-
assert isinstance(test_result.values, list)
|
330
|
+
return results_df, fig, all(results_df["Passed"])
|