validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -16,25 +16,27 @@ logger = get_logger(__name__)
|
|
16
16
|
|
17
17
|
|
18
18
|
@dataclass
|
19
|
-
class
|
19
|
+
class RegressionPerformance(Metric):
|
20
20
|
"""
|
21
21
|
Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
|
22
22
|
MAPE, and MBD.
|
23
23
|
|
24
|
-
|
24
|
+
### Purpose
|
25
|
+
|
25
26
|
The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
|
26
27
|
models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
|
27
28
|
Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
|
28
29
|
enabling a comprehensive view of model performance.
|
29
30
|
|
30
|
-
|
31
|
+
### Test Mechanism
|
32
|
+
|
31
33
|
The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
|
32
34
|
MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
|
33
35
|
providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
|
34
36
|
performance of all models using these metrics. The results are then appended to a table for presenting a
|
35
37
|
comparative summary.
|
36
38
|
|
37
|
-
|
39
|
+
### Signs of High Risk
|
38
40
|
|
39
41
|
- High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
|
40
42
|
model's predictions from the true values.
|
@@ -42,13 +44,13 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
42
44
|
- If the test returns an error citing that no models were provided for comparison, it implies a risk in the
|
43
45
|
evaluation process itself.
|
44
46
|
|
45
|
-
|
47
|
+
### Strengths
|
46
48
|
|
47
49
|
- The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
|
48
50
|
- It compares multiple models simultaneously, aiding in the selection of the best-performing models.
|
49
51
|
- It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
|
50
52
|
|
51
|
-
|
53
|
+
### Limitations
|
52
54
|
|
53
55
|
- The metric only evaluates regression models and does not evaluate classification models.
|
54
56
|
- The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
|
@@ -58,8 +60,8 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
58
60
|
- The test could exhibit performance limitations if a large number of models is input for comparison.
|
59
61
|
"""
|
60
62
|
|
61
|
-
name = "
|
62
|
-
required_inputs = ["dataset", "
|
63
|
+
name = "regression_performance"
|
64
|
+
required_inputs = ["dataset", "model"]
|
63
65
|
|
64
66
|
tasks = ["regression"]
|
65
67
|
tags = [
|
@@ -96,7 +98,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
96
98
|
This summary varies depending if we're evaluating a binary or multi-class model
|
97
99
|
"""
|
98
100
|
results = []
|
99
|
-
metrics = metric_value[
|
101
|
+
metrics = metric_value[self.inputs.model.input_id].keys()
|
100
102
|
error_table = []
|
101
103
|
for metric_name in metrics:
|
102
104
|
errors_dict = {}
|
@@ -119,20 +121,16 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
119
121
|
|
120
122
|
def run(self):
|
121
123
|
# Check models list is not empty
|
122
|
-
if not self.inputs.
|
124
|
+
if not self.inputs.model:
|
123
125
|
raise SkipTestError(
|
124
|
-
"
|
126
|
+
"Model must be provided as a `models` parameter to compare performance"
|
125
127
|
)
|
126
|
-
|
127
|
-
all_models = self.inputs.models
|
128
|
-
|
129
128
|
results = {}
|
130
129
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
results["model_" + str(idx)] = result
|
130
|
+
result = self.regression_errors(
|
131
|
+
y_true_test=self.inputs.dataset.y,
|
132
|
+
y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
|
133
|
+
)
|
134
|
+
results[self.inputs.model.input_id] = result
|
137
135
|
|
138
136
|
return self.cache_results(results)
|
@@ -2,105 +2,67 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
5
|
+
import pandas as pd
|
6
6
|
|
7
7
|
from sklearn import metrics
|
8
8
|
|
9
9
|
from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
10
|
-
from validmind
|
10
|
+
from validmind import tags, tasks
|
11
11
|
|
12
12
|
|
13
|
-
@
|
14
|
-
|
13
|
+
@tags("sklearn", "model_performance")
|
14
|
+
@tasks("regression")
|
15
|
+
def RegressionR2Square(dataset, model):
|
15
16
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
17
|
+
Assesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj
|
18
|
+
R2) scores to determine the model's explanatory power over the dependent variable.
|
19
|
+
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a regression model.
|
23
|
+
Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2) scores, which are
|
24
|
+
statistical measures used to assess the strength of the relationship between the model's predictors and the
|
25
|
+
response variable.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test deploys the `r2_score` method from the Scikit-learn metrics module to measure the R2 score on both
|
30
|
+
training and test sets. This score reflects the proportion of the variance in the dependent variable that is
|
31
|
+
predictable from the independent variables. The test also calculates the Adjusted R2 score, which accounts for the
|
32
|
+
number of predictors in the model to penalize model complexity and reduce overfitting. The Adjusted R2 score will
|
33
|
+
be smaller if unnecessary predictors are included in the model.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
37
|
+
- Low R2 or Adjusted R2 scores, suggesting that the model does not explain much variation in the dependent variable.
|
38
|
+
- Significant discrepancy between R2 scores on the training set and test set, indicating overfitting and poor
|
39
|
+
generalization to unseen data.
|
40
|
+
|
41
|
+
### Strengths
|
42
|
+
|
43
|
+
- Widely-used measure in regression analysis, providing a sound general indication of model performance.
|
44
|
+
- Easy to interpret and understand, as it represents the proportion of the dependent variable's variance explained
|
45
|
+
by the independent variables.
|
46
|
+
- Adjusted R2 score helps control overfitting by penalizing unnecessary predictors.
|
47
|
+
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Sensitive to the inclusion of unnecessary predictors even though Adjusted R2 penalizes complexity.
|
51
|
+
- Less reliable in cases of non-linear relationships or when the underlying assumptions of linear regression are
|
52
|
+
violated.
|
53
|
+
- Does not provide insight on whether the correct regression model was used or if key assumptions have been met.
|
42
54
|
"""
|
43
55
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
"""
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
for result in raw_results:
|
58
|
-
for key, _ in result.items():
|
59
|
-
table_records.append(
|
60
|
-
{
|
61
|
-
"Metric": key,
|
62
|
-
"TRAIN": result[key]["train"],
|
63
|
-
"TEST": result[key]["test"],
|
64
|
-
}
|
65
|
-
)
|
66
|
-
|
67
|
-
return ResultSummary(results=[ResultTable(data=table_records)])
|
68
|
-
|
69
|
-
def run(self):
|
70
|
-
y_train_true = self.inputs.datasets[0].y
|
71
|
-
y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
72
|
-
y_train_true = y_train_true.astype(y_train_pred.dtype)
|
73
|
-
|
74
|
-
y_test_true = self.inputs.datasets[1].y
|
75
|
-
y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
76
|
-
y_test_true = y_test_true.astype(y_test_pred.dtype)
|
77
|
-
|
78
|
-
r2s_train = metrics.r2_score(y_train_true, y_train_pred)
|
79
|
-
r2s_test = metrics.r2_score(y_test_true, y_test_pred)
|
80
|
-
|
81
|
-
results = []
|
82
|
-
results.append(
|
83
|
-
{
|
84
|
-
"R-squared (R2) Score": {
|
85
|
-
"train": r2s_train,
|
86
|
-
"test": r2s_test,
|
87
|
-
}
|
88
|
-
}
|
89
|
-
)
|
90
|
-
|
91
|
-
X_columns = self.inputs.datasets[0].feature_columns
|
92
|
-
adj_r2_train = adj_r2_score(
|
93
|
-
y_train_true, y_train_pred, len(y_train_true), len(X_columns)
|
94
|
-
)
|
95
|
-
adj_r2_test = adj_r2_score(
|
96
|
-
y_test_true, y_test_pred, len(y_test_true), len(X_columns)
|
97
|
-
)
|
98
|
-
results.append(
|
99
|
-
{
|
100
|
-
"Adjusted R-squared (R2) Score": {
|
101
|
-
"train": adj_r2_train,
|
102
|
-
"test": adj_r2_test,
|
103
|
-
}
|
104
|
-
}
|
105
|
-
)
|
106
|
-
return self.cache_results(metric_value=results)
|
56
|
+
y_true = dataset.y
|
57
|
+
y_pred = dataset.y_pred(model)
|
58
|
+
y_true = y_true.astype(y_pred.dtype)
|
59
|
+
|
60
|
+
r2s = metrics.r2_score(y_true, y_pred)
|
61
|
+
adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(dataset.feature_columns))
|
62
|
+
|
63
|
+
# Create dataframe with R2 and Adjusted R2 in one row
|
64
|
+
results_df = pd.DataFrame(
|
65
|
+
{"R-squared (R2) Score": [r2s], "Adjusted R-squared (R2) Score": [adj_r2]}
|
66
|
+
)
|
67
|
+
|
68
|
+
return results_df
|
@@ -13,26 +13,45 @@ from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
|
13
13
|
@tasks("regression", "time_series_forecasting")
|
14
14
|
def RegressionR2SquareComparison(datasets, models):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
16
|
+
Compares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess
|
17
|
+
model performance and relevance of features.
|
18
18
|
|
19
|
-
|
19
|
+
### Purpose
|
20
20
|
|
21
|
-
|
21
|
+
The Regression R2 Square Comparison test aims to compare the R-Squared and Adjusted R-Squared values for different
|
22
|
+
regression models across various datasets. It helps in assessing how well each model explains the variability in
|
23
|
+
the dataset, and whether the models include irrelevant features.
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This test operates by:
|
28
|
+
|
29
|
+
- Iterating through each dataset-model pair.
|
30
|
+
- Calculating the R-Squared values to measure how much of the variability in the dataset is explained by the model.
|
31
|
+
- Calculating the Adjusted R-Squared values, which adjust the R-Squared based on the number of predictors in the
|
32
|
+
model, making it more reliable when comparing models with different numbers of features.
|
33
|
+
- Generating a summary table containing these values for each combination of dataset and model.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
37
|
+
- If the R-Squared values are significantly low, it indicates the model isn't explaining much of the variability in
|
38
|
+
the dataset.
|
39
|
+
- A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes
|
40
|
+
irrelevant features.
|
41
|
+
|
42
|
+
### Strengths
|
26
43
|
|
27
|
-
**Strengths**:
|
28
44
|
- Provides a quantitative measure of model performance in terms of variance explained.
|
29
|
-
- Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
|
45
|
+
- Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
|
46
|
+
with different numbers of features.
|
47
|
+
- Useful for time-series forecasting and regression tasks.
|
30
48
|
|
31
|
-
|
32
|
-
- Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
|
33
|
-
- The function relies on `adj_r2_score` from the `statsmodels.statsutils` module, which should be correctly implemented and imported.
|
34
|
-
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
49
|
+
### Limitations
|
35
50
|
|
51
|
+
- Assumes the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
|
52
|
+
- Relies on `adj_r2_score` from the `statsmodels.statsutils` module, which needs to be correctly implemented and
|
53
|
+
imported.
|
54
|
+
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
36
55
|
"""
|
37
56
|
results_list = []
|
38
57
|
|
@@ -7,9 +7,9 @@ from dataclasses import dataclass
|
|
7
7
|
from operator import add
|
8
8
|
from typing import List, Tuple
|
9
9
|
|
10
|
-
import matplotlib.pyplot as plt
|
11
10
|
import numpy as np
|
12
11
|
import pandas as pd
|
12
|
+
import plotly.graph_objects as go
|
13
13
|
import seaborn as sns
|
14
14
|
from sklearn import metrics
|
15
15
|
|
@@ -132,24 +132,28 @@ def _combine_results(results: List[dict]):
|
|
132
132
|
|
133
133
|
|
134
134
|
def _plot_robustness(
|
135
|
-
results: pd.DataFrame, metric: str, threshold: float, columns: List[str]
|
135
|
+
results: pd.DataFrame, metric: str, threshold: float, columns: List[str], model: str
|
136
136
|
):
|
137
|
-
fig
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
137
|
+
fig = go.Figure()
|
138
|
+
|
139
|
+
datasets = results["Dataset"].unique()
|
140
|
+
pallete = [
|
141
|
+
f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
|
142
|
+
for r, g, b in sns.color_palette("husl", len(datasets))
|
143
|
+
]
|
144
|
+
|
145
|
+
for i, dataset in enumerate(datasets):
|
146
|
+
dataset_results = results[results["Dataset"] == dataset]
|
147
|
+
fig.add_trace(
|
148
|
+
go.Scatter(
|
149
|
+
x=dataset_results["Perturbation Size"],
|
150
|
+
y=dataset_results[metric.upper()],
|
151
|
+
mode="lines+markers",
|
152
|
+
name=dataset,
|
153
|
+
line=dict(width=3, color=pallete[i]),
|
154
|
+
marker=dict(size=10),
|
155
|
+
)
|
156
|
+
)
|
153
157
|
|
154
158
|
if PERFORMANCE_METRICS[metric]["is_lower_better"]:
|
155
159
|
y_label = f"{metric.upper()} (lower is better)"
|
@@ -157,33 +161,64 @@ def _plot_robustness(
|
|
157
161
|
threshold = -threshold
|
158
162
|
y_label = f"{metric.upper()} (higher is better)"
|
159
163
|
|
160
|
-
# add
|
161
|
-
for i in
|
162
|
-
baseline = results[results["Dataset"] ==
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
164
|
+
# add threshold lines
|
165
|
+
for i, dataset in enumerate(datasets):
|
166
|
+
baseline = results[results["Dataset"] == dataset][metric.upper()].iloc[0]
|
167
|
+
fig.add_trace(
|
168
|
+
go.Scatter(
|
169
|
+
x=results["Perturbation Size"].unique(),
|
170
|
+
y=[baseline + threshold] * len(results["Perturbation Size"].unique()),
|
171
|
+
mode="lines",
|
172
|
+
name=f"threshold_{dataset}",
|
173
|
+
line=dict(dash="dash", width=2, color=pallete[i]),
|
174
|
+
showlegend=True,
|
175
|
+
)
|
169
176
|
)
|
170
177
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
178
|
+
columns_lines = [""]
|
179
|
+
for column in columns:
|
180
|
+
# keep adding to the last line in list until character limit (40)
|
181
|
+
if len(columns_lines[-1]) + len(column) < 40:
|
182
|
+
columns_lines[-1] += f"{column}, "
|
183
|
+
else:
|
184
|
+
columns_lines.append(f"{column}, ")
|
185
|
+
|
186
|
+
fig.update_layout(
|
187
|
+
title=dict(
|
188
|
+
text=(
|
189
|
+
f"Model Robustness for '{model}'<br><sup>As determined by calculating "
|
190
|
+
f"{metric.upper()} decay in the presence of random gaussian noise</sup>"
|
191
|
+
),
|
192
|
+
font=dict(size=20),
|
193
|
+
x=0.5,
|
194
|
+
xanchor="center",
|
195
|
+
),
|
196
|
+
xaxis_title=dict(
|
197
|
+
text="Perturbation Size (X * Standard Deviation)",
|
198
|
+
),
|
199
|
+
yaxis_title=dict(text=y_label),
|
200
|
+
plot_bgcolor="white",
|
201
|
+
margin=dict(t=60, b=80, r=20, l=60),
|
202
|
+
xaxis=dict(showgrid=True, gridcolor="lightgrey"),
|
203
|
+
yaxis=dict(showgrid=True, gridcolor="lightgrey"),
|
204
|
+
annotations=[
|
205
|
+
go.layout.Annotation(
|
206
|
+
text=f"Perturbed Features:<br><sup>{'<br>'.join(columns_lines)}</sup>",
|
207
|
+
align="left",
|
208
|
+
font=dict(size=14),
|
209
|
+
bordercolor="lightgrey",
|
210
|
+
borderwidth=1,
|
211
|
+
borderpad=4,
|
212
|
+
showarrow=False,
|
213
|
+
x=1.025,
|
214
|
+
xref="paper",
|
215
|
+
xanchor="left",
|
216
|
+
y=-0.15,
|
217
|
+
yref="paper",
|
218
|
+
)
|
219
|
+
],
|
182
220
|
)
|
183
221
|
|
184
|
-
# prevent the figure from being displayed
|
185
|
-
plt.close("all")
|
186
|
-
|
187
222
|
return fig
|
188
223
|
|
189
224
|
|
@@ -267,6 +302,7 @@ def robustness_diagnosis(
|
|
267
302
|
metric=metric,
|
268
303
|
threshold=performance_decay_threshold,
|
269
304
|
columns=datasets[0].feature_columns_numeric,
|
305
|
+
model=model.input_id,
|
270
306
|
)
|
271
307
|
|
272
308
|
# rename perturbation size for baseline
|
@@ -279,38 +315,42 @@ def robustness_diagnosis(
|
|
279
315
|
|
280
316
|
@dataclass
|
281
317
|
class RobustnessDiagnosis(ThresholdTest):
|
282
|
-
"""
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
318
|
+
"""
|
319
|
+
Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
|
320
|
+
|
321
|
+
### Purpose
|
322
|
+
|
323
|
+
The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
|
324
|
+
perturbations or noise in its input data. This is essential for understanding the model's ability to handle
|
325
|
+
real-world scenarios where data may be imperfect or corrupted.
|
326
|
+
|
327
|
+
### Test Mechanism
|
328
|
+
|
329
|
+
This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
|
330
|
+
deviation. The performance of the model is then measured using a specified metric. The process includes:
|
331
|
+
|
332
|
+
- Adding Gaussian noise to numerical input features based on scaling factors.
|
333
|
+
- Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
|
334
|
+
for regression tasks.
|
335
|
+
- Aggregating and plotting the results to visualize performance decay relative to perturbation size.
|
336
|
+
|
337
|
+
### Signs of High Risk
|
338
|
+
|
339
|
+
- A significant drop in performance metrics with minimal noise.
|
340
|
+
- Performance decay values exceeding the specified threshold.
|
341
|
+
- Consistent failure to meet performance standards across multiple perturbation scales.
|
342
|
+
|
343
|
+
### Strengths
|
344
|
+
|
345
|
+
- Provides insights into the model's robustness against noisy or corrupted data.
|
346
|
+
- Utilizes a variety of performance metrics suitable for both classification and regression tasks.
|
347
|
+
- Visualization helps in understanding the extent of performance degradation.
|
348
|
+
|
349
|
+
### Limitations
|
350
|
+
|
351
|
+
- Gaussian noise might not adequately represent all types of real-world data perturbations.
|
352
|
+
- Performance thresholds are somewhat arbitrary and might need tuning.
|
353
|
+
- The test may not account for more complex or unstructured noise patterns that could affect model robustness.
|
314
354
|
"""
|
315
355
|
|
316
356
|
name = "robustness"
|
@@ -22,13 +22,15 @@ class SHAPGlobalImportance(Metric):
|
|
22
22
|
"""
|
23
23
|
Evaluates and visualizes global feature importance using SHAP values for model explanation and risk identification.
|
24
24
|
|
25
|
-
|
25
|
+
### Purpose
|
26
|
+
|
26
27
|
The SHAP (SHapley Additive exPlanations) Global Importance metric aims to elucidate model outcomes by attributing
|
27
28
|
them to the contributing features. It assigns a quantifiable global importance to each feature via their respective
|
28
29
|
absolute Shapley values, thereby making it suitable for tasks like classification (both binary and multiclass).
|
29
30
|
This metric forms an essential part of model risk management.
|
30
31
|
|
31
|
-
|
32
|
+
### Test Mechanism
|
33
|
+
|
32
34
|
The exam begins with the selection of a suitable explainer which aligns with the model's type. For tree-based
|
33
35
|
models like XGBClassifier, RandomForestClassifier, CatBoostClassifier, TreeExplainer is used whereas for linear
|
34
36
|
models like LogisticRegression, XGBRegressor, LinearRegression, it is the LinearExplainer. Once the explainer
|
@@ -44,20 +46,20 @@ class SHAPGlobalImportance(Metric):
|
|
44
46
|
gradually changing from low to high. Features are systematically organized in accordance with their importance.
|
45
47
|
These plots are generated by the function `_generate_shap_plot()`.
|
46
48
|
|
47
|
-
|
49
|
+
### Signs of High Risk
|
48
50
|
|
49
51
|
- Overemphasis on certain features in SHAP importance plots, thus hinting at the possibility of model overfitting
|
50
52
|
- Anomalies such as unexpected or illogical features showing high importance, which might suggest that the model's
|
51
53
|
decisions are rooted in incorrect or undesirable reasoning
|
52
54
|
- A SHAP summary plot filled with high variability or scattered data points, indicating a cause for concern
|
53
55
|
|
54
|
-
|
56
|
+
### Strengths
|
55
57
|
|
56
58
|
- SHAP does more than just illustrating global feature significance, it offers a detailed perspective on how
|
57
59
|
different features shape the model's decision-making logic for each instance.
|
58
60
|
- It provides clear insights into model behavior.
|
59
61
|
|
60
|
-
|
62
|
+
### Limitations
|
61
63
|
|
62
64
|
- High-dimensional data can convolute interpretations.
|
63
65
|
- Associating importance with tangible real-world impact still involves a certain degree of subjectivity.
|