validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -16,25 +16,27 @@ logger = get_logger(__name__)
|
|
16
16
|
|
17
17
|
|
18
18
|
@dataclass
|
19
|
-
class
|
19
|
+
class RegressionPerformance(Metric):
|
20
20
|
"""
|
21
21
|
Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
|
22
22
|
MAPE, and MBD.
|
23
23
|
|
24
|
-
|
24
|
+
### Purpose
|
25
|
+
|
25
26
|
The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
|
26
27
|
models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
|
27
28
|
Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
|
28
29
|
enabling a comprehensive view of model performance.
|
29
30
|
|
30
|
-
|
31
|
+
### Test Mechanism
|
32
|
+
|
31
33
|
The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
|
32
34
|
MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
|
33
35
|
providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
|
34
36
|
performance of all models using these metrics. The results are then appended to a table for presenting a
|
35
37
|
comparative summary.
|
36
38
|
|
37
|
-
|
39
|
+
### Signs of High Risk
|
38
40
|
|
39
41
|
- High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
|
40
42
|
model's predictions from the true values.
|
@@ -42,13 +44,13 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
42
44
|
- If the test returns an error citing that no models were provided for comparison, it implies a risk in the
|
43
45
|
evaluation process itself.
|
44
46
|
|
45
|
-
|
47
|
+
### Strengths
|
46
48
|
|
47
49
|
- The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
|
48
50
|
- It compares multiple models simultaneously, aiding in the selection of the best-performing models.
|
49
51
|
- It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
|
50
52
|
|
51
|
-
|
53
|
+
### Limitations
|
52
54
|
|
53
55
|
- The metric only evaluates regression models and does not evaluate classification models.
|
54
56
|
- The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
|
@@ -58,8 +60,8 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
58
60
|
- The test could exhibit performance limitations if a large number of models is input for comparison.
|
59
61
|
"""
|
60
62
|
|
61
|
-
name = "
|
62
|
-
required_inputs = ["dataset", "
|
63
|
+
name = "regression_performance"
|
64
|
+
required_inputs = ["dataset", "model"]
|
63
65
|
|
64
66
|
tasks = ["regression"]
|
65
67
|
tags = [
|
@@ -96,7 +98,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
96
98
|
This summary varies depending if we're evaluating a binary or multi-class model
|
97
99
|
"""
|
98
100
|
results = []
|
99
|
-
metrics = metric_value[
|
101
|
+
metrics = metric_value[self.inputs.model.input_id].keys()
|
100
102
|
error_table = []
|
101
103
|
for metric_name in metrics:
|
102
104
|
errors_dict = {}
|
@@ -119,20 +121,16 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
119
121
|
|
120
122
|
def run(self):
|
121
123
|
# Check models list is not empty
|
122
|
-
if not self.inputs.
|
124
|
+
if not self.inputs.model:
|
123
125
|
raise SkipTestError(
|
124
|
-
"
|
126
|
+
"Model must be provided as a `models` parameter to compare performance"
|
125
127
|
)
|
126
|
-
|
127
|
-
all_models = self.inputs.models
|
128
|
-
|
129
128
|
results = {}
|
130
129
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
results["model_" + str(idx)] = result
|
130
|
+
result = self.regression_errors(
|
131
|
+
y_true_test=self.inputs.dataset.y,
|
132
|
+
y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
|
133
|
+
)
|
134
|
+
results[self.inputs.model.input_id] = result
|
137
135
|
|
138
136
|
return self.cache_results(results)
|
@@ -2,105 +2,67 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
5
|
+
import pandas as pd
|
6
6
|
|
7
7
|
from sklearn import metrics
|
8
8
|
|
9
9
|
from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
10
|
-
from validmind
|
10
|
+
from validmind import tags, tasks
|
11
11
|
|
12
12
|
|
13
|
-
@
|
14
|
-
|
13
|
+
@tags("sklearn", "model_performance")
|
14
|
+
@tasks("regression")
|
15
|
+
def RegressionR2Square(dataset, model):
|
15
16
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
17
|
+
Assesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj
|
18
|
+
R2) scores to determine the model's explanatory power over the dependent variable.
|
19
|
+
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a regression model.
|
23
|
+
Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2) scores, which are
|
24
|
+
statistical measures used to assess the strength of the relationship between the model's predictors and the
|
25
|
+
response variable.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test deploys the `r2_score` method from the Scikit-learn metrics module to measure the R2 score on both
|
30
|
+
training and test sets. This score reflects the proportion of the variance in the dependent variable that is
|
31
|
+
predictable from the independent variables. The test also calculates the Adjusted R2 score, which accounts for the
|
32
|
+
number of predictors in the model to penalize model complexity and reduce overfitting. The Adjusted R2 score will
|
33
|
+
be smaller if unnecessary predictors are included in the model.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
37
|
+
- Low R2 or Adjusted R2 scores, suggesting that the model does not explain much variation in the dependent variable.
|
38
|
+
- Significant discrepancy between R2 scores on the training set and test set, indicating overfitting and poor
|
39
|
+
generalization to unseen data.
|
40
|
+
|
41
|
+
### Strengths
|
42
|
+
|
43
|
+
- Widely-used measure in regression analysis, providing a sound general indication of model performance.
|
44
|
+
- Easy to interpret and understand, as it represents the proportion of the dependent variable's variance explained
|
45
|
+
by the independent variables.
|
46
|
+
- Adjusted R2 score helps control overfitting by penalizing unnecessary predictors.
|
47
|
+
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Sensitive to the inclusion of unnecessary predictors even though Adjusted R2 penalizes complexity.
|
51
|
+
- Less reliable in cases of non-linear relationships or when the underlying assumptions of linear regression are
|
52
|
+
violated.
|
53
|
+
- Does not provide insight on whether the correct regression model was used or if key assumptions have been met.
|
42
54
|
"""
|
43
55
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
"""
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
for result in raw_results:
|
58
|
-
for key, _ in result.items():
|
59
|
-
table_records.append(
|
60
|
-
{
|
61
|
-
"Metric": key,
|
62
|
-
"TRAIN": result[key]["train"],
|
63
|
-
"TEST": result[key]["test"],
|
64
|
-
}
|
65
|
-
)
|
66
|
-
|
67
|
-
return ResultSummary(results=[ResultTable(data=table_records)])
|
68
|
-
|
69
|
-
def run(self):
|
70
|
-
y_train_true = self.inputs.datasets[0].y
|
71
|
-
y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
72
|
-
y_train_true = y_train_true.astype(y_train_pred.dtype)
|
73
|
-
|
74
|
-
y_test_true = self.inputs.datasets[1].y
|
75
|
-
y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
76
|
-
y_test_true = y_test_true.astype(y_test_pred.dtype)
|
77
|
-
|
78
|
-
r2s_train = metrics.r2_score(y_train_true, y_train_pred)
|
79
|
-
r2s_test = metrics.r2_score(y_test_true, y_test_pred)
|
80
|
-
|
81
|
-
results = []
|
82
|
-
results.append(
|
83
|
-
{
|
84
|
-
"R-squared (R2) Score": {
|
85
|
-
"train": r2s_train,
|
86
|
-
"test": r2s_test,
|
87
|
-
}
|
88
|
-
}
|
89
|
-
)
|
90
|
-
|
91
|
-
X_columns = self.inputs.datasets[0].feature_columns
|
92
|
-
adj_r2_train = adj_r2_score(
|
93
|
-
y_train_true, y_train_pred, len(y_train_true), len(X_columns)
|
94
|
-
)
|
95
|
-
adj_r2_test = adj_r2_score(
|
96
|
-
y_test_true, y_test_pred, len(y_test_true), len(X_columns)
|
97
|
-
)
|
98
|
-
results.append(
|
99
|
-
{
|
100
|
-
"Adjusted R-squared (R2) Score": {
|
101
|
-
"train": adj_r2_train,
|
102
|
-
"test": adj_r2_test,
|
103
|
-
}
|
104
|
-
}
|
105
|
-
)
|
106
|
-
return self.cache_results(metric_value=results)
|
56
|
+
y_true = dataset.y
|
57
|
+
y_pred = dataset.y_pred(model)
|
58
|
+
y_true = y_true.astype(y_pred.dtype)
|
59
|
+
|
60
|
+
r2s = metrics.r2_score(y_true, y_pred)
|
61
|
+
adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(dataset.feature_columns))
|
62
|
+
|
63
|
+
# Create dataframe with R2 and Adjusted R2 in one row
|
64
|
+
results_df = pd.DataFrame(
|
65
|
+
{"R-squared (R2) Score": [r2s], "Adjusted R-squared (R2) Score": [adj_r2]}
|
66
|
+
)
|
67
|
+
|
68
|
+
return results_df
|
@@ -13,26 +13,45 @@ from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
|
13
13
|
@tasks("regression", "time_series_forecasting")
|
14
14
|
def RegressionR2SquareComparison(datasets, models):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
16
|
+
Compares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess
|
17
|
+
model performance and relevance of features.
|
18
18
|
|
19
|
-
|
19
|
+
### Purpose
|
20
20
|
|
21
|
-
|
21
|
+
The Regression R2 Square Comparison test aims to compare the R-Squared and Adjusted R-Squared values for different
|
22
|
+
regression models across various datasets. It helps in assessing how well each model explains the variability in
|
23
|
+
the dataset, and whether the models include irrelevant features.
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This test operates by:
|
28
|
+
|
29
|
+
- Iterating through each dataset-model pair.
|
30
|
+
- Calculating the R-Squared values to measure how much of the variability in the dataset is explained by the model.
|
31
|
+
- Calculating the Adjusted R-Squared values, which adjust the R-Squared based on the number of predictors in the
|
32
|
+
model, making it more reliable when comparing models with different numbers of features.
|
33
|
+
- Generating a summary table containing these values for each combination of dataset and model.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
37
|
+
- If the R-Squared values are significantly low, it indicates the model isn't explaining much of the variability in
|
38
|
+
the dataset.
|
39
|
+
- A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes
|
40
|
+
irrelevant features.
|
41
|
+
|
42
|
+
### Strengths
|
26
43
|
|
27
|
-
**Strengths**:
|
28
44
|
- Provides a quantitative measure of model performance in terms of variance explained.
|
29
|
-
- Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
|
45
|
+
- Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
|
46
|
+
with different numbers of features.
|
47
|
+
- Useful for time-series forecasting and regression tasks.
|
30
48
|
|
31
|
-
|
32
|
-
- Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
|
33
|
-
- The function relies on `adj_r2_score` from the `statsmodels.statsutils` module, which should be correctly implemented and imported.
|
34
|
-
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
49
|
+
### Limitations
|
35
50
|
|
51
|
+
- Assumes the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
|
52
|
+
- Relies on `adj_r2_score` from the `statsmodels.statsutils` module, which needs to be correctly implemented and
|
53
|
+
imported.
|
54
|
+
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
36
55
|
"""
|
37
56
|
results_list = []
|
38
57
|
|
@@ -315,38 +315,42 @@ def robustness_diagnosis(
|
|
315
315
|
|
316
316
|
@dataclass
|
317
317
|
class RobustnessDiagnosis(ThresholdTest):
|
318
|
-
"""
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
318
|
+
"""
|
319
|
+
Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
|
320
|
+
|
321
|
+
### Purpose
|
322
|
+
|
323
|
+
The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
|
324
|
+
perturbations or noise in its input data. This is essential for understanding the model's ability to handle
|
325
|
+
real-world scenarios where data may be imperfect or corrupted.
|
326
|
+
|
327
|
+
### Test Mechanism
|
328
|
+
|
329
|
+
This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
|
330
|
+
deviation. The performance of the model is then measured using a specified metric. The process includes:
|
331
|
+
|
332
|
+
- Adding Gaussian noise to numerical input features based on scaling factors.
|
333
|
+
- Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
|
334
|
+
for regression tasks.
|
335
|
+
- Aggregating and plotting the results to visualize performance decay relative to perturbation size.
|
336
|
+
|
337
|
+
### Signs of High Risk
|
338
|
+
|
339
|
+
- A significant drop in performance metrics with minimal noise.
|
340
|
+
- Performance decay values exceeding the specified threshold.
|
341
|
+
- Consistent failure to meet performance standards across multiple perturbation scales.
|
342
|
+
|
343
|
+
### Strengths
|
344
|
+
|
345
|
+
- Provides insights into the model's robustness against noisy or corrupted data.
|
346
|
+
- Utilizes a variety of performance metrics suitable for both classification and regression tasks.
|
347
|
+
- Visualization helps in understanding the extent of performance degradation.
|
348
|
+
|
349
|
+
### Limitations
|
350
|
+
|
351
|
+
- Gaussian noise might not adequately represent all types of real-world data perturbations.
|
352
|
+
- Performance thresholds are somewhat arbitrary and might need tuning.
|
353
|
+
- The test may not account for more complex or unstructured noise patterns that could affect model robustness.
|
350
354
|
"""
|
351
355
|
|
352
356
|
name = "robustness"
|
@@ -22,13 +22,15 @@ class SHAPGlobalImportance(Metric):
|
|
22
22
|
"""
|
23
23
|
Evaluates and visualizes global feature importance using SHAP values for model explanation and risk identification.
|
24
24
|
|
25
|
-
|
25
|
+
### Purpose
|
26
|
+
|
26
27
|
The SHAP (SHapley Additive exPlanations) Global Importance metric aims to elucidate model outcomes by attributing
|
27
28
|
them to the contributing features. It assigns a quantifiable global importance to each feature via their respective
|
28
29
|
absolute Shapley values, thereby making it suitable for tasks like classification (both binary and multiclass).
|
29
30
|
This metric forms an essential part of model risk management.
|
30
31
|
|
31
|
-
|
32
|
+
### Test Mechanism
|
33
|
+
|
32
34
|
The exam begins with the selection of a suitable explainer which aligns with the model's type. For tree-based
|
33
35
|
models like XGBClassifier, RandomForestClassifier, CatBoostClassifier, TreeExplainer is used whereas for linear
|
34
36
|
models like LogisticRegression, XGBRegressor, LinearRegression, it is the LinearExplainer. Once the explainer
|
@@ -44,20 +46,20 @@ class SHAPGlobalImportance(Metric):
|
|
44
46
|
gradually changing from low to high. Features are systematically organized in accordance with their importance.
|
45
47
|
These plots are generated by the function `_generate_shap_plot()`.
|
46
48
|
|
47
|
-
|
49
|
+
### Signs of High Risk
|
48
50
|
|
49
51
|
- Overemphasis on certain features in SHAP importance plots, thus hinting at the possibility of model overfitting
|
50
52
|
- Anomalies such as unexpected or illogical features showing high importance, which might suggest that the model's
|
51
53
|
decisions are rooted in incorrect or undesirable reasoning
|
52
54
|
- A SHAP summary plot filled with high variability or scattered data points, indicating a cause for concern
|
53
55
|
|
54
|
-
|
56
|
+
### Strengths
|
55
57
|
|
56
58
|
- SHAP does more than just illustrating global feature significance, it offers a detailed perspective on how
|
57
59
|
different features shape the model's decision-making logic for each instance.
|
58
60
|
- It provides clear insights into model behavior.
|
59
61
|
|
60
|
-
|
62
|
+
### Limitations
|
61
63
|
|
62
64
|
- High-dimensional data can convolute interpretations.
|
63
65
|
- Associating importance with tangible real-world impact still involves a certain degree of subjectivity.
|
@@ -20,36 +20,44 @@ from validmind.vm_models import (
|
|
20
20
|
@dataclass
|
21
21
|
class SilhouettePlot(Metric):
|
22
22
|
"""
|
23
|
-
Calculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
Silhouette Score
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
23
|
+
Calculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML
|
24
|
+
models.
|
25
|
+
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
This test calculates the Silhouette Score, which is a model performance metric used in clustering applications.
|
29
|
+
Primarily, the Silhouette Score evaluates how similar a data point is to its own cluster compared to other
|
30
|
+
clusters. The metric ranges between -1 and 1, where a high value indicates that the object is well matched to its
|
31
|
+
own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high Silhouette Score,
|
32
|
+
implying well-separated clusters.
|
33
|
+
|
34
|
+
### Test Mechanism
|
35
|
+
|
36
|
+
The test first extracts the true and predicted labels from the model's training data. The test runs the Silhouette
|
37
|
+
Score function, which takes as input the training dataset features and the predicted labels, subsequently
|
38
|
+
calculating the average score. This average Silhouette Score is printed for reference. The script then calculates
|
39
|
+
the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is represented
|
40
|
+
in this plot, with color distinguishing between different clusters. A red dashed line indicates the average
|
41
|
+
Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model performance
|
42
|
+
analysis and comparison.
|
43
|
+
|
44
|
+
### Signs of High Risk
|
45
|
+
|
40
46
|
- A low Silhouette Score, potentially indicating that the clusters are not well separated and that data points may
|
41
47
|
not be fitting well to their respective clusters.
|
42
48
|
- A Silhouette Plot displaying overlapping clusters or the absence of clear distinctions between clusters visually
|
43
49
|
also suggests poor clustering performance.
|
44
50
|
|
45
|
-
|
51
|
+
### Strengths
|
52
|
+
|
46
53
|
- The Silhouette Score provides a clear and quantitative measure of how well data points have been grouped into
|
47
54
|
clusters, offering insights into model performance.
|
48
55
|
- The Silhouette Plot provides an intuitive, graphical representation of the clustering mechanism, aiding visual
|
49
56
|
assessments of model performance.
|
50
57
|
- It does not require ground truth labels, so it's useful when true cluster assignments are not known.
|
51
58
|
|
52
|
-
|
59
|
+
### Limitations
|
60
|
+
|
53
61
|
- The Silhouette Score may be susceptible to the influence of outliers, which could impact its accuracy and
|
54
62
|
reliability.
|
55
63
|
- It assumes the clusters are convex and isotropic, which might not be the case with complex datasets.
|
@@ -32,33 +32,40 @@ class TrainingTestDegradation(ThresholdTest):
|
|
32
32
|
"""
|
33
33
|
Tests if model performance degradation between training and test datasets exceeds a predefined threshold.
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
### Purpose
|
36
|
+
|
37
|
+
The `TrainingTestDegradation` class serves as a test to verify that the degradation in performance between the
|
38
|
+
training and test datasets does not exceed a predefined threshold. This test measures the model's ability to
|
39
|
+
generalize from its training data to unseen test data, assessing key classification metrics such as accuracy,
|
40
|
+
precision, recall, and f1 score to verify the model's robustness and reliability.
|
41
|
+
|
42
|
+
### Test Mechanism
|
43
|
+
|
44
|
+
The code applies several predefined metrics, including accuracy, precision, recall, and f1 scores, to the model's
|
45
|
+
predictions for both the training and test datasets. It calculates the degradation as the difference between the
|
46
|
+
training score and test score divided by the training score. The test is considered successful if the degradation
|
47
|
+
for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table showing
|
48
|
+
each metric's train score, test score, degradation percentage, and pass/fail status.
|
49
|
+
|
50
|
+
### Signs of High Risk
|
51
|
+
|
47
52
|
- A degradation percentage that exceeds the maximum allowed threshold of 10% for any of the evaluated metrics.
|
48
53
|
- A high difference or gap between the metric scores on the training and the test datasets.
|
49
54
|
- The 'Pass/Fail' column displaying 'Fail' for any of the evaluated metrics.
|
50
55
|
|
51
|
-
|
52
|
-
|
53
|
-
|
56
|
+
### Strengths
|
57
|
+
|
58
|
+
- Provides a quantitative measure of the model's ability to generalize to unseen data, which is key for predicting
|
59
|
+
its practical real-world performance.
|
54
60
|
- By evaluating multiple metrics, it takes into account different facets of model performance and enables a more
|
55
61
|
holistic evaluation.
|
56
62
|
- The use of a variable predefined threshold allows the flexibility to adjust the acceptability criteria for
|
57
63
|
different scenarios.
|
58
64
|
|
59
|
-
|
60
|
-
|
61
|
-
|
65
|
+
### Limitations
|
66
|
+
|
67
|
+
- The test compares raw performance on training and test data but does not factor in the nature of the data. Areas
|
68
|
+
with less representation in the training set might still perform poorly on unseen data.
|
62
69
|
- It requires good coverage and balance in the test and training datasets to produce reliable results, which may
|
63
70
|
not always be available.
|
64
71
|
- The test is currently only designed for classification tasks.
|
@@ -14,42 +14,43 @@ class VMeasure(ClusterPerformance):
|
|
14
14
|
"""
|
15
15
|
Evaluates homogeneity and completeness of a clustering model using the V Measure Score.
|
16
16
|
|
17
|
-
|
17
|
+
### Purpose
|
18
|
+
|
18
19
|
The purpose of this metric, V Measure Score (V Score), is to evaluate the performance of a clustering model. It
|
19
20
|
measures the homogeneity and completeness of a set of cluster labels, where homogeneity refers to each cluster
|
20
21
|
containing only members of a single class and completeness meaning all members of a given class are assigned to the
|
21
22
|
same cluster.
|
22
23
|
|
23
|
-
|
24
|
-
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the `v_measure_score`
|
25
27
|
function from the sklearn module's metrics package. The required inputs to perform this metric are the model, train
|
26
28
|
dataset, and test dataset. The test is appropriate for models tasked with clustering.
|
27
29
|
|
28
|
-
|
30
|
+
### Signs of High Risk
|
29
31
|
|
30
32
|
- Low V Measure Score: A low V Measure Score indicates that the clustering model has poor homogeneity or
|
31
33
|
completeness, or both. This might signal that the model is failing to correctly cluster the data.
|
32
34
|
|
33
|
-
|
35
|
+
### Strengths
|
34
36
|
|
35
37
|
- The V Measure Score is a harmonic mean between homogeneity and completeness. This ensures that both attributes
|
36
38
|
are taken into account when evaluating the model, providing an overall measure of its cluster validity.
|
37
|
-
|
38
39
|
- The metric does not require knowledge of the ground truth classes when measuring homogeneity and completeness,
|
39
40
|
making it applicable in instances where such information is unavailable.
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
- The V Score can be influenced by the number of clusters, which means that it might not always reflect the quality
|
44
|
-
of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
|
45
|
-
completeness, leading to a low V Score even if the clustering might be useful.
|
42
|
+
### Limitations
|
46
43
|
|
44
|
+
- The V Measure Score can be influenced by the number of clusters, which means that it might not always reflect the
|
45
|
+
quality of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
|
46
|
+
completeness, leading to a low V Measure Score even if the clustering might be useful.
|
47
47
|
- It assumes equal importance of homogeneity and completeness. In some applications, one may be more important than
|
48
|
-
the other. The V Score does not provide flexibility in assigning different weights to homogeneity and
|
48
|
+
the other. The V Measure Score does not provide flexibility in assigning different weights to homogeneity and
|
49
|
+
completeness.
|
49
50
|
"""
|
50
51
|
|
51
52
|
name = "v_measure_score"
|
52
|
-
required_inputs = ["model", "
|
53
|
+
required_inputs = ["model", "dataset"]
|
53
54
|
tasks = ["clustering"]
|
54
55
|
tags = [
|
55
56
|
"sklearn",
|