validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,24 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
6
|
-
from
|
7
|
-
|
8
|
-
import
|
9
|
-
from
|
10
|
-
from sklearn import metrics
|
11
|
-
|
12
|
-
from validmind.vm_models import (
|
13
|
-
ResultSummary,
|
14
|
-
ResultTable,
|
15
|
-
ResultTableMetadata,
|
16
|
-
ThresholdTest,
|
17
|
-
ThresholdTestResult,
|
18
|
-
)
|
5
|
+
import numpy as np
|
6
|
+
from sklearn.metrics import f1_score
|
7
|
+
|
8
|
+
from validmind.tests import tags, tasks
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
19
10
|
|
20
11
|
|
21
|
-
@
|
22
|
-
|
12
|
+
@tags(
|
13
|
+
"sklearn", "binary_classification", "multiclass_classification", "model_performance"
|
14
|
+
)
|
15
|
+
@tasks("classification", "text_classification")
|
16
|
+
def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
|
23
17
|
"""
|
24
18
|
Assesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced
|
25
19
|
performance between precision and recall.
|
@@ -59,59 +53,15 @@ class MinimumF1Score(ThresholdTest):
|
|
59
53
|
closely with specific requirements.
|
60
54
|
"""
|
61
55
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
"""
|
75
|
-
The f1 score test returns results like these:
|
76
|
-
[{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
|
77
|
-
"""
|
78
|
-
result = results[0]
|
79
|
-
results_table = [
|
80
|
-
{
|
81
|
-
"Score": result.values["score"],
|
82
|
-
"Threshold": result.values["threshold"],
|
83
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
84
|
-
}
|
85
|
-
]
|
86
|
-
|
87
|
-
return ResultSummary(
|
88
|
-
results=[
|
89
|
-
ResultTable(
|
90
|
-
data=pd.DataFrame(results_table),
|
91
|
-
metadata=ResultTableMetadata(title="Minimum F1 Score Test"),
|
92
|
-
)
|
93
|
-
]
|
94
|
-
)
|
95
|
-
|
96
|
-
def run(self):
|
97
|
-
y_true = self.inputs.dataset.y
|
98
|
-
class_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
99
|
-
y_true = y_true.astype(class_pred.dtype)
|
100
|
-
|
101
|
-
if len(unique(y_true)) > 2:
|
102
|
-
f1_score = metrics.f1_score(y_true, class_pred, average="macro")
|
103
|
-
else:
|
104
|
-
f1_score = metrics.f1_score(y_true, class_pred)
|
105
|
-
|
106
|
-
passed = f1_score > self.params["min_threshold"]
|
107
|
-
results = [
|
108
|
-
ThresholdTestResult(
|
109
|
-
passed=passed,
|
110
|
-
values={
|
111
|
-
"score": f1_score,
|
112
|
-
"threshold": self.params["min_threshold"],
|
113
|
-
},
|
114
|
-
)
|
115
|
-
]
|
116
|
-
|
117
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
56
|
+
if len(np.unique(dataset.y)) > 2:
|
57
|
+
score = f1_score(dataset.y, dataset.y_pred(model), average="macro")
|
58
|
+
else:
|
59
|
+
score = f1_score(dataset.y, dataset.y_pred(model))
|
60
|
+
|
61
|
+
return [
|
62
|
+
{
|
63
|
+
"Score": score,
|
64
|
+
"Threshold": min_threshold,
|
65
|
+
"Pass/Fail": "Pass" if score > min_threshold else "Fail",
|
66
|
+
}
|
67
|
+
], score > min_threshold
|
@@ -2,24 +2,19 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import List
|
7
|
-
|
8
5
|
import numpy as np
|
9
|
-
|
10
|
-
from sklearn import
|
11
|
-
|
12
|
-
from validmind.
|
13
|
-
|
14
|
-
ResultTable,
|
15
|
-
ResultTableMetadata,
|
16
|
-
ThresholdTest,
|
17
|
-
ThresholdTestResult,
|
18
|
-
)
|
6
|
+
from sklearn.metrics import roc_auc_score
|
7
|
+
from sklearn.preprocessing import LabelBinarizer
|
8
|
+
|
9
|
+
from validmind.tests import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
19
11
|
|
20
12
|
|
21
|
-
@
|
22
|
-
|
13
|
+
@tags(
|
14
|
+
"sklearn", "binary_classification", "multiclass_classification", "model_performance"
|
15
|
+
)
|
16
|
+
@tasks("classification", "text_classification")
|
17
|
+
def MinimumROCAUCScore(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
|
23
18
|
"""
|
24
19
|
Validates model by checking if the ROC AUC score meets or surpasses a specified threshold.
|
25
20
|
|
@@ -61,69 +56,25 @@ class MinimumROCAUCScore(ThresholdTest):
|
|
61
56
|
- The use of macro average for multiclass ROC AUC score implies equal weightage to each class, which might not be
|
62
57
|
appropriate if the classes are imbalanced.
|
63
58
|
"""
|
59
|
+
y_true = dataset.y
|
60
|
+
|
61
|
+
if len(np.unique(y_true)) > 2:
|
62
|
+
lb = LabelBinarizer()
|
63
|
+
lb.fit(y_true)
|
64
64
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
tags = [
|
70
|
-
"sklearn",
|
71
|
-
"binary_classification",
|
72
|
-
"multiclass_classification",
|
73
|
-
"model_performance",
|
74
|
-
]
|
75
|
-
|
76
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
77
|
-
"""
|
78
|
-
The roc auc score test returns results like these:
|
79
|
-
[{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
|
80
|
-
"""
|
81
|
-
result = results[0]
|
82
|
-
results_table = [
|
83
|
-
{
|
84
|
-
"Score": result.values["score"],
|
85
|
-
"Threshold": result.values["threshold"],
|
86
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
87
|
-
}
|
88
|
-
]
|
89
|
-
|
90
|
-
return ResultSummary(
|
91
|
-
results=[
|
92
|
-
ResultTable(
|
93
|
-
data=pd.DataFrame(results_table),
|
94
|
-
metadata=ResultTableMetadata(title="Minimum ROC AUC Score Test"),
|
95
|
-
)
|
96
|
-
]
|
65
|
+
roc_auc = roc_auc_score(
|
66
|
+
y_true=lb.transform(y_true),
|
67
|
+
y_score=lb.transform(dataset.y_pred(model)),
|
68
|
+
average="macro",
|
97
69
|
)
|
98
70
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
if len(np.unique(y_true)) > 2:
|
110
|
-
class_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
111
|
-
y_true = y_true.astype(class_pred.dtype)
|
112
|
-
roc_auc = self.multiclass_roc_auc_score(y_true, class_pred)
|
113
|
-
else:
|
114
|
-
y_prob = self.inputs.dataset.y_prob(self.inputs.model)
|
115
|
-
y_true = y_true.astype(y_prob.dtype).flatten()
|
116
|
-
roc_auc = metrics.roc_auc_score(y_true, y_prob)
|
117
|
-
|
118
|
-
passed = roc_auc > self.params["min_threshold"]
|
119
|
-
results = [
|
120
|
-
ThresholdTestResult(
|
121
|
-
passed=passed,
|
122
|
-
values={
|
123
|
-
"score": roc_auc,
|
124
|
-
"threshold": self.params["min_threshold"],
|
125
|
-
},
|
126
|
-
)
|
127
|
-
]
|
128
|
-
|
129
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
71
|
+
else:
|
72
|
+
roc_auc = roc_auc_score(y_true=y_true, y_score=dataset.y_prob(model))
|
73
|
+
|
74
|
+
return [
|
75
|
+
{
|
76
|
+
"Score": roc_auc,
|
77
|
+
"Threshold": min_threshold,
|
78
|
+
"Pass/Fail": "Pass" if roc_auc > min_threshold else "Fail",
|
79
|
+
}
|
80
|
+
], roc_auc > min_threshold
|
@@ -2,19 +2,24 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
from numpy import unique
|
5
|
+
import numpy as np
|
8
6
|
from sklearn.metrics import classification_report
|
9
7
|
|
10
|
-
from validmind
|
11
|
-
from validmind.vm_models import
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
12
10
|
|
13
|
-
from .ClassifierPerformance import
|
11
|
+
from .ClassifierPerformance import multiclass_roc_auc_score
|
14
12
|
|
15
13
|
|
16
|
-
@
|
17
|
-
|
14
|
+
@tags(
|
15
|
+
"sklearn",
|
16
|
+
"binary_classification",
|
17
|
+
"multiclass_classification",
|
18
|
+
"model_performance",
|
19
|
+
"model_comparison",
|
20
|
+
)
|
21
|
+
@tasks("classification", "text_classification")
|
22
|
+
def ModelsPerformanceComparison(dataset: VMDataset, models: list[VMModel]):
|
18
23
|
"""
|
19
24
|
Evaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,
|
20
25
|
precision, recall, and F1 score.
|
@@ -57,84 +62,49 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
57
62
|
with unseen data or changes in the data distribution.
|
58
63
|
- The ROC AUC score might not be as meaningful or easily interpretable for multilabel/multiclass tasks.
|
59
64
|
"""
|
65
|
+
y_true = dataset.y
|
66
|
+
classes = {str(i) for i in np.unique(y_true)}
|
67
|
+
|
68
|
+
prf_table = []
|
69
|
+
acc_roc_auc_table = []
|
70
|
+
|
71
|
+
for model in models:
|
72
|
+
y_pred = dataset.y_pred(model)
|
60
73
|
|
61
|
-
|
62
|
-
|
63
|
-
tasks = ["classification", "text_classification"]
|
64
|
-
tags = [
|
65
|
-
"sklearn",
|
66
|
-
"binary_classification",
|
67
|
-
"multiclass_classification",
|
68
|
-
"model_performance",
|
69
|
-
"model_comparison",
|
70
|
-
]
|
71
|
-
|
72
|
-
def summary(self, metric_value: dict):
|
73
|
-
"""
|
74
|
-
This summary varies depending if we're evaluating a binary or multi-class model
|
75
|
-
"""
|
76
|
-
results = []
|
77
|
-
prf_table = []
|
78
|
-
classes = {str(i) for i in unique(self.inputs.dataset.y)}
|
74
|
+
report = classification_report(y_true, y_pred, output_dict=True)
|
75
|
+
report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
|
79
76
|
|
80
77
|
for class_name in classes:
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
avg_metrics = ["weighted avg", "macro avg"]
|
90
|
-
for class_name in avg_metrics:
|
91
|
-
avg_dict = {}
|
92
|
-
avg_dict["Class"] = class_name
|
93
|
-
for m, _ in metric_value.items():
|
94
|
-
avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
|
95
|
-
avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
|
96
|
-
avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
|
97
|
-
prf_table.append(avg_dict)
|
98
|
-
results.append(
|
99
|
-
ResultTable(
|
100
|
-
data=prf_table,
|
101
|
-
metadata=ResultTableMetadata(
|
102
|
-
title="Precision, Recall, and F1 Comparison"
|
103
|
-
),
|
78
|
+
prf_table.append(
|
79
|
+
{
|
80
|
+
"Model": model.input_id,
|
81
|
+
"Class": class_name,
|
82
|
+
"Precision": report[class_name]["precision"],
|
83
|
+
"Recall": report[class_name]["recall"],
|
84
|
+
"F1-Score": report[class_name]["f1-score"],
|
85
|
+
}
|
104
86
|
)
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
acc_roc_auc_table.append(acc_roc_auc_dict)
|
115
|
-
results.append(
|
116
|
-
ResultTable(
|
117
|
-
data=acc_roc_auc_table,
|
118
|
-
metadata=ResultTableMetadata(title="Accuracy and ROC AUC Comparison"),
|
87
|
+
for avg_metric in ["weighted avg", "macro avg"]:
|
88
|
+
prf_table.append(
|
89
|
+
{
|
90
|
+
"Model": model.input_id,
|
91
|
+
"Class": avg_metric,
|
92
|
+
"Precision": report[avg_metric]["precision"],
|
93
|
+
"Recall": report[avg_metric]["recall"],
|
94
|
+
"F1-Score": report[avg_metric]["f1-score"],
|
95
|
+
}
|
119
96
|
)
|
120
|
-
)
|
121
|
-
return ResultSummary(results=results)
|
122
|
-
|
123
|
-
def run(self):
|
124
|
-
# Check models list is not empty
|
125
|
-
if not self.inputs.models:
|
126
|
-
raise SkipTestError(
|
127
|
-
"List of models must be provided as a `models` parameter to compare performance"
|
128
|
-
)
|
129
|
-
|
130
|
-
all_models = self.inputs.models
|
131
97
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
98
|
+
for metric in ["accuracy", "roc_auc"]:
|
99
|
+
acc_roc_auc_table.append(
|
100
|
+
{
|
101
|
+
"Model": model.input_id,
|
102
|
+
"Metric": metric,
|
103
|
+
"Value": report[metric],
|
104
|
+
}
|
105
|
+
)
|
139
106
|
|
140
|
-
|
107
|
+
return {
|
108
|
+
"Precision, Recall, and F1 Comparison": prf_table,
|
109
|
+
"Accuracy and ROC AUC Comparison": acc_roc_auc_table,
|
110
|
+
}
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
5
|
from typing import List
|
7
6
|
|
8
7
|
import matplotlib.pyplot as plt
|
@@ -11,17 +10,9 @@ import pandas as pd
|
|
11
10
|
import seaborn as sns
|
12
11
|
from sklearn import metrics
|
13
12
|
|
13
|
+
from validmind import tags, tasks
|
14
14
|
from validmind.logging import get_logger
|
15
|
-
from validmind.vm_models import
|
16
|
-
Figure,
|
17
|
-
ResultSummary,
|
18
|
-
ResultTable,
|
19
|
-
ResultTableMetadata,
|
20
|
-
ThresholdTest,
|
21
|
-
ThresholdTestResult,
|
22
|
-
VMDataset,
|
23
|
-
VMModel,
|
24
|
-
)
|
15
|
+
from validmind.vm_models import VMDataset, VMModel
|
25
16
|
|
26
17
|
logger = get_logger(__name__)
|
27
18
|
|
@@ -173,56 +164,69 @@ def _plot_overfit_regions(
|
|
173
164
|
return fig
|
174
165
|
|
175
166
|
|
176
|
-
|
177
|
-
|
178
|
-
|
167
|
+
@tags(
|
168
|
+
"sklearn",
|
169
|
+
"binary_classification",
|
170
|
+
"multiclass_classification",
|
171
|
+
"linear_regression",
|
172
|
+
"model_diagnosis",
|
173
|
+
)
|
174
|
+
@tasks("classification", "regression")
|
175
|
+
def OverfitDiagnosis(
|
179
176
|
model: VMModel,
|
180
177
|
datasets: List[VMDataset],
|
181
178
|
metric: str = None,
|
182
179
|
cut_off_threshold: float = DEFAULT_THRESHOLD,
|
183
180
|
):
|
184
|
-
"""Identify overfit regions in a model's predictions.
|
185
|
-
|
186
|
-
This test compares the model's performance on training versus test data, grouped by
|
187
|
-
feature columns. It calculates the difference between the training and test performance
|
188
|
-
for each group and identifies regions where the difference exceeds a specified threshold.
|
189
|
-
|
190
|
-
## Test Methodology
|
191
|
-
|
192
|
-
This test works for both classification and regression models and with a variety of
|
193
|
-
performance metrics. By default, it uses the AUC metric for classification models and
|
194
|
-
the MSE metric for regression models. The threshold for identifying overfit regions
|
195
|
-
defaults to 0.04 but should be adjusted based on the specific use case.
|
196
|
-
|
197
|
-
## Inputs
|
198
|
-
- `model` (VMModel): The ValidMind model object to evaluate.
|
199
|
-
- `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
|
200
|
-
is the training data and the second dataset is the test data.
|
201
|
-
|
202
|
-
## Parameters
|
203
|
-
- `metric` (str, optional): The performance metric to use for evaluation. Choose from:
|
204
|
-
'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
|
205
|
-
Defaults to 'auc' for classification models and 'mse' for regression models.
|
206
|
-
- `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
|
207
|
-
Defaults to 0.04.
|
208
181
|
"""
|
182
|
+
Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
|
183
|
+
testing sets deviates significantly.
|
184
|
+
|
185
|
+
### Purpose
|
186
|
+
|
187
|
+
The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
|
188
|
+
in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
|
189
|
+
segments where the model may be overfitting.
|
190
|
+
|
191
|
+
### Test Mechanism
|
192
|
+
|
193
|
+
This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
|
194
|
+
the difference between the training and test performance for each group and identifies regions where this
|
195
|
+
difference exceeds a specified threshold:
|
196
|
+
|
197
|
+
- The test works for both classification and regression models.
|
198
|
+
- It defaults to using the AUC metric for classification models and the MSE metric for regression models.
|
199
|
+
- The threshold for identifying overfitting regions is set to 0.04 by default.
|
200
|
+
- The test calculates the performance metrics for each feature segment and plots regions where the performance gap
|
201
|
+
exceeds the threshold.
|
202
|
+
|
203
|
+
### Signs of High Risk
|
209
204
|
|
210
|
-
|
205
|
+
- Significant gaps between training and test performance metrics for specific feature segments.
|
206
|
+
- Multiple regions with performance gaps exceeding the defined threshold.
|
207
|
+
- Higher than expected differences in predicted versus actual values in the test set compared to the training set.
|
208
|
+
|
209
|
+
### Strengths
|
210
|
+
|
211
|
+
- Identifies specific areas where overfitting occurs.
|
212
|
+
- Supports multiple performance metrics, providing flexibility.
|
213
|
+
- Applicable to both classification and regression models.
|
214
|
+
- Visualization of overfitting segments aids in better understanding and debugging.
|
215
|
+
|
216
|
+
### Limitations
|
217
|
+
|
218
|
+
- The default threshold may not be suitable for all use cases and requires tuning.
|
219
|
+
- May not capture more subtle forms of overfitting that do not exceed the threshold.
|
220
|
+
- Assumes that the binning of features adequately represents the data segments.
|
221
|
+
"""
|
211
222
|
is_classification = bool(datasets[0].probability_column(model))
|
212
223
|
|
213
|
-
# Set default metric if not provided
|
214
224
|
if not metric:
|
215
225
|
metric = (
|
216
226
|
DEFAULT_CLASSIFICATION_METRIC
|
217
227
|
if is_classification
|
218
228
|
else DEFAULT_REGRESSION_METRIC
|
219
229
|
)
|
220
|
-
logger.info(
|
221
|
-
f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
|
222
|
-
)
|
223
|
-
|
224
|
-
if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
|
225
|
-
logger.info("Using default cut-off threshold of 0.04")
|
226
230
|
|
227
231
|
train_df = datasets[0].df
|
228
232
|
test_df = datasets[1].df
|
@@ -279,18 +283,8 @@ def overfit_diagnosis( # noqa: C901
|
|
279
283
|
)
|
280
284
|
|
281
285
|
results = _prepare_results(results_train, results_test, metric)
|
282
|
-
|
283
|
-
fig = _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
|
284
286
|
test_figures.append(
|
285
|
-
|
286
|
-
key=f"overfit_diagnosis:{metric}:{feature_column}",
|
287
|
-
figure=fig,
|
288
|
-
metadata={
|
289
|
-
"metric": metric,
|
290
|
-
"cut_off_threshold": cut_off_threshold,
|
291
|
-
"feature": feature_column,
|
292
|
-
},
|
293
|
-
)
|
287
|
+
_plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
|
294
288
|
)
|
295
289
|
|
296
290
|
for _, row in results[results["gap"] > cut_off_threshold].iterrows():
|
@@ -306,91 +300,3 @@ def overfit_diagnosis( # noqa: C901
|
|
306
300
|
)
|
307
301
|
|
308
302
|
return {"Overfit Diagnosis": test_results}, *test_figures
|
309
|
-
|
310
|
-
|
311
|
-
@dataclass
|
312
|
-
class OverfitDiagnosis(ThresholdTest):
|
313
|
-
"""
|
314
|
-
Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
|
315
|
-
testing sets deviates significantly.
|
316
|
-
|
317
|
-
### Purpose
|
318
|
-
|
319
|
-
The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
|
320
|
-
in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
|
321
|
-
segments where the model may be overfitting.
|
322
|
-
|
323
|
-
### Test Mechanism
|
324
|
-
|
325
|
-
This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
|
326
|
-
the difference between the training and test performance for each group and identifies regions where this
|
327
|
-
difference exceeds a specified threshold:
|
328
|
-
|
329
|
-
- The test works for both classification and regression models.
|
330
|
-
- It defaults to using the AUC metric for classification models and the MSE metric for regression models.
|
331
|
-
- The threshold for identifying overfitting regions is set to 0.04 by default.
|
332
|
-
- The test calculates the performance metrics for each feature segment and plots regions where the performance gap
|
333
|
-
exceeds the threshold.
|
334
|
-
|
335
|
-
### Signs of High Risk
|
336
|
-
|
337
|
-
- Significant gaps between training and test performance metrics for specific feature segments.
|
338
|
-
- Multiple regions with performance gaps exceeding the defined threshold.
|
339
|
-
- Higher than expected differences in predicted versus actual values in the test set compared to the training set.
|
340
|
-
|
341
|
-
### Strengths
|
342
|
-
|
343
|
-
- Identifies specific areas where overfitting occurs.
|
344
|
-
- Supports multiple performance metrics, providing flexibility.
|
345
|
-
- Applicable to both classification and regression models.
|
346
|
-
- Visualization of overfitting segments aids in better understanding and debugging.
|
347
|
-
|
348
|
-
### Limitations
|
349
|
-
|
350
|
-
- The default threshold may not be suitable for all use cases and requires tuning.
|
351
|
-
- May not capture more subtle forms of overfitting that do not exceed the threshold.
|
352
|
-
- Assumes that the binning of features adequately represents the data segments.
|
353
|
-
"""
|
354
|
-
|
355
|
-
required_inputs = ["model", "datasets"]
|
356
|
-
default_params = {"metric": None, "cut_off_threshold": DEFAULT_THRESHOLD}
|
357
|
-
tasks = ["classification", "regression"]
|
358
|
-
tags = [
|
359
|
-
"sklearn",
|
360
|
-
"binary_classification",
|
361
|
-
"multiclass_classification",
|
362
|
-
"linear_regression",
|
363
|
-
"model_diagnosis",
|
364
|
-
]
|
365
|
-
|
366
|
-
def run(self):
|
367
|
-
func_result = overfit_diagnosis(
|
368
|
-
self.inputs.model,
|
369
|
-
self.inputs.datasets,
|
370
|
-
metric=self.params["metric"],
|
371
|
-
cut_off_threshold=self.params["cut_off_threshold"],
|
372
|
-
)
|
373
|
-
|
374
|
-
return self.cache_results(
|
375
|
-
test_results_list=[
|
376
|
-
ThresholdTestResult(
|
377
|
-
test_name=self.params["metric"],
|
378
|
-
column=row["Feature"],
|
379
|
-
passed=False,
|
380
|
-
values={k: v for k, v in row.items()},
|
381
|
-
)
|
382
|
-
for row in func_result[0]["Overfit Diagnosis"]
|
383
|
-
],
|
384
|
-
passed=(not func_result[0]["Overfit Diagnosis"]),
|
385
|
-
figures=func_result[1:],
|
386
|
-
)
|
387
|
-
|
388
|
-
def summary(self, results, _):
|
389
|
-
return ResultSummary(
|
390
|
-
results=[
|
391
|
-
ResultTable(
|
392
|
-
data=[result.values for result in results],
|
393
|
-
metadata=ResultTableMetadata(title="Overfit Diagnosis"),
|
394
|
-
)
|
395
|
-
],
|
396
|
-
)
|