validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -187,6 +187,8 @@ def overfit_diagnosis( # noqa: C901
|
|
187
187
|
feature columns. It calculates the difference between the training and test performance
|
188
188
|
for each group and identifies regions where the difference exceeds a specified threshold.
|
189
189
|
|
190
|
+
## Test Methodology
|
191
|
+
|
190
192
|
This test works for both classification and regression models and with a variety of
|
191
193
|
performance metrics. By default, it uses the AUC metric for classification models and
|
192
194
|
the MSE metric for regression models. The threshold for identifying overfit regions
|
@@ -308,28 +310,46 @@ def overfit_diagnosis( # noqa: C901
|
|
308
310
|
|
309
311
|
@dataclass
|
310
312
|
class OverfitDiagnosis(ThresholdTest):
|
311
|
-
"""
|
313
|
+
"""
|
314
|
+
Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
|
315
|
+
testing sets deviates significantly.
|
312
316
|
|
313
|
-
|
314
|
-
feature columns. It calculates the difference between the training and test performance
|
315
|
-
for each group and identifies regions where the difference exceeds a specified threshold.
|
317
|
+
### Purpose
|
316
318
|
|
317
|
-
|
318
|
-
performance
|
319
|
-
the
|
320
|
-
defaults to 0.04 but should be adjusted based on the specific use case.
|
319
|
+
The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
|
320
|
+
in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
|
321
|
+
segments where the model may be overfitting.
|
321
322
|
|
322
|
-
|
323
|
-
- `model` (VMModel): The ValidMind model object to evaluate.
|
324
|
-
- `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
|
325
|
-
is the training data and the second dataset is the test data.
|
323
|
+
### Test Mechanism
|
326
324
|
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
-
|
332
|
-
|
325
|
+
This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
|
326
|
+
the difference between the training and test performance for each group and identifies regions where this
|
327
|
+
difference exceeds a specified threshold:
|
328
|
+
|
329
|
+
- The test works for both classification and regression models.
|
330
|
+
- It defaults to using the AUC metric for classification models and the MSE metric for regression models.
|
331
|
+
- The threshold for identifying overfitting regions is set to 0.04 by default.
|
332
|
+
- The test calculates the performance metrics for each feature segment and plots regions where the performance gap
|
333
|
+
exceeds the threshold.
|
334
|
+
|
335
|
+
### Signs of High Risk
|
336
|
+
|
337
|
+
- Significant gaps between training and test performance metrics for specific feature segments.
|
338
|
+
- Multiple regions with performance gaps exceeding the defined threshold.
|
339
|
+
- Higher than expected differences in predicted versus actual values in the test set compared to the training set.
|
340
|
+
|
341
|
+
### Strengths
|
342
|
+
|
343
|
+
- Identifies specific areas where overfitting occurs.
|
344
|
+
- Supports multiple performance metrics, providing flexibility.
|
345
|
+
- Applicable to both classification and regression models.
|
346
|
+
- Visualization of overfitting segments aids in better understanding and debugging.
|
347
|
+
|
348
|
+
### Limitations
|
349
|
+
|
350
|
+
- The default threshold may not be suitable for all use cases and requires tuning.
|
351
|
+
- May not capture more subtle forms of overfitting that do not exceed the threshold.
|
352
|
+
- Assumes that the binning of features adequately represents the data segments.
|
333
353
|
"""
|
334
354
|
|
335
355
|
required_inputs = ["model", "datasets"]
|
@@ -20,34 +20,40 @@ class PermutationFeatureImportance(Metric):
|
|
20
20
|
Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
|
21
21
|
values are randomly rearranged.
|
22
22
|
|
23
|
-
|
24
|
-
feature used by the Machine Learning model. The significance is measured by evaluating the decrease in the model's
|
25
|
-
performance when the feature's values are randomly arranged.
|
23
|
+
### Purpose
|
26
24
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
The Permutation Feature Importance (PFI) metric aims to assess the importance of each feature used by the Machine
|
26
|
+
Learning model. The significance is measured by evaluating the decrease in the model's performance when the
|
27
|
+
feature's values are randomly arranged.
|
28
|
+
|
29
|
+
### Test Mechanism
|
30
|
+
|
31
|
+
PFI is calculated via the `permutation_importance` method from the `sklearn.inspection` module. This method
|
32
|
+
shuffles the columns of the feature dataset and measures the impact on the model's performance. A significant
|
33
|
+
decrease in performance after permutating a feature's values deems the feature as important. On the other hand, if
|
34
|
+
performance remains the same, the feature is likely not important. The output of the PFI metric is a figure
|
35
|
+
illustrating the importance of each feature.
|
36
|
+
|
37
|
+
### Signs of High Risk
|
32
38
|
|
33
|
-
**Signs of High Risk**:
|
34
39
|
- The model heavily relies on a feature with highly variable or easily permutable values, indicating instability.
|
35
|
-
- A feature
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
-
|
41
|
-
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
-
|
48
|
-
|
49
|
-
-
|
50
|
-
|
40
|
+
- A feature deemed unimportant by the model but expected to have a significant effect on the outcome based on
|
41
|
+
domain knowledge is not influencing the model's predictions.
|
42
|
+
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides insights into the importance of different features and may reveal underlying data structure.
|
46
|
+
- Can indicate overfitting if a particular feature or set of features overly impacts the model's predictions.
|
47
|
+
- Model-agnostic and can be used with any classifier that provides a measure of prediction accuracy before and
|
48
|
+
after feature permutation.
|
49
|
+
|
50
|
+
### Limitations
|
51
|
+
|
52
|
+
- Does not imply causality; it only presents the amount of information that a feature provides for the prediction
|
53
|
+
task.
|
54
|
+
- Does not account for interactions between features. If features are correlated, the permutation importance may
|
55
|
+
allocate importance to one and not the other.
|
56
|
+
- Cannot interact with certain libraries like statsmodels, pytorch, catboost, etc., thus limiting its applicability.
|
51
57
|
"""
|
52
58
|
|
53
59
|
name = "pfi"
|
@@ -23,17 +23,19 @@ logger = get_logger(__name__)
|
|
23
23
|
@dataclass
|
24
24
|
class PopulationStabilityIndex(Metric):
|
25
25
|
"""
|
26
|
-
|
26
|
+
Assesses the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across
|
27
27
|
different datasets.
|
28
28
|
|
29
|
-
|
29
|
+
### Purpose
|
30
|
+
|
30
31
|
The Population Stability Index (PSI) serves as a quantitative assessment for evaluating the stability of a machine
|
31
32
|
learning model's output distributions when comparing two different datasets. Typically, these would be a
|
32
33
|
development and a validation dataset or two datasets collected at different periods. The PSI provides a measurable
|
33
34
|
indication of any significant shift in the model's performance over time or noticeable changes in the
|
34
35
|
characteristics of the population the model is making predictions for.
|
35
36
|
|
36
|
-
|
37
|
+
### Test Mechanism
|
38
|
+
|
37
39
|
The implementation of the PSI in this script involves calculating the PSI for each feature between the training and
|
38
40
|
test datasets. Data from both datasets is sorted and placed into either a predetermined number of bins or
|
39
41
|
quantiles. The boundaries for these bins are initially determined based on the distribution of the training data.
|
@@ -42,14 +44,14 @@ class PopulationStabilityIndex(Metric):
|
|
42
44
|
in the training and test datasets. The PSI, along with the proportions of data in each bin for both datasets, are
|
43
45
|
displayed in a summary table, a grouped bar chart, and a scatter plot.
|
44
46
|
|
45
|
-
|
47
|
+
### Signs of High Risk
|
46
48
|
|
47
49
|
- A high PSI value is a clear indicator of high risk. Such a value suggests a significant shift in the model
|
48
50
|
predictions or severe changes in the characteristics of the underlying population.
|
49
51
|
- This ultimately suggests that the model may not be performing as well as expected and that it may be less
|
50
52
|
reliable for making future predictions.
|
51
53
|
|
52
|
-
|
54
|
+
### Strengths
|
53
55
|
|
54
56
|
- The PSI provides a quantitative measure of the stability of a model over time or across different samples, making
|
55
57
|
it an invaluable tool for evaluating changes in a model's performance.
|
@@ -58,7 +60,7 @@ class PopulationStabilityIndex(Metric):
|
|
58
60
|
- The use of visual aids such as tables and charts further simplifies the comprehension and interpretation of the
|
59
61
|
PSI.
|
60
62
|
|
61
|
-
|
63
|
+
### Limitations
|
62
64
|
|
63
65
|
- The PSI test does not account for the interdependence between features: features that are dependent on one
|
64
66
|
another may show similar shifts in their distributions, which in turn may result in similar PSI values.
|
@@ -18,34 +18,41 @@ class PrecisionRecallCurve(Metric):
|
|
18
18
|
"""
|
19
19
|
Evaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve.
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
### Purpose
|
22
|
+
|
23
|
+
The Precision Recall Curve metric is intended to evaluate the trade-off between precision and recall in
|
24
|
+
classification models, particularly binary classification models. It assesses the model's capacity to produce
|
23
25
|
accurate results (high precision), as well as its ability to capture a majority of all positive instances (high
|
24
26
|
recall).
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
### Test Mechanism
|
29
|
+
|
30
|
+
The test extracts ground truth labels and prediction probabilities from the model's test dataset. It applies the
|
31
|
+
`precision_recall_curve` method from the sklearn metrics module to these extracted labels and predictions, which
|
32
|
+
computes a precision-recall pair for each possible threshold. This calculation results in an array of precision and
|
33
|
+
recall scores that can be plotted against each other to form the Precision-Recall Curve. This curve is then
|
34
|
+
visually represented by using Plotly's scatter plot.
|
35
|
+
|
36
|
+
### Signs of High Risk
|
31
37
|
|
32
|
-
|
33
|
-
|
34
|
-
* This corresponds to a model yielding a high amount of false positives (low precision) and/or false negatives (low
|
38
|
+
- A lower area under the Precision-Recall Curve signifies high risk.
|
39
|
+
- This corresponds to a model yielding a high amount of false positives (low precision) and/or false negatives (low
|
35
40
|
recall).
|
36
|
-
|
41
|
+
- If the curve is closer to the bottom left of the plot, rather than being closer to the top right corner, it can
|
37
42
|
be a sign of high risk.
|
38
43
|
|
39
|
-
|
40
|
-
|
44
|
+
### Strengths
|
45
|
+
|
46
|
+
- This metric aptly represents the balance between precision (minimizing false positives) and recall (minimizing
|
41
47
|
false negatives), which is especially critical in scenarios where both values are significant.
|
42
|
-
|
48
|
+
- Through the graphic representation, it enables an intuitive understanding of the model's performance across
|
43
49
|
different threshold levels.
|
44
50
|
|
45
|
-
|
46
|
-
|
51
|
+
### Limitations
|
52
|
+
|
53
|
+
- This metric is only applicable to binary classification models - it raises errors for multiclass classification
|
47
54
|
models or Foundation models.
|
48
|
-
|
55
|
+
- It may not fully represent the overall accuracy of the model if the cost of false positives and false negatives
|
49
56
|
are extremely different, or if the dataset is heavily imbalanced.
|
50
57
|
"""
|
51
58
|
|
@@ -19,7 +19,8 @@ class ROCCurve(Metric):
|
|
19
19
|
Evaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic
|
20
20
|
(ROC) curve and calculating the Area Under Curve (AUC) score.
|
21
21
|
|
22
|
-
|
22
|
+
### Purpose
|
23
|
+
|
23
24
|
The Receiver Operating Characteristic (ROC) curve is designed to evaluate the performance of binary classification
|
24
25
|
models. This curve illustrates the balance between the True Positive Rate (TPR) and False Positive Rate (FPR)
|
25
26
|
across various threshold levels. In combination with the Area Under the Curve (AUC), the ROC curve aims to measure
|
@@ -27,28 +28,32 @@ class ROCCurve(Metric):
|
|
27
28
|
default vs non-default). Ideally, a higher AUC score signifies superior model performance in accurately
|
28
29
|
distinguishing between the positive and negative classes.
|
29
30
|
|
30
|
-
|
31
|
+
### Test Mechanism
|
32
|
+
|
31
33
|
First, this script selects the target model and datasets that require binary classification. It then calculates the
|
32
34
|
predicted probabilities for the test set, and uses this data, along with the true outcomes, to generate and plot
|
33
|
-
the ROC curve. Additionally, it
|
35
|
+
the ROC curve. Additionally, it includes a line signifying randomness (AUC of 0.5). The AUC score for the model's
|
34
36
|
ROC curve is also computed, presenting a numerical estimation of the model's performance. If any Infinite values
|
35
37
|
are detected in the ROC threshold, these are effectively eliminated. The resulting ROC curve, AUC score, and
|
36
38
|
thresholds are consequently saved for future reference.
|
37
39
|
|
38
|
-
|
40
|
+
### Signs of High Risk
|
41
|
+
|
39
42
|
- A high risk is potentially linked to the model's performance if the AUC score drops below or nears 0.5.
|
40
43
|
- Another warning sign would be the ROC curve lying closer to the line of randomness, indicating no discriminative
|
41
44
|
ability.
|
42
45
|
- For the model to be deemed competent at its classification tasks, it is crucial that the AUC score is
|
43
46
|
significantly above 0.5.
|
44
47
|
|
45
|
-
|
46
|
-
|
48
|
+
### Strengths
|
49
|
+
|
50
|
+
- The ROC Curve offers an inclusive visual depiction of a model's discriminative power throughout all conceivable
|
47
51
|
classification thresholds, unlike other metrics that solely disclose model performance at one fixed threshold.
|
48
52
|
- Despite the proportions of the dataset, the AUC Score, which represents the entire ROC curve as a single data
|
49
53
|
point, continues to be consistent, proving to be the ideal choice for such situations.
|
50
54
|
|
51
|
-
|
55
|
+
### Limitations
|
56
|
+
|
52
57
|
- The primary limitation is that this test is exclusively structured for binary classification tasks, thus limiting
|
53
58
|
its application towards other model types.
|
54
59
|
- Furthermore, its performance might be subpar with models that output probabilities highly skewed towards 0 or 1.
|
@@ -2,141 +2,85 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
6
|
+
import pandas as pd
|
8
7
|
from sklearn import metrics
|
9
8
|
|
10
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
11
10
|
|
12
11
|
|
13
|
-
@
|
14
|
-
|
12
|
+
@tags("sklearn", "model_performance")
|
13
|
+
@tasks("regression", "classification")
|
14
|
+
def RegressionErrors(model, dataset):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
**
|
28
|
-
|
29
|
-
|
30
|
-
the
|
31
|
-
|
32
|
-
**
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
16
|
+
Assesses the performance and error distribution of a regression model using various error metrics.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The purpose of the Regression Errors test is to measure the performance of a regression model by calculating
|
21
|
+
several error metrics. This evaluation helps determine the model's accuracy and potential issues like overfitting
|
22
|
+
or bias by analyzing differences in error metrics between the training and testing datasets.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The test computes the following error metrics:
|
27
|
+
- **Mean Absolute Error (MAE)**: Average of the absolute differences between true values and predicted values.
|
28
|
+
- **Mean Squared Error (MSE)**: Average of the squared differences between true values and predicted values.
|
29
|
+
- **Root Mean Squared Error (RMSE)**: Square root of the mean squared error.
|
30
|
+
- **Mean Absolute Percentage Error (MAPE)**: Average of the absolute differences between true values and predicted
|
31
|
+
values, divided by the true values, and expressed as a percentage.
|
32
|
+
- **Mean Bias Deviation (MBD)**: Average bias between true values and predicted values.
|
33
|
+
|
34
|
+
These metrics are calculated separately for the training and testing datasets and compared to identify
|
35
|
+
discrepancies.
|
36
|
+
|
37
|
+
### Signs of High Risk
|
38
|
+
|
39
|
+
- High values for MAE, MSE, RMSE, or MAPE indicating poor model performance.
|
40
|
+
- Large differences in error metrics between the training and testing datasets, suggesting overfitting.
|
41
|
+
- Significant deviation of MBD from zero, indicating systematic bias in model predictions.
|
42
|
+
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides a comprehensive overview of model performance through multiple error metrics.
|
46
|
+
- Individual metrics offer specific insights, e.g., MAE for interpretability, MSE for emphasizing larger errors.
|
47
|
+
- RMSE is useful for being in the same unit as the target variable.
|
48
|
+
- MAPE allows the error to be expressed as a percentage.
|
49
|
+
- MBD detects systematic bias in model predictions.
|
50
|
+
|
51
|
+
### Limitations
|
52
|
+
|
53
|
+
- MAE and MSE are sensitive to outliers.
|
54
|
+
- RMSE heavily penalizes larger errors, which might not always be desirable.
|
55
|
+
- MAPE can be misleading when actual values are near zero.
|
56
|
+
- MBD may not be suitable if bias varies with the magnitude of actual values.
|
57
|
+
- These metrics may not capture all nuances of model performance and should be interpreted with domain-specific
|
58
|
+
context.
|
42
59
|
"""
|
43
60
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
self, y_true_train, class_pred_train, y_true_test, class_pred_test
|
71
|
-
):
|
72
|
-
mae_train = metrics.mean_absolute_error(y_true_train, class_pred_train)
|
73
|
-
mae_test = metrics.mean_absolute_error(y_true_test, class_pred_test)
|
74
|
-
|
75
|
-
results = []
|
76
|
-
results.append(
|
77
|
-
{
|
78
|
-
"Mean Absolute Error (MAE)": {
|
79
|
-
"train": mae_train,
|
80
|
-
"test": mae_test,
|
81
|
-
}
|
82
|
-
}
|
83
|
-
)
|
84
|
-
|
85
|
-
mse_train = metrics.mean_squared_error(y_true_train, class_pred_train)
|
86
|
-
mse_test = metrics.mean_squared_error(y_true_test, class_pred_test)
|
87
|
-
results.append(
|
88
|
-
{
|
89
|
-
"Mean Squared Error (MSE)": {
|
90
|
-
"train": mse_train,
|
91
|
-
"test": mse_test,
|
92
|
-
}
|
93
|
-
}
|
94
|
-
)
|
95
|
-
results.append(
|
96
|
-
{
|
97
|
-
"Root Mean Squared Error (RMSE)": {
|
98
|
-
"train": np.sqrt(mse_train),
|
99
|
-
"test": np.sqrt(mse_test),
|
100
|
-
}
|
101
|
-
}
|
102
|
-
)
|
103
|
-
|
104
|
-
mape_train = (
|
105
|
-
np.mean(np.abs((y_true_train - class_pred_train) / y_true_train)) * 100
|
106
|
-
)
|
107
|
-
mape_test = np.mean(np.abs((y_true_test - class_pred_test) / y_true_test)) * 100
|
108
|
-
results.append(
|
109
|
-
{
|
110
|
-
"Mean Absolute Percentage Error (MAPE)": {
|
111
|
-
"train": mape_train,
|
112
|
-
"test": mape_test,
|
113
|
-
}
|
114
|
-
}
|
115
|
-
)
|
116
|
-
|
117
|
-
mbd_train = np.mean(class_pred_train - y_true_train)
|
118
|
-
mbd_test = np.mean(class_pred_test - y_true_test)
|
119
|
-
results.append(
|
120
|
-
{
|
121
|
-
"Mean Bias Deviation (MBD)": {
|
122
|
-
"train": mbd_train,
|
123
|
-
"test": mbd_test,
|
124
|
-
}
|
125
|
-
}
|
126
|
-
)
|
127
|
-
return results
|
128
|
-
|
129
|
-
def run(self):
|
130
|
-
y_train_true = self.inputs.datasets[0].y
|
131
|
-
y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
132
|
-
y_train_true = y_train_true.astype(y_train_pred.dtype)
|
133
|
-
|
134
|
-
y_test_true = self.inputs.datasets[1].y
|
135
|
-
y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
136
|
-
y_test_true = y_test_true.astype(y_test_pred.dtype)
|
137
|
-
|
138
|
-
results = self.regression_errors(
|
139
|
-
y_train_true, y_train_pred, y_test_true, y_test_pred
|
140
|
-
)
|
141
|
-
|
142
|
-
return self.cache_results(metric_value=results)
|
61
|
+
y_true = dataset.y
|
62
|
+
y_pred = dataset.y_pred(model)
|
63
|
+
y_true = y_true.astype(y_pred.dtype)
|
64
|
+
|
65
|
+
return _regression_errors(y_true, y_pred)
|
66
|
+
|
67
|
+
|
68
|
+
def _regression_errors(y_true, y_pred):
|
69
|
+
mae_train = metrics.mean_absolute_error(y_true, y_pred)
|
70
|
+
mse_train = metrics.mean_squared_error(y_true, y_pred)
|
71
|
+
rmse_train = np.sqrt(mse_train)
|
72
|
+
mape_train = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
|
73
|
+
mbd_train = np.mean(y_pred - y_true)
|
74
|
+
|
75
|
+
# Create dataframe with one row and each error metric as a column
|
76
|
+
results_df = pd.DataFrame(
|
77
|
+
{
|
78
|
+
"Mean Absolute Error (MAE)": [mae_train],
|
79
|
+
"Mean Squared Error (MSE)": [mse_train],
|
80
|
+
"Root Mean Squared Error (RMSE)": [rmse_train],
|
81
|
+
"Mean Absolute Percentage Error (MAPE)": [mape_train],
|
82
|
+
"Mean Bias Deviation (MBD)": [mbd_train],
|
83
|
+
}
|
84
|
+
)
|
85
|
+
|
86
|
+
return results_df
|
@@ -16,25 +16,40 @@ logger = get_logger(__name__)
|
|
16
16
|
@tasks("regression", "time_series_forecasting")
|
17
17
|
def RegressionErrorsComparison(datasets, models):
|
18
18
|
"""
|
19
|
-
|
20
|
-
|
19
|
+
Assesses multiple regression error metrics to compare model performance across different datasets, emphasizing
|
20
|
+
systematic overestimation or underestimation and large percentage errors.
|
21
21
|
|
22
|
-
|
22
|
+
### Purpose
|
23
23
|
|
24
|
-
|
24
|
+
The purpose of this test is to compare regression errors for different models applied to various datasets. It aims
|
25
|
+
to examine model performance using multiple error metrics, thereby identifying areas where models may be
|
26
|
+
underperforming or exhibiting bias.
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
28
|
+
### Test Mechanism
|
29
|
+
|
30
|
+
The function iterates through each dataset-model pair and calculates various error metrics, including Mean Absolute
|
31
|
+
Error (MAE), Mean Squared Error (MSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD). The
|
32
|
+
results are summarized in a table, which provides a comprehensive view of each model's performance on the datasets.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
35
|
+
|
36
|
+
- High Mean Absolute Error (MAE) or Mean Squared Error (MSE), indicating poor model performance.
|
37
|
+
- High Mean Absolute Percentage Error (MAPE), suggesting large percentage errors, especially problematic if the
|
38
|
+
true values are small.
|
39
|
+
- Mean Bias Deviation (MBD) significantly different from zero, indicating systematic overestimation or
|
40
|
+
underestimation by the model.
|
41
|
+
|
42
|
+
### Strengths
|
30
43
|
|
31
|
-
**Strengths**:
|
32
44
|
- Provides multiple error metrics to assess model performance from different perspectives.
|
33
45
|
- Includes a check to avoid division by zero when calculating MAPE.
|
34
46
|
|
35
|
-
|
36
|
-
|
37
|
-
-
|
47
|
+
### Limitations
|
48
|
+
|
49
|
+
- Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns`
|
50
|
+
attributes.
|
51
|
+
- Relies on the `logger` from `validmind.logging` to warn about zero values in `y_true`, which should be correctly
|
52
|
+
implemented and imported.
|
38
53
|
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
39
54
|
"""
|
40
55
|
results_list = []
|