validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +80 -119
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +14 -15
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/JarqueBera.py +70 -0
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LJungBox.py +66 -0
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/RunsTest.py +72 -0
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +42 -40
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +39 -36
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +38 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +143 -158
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- validmind-2.5.18.dist-info/RECORD +324 -0
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
- validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
- validmind-2.5.8.dist-info/RECORD +0 -318
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -27,21 +27,23 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
27
27
|
Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
|
28
28
|
feature space.
|
29
29
|
|
30
|
-
|
30
|
+
### Purpose
|
31
|
+
|
31
32
|
The weak spots test is applied to evaluate the performance of a machine learning model within specific regions of
|
32
33
|
its feature space. This test slices the feature space into various sections, evaluating the model's outputs within
|
33
34
|
each section against specific performance metrics (e.g., accuracy, precision, recall, and F1 scores). The ultimate
|
34
35
|
aim is to identify areas where the model's performance falls below the set thresholds, thereby exposing its
|
35
36
|
possible weaknesses and limitations.
|
36
37
|
|
37
|
-
|
38
|
+
### Test Mechanism
|
39
|
+
|
38
40
|
The test mechanism adopts an approach of dividing the feature space of the training dataset into numerous bins. The
|
39
41
|
model's performance metrics (accuracy, precision, recall, F1 scores) are then computed for each bin on both the
|
40
42
|
training and test datasets. A "weak spot" is identified if any of the performance metrics fall below a
|
41
43
|
predetermined threshold for a particular bin on the test dataset. The test results are visually plotted as bar
|
42
44
|
charts for each performance metric, indicating the bins which fail to meet the established threshold.
|
43
45
|
|
44
|
-
|
46
|
+
### Signs of High Risk
|
45
47
|
|
46
48
|
- Any performance metric of the model dropping below the set thresholds.
|
47
49
|
- Significant disparity in performance between the training and test datasets within a bin could be an indication
|
@@ -49,7 +51,7 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
49
51
|
- Regions or slices with consistently low performance metrics. Such instances could mean that the model struggles
|
50
52
|
to handle specific types of input data adequately, resulting in potentially inaccurate predictions.
|
51
53
|
|
52
|
-
|
54
|
+
### Strengths
|
53
55
|
|
54
56
|
- The test helps pinpoint precise regions of the feature space where the model's performance is below par, allowing
|
55
57
|
for more targeted improvements to the model.
|
@@ -58,7 +60,7 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
58
60
|
- The test exhibits flexibility, letting users set different thresholds for various performance metrics according
|
59
61
|
to the specific requirements of the application.
|
60
62
|
|
61
|
-
|
63
|
+
### Limitations
|
62
64
|
|
63
65
|
- The binning system utilized for the feature space in the test could over-simplify the model's behavior within
|
64
66
|
each bin. The granularity of this slicing depends on the chosen 'bins' parameter and can sometimes be arbitrary.
|
@@ -15,13 +15,16 @@ class AutoARIMA(Metric):
|
|
15
15
|
"""
|
16
16
|
Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
models
|
22
|
-
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving Average (ARIMA)
|
21
|
+
models. These models are primarily used for forecasting time-series data. The validation test automatically fits
|
22
|
+
multiple ARIMA models, with varying parameters, to every variable within the given dataset. The models are then
|
23
|
+
ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC) values, which
|
24
|
+
provide a basis for the efficient model selection process.
|
25
|
+
|
26
|
+
### Test Mechanism
|
23
27
|
|
24
|
-
**Test Mechanism**:
|
25
28
|
This metric proceeds by generating an array of feasible combinations of ARIMA model parameters which are within a
|
26
29
|
prescribed limit. These limits include `max_p`, `max_d`, `max_q`; they represent the autoregressive, differencing,
|
27
30
|
and moving average components respectively. Upon applying these sets of parameters, the validation test fits each
|
@@ -31,28 +34,31 @@ class AutoARIMA(Metric):
|
|
31
34
|
found to be non-stationary, a warning message is sent out, given that ARIMA models necessitate input series to be
|
32
35
|
stationary.
|
33
36
|
|
34
|
-
|
35
|
-
|
37
|
+
### Signs of High Risk
|
38
|
+
|
39
|
+
- If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
|
36
40
|
indicates that the series might not be stationary, leading to potentially inaccurate results.
|
37
|
-
|
41
|
+
- Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
|
38
42
|
either the data or model stability.
|
39
43
|
|
40
|
-
|
41
|
-
|
44
|
+
### Strengths
|
45
|
+
|
46
|
+
- The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
|
42
47
|
on BIC and AIC criteria.
|
43
|
-
|
48
|
+
- The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
|
44
49
|
ARIMA models.
|
45
|
-
|
50
|
+
- The exhaustive search through all possible combinations of model parameters enhances the likelihood of
|
46
51
|
identifying the best-fit model.
|
47
52
|
|
48
|
-
|
49
|
-
|
53
|
+
### Limitations
|
54
|
+
|
55
|
+
- This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
|
50
56
|
every variable.
|
51
|
-
|
57
|
+
- Although the test checks for non-stationarity and logs warnings where present, it does not apply any
|
52
58
|
transformations to the data to establish stationarity.
|
53
|
-
|
59
|
+
- The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
|
54
60
|
all scenarios.
|
55
|
-
|
61
|
+
- The test is only applicable to regression tasks involving time-series data, and may not work effectively for
|
56
62
|
other types of machine learning tasks.
|
57
63
|
"""
|
58
64
|
|
@@ -2,138 +2,107 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
from matplotlib import cm
|
10
8
|
|
11
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
12
10
|
|
13
11
|
|
14
|
-
@
|
15
|
-
|
12
|
+
@tags("visualization", "credit_risk", "logistic_regression")
|
13
|
+
@tasks("classification")
|
14
|
+
def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
|
16
15
|
"""
|
17
16
|
Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
|
18
17
|
regression models.
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
|
22
|
+
in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
|
23
|
+
probabilities for positive and negative classes across both the training and test datasets.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
|
28
|
+
the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
|
29
|
+
for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
|
30
|
+
distributions of these probabilities are created for both positive and negative classes across both training and
|
31
|
+
test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for
|
32
|
+
the training data and the other for the test data, with lines representing cumulative distributions of positive and
|
33
|
+
negative classes.
|
24
34
|
|
25
|
-
|
26
|
-
each instance in both the training and test datasets, which are then added as a new column in these sets. The
|
27
|
-
cumulative probabilities for positive and negative classes are subsequently calculated and sorted in ascending
|
28
|
-
order. Cumulative distributions of these probabilities are created for both positive and negative classes across
|
29
|
-
both training and test datasets. These cumulative probabilities are represented visually in a plot, containing two
|
30
|
-
subplots - one for the training data and the other for the test data, with lines representing cumulative
|
31
|
-
distributions of positive and negative classes.
|
35
|
+
### Signs of High Risk
|
32
36
|
|
33
|
-
**Signs of High Risk**:
|
34
37
|
- Imbalanced distribution of probabilities for either positive or negative classes.
|
35
38
|
- Notable discrepancies or significant differences between the cumulative probability distributions for the
|
36
39
|
training data versus the test data.
|
37
40
|
- Marked discrepancies or large differences between the cumulative probability distributions for positive and
|
38
41
|
negative classes.
|
39
42
|
|
40
|
-
|
41
|
-
|
42
|
-
ease of understanding and interpreting the model's
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's
|
46
|
+
behavior.
|
43
47
|
- Allows for the comparison of model's behavior across training and testing datasets, providing insights about how
|
44
48
|
well the model is generalized.
|
45
|
-
-
|
46
|
-
|
49
|
+
- Differentiates between positive and negative classes and their respective distribution patterns, aiding in
|
50
|
+
problem diagnosis.
|
51
|
+
|
52
|
+
### Limitations
|
47
53
|
|
48
|
-
**Limitations**:
|
49
54
|
- Exclusive to classification tasks and specifically to logistic regression models.
|
50
55
|
- Graphical results necessitate human interpretation and may not be directly applicable for automated risk
|
51
56
|
detection.
|
52
|
-
- The method does not give a solitary quantifiable measure of model risk,
|
53
|
-
and broad distributional information.
|
57
|
+
- The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
|
58
|
+
representation and broad distributional information.
|
54
59
|
- If the training and test datasets are not representative of the overall data distribution, the metric could
|
55
60
|
provide misleading results.
|
56
61
|
"""
|
57
62
|
|
58
|
-
|
59
|
-
|
60
|
-
tasks = ["classification"]
|
61
|
-
tags = ["logistic_regression", "visualization"]
|
62
|
-
|
63
|
-
default_params = {"title": "Cumulative Probabilities"}
|
64
|
-
|
65
|
-
@staticmethod
|
66
|
-
def plot_cumulative_prob(dataframes, dataset_titles, target_col, title):
|
67
|
-
figures = []
|
68
|
-
|
69
|
-
# Generate a colormap and convert to Plotly-accepted color format
|
70
|
-
# Adjust 'viridis' to any other matplotlib colormap if desired
|
71
|
-
colormap = cm.get_cmap("viridis")
|
72
|
-
|
73
|
-
for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
|
74
|
-
fig = go.Figure()
|
75
|
-
|
76
|
-
# Get unique classes and assign colors
|
77
|
-
classes = sorted(df[target_col].unique())
|
78
|
-
colors = [
|
79
|
-
colormap(i / len(classes))[:3] for i in range(len(classes))
|
80
|
-
] # RGB
|
81
|
-
color_dict = {
|
82
|
-
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
83
|
-
for cls, rgb in zip(classes, colors)
|
84
|
-
}
|
85
|
-
for class_value in sorted(df[target_col].unique()):
|
86
|
-
# Calculate cumulative distribution for the current class
|
87
|
-
sorted_probs = np.sort(
|
88
|
-
df[df[target_col] == class_value]["probabilities"]
|
89
|
-
)
|
90
|
-
cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
|
91
|
-
|
92
|
-
fig.add_trace(
|
93
|
-
go.Scatter(
|
94
|
-
x=sorted_probs,
|
95
|
-
y=cumulative_probs,
|
96
|
-
mode="lines",
|
97
|
-
name=f"{dataset_title} {target_col} = {class_value}",
|
98
|
-
line=dict(
|
99
|
-
color=color_dict[class_value],
|
100
|
-
),
|
101
|
-
)
|
102
|
-
)
|
103
|
-
fig.update_layout(
|
104
|
-
title_text=f"{title} - {dataset_title}",
|
105
|
-
xaxis_title="Probability",
|
106
|
-
yaxis_title="Cumulative Distribution",
|
107
|
-
legend_title=target_col,
|
108
|
-
)
|
109
|
-
figures.append(fig)
|
110
|
-
return figures
|
111
|
-
|
112
|
-
def run(self):
|
113
|
-
dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
|
114
|
-
target_column = self.inputs.datasets[0].target_column
|
115
|
-
title = self.params.get("title", self.default_params["title"])
|
116
|
-
|
117
|
-
dataframes = []
|
118
|
-
metric_value = {"cum_prob": {}}
|
119
|
-
for dataset in self.inputs.datasets:
|
120
|
-
df = dataset.df.copy()
|
121
|
-
y_prob = dataset.y_prob(self.inputs.model)
|
122
|
-
df["probabilities"] = y_prob
|
123
|
-
dataframes.append(df)
|
124
|
-
metric_value["cum_prob"][dataset.input_id] = list(df["probabilities"])
|
125
|
-
|
126
|
-
figures = self.plot_cumulative_prob(
|
127
|
-
dataframes, dataset_titles, target_column, title
|
128
|
-
)
|
63
|
+
df = dataset.df
|
64
|
+
df["probabilities"] = dataset.y_prob(model)
|
129
65
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
66
|
+
fig = _plot_cumulative_prob(df, dataset.target_column, title)
|
67
|
+
|
68
|
+
return fig
|
69
|
+
|
70
|
+
|
71
|
+
def _plot_cumulative_prob(df, target_col, title):
|
72
|
+
|
73
|
+
# Generate a colormap and convert to Plotly-accepted color format
|
74
|
+
# Adjust 'viridis' to any other matplotlib colormap if desired
|
75
|
+
colormap = cm.get_cmap("viridis")
|
76
|
+
|
77
|
+
fig = go.Figure()
|
78
|
+
|
79
|
+
# Get unique classes and assign colors
|
80
|
+
classes = sorted(df[target_col].unique())
|
81
|
+
colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
|
82
|
+
color_dict = {
|
83
|
+
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
84
|
+
for cls, rgb in zip(classes, colors)
|
85
|
+
}
|
86
|
+
for class_value in sorted(df[target_col].unique()):
|
87
|
+
# Calculate cumulative distribution for the current class
|
88
|
+
sorted_probs = np.sort(df[df[target_col] == class_value]["probabilities"])
|
89
|
+
cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
|
90
|
+
|
91
|
+
fig.add_trace(
|
92
|
+
go.Scatter(
|
93
|
+
x=sorted_probs,
|
94
|
+
y=cumulative_probs,
|
95
|
+
mode="lines",
|
96
|
+
name=f"{target_col} = {class_value}",
|
97
|
+
line=dict(
|
98
|
+
color=color_dict[class_value],
|
99
|
+
),
|
135
100
|
)
|
136
|
-
|
137
|
-
|
101
|
+
)
|
102
|
+
fig.update_layout(
|
103
|
+
title_text=f"{title}",
|
104
|
+
xaxis_title="Probability",
|
105
|
+
yaxis_title="Cumulative Distribution",
|
106
|
+
)
|
138
107
|
|
139
|
-
|
108
|
+
return fig
|
@@ -2,58 +2,85 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
import pandas as pd
|
7
6
|
from statsmodels.stats.stattools import durbin_watson
|
8
7
|
|
9
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
10
9
|
|
11
10
|
|
12
|
-
@
|
13
|
-
|
11
|
+
@tasks("regression")
|
12
|
+
@tags("time_series_data", "forecasting", "statistical_test", "statsmodels")
|
13
|
+
def DurbinWatsonTest(dataset, model, threshold=[1.5, 2.5]):
|
14
14
|
"""
|
15
15
|
Assesses autocorrelation in time series data features using the Durbin-Watson statistic.
|
16
16
|
|
17
|
-
|
18
|
-
|
17
|
+
### Purpose
|
18
|
+
|
19
|
+
The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values influences
|
20
|
+
their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
|
19
21
|
independence of residuals. A model with significant autocorrelation may give unreliable predictions.
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
### Test Mechanism
|
24
|
+
|
25
|
+
Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson (DW) Test metric
|
26
|
+
generates a statistical value for each feature of the training dataset. The function is looped over all columns of
|
27
|
+
the dataset, calculating and caching the DW value for each column for further analysis. A DW metric value nearing 2
|
28
|
+
indicates no autocorrelation. Conversely, values approaching 0 suggest positive autocorrelation, and those leaning
|
29
|
+
towards 4 imply negative autocorrelation.
|
30
|
+
|
31
|
+
### Signs of High Risk
|
26
32
|
|
27
|
-
**Signs of High Risk**:
|
28
33
|
- If a feature's DW value significantly deviates from 2, it could signal a high risk due to potential
|
29
34
|
autocorrelation issues in the dataset.
|
30
|
-
- A value closer to
|
35
|
+
- A value closer to 0 could imply positive autocorrelation, while a value nearer to 4 could point to negative
|
31
36
|
autocorrelation, both leading to potentially unreliable prediction models.
|
32
37
|
|
33
|
-
|
38
|
+
### Strengths
|
39
|
+
|
34
40
|
- The metric specializes in identifying autocorrelation in prediction model residuals.
|
35
41
|
- Autocorrelation detection assists in diagnosing violation of various modeling technique assumptions, particularly
|
36
42
|
in regression analysis and time-series data modeling.
|
37
43
|
|
38
|
-
|
44
|
+
### Limitations
|
45
|
+
|
39
46
|
- The Durbin-Watson Test mainly detects linear autocorrelation and could overlook other types of relationships.
|
40
47
|
- The metric is highly sensitive to data points order. Shuffling the order could lead to notably different results.
|
41
48
|
- The test only checks for first-order autocorrelation (between a variable and its immediate predecessor) and fails
|
42
|
-
to detect higher
|
49
|
+
to detect higher-order autocorrelation.
|
43
50
|
"""
|
44
51
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
52
|
+
# Validate threshold values
|
53
|
+
if not (0 < threshold[0] < threshold[1] < 4):
|
54
|
+
raise ValueError(
|
55
|
+
"Invalid threshold. It should be in the form [a, b] where 0 < a < b < 4."
|
56
|
+
)
|
57
|
+
|
58
|
+
# Check if threshold values are around 2
|
59
|
+
if abs(2 - threshold[0]) > 1 or abs(2 - threshold[1]) > 1:
|
60
|
+
raise ValueError(
|
61
|
+
"Threshold values should be around 2 for meaningful Durbin-Watson test results."
|
62
|
+
)
|
63
|
+
|
64
|
+
y_true = dataset.y
|
65
|
+
y_pred = dataset.y_pred(model)
|
66
|
+
residuals = y_true - y_pred
|
67
|
+
|
68
|
+
dw_statistic = durbin_watson(residuals)
|
69
|
+
|
70
|
+
def get_autocorrelation(dw_value, threshold):
|
71
|
+
if dw_value < threshold[0]:
|
72
|
+
return "Positive autocorrelation"
|
73
|
+
elif dw_value > threshold[1]:
|
74
|
+
return "Negative autocorrelation"
|
75
|
+
else:
|
76
|
+
return "No autocorrelation"
|
77
|
+
|
78
|
+
results = pd.DataFrame(
|
79
|
+
{
|
80
|
+
"dw_statistic": [dw_statistic],
|
81
|
+
"threshold": [str(threshold)],
|
82
|
+
"autocorrelation": [get_autocorrelation(dw_statistic, threshold)],
|
83
|
+
}
|
84
|
+
)
|
85
|
+
|
86
|
+
return results
|
@@ -2,34 +2,37 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import pandas as pd
|
9
7
|
from sklearn.metrics import roc_auc_score, roc_curve
|
10
8
|
|
11
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
12
10
|
|
13
11
|
|
14
|
-
@
|
15
|
-
|
12
|
+
@tags("model_performance")
|
13
|
+
@tasks("classification")
|
14
|
+
def GINITable(dataset, model):
|
16
15
|
"""
|
17
16
|
Evaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets.
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The 'GINITable' metric is designed to evaluate the performance of a classification model by emphasizing its
|
21
|
+
discriminatory power. Specifically, it calculates and presents three important metrics - the Area under the ROC
|
22
|
+
Curve (AUC), the GINI coefficient, and the Kolmogorov-Smirnov (KS) statistic - for both training and test datasets.
|
23
|
+
|
24
|
+
### Test Mechanism
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
Using a dictionary for storing performance metrics for both the training and test datasets, the 'GINITable' metric
|
27
|
+
calculates each of these metrics sequentially. The Area under the ROC Curve (AUC) is calculated via the
|
28
|
+
`roc_auc_score` function from the Scikit-Learn library. The GINI coefficient, a measure of statistical dispersion,
|
29
|
+
is then computed by doubling the AUC and subtracting 1. Finally, the Kolmogorov-Smirnov (KS) statistic is
|
28
30
|
calculated via the `roc_curve` function from Scikit-Learn, with the False Positive Rate (FPR) subtracted from the
|
29
31
|
True Positive Rate (TPR) and the maximum value taken from the resulting data. These metrics are then stored in a
|
30
32
|
pandas DataFrame for convenient visualization.
|
31
33
|
|
32
|
-
|
34
|
+
### Signs of High Risk
|
35
|
+
|
33
36
|
- Low values for performance metrics may suggest a reduction in model performance, particularly a low AUC which
|
34
37
|
indicates poor classification performance, or a low GINI coefficient, which could suggest a decreased ability to
|
35
38
|
discriminate different classes.
|
@@ -38,7 +41,8 @@ class GINITable(Metric):
|
|
38
41
|
- Significant discrepancies between the performance on the training dataset and the test dataset may present
|
39
42
|
another signal of high risk.
|
40
43
|
|
41
|
-
|
44
|
+
### Strengths
|
45
|
+
|
42
46
|
- Offers three key performance metrics (AUC, GINI, and KS) in one test, providing a more comprehensive evaluation
|
43
47
|
of the model.
|
44
48
|
- Provides a direct comparison between the model's performance on training and testing datasets, which aids in
|
@@ -47,7 +51,8 @@ class GINITable(Metric):
|
|
47
51
|
performance even when dealing with imbalanced datasets.
|
48
52
|
- Presents the metrics in a user-friendly table format for easy comprehension and analysis.
|
49
53
|
|
50
|
-
|
54
|
+
### Limitations
|
55
|
+
|
51
56
|
- The GINI coefficient and KS statistic are both dependent on the AUC value. Therefore, any errors in the
|
52
57
|
calculation of the latter will adversely impact the former metrics too.
|
53
58
|
- Mainly suited for binary classification models and may require modifications for effective application in
|
@@ -57,64 +62,26 @@ class GINITable(Metric):
|
|
57
62
|
lead to inaccuracies in the metrics if the data is not appropriately preprocessed.
|
58
63
|
"""
|
59
64
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
dataset.input_id
|
84
|
-
) # Use input_id as the label for each dataset
|
85
|
-
metrics_dict["Dataset"].append(dataset_label)
|
86
|
-
|
87
|
-
# Retrieve y_true and y_pred for the current dataset
|
88
|
-
y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
|
89
|
-
y_prob = dataset.y_prob(self.inputs.model)
|
90
|
-
|
91
|
-
# Compute metrics
|
92
|
-
y_true = np.array(y_true, dtype=float)
|
93
|
-
y_prob = np.array(y_prob, dtype=float)
|
94
|
-
|
95
|
-
fpr, tpr, _ = roc_curve(y_true, y_prob)
|
96
|
-
ks = max(tpr - fpr)
|
97
|
-
auc = roc_auc_score(y_true, y_prob)
|
98
|
-
gini = 2 * auc - 1
|
99
|
-
|
100
|
-
# Add the metrics to the dictionary
|
101
|
-
metrics_dict["AUC"].append(auc)
|
102
|
-
metrics_dict["GINI"].append(gini)
|
103
|
-
metrics_dict["KS"].append(ks)
|
104
|
-
|
105
|
-
# Create a DataFrame to store and return the results
|
106
|
-
metrics_df = pd.DataFrame(metrics_dict)
|
107
|
-
return metrics_df
|
108
|
-
|
109
|
-
def summary(self, metric_value):
|
110
|
-
summary_metrics_table = metric_value["metrics_summary"]
|
111
|
-
return ResultSummary(
|
112
|
-
results=[
|
113
|
-
ResultTable(
|
114
|
-
data=summary_metrics_table,
|
115
|
-
metadata=ResultTableMetadata(
|
116
|
-
title="AUC, GINI and KS for train and test datasets"
|
117
|
-
),
|
118
|
-
)
|
119
|
-
]
|
120
|
-
)
|
65
|
+
metrics_dict = {"AUC": [], "GINI": [], "KS": []}
|
66
|
+
|
67
|
+
# Retrieve y_true and y_pred for the current dataset
|
68
|
+
y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
|
69
|
+
y_prob = dataset.y_prob(model)
|
70
|
+
|
71
|
+
# Compute metrics
|
72
|
+
y_true = np.array(y_true, dtype=float)
|
73
|
+
y_prob = np.array(y_prob, dtype=float)
|
74
|
+
|
75
|
+
fpr, tpr, _ = roc_curve(y_true, y_prob)
|
76
|
+
ks = max(tpr - fpr)
|
77
|
+
auc = roc_auc_score(y_true, y_prob)
|
78
|
+
gini = 2 * auc - 1
|
79
|
+
|
80
|
+
# Add the metrics to the dictionary
|
81
|
+
metrics_dict["AUC"].append(auc)
|
82
|
+
metrics_dict["GINI"].append(gini)
|
83
|
+
metrics_dict["KS"].append(ks)
|
84
|
+
|
85
|
+
# Create a DataFrame to store and return the results
|
86
|
+
metrics_df = pd.DataFrame(metrics_dict)
|
87
|
+
return metrics_df
|