validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -20,36 +20,44 @@ from validmind.vm_models import (
|
|
20
20
|
@dataclass
|
21
21
|
class SilhouettePlot(Metric):
|
22
22
|
"""
|
23
|
-
Calculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
Silhouette Score
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
23
|
+
Calculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML
|
24
|
+
models.
|
25
|
+
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
This test calculates the Silhouette Score, which is a model performance metric used in clustering applications.
|
29
|
+
Primarily, the Silhouette Score evaluates how similar a data point is to its own cluster compared to other
|
30
|
+
clusters. The metric ranges between -1 and 1, where a high value indicates that the object is well matched to its
|
31
|
+
own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high Silhouette Score,
|
32
|
+
implying well-separated clusters.
|
33
|
+
|
34
|
+
### Test Mechanism
|
35
|
+
|
36
|
+
The test first extracts the true and predicted labels from the model's training data. The test runs the Silhouette
|
37
|
+
Score function, which takes as input the training dataset features and the predicted labels, subsequently
|
38
|
+
calculating the average score. This average Silhouette Score is printed for reference. The script then calculates
|
39
|
+
the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is represented
|
40
|
+
in this plot, with color distinguishing between different clusters. A red dashed line indicates the average
|
41
|
+
Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model performance
|
42
|
+
analysis and comparison.
|
43
|
+
|
44
|
+
### Signs of High Risk
|
45
|
+
|
40
46
|
- A low Silhouette Score, potentially indicating that the clusters are not well separated and that data points may
|
41
47
|
not be fitting well to their respective clusters.
|
42
48
|
- A Silhouette Plot displaying overlapping clusters or the absence of clear distinctions between clusters visually
|
43
49
|
also suggests poor clustering performance.
|
44
50
|
|
45
|
-
|
51
|
+
### Strengths
|
52
|
+
|
46
53
|
- The Silhouette Score provides a clear and quantitative measure of how well data points have been grouped into
|
47
54
|
clusters, offering insights into model performance.
|
48
55
|
- The Silhouette Plot provides an intuitive, graphical representation of the clustering mechanism, aiding visual
|
49
56
|
assessments of model performance.
|
50
57
|
- It does not require ground truth labels, so it's useful when true cluster assignments are not known.
|
51
58
|
|
52
|
-
|
59
|
+
### Limitations
|
60
|
+
|
53
61
|
- The Silhouette Score may be susceptible to the influence of outliers, which could impact its accuracy and
|
54
62
|
reliability.
|
55
63
|
- It assumes the clusters are convex and isotropic, which might not be the case with complex datasets.
|
@@ -32,33 +32,40 @@ class TrainingTestDegradation(ThresholdTest):
|
|
32
32
|
"""
|
33
33
|
Tests if model performance degradation between training and test datasets exceeds a predefined threshold.
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
### Purpose
|
36
|
+
|
37
|
+
The `TrainingTestDegradation` class serves as a test to verify that the degradation in performance between the
|
38
|
+
training and test datasets does not exceed a predefined threshold. This test measures the model's ability to
|
39
|
+
generalize from its training data to unseen test data, assessing key classification metrics such as accuracy,
|
40
|
+
precision, recall, and f1 score to verify the model's robustness and reliability.
|
41
|
+
|
42
|
+
### Test Mechanism
|
43
|
+
|
44
|
+
The code applies several predefined metrics, including accuracy, precision, recall, and f1 scores, to the model's
|
45
|
+
predictions for both the training and test datasets. It calculates the degradation as the difference between the
|
46
|
+
training score and test score divided by the training score. The test is considered successful if the degradation
|
47
|
+
for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table showing
|
48
|
+
each metric's train score, test score, degradation percentage, and pass/fail status.
|
49
|
+
|
50
|
+
### Signs of High Risk
|
51
|
+
|
47
52
|
- A degradation percentage that exceeds the maximum allowed threshold of 10% for any of the evaluated metrics.
|
48
53
|
- A high difference or gap between the metric scores on the training and the test datasets.
|
49
54
|
- The 'Pass/Fail' column displaying 'Fail' for any of the evaluated metrics.
|
50
55
|
|
51
|
-
|
52
|
-
|
53
|
-
|
56
|
+
### Strengths
|
57
|
+
|
58
|
+
- Provides a quantitative measure of the model's ability to generalize to unseen data, which is key for predicting
|
59
|
+
its practical real-world performance.
|
54
60
|
- By evaluating multiple metrics, it takes into account different facets of model performance and enables a more
|
55
61
|
holistic evaluation.
|
56
62
|
- The use of a variable predefined threshold allows the flexibility to adjust the acceptability criteria for
|
57
63
|
different scenarios.
|
58
64
|
|
59
|
-
|
60
|
-
|
61
|
-
|
65
|
+
### Limitations
|
66
|
+
|
67
|
+
- The test compares raw performance on training and test data but does not factor in the nature of the data. Areas
|
68
|
+
with less representation in the training set might still perform poorly on unseen data.
|
62
69
|
- It requires good coverage and balance in the test and training datasets to produce reliable results, which may
|
63
70
|
not always be available.
|
64
71
|
- The test is currently only designed for classification tasks.
|
@@ -14,42 +14,43 @@ class VMeasure(ClusterPerformance):
|
|
14
14
|
"""
|
15
15
|
Evaluates homogeneity and completeness of a clustering model using the V Measure Score.
|
16
16
|
|
17
|
-
|
17
|
+
### Purpose
|
18
|
+
|
18
19
|
The purpose of this metric, V Measure Score (V Score), is to evaluate the performance of a clustering model. It
|
19
20
|
measures the homogeneity and completeness of a set of cluster labels, where homogeneity refers to each cluster
|
20
21
|
containing only members of a single class and completeness meaning all members of a given class are assigned to the
|
21
22
|
same cluster.
|
22
23
|
|
23
|
-
|
24
|
-
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the `v_measure_score`
|
25
27
|
function from the sklearn module's metrics package. The required inputs to perform this metric are the model, train
|
26
28
|
dataset, and test dataset. The test is appropriate for models tasked with clustering.
|
27
29
|
|
28
|
-
|
30
|
+
### Signs of High Risk
|
29
31
|
|
30
32
|
- Low V Measure Score: A low V Measure Score indicates that the clustering model has poor homogeneity or
|
31
33
|
completeness, or both. This might signal that the model is failing to correctly cluster the data.
|
32
34
|
|
33
|
-
|
35
|
+
### Strengths
|
34
36
|
|
35
37
|
- The V Measure Score is a harmonic mean between homogeneity and completeness. This ensures that both attributes
|
36
38
|
are taken into account when evaluating the model, providing an overall measure of its cluster validity.
|
37
|
-
|
38
39
|
- The metric does not require knowledge of the ground truth classes when measuring homogeneity and completeness,
|
39
40
|
making it applicable in instances where such information is unavailable.
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
- The V Score can be influenced by the number of clusters, which means that it might not always reflect the quality
|
44
|
-
of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
|
45
|
-
completeness, leading to a low V Score even if the clustering might be useful.
|
42
|
+
### Limitations
|
46
43
|
|
44
|
+
- The V Measure Score can be influenced by the number of clusters, which means that it might not always reflect the
|
45
|
+
quality of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
|
46
|
+
completeness, leading to a low V Measure Score even if the clustering might be useful.
|
47
47
|
- It assumes equal importance of homogeneity and completeness. In some applications, one may be more important than
|
48
|
-
the other. The V Score does not provide flexibility in assigning different weights to homogeneity and
|
48
|
+
the other. The V Measure Score does not provide flexibility in assigning different weights to homogeneity and
|
49
|
+
completeness.
|
49
50
|
"""
|
50
51
|
|
51
52
|
name = "v_measure_score"
|
52
|
-
required_inputs = ["model", "
|
53
|
+
required_inputs = ["model", "dataset"]
|
53
54
|
tasks = ["clustering"]
|
54
55
|
tags = [
|
55
56
|
"sklearn",
|
@@ -27,21 +27,23 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
27
27
|
Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
|
28
28
|
feature space.
|
29
29
|
|
30
|
-
|
30
|
+
### Purpose
|
31
|
+
|
31
32
|
The weak spots test is applied to evaluate the performance of a machine learning model within specific regions of
|
32
33
|
its feature space. This test slices the feature space into various sections, evaluating the model's outputs within
|
33
34
|
each section against specific performance metrics (e.g., accuracy, precision, recall, and F1 scores). The ultimate
|
34
35
|
aim is to identify areas where the model's performance falls below the set thresholds, thereby exposing its
|
35
36
|
possible weaknesses and limitations.
|
36
37
|
|
37
|
-
|
38
|
+
### Test Mechanism
|
39
|
+
|
38
40
|
The test mechanism adopts an approach of dividing the feature space of the training dataset into numerous bins. The
|
39
41
|
model's performance metrics (accuracy, precision, recall, F1 scores) are then computed for each bin on both the
|
40
42
|
training and test datasets. A "weak spot" is identified if any of the performance metrics fall below a
|
41
43
|
predetermined threshold for a particular bin on the test dataset. The test results are visually plotted as bar
|
42
44
|
charts for each performance metric, indicating the bins which fail to meet the established threshold.
|
43
45
|
|
44
|
-
|
46
|
+
### Signs of High Risk
|
45
47
|
|
46
48
|
- Any performance metric of the model dropping below the set thresholds.
|
47
49
|
- Significant disparity in performance between the training and test datasets within a bin could be an indication
|
@@ -49,7 +51,7 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
49
51
|
- Regions or slices with consistently low performance metrics. Such instances could mean that the model struggles
|
50
52
|
to handle specific types of input data adequately, resulting in potentially inaccurate predictions.
|
51
53
|
|
52
|
-
|
54
|
+
### Strengths
|
53
55
|
|
54
56
|
- The test helps pinpoint precise regions of the feature space where the model's performance is below par, allowing
|
55
57
|
for more targeted improvements to the model.
|
@@ -58,7 +60,7 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
58
60
|
- The test exhibits flexibility, letting users set different thresholds for various performance metrics according
|
59
61
|
to the specific requirements of the application.
|
60
62
|
|
61
|
-
|
63
|
+
### Limitations
|
62
64
|
|
63
65
|
- The binning system utilized for the feature space in the test could over-simplify the model's behavior within
|
64
66
|
each bin. The granularity of this slicing depends on the chosen 'bins' parameter and can sometimes be arbitrary.
|
@@ -15,13 +15,16 @@ class AutoARIMA(Metric):
|
|
15
15
|
"""
|
16
16
|
Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
models
|
22
|
-
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving Average (ARIMA)
|
21
|
+
models. These models are primarily used for forecasting time-series data. The validation test automatically fits
|
22
|
+
multiple ARIMA models, with varying parameters, to every variable within the given dataset. The models are then
|
23
|
+
ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC) values, which
|
24
|
+
provide a basis for the efficient model selection process.
|
25
|
+
|
26
|
+
### Test Mechanism
|
23
27
|
|
24
|
-
**Test Mechanism**:
|
25
28
|
This metric proceeds by generating an array of feasible combinations of ARIMA model parameters which are within a
|
26
29
|
prescribed limit. These limits include `max_p`, `max_d`, `max_q`; they represent the autoregressive, differencing,
|
27
30
|
and moving average components respectively. Upon applying these sets of parameters, the validation test fits each
|
@@ -31,28 +34,31 @@ class AutoARIMA(Metric):
|
|
31
34
|
found to be non-stationary, a warning message is sent out, given that ARIMA models necessitate input series to be
|
32
35
|
stationary.
|
33
36
|
|
34
|
-
|
35
|
-
|
37
|
+
### Signs of High Risk
|
38
|
+
|
39
|
+
- If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
|
36
40
|
indicates that the series might not be stationary, leading to potentially inaccurate results.
|
37
|
-
|
41
|
+
- Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
|
38
42
|
either the data or model stability.
|
39
43
|
|
40
|
-
|
41
|
-
|
44
|
+
### Strengths
|
45
|
+
|
46
|
+
- The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
|
42
47
|
on BIC and AIC criteria.
|
43
|
-
|
48
|
+
- The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
|
44
49
|
ARIMA models.
|
45
|
-
|
50
|
+
- The exhaustive search through all possible combinations of model parameters enhances the likelihood of
|
46
51
|
identifying the best-fit model.
|
47
52
|
|
48
|
-
|
49
|
-
|
53
|
+
### Limitations
|
54
|
+
|
55
|
+
- This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
|
50
56
|
every variable.
|
51
|
-
|
57
|
+
- Although the test checks for non-stationarity and logs warnings where present, it does not apply any
|
52
58
|
transformations to the data to establish stationarity.
|
53
|
-
|
59
|
+
- The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
|
54
60
|
all scenarios.
|
55
|
-
|
61
|
+
- The test is only applicable to regression tasks involving time-series data, and may not work effectively for
|
56
62
|
other types of machine learning tasks.
|
57
63
|
"""
|
58
64
|
|
@@ -11,31 +11,35 @@ class BoxPierce(Metric):
|
|
11
11
|
"""
|
12
12
|
Detects autocorrelation in time-series data through the Box-Pierce test to validate model performance.
|
13
13
|
|
14
|
-
|
14
|
+
### Purpose
|
15
|
+
|
16
|
+
The Box-Pierce test is utilized to detect the presence of autocorrelation in a time-series dataset.
|
15
17
|
Autocorrelation, or serial correlation, refers to the degree of similarity between observations based on the
|
16
18
|
temporal spacing between them. This test is essential for affirming the quality of a time-series model by ensuring
|
17
19
|
that the error terms in the model are random and do not adhere to a specific pattern.
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
The implementation of the Box-Pierce test involves calculating a test statistic along with a corresponding p-value
|
24
|
+
derived from the dataset features. These quantities are used to test the null hypothesis that posits the data to be
|
25
|
+
independently distributed. This is achieved by iterating over every feature column in the time-series data and
|
26
|
+
applying the `acorr_ljungbox` function of the statsmodels library. The function yields the Box-Pierce test
|
27
|
+
statistic as well as the respective p-value, all of which are cached as test results.
|
24
28
|
|
25
|
-
|
29
|
+
### Signs of High Risk
|
26
30
|
|
27
31
|
- A low p-value, typically under 0.05 as per statistical convention, throws the null hypothesis of independence
|
28
32
|
into question. This implies that the dataset potentially houses autocorrelations, thus indicating a high-risk
|
29
33
|
scenario concerning model performance.
|
30
34
|
- Large Box-Pierce test statistic values may indicate the presence of autocorrelation.
|
31
35
|
|
32
|
-
|
36
|
+
### Strengths
|
33
37
|
|
34
38
|
- Detects patterns in data that are supposed to be random, thereby ensuring no underlying autocorrelation.
|
35
39
|
- Can be computed efficiently given its low computational complexity.
|
36
40
|
- Can be widely applied to most regression problems, making it very versatile.
|
37
41
|
|
38
|
-
|
42
|
+
### Limitations
|
39
43
|
|
40
44
|
- Assumes homoscedasticity (constant variance) and normality of residuals, which may not always be the case in
|
41
45
|
real-world datasets.
|
@@ -43,7 +47,7 @@ class BoxPierce(Metric):
|
|
43
47
|
correlations.
|
44
48
|
- It only provides a general indication of the existence of autocorrelation, without providing specific insights
|
45
49
|
into the nature or patterns of the detected autocorrelation.
|
46
|
-
- In the presence of
|
50
|
+
- In the presence of trends or seasonal patterns, the Box-Pierce test may yield misleading results.
|
47
51
|
- Applicability is limited to time-series data, which limits its overall utility.
|
48
52
|
"""
|
49
53
|
|
@@ -2,138 +2,107 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
from matplotlib import cm
|
10
8
|
|
11
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
12
10
|
|
13
11
|
|
14
|
-
@
|
15
|
-
|
12
|
+
@tags("visualization", "credit_risk", "logistic_regression")
|
13
|
+
@tasks("classification")
|
14
|
+
def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
|
16
15
|
"""
|
17
16
|
Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
|
18
17
|
regression models.
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
|
22
|
+
in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
|
23
|
+
probabilities for positive and negative classes across both the training and test datasets.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
|
28
|
+
the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
|
29
|
+
for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
|
30
|
+
distributions of these probabilities are created for both positive and negative classes across both training and
|
31
|
+
test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for
|
32
|
+
the training data and the other for the test data, with lines representing cumulative distributions of positive and
|
33
|
+
negative classes.
|
24
34
|
|
25
|
-
|
26
|
-
each instance in both the training and test datasets, which are then added as a new column in these sets. The
|
27
|
-
cumulative probabilities for positive and negative classes are subsequently calculated and sorted in ascending
|
28
|
-
order. Cumulative distributions of these probabilities are created for both positive and negative classes across
|
29
|
-
both training and test datasets. These cumulative probabilities are represented visually in a plot, containing two
|
30
|
-
subplots - one for the training data and the other for the test data, with lines representing cumulative
|
31
|
-
distributions of positive and negative classes.
|
35
|
+
### Signs of High Risk
|
32
36
|
|
33
|
-
**Signs of High Risk**:
|
34
37
|
- Imbalanced distribution of probabilities for either positive or negative classes.
|
35
38
|
- Notable discrepancies or significant differences between the cumulative probability distributions for the
|
36
39
|
training data versus the test data.
|
37
40
|
- Marked discrepancies or large differences between the cumulative probability distributions for positive and
|
38
41
|
negative classes.
|
39
42
|
|
40
|
-
|
41
|
-
|
42
|
-
ease of understanding and interpreting the model's
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's
|
46
|
+
behavior.
|
43
47
|
- Allows for the comparison of model's behavior across training and testing datasets, providing insights about how
|
44
48
|
well the model is generalized.
|
45
|
-
-
|
46
|
-
|
49
|
+
- Differentiates between positive and negative classes and their respective distribution patterns, aiding in
|
50
|
+
problem diagnosis.
|
51
|
+
|
52
|
+
### Limitations
|
47
53
|
|
48
|
-
**Limitations**:
|
49
54
|
- Exclusive to classification tasks and specifically to logistic regression models.
|
50
55
|
- Graphical results necessitate human interpretation and may not be directly applicable for automated risk
|
51
56
|
detection.
|
52
|
-
- The method does not give a solitary quantifiable measure of model risk,
|
53
|
-
and broad distributional information.
|
57
|
+
- The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
|
58
|
+
representation and broad distributional information.
|
54
59
|
- If the training and test datasets are not representative of the overall data distribution, the metric could
|
55
60
|
provide misleading results.
|
56
61
|
"""
|
57
62
|
|
58
|
-
|
59
|
-
|
60
|
-
tasks = ["classification"]
|
61
|
-
tags = ["logistic_regression", "visualization"]
|
62
|
-
|
63
|
-
default_params = {"title": "Cumulative Probabilities"}
|
64
|
-
|
65
|
-
@staticmethod
|
66
|
-
def plot_cumulative_prob(dataframes, dataset_titles, target_col, title):
|
67
|
-
figures = []
|
68
|
-
|
69
|
-
# Generate a colormap and convert to Plotly-accepted color format
|
70
|
-
# Adjust 'viridis' to any other matplotlib colormap if desired
|
71
|
-
colormap = cm.get_cmap("viridis")
|
72
|
-
|
73
|
-
for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
|
74
|
-
fig = go.Figure()
|
75
|
-
|
76
|
-
# Get unique classes and assign colors
|
77
|
-
classes = sorted(df[target_col].unique())
|
78
|
-
colors = [
|
79
|
-
colormap(i / len(classes))[:3] for i in range(len(classes))
|
80
|
-
] # RGB
|
81
|
-
color_dict = {
|
82
|
-
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
83
|
-
for cls, rgb in zip(classes, colors)
|
84
|
-
}
|
85
|
-
for class_value in sorted(df[target_col].unique()):
|
86
|
-
# Calculate cumulative distribution for the current class
|
87
|
-
sorted_probs = np.sort(
|
88
|
-
df[df[target_col] == class_value]["probabilities"]
|
89
|
-
)
|
90
|
-
cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
|
91
|
-
|
92
|
-
fig.add_trace(
|
93
|
-
go.Scatter(
|
94
|
-
x=sorted_probs,
|
95
|
-
y=cumulative_probs,
|
96
|
-
mode="lines",
|
97
|
-
name=f"{dataset_title} {target_col} = {class_value}",
|
98
|
-
line=dict(
|
99
|
-
color=color_dict[class_value],
|
100
|
-
),
|
101
|
-
)
|
102
|
-
)
|
103
|
-
fig.update_layout(
|
104
|
-
title_text=f"{title} - {dataset_title}",
|
105
|
-
xaxis_title="Probability",
|
106
|
-
yaxis_title="Cumulative Distribution",
|
107
|
-
legend_title=target_col,
|
108
|
-
)
|
109
|
-
figures.append(fig)
|
110
|
-
return figures
|
111
|
-
|
112
|
-
def run(self):
|
113
|
-
dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
|
114
|
-
target_column = self.inputs.datasets[0].target_column
|
115
|
-
title = self.params.get("title", self.default_params["title"])
|
116
|
-
|
117
|
-
dataframes = []
|
118
|
-
metric_value = {"cum_prob": {}}
|
119
|
-
for dataset in self.inputs.datasets:
|
120
|
-
df = dataset.df.copy()
|
121
|
-
y_prob = dataset.y_prob(self.inputs.model)
|
122
|
-
df["probabilities"] = y_prob
|
123
|
-
dataframes.append(df)
|
124
|
-
metric_value["cum_prob"][dataset.input_id] = list(df["probabilities"])
|
125
|
-
|
126
|
-
figures = self.plot_cumulative_prob(
|
127
|
-
dataframes, dataset_titles, target_column, title
|
128
|
-
)
|
63
|
+
df = dataset.df
|
64
|
+
df["probabilities"] = dataset.y_prob(model)
|
129
65
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
66
|
+
fig = _plot_cumulative_prob(df, dataset.target_column, title)
|
67
|
+
|
68
|
+
return fig
|
69
|
+
|
70
|
+
|
71
|
+
def _plot_cumulative_prob(df, target_col, title):
|
72
|
+
|
73
|
+
# Generate a colormap and convert to Plotly-accepted color format
|
74
|
+
# Adjust 'viridis' to any other matplotlib colormap if desired
|
75
|
+
colormap = cm.get_cmap("viridis")
|
76
|
+
|
77
|
+
fig = go.Figure()
|
78
|
+
|
79
|
+
# Get unique classes and assign colors
|
80
|
+
classes = sorted(df[target_col].unique())
|
81
|
+
colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
|
82
|
+
color_dict = {
|
83
|
+
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
84
|
+
for cls, rgb in zip(classes, colors)
|
85
|
+
}
|
86
|
+
for class_value in sorted(df[target_col].unique()):
|
87
|
+
# Calculate cumulative distribution for the current class
|
88
|
+
sorted_probs = np.sort(df[df[target_col] == class_value]["probabilities"])
|
89
|
+
cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
|
90
|
+
|
91
|
+
fig.add_trace(
|
92
|
+
go.Scatter(
|
93
|
+
x=sorted_probs,
|
94
|
+
y=cumulative_probs,
|
95
|
+
mode="lines",
|
96
|
+
name=f"{target_col} = {class_value}",
|
97
|
+
line=dict(
|
98
|
+
color=color_dict[class_value],
|
99
|
+
),
|
135
100
|
)
|
136
|
-
|
137
|
-
|
101
|
+
)
|
102
|
+
fig.update_layout(
|
103
|
+
title_text=f"{title}",
|
104
|
+
xaxis_title="Probability",
|
105
|
+
yaxis_title="Cumulative Distribution",
|
106
|
+
)
|
138
107
|
|
139
|
-
|
108
|
+
return fig
|
@@ -14,32 +14,39 @@ class DurbinWatsonTest(Metric):
|
|
14
14
|
"""
|
15
15
|
Assesses autocorrelation in time series data features using the Durbin-Watson statistic.
|
16
16
|
|
17
|
-
|
18
|
-
|
17
|
+
### Purpose
|
18
|
+
|
19
|
+
The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values influences
|
20
|
+
their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
|
19
21
|
independence of residuals. A model with significant autocorrelation may give unreliable predictions.
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
### Test Mechanism
|
24
|
+
|
25
|
+
Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson (DW) Test metric
|
26
|
+
generates a statistical value for each feature of the training dataset. The function is looped over all columns of
|
27
|
+
the dataset, calculating and caching the DW value for each column for further analysis. A DW metric value nearing 2
|
28
|
+
indicates no autocorrelation. Conversely, values approaching 0 suggest positive autocorrelation, and those leaning
|
29
|
+
towards 4 imply negative autocorrelation.
|
30
|
+
|
31
|
+
### Signs of High Risk
|
26
32
|
|
27
|
-
**Signs of High Risk**:
|
28
33
|
- If a feature's DW value significantly deviates from 2, it could signal a high risk due to potential
|
29
34
|
autocorrelation issues in the dataset.
|
30
|
-
- A value closer to
|
35
|
+
- A value closer to 0 could imply positive autocorrelation, while a value nearer to 4 could point to negative
|
31
36
|
autocorrelation, both leading to potentially unreliable prediction models.
|
32
37
|
|
33
|
-
|
38
|
+
### Strengths
|
39
|
+
|
34
40
|
- The metric specializes in identifying autocorrelation in prediction model residuals.
|
35
41
|
- Autocorrelation detection assists in diagnosing violation of various modeling technique assumptions, particularly
|
36
42
|
in regression analysis and time-series data modeling.
|
37
43
|
|
38
|
-
|
44
|
+
### Limitations
|
45
|
+
|
39
46
|
- The Durbin-Watson Test mainly detects linear autocorrelation and could overlook other types of relationships.
|
40
47
|
- The metric is highly sensitive to data points order. Shuffling the order could lead to notably different results.
|
41
48
|
- The test only checks for first-order autocorrelation (between a variable and its immediate predecessor) and fails
|
42
|
-
to detect higher
|
49
|
+
to detect higher-order autocorrelation.
|
43
50
|
"""
|
44
51
|
|
45
52
|
name = "durbin_watson"
|