validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -12,42 +12,46 @@ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableM
|
|
12
12
|
@dataclass
|
13
13
|
class IQROutliersTable(Metric):
|
14
14
|
"""
|
15
|
-
Determines and summarizes outliers in numerical features using Interquartile Range method.
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
15
|
+
Determines and summarizes outliers in numerical features using the Interquartile Range method.
|
16
|
+
|
17
|
+
### Purpose
|
18
|
+
|
19
|
+
The "Interquartile Range Outliers Table" (IQROutliersTable) metric is designed to identify and summarize outliers
|
20
|
+
within numerical features of a dataset using the Interquartile Range (IQR) method. This exercise is crucial in the
|
21
|
+
pre-processing of data because outliers can substantially distort statistical analysis and impact the performance
|
22
|
+
of machine learning models.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The IQR, which is the range separating the first quartile (25th percentile) from the third quartile (75th
|
27
|
+
percentile), is calculated for each numerical feature within the dataset. An outlier is defined as a data point
|
28
|
+
falling below the "Q1 - 1.5 * IQR" or above "Q3 + 1.5 * IQR" range. The test computes the number of outliers and
|
29
|
+
their summary statistics (minimum, 25th percentile, median, 75th percentile, and maximum values) for each numerical
|
30
|
+
feature. If no specific features are chosen, the test applies to all numerical features in the dataset. The default
|
31
|
+
outlier threshold is set to 1.5 but can be customized by the user.
|
32
|
+
|
33
|
+
### Signs of High Risk
|
34
|
+
|
35
|
+
- A large number of outliers in multiple features.
|
36
|
+
- Outliers significantly distanced from the mean value of variables.
|
37
|
+
- Extremely high or low outlier values indicative of data entry errors or other data quality issues.
|
38
|
+
|
39
|
+
### Strengths
|
40
|
+
|
41
|
+
- Provides a comprehensive summary of outliers for each numerical feature, helping pinpoint features with potential
|
42
|
+
quality issues.
|
43
|
+
- The IQR method is robust to extremely high or low outlier values as it is based on quartile calculations.
|
44
|
+
- Can be customized to work on selected features and set thresholds for outliers.
|
45
|
+
|
46
|
+
### Limitations
|
47
|
+
|
48
|
+
- Might cause false positives if the variable deviates from a normal or near-normal distribution, especially for
|
49
|
+
skewed distributions.
|
50
|
+
- Does not provide interpretation or recommendations for addressing outliers, relying on further analysis by users
|
51
|
+
or data scientists.
|
52
|
+
- Only applicable to numerical features, not categorical data.
|
53
|
+
- Default thresholds may not be optimal for data with heavy pre-processing, manipulation, or inherently high
|
54
|
+
kurtosis (heavy tails).
|
51
55
|
"""
|
52
56
|
|
53
57
|
name = "iqr_outliers_table"
|
@@ -17,24 +17,30 @@ class IsolationForestOutliers(Metric):
|
|
17
17
|
"""
|
18
18
|
Detects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots.
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The IsolationForestOutliers test is designed to identify anomalies or outliers in the model's dataset using the
|
23
|
+
isolation forest algorithm. This algorithm assumes that anomalous data points can be isolated more quickly due to
|
24
|
+
their distinctive properties. By creating isolation trees and identifying instances with shorter average path
|
25
|
+
lengths, the test is able to pick out data points that differ from the majority.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test uses the isolation forest algorithm, which builds an ensemble of isolation trees by randomly selecting
|
30
|
+
features and splitting the data based on random thresholds. It isolates anomalies rather than focusing on normal
|
31
|
+
data points. For each pair of variables, a scatter plot is generated which distinguishes the identified outliers
|
32
|
+
from the inliers. The results of the test can be visualized using these scatter plots, illustrating the distinction
|
33
|
+
between outliers and inliers.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
32
37
|
- The presence of high contamination, indicating a large number of anomalies
|
33
38
|
- Inability to detect clusters of anomalies that are close in the feature space
|
34
39
|
- Misclassifying normal instances as anomalies
|
35
40
|
- Failure to detect actual anomalies
|
36
41
|
|
37
|
-
|
42
|
+
### Strengths
|
43
|
+
|
38
44
|
- Ability to handle large, high-dimensional datasets
|
39
45
|
- Efficiency in isolating anomalies instead of normal instances
|
40
46
|
- Insensitivity to the underlying distribution of data
|
@@ -42,7 +48,8 @@ class IsolationForestOutliers(Metric):
|
|
42
48
|
distinctive properties
|
43
49
|
- Visually presents the test results for better understanding and interpretability
|
44
50
|
|
45
|
-
|
51
|
+
### Limitations
|
52
|
+
|
46
53
|
- Difficult to detect anomalies that are close to each other or prevalent in datasets
|
47
54
|
- Dependency on the contamination parameter which may need fine-tuning to be effective
|
48
55
|
- Potential failure in detecting collective anomalies if they behave similarly to normal data
|
@@ -16,35 +16,40 @@ logger = get_logger(__name__)
|
|
16
16
|
@dataclass
|
17
17
|
class KPSS(Metric):
|
18
18
|
"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
- The
|
46
|
-
|
47
|
-
|
19
|
+
Assesses the stationarity of time-series data in a machine learning model using the KPSS unit root test.
|
20
|
+
|
21
|
+
### Purpose
|
22
|
+
|
23
|
+
The KPSS (Kwiatkowski-Phillips-Schmidt-Shin) unit root test is utilized to ensure the stationarity of data within a
|
24
|
+
machine learning model. It specifically works on time-series data to establish the order of integration, which is
|
25
|
+
essential for accurate forecasting. A fundamental requirement for any time series model is that the series should
|
26
|
+
be stationary.
|
27
|
+
|
28
|
+
### Test Mechanism
|
29
|
+
|
30
|
+
This test calculates the KPSS score for each feature in the dataset. The KPSS score includes a statistic, a
|
31
|
+
p-value, a used lag, and critical values. The core principle behind the KPSS test is to evaluate the hypothesis
|
32
|
+
that an observable time series is stationary around a deterministic trend. If the computed statistic exceeds the
|
33
|
+
critical value, the null hypothesis (that the series is stationary) is rejected, indicating that the series is
|
34
|
+
non-stationary.
|
35
|
+
|
36
|
+
### Signs of High Risk
|
37
|
+
|
38
|
+
- High KPSS score, particularly if the calculated statistic is higher than the critical value.
|
39
|
+
- Rejection of the null hypothesis, indicating that the series is recognized as non-stationary, can severely affect
|
40
|
+
the model's forecasting capability.
|
41
|
+
|
42
|
+
### Strengths
|
43
|
+
|
44
|
+
- Directly measures the stationarity of a series, fulfilling a key prerequisite for many time-series models.
|
45
|
+
- The underlying logic of the test is intuitive and simple, making it easy to understand and accessible for both
|
46
|
+
developers and risk management teams.
|
47
|
+
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Assumes the absence of a unit root in the series and doesn't differentiate between series that are stationary and
|
51
|
+
those border-lining stationarity.
|
52
|
+
- The test may have restricted power against certain alternatives.
|
48
53
|
- The reliability of the test is contingent on the number of lags selected, which introduces potential bias in the
|
49
54
|
measurement.
|
50
55
|
"""
|
@@ -17,30 +17,37 @@ class LaggedCorrelationHeatmap(Metric):
|
|
17
17
|
Assesses and visualizes correlation between target variable and lagged independent variables in a time-series
|
18
18
|
dataset.
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
variable
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The LaggedCorrelationHeatmap metric is utilized to appraise and illustrate the correlation between the target
|
23
|
+
variable and delayed copies (lags) of independent variables in a time-series dataset. It assists in revealing
|
24
|
+
relationships in time-series data where the influence of an independent variable on the dependent variable is not
|
25
|
+
immediate but occurs after a period (lags).
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
To execute this test, Python's Pandas library pairs with Plotly to perform computations and present the
|
30
|
+
visualization in the form of a heatmap. The test begins by extracting the target variable and corresponding
|
31
|
+
independent variables from the dataset. Then, generation of lags of independent variables takes place, followed by
|
32
|
+
the calculation of correlation between these lagged variables and the target variable. The outcome is a correlation
|
33
|
+
matrix that gets recorded and illustrated as a heatmap, where different color intensities represent the strength of
|
34
|
+
the correlation, making patterns easier to identify.
|
35
|
+
|
36
|
+
### Signs of High Risk
|
37
|
+
|
33
38
|
- Insignificant correlations across the heatmap, indicating a lack of noteworthy relationships between variables.
|
34
39
|
- Correlations that break intuition or previous understanding, suggesting potential issues with the dataset or the
|
35
40
|
model.
|
36
41
|
|
37
|
-
|
42
|
+
### Strengths
|
43
|
+
|
38
44
|
- This metric serves as an exceptional tool for exploring and visualizing time-dependent relationships between
|
39
45
|
features and the target variable in a time-series dataset.
|
40
46
|
- It aids in identifying delayed effects that might go unnoticed with other correlation measures.
|
41
47
|
- The heatmap offers an intuitive visual representation of time-dependent correlations and influences.
|
42
48
|
|
43
|
-
|
49
|
+
### Limitations
|
50
|
+
|
44
51
|
- The metric presumes linear relationships between variables, potentially ignoring non-linear relationships.
|
45
52
|
- The correlation considered is linear; therefore, intricate non-linear interactions might be overlooked.
|
46
53
|
- The metric is only applicable for time-series data, limiting its utility outside of this context.
|
@@ -19,34 +19,39 @@ class MissingValues(ThresholdTest):
|
|
19
19
|
"""
|
20
20
|
Evaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold.
|
21
21
|
|
22
|
-
|
22
|
+
### Purpose
|
23
|
+
|
24
|
+
The Missing Values test is designed to evaluate the quality of a dataset by measuring the number of missing values
|
23
25
|
across all features. The objective is to ensure that the ratio of missing data to total data is less than a
|
24
|
-
predefined threshold, defaulting to 1, to maintain the data quality necessary for reliable predictive
|
25
|
-
machine learning model.
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
-
|
49
|
-
|
26
|
+
predefined threshold, defaulting to 1, in order to maintain the data quality necessary for reliable predictive
|
27
|
+
strength in a machine learning model.
|
28
|
+
|
29
|
+
### Test Mechanism
|
30
|
+
|
31
|
+
The mechanism for this test involves iterating through each column of the dataset, counting missing values
|
32
|
+
(represented as NaNs), and calculating the percentage they represent against the total number of rows. The test
|
33
|
+
then checks if these missing value counts are less than the predefined `min_threshold`. The results are shown in a
|
34
|
+
table summarizing each column, the number of missing values, the percentage of missing values in each column, and a
|
35
|
+
Pass/Fail status based on the threshold comparison.
|
36
|
+
|
37
|
+
### Signs of High Risk
|
38
|
+
|
39
|
+
- When the number of missing values in any column exceeds the `min_threshold` value.
|
40
|
+
- Presence of missing values across many columns, leading to multiple instances of failing the threshold.
|
41
|
+
|
42
|
+
### Strengths
|
43
|
+
|
44
|
+
- Quick and granular identification of missing data across each feature in the dataset.
|
45
|
+
- Provides an effective and straightforward means of maintaining data quality, essential for constructing efficient
|
46
|
+
machine learning models.
|
47
|
+
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Does not suggest the root causes of the missing values or recommend ways to impute or handle them.
|
51
|
+
- May overlook features with significant missing data but still less than the `min_threshold`, potentially
|
52
|
+
impacting the model.
|
53
|
+
- Does not account for data encoded as values like "-999" or "None," which might not technically classify as
|
54
|
+
missing but could bear similar implications.
|
50
55
|
"""
|
51
56
|
|
52
57
|
name = "missing"
|
@@ -12,37 +12,41 @@ from validmind.vm_models import Figure, Metric
|
|
12
12
|
@dataclass
|
13
13
|
class MissingValuesBarPlot(Metric):
|
14
14
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
15
|
+
Assesses the percentage and distribution of missing values in the dataset via a bar plot, with emphasis on
|
16
|
+
identifying high-risk columns based on a user-defined threshold.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The 'MissingValuesBarPlot' metric provides a color-coded visual representation of the percentage of missing values
|
21
|
+
for each column in an ML model's dataset. The primary purpose of this metric is to easily identify and quantify
|
22
|
+
missing data, which are essential steps in data preprocessing. The presence of missing data can potentially skew
|
23
|
+
the model's predictions and decrease its accuracy. Additionally, this metric uses a pre-set threshold to categorize
|
24
|
+
various columns into ones that contain missing data above the threshold (high risk) and below the threshold (less
|
25
|
+
risky).
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test mechanism involves scanning each column in the input dataset and calculating the percentage of missing
|
30
|
+
values. It then compares each column's missing data percentage with the predefined threshold, categorizing columns
|
31
|
+
with missing data above the threshold as high-risk. The test generates a bar plot in which columns with missing
|
32
|
+
data are represented on the y-axis and their corresponding missing data percentages are displayed on the x-axis.
|
33
|
+
The color of each bar reflects the missing data percentage in relation to the threshold: grey for values below the
|
34
|
+
threshold and light coral for those exceeding it. The user-defined threshold is represented by a red dashed line on
|
35
|
+
the plot.
|
36
|
+
|
37
|
+
### Signs of High Risk
|
34
38
|
|
35
39
|
- Columns with higher percentages of missing values beyond the threshold are high-risk. These are visually
|
36
40
|
represented by light coral bars on the bar plot.
|
37
41
|
|
38
|
-
|
42
|
+
### Strengths
|
39
43
|
|
40
44
|
- Helps in quickly identifying and quantifying missing data across all columns of the dataset.
|
41
45
|
- Facilitates pattern recognition through visual representation.
|
42
46
|
- Enables customization of the level of risk tolerance via a user-defined threshold.
|
43
47
|
- Supports both classification and regression tasks, sharing its versatility.
|
44
48
|
|
45
|
-
|
49
|
+
### Limitations
|
46
50
|
|
47
51
|
- It only considers the quantity of missing values, not differentiating between different types of missingness
|
48
52
|
(Missing completely at random - MCAR, Missing at random - MAR, Not Missing at random - NMAR).
|
@@ -2,103 +2,90 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
5
|
|
7
6
|
import plotly.graph_objects as go
|
8
7
|
|
9
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
10
9
|
|
11
10
|
|
12
|
-
@
|
13
|
-
|
11
|
+
@tags("tabular_data", "numerical_data", "correlation")
|
12
|
+
@tasks("classification", "regression")
|
13
|
+
def PearsonCorrelationMatrix(dataset):
|
14
14
|
"""
|
15
15
|
Evaluates linear dependency between numerical variables in a dataset via a Pearson Correlation coefficient heat map.
|
16
16
|
|
17
|
-
|
18
|
-
variables in the given dataset. It provides the Pearson Correlation coefficient, which reveals any high
|
19
|
-
correlations present. The purpose of doing this is to identify potential redundancy, as variables that are highly
|
20
|
-
correlated can often be removed to reduce the dimensionality of the dataset without significantly impacting the
|
21
|
-
model's performance.
|
17
|
+
### Purpose
|
22
18
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
This test is intended to evaluate the extent of linear dependency between all pairs of numerical variables in the
|
20
|
+
given dataset. It provides the Pearson Correlation coefficient, which reveals any high correlations present. The
|
21
|
+
purpose of doing this is to identify potential redundancy, as variables that are highly correlated can often be
|
22
|
+
removed to reduce the dimensionality of the dataset without significantly impacting the model's performance.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
This metric test generates a correlation matrix for all numerical variables in the dataset using the Pearson
|
27
|
+
correlation formula. A heat map is subsequently created to visualize this matrix effectively. The color of each
|
28
|
+
point on the heat map corresponds to the magnitude and direction (positive or negative) of the correlation, with a
|
29
|
+
range from -1 (perfect negative correlation) to 1 (perfect positive correlation). Any correlation coefficients
|
30
|
+
higher than 0.7 (in absolute terms) are indicated in white in the heat map, suggesting a high degree of correlation.
|
31
|
+
|
32
|
+
### Signs of High Risk
|
29
33
|
|
30
|
-
**Signs of High Risk**:
|
31
34
|
- A large number of variables in the dataset showing a high degree of correlation (coefficients approaching ±1).
|
32
35
|
This indicates redundancy within the dataset, suggesting that some variables may not be contributing new
|
33
36
|
information to the model.
|
34
|
-
-
|
35
|
-
|
36
|
-
**Strengths**:
|
37
|
-
- The primary strength of this metric test is its ability to detect and quantify the linearity of relationships
|
38
|
-
between variables. This allows for the identification of redundant variables, which in turn can help in simplifying
|
39
|
-
models and potentially improving their performance.
|
40
|
-
- The visualization aspect (heatmap) is another strength as it offers an easy-to-understand overview of the
|
41
|
-
correlations, beneficial for those not comfortable navigating numerical matrices.
|
42
|
-
|
43
|
-
**Limitations**:
|
44
|
-
- The primary limitation of Pearson Correlation is its inability to detect non-linear relationships between
|
45
|
-
variables, which can lead to missed opportunities for dimensionality reduction.
|
46
|
-
- It only measures the degree of linear relationship and not the strength of effect of one variable on the other.
|
47
|
-
- The cutoff value of 0.7 for high correlation is a somewhat arbitrary choice and some valid dependencies might be
|
48
|
-
missed if they have a correlation coefficient less than this value.
|
49
|
-
"""
|
37
|
+
- Potential risk of overfitting.
|
50
38
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
zmin=-1,
|
66
|
-
zmax=1,
|
67
|
-
)
|
68
|
-
|
69
|
-
annotations = []
|
70
|
-
for i, row in enumerate(corr_matrix.values):
|
71
|
-
for j, value in enumerate(row):
|
72
|
-
color = "#ffffff" if abs(value) > 0.7 else "#000000"
|
73
|
-
annotations.append(
|
74
|
-
go.layout.Annotation(
|
75
|
-
text=str(round(value, 2)),
|
76
|
-
x=corr_matrix.columns[j],
|
77
|
-
y=corr_matrix.index[i],
|
78
|
-
showarrow=False,
|
79
|
-
font=dict(color=color),
|
80
|
-
)
|
81
|
-
)
|
39
|
+
### Strengths
|
40
|
+
|
41
|
+
- Detects and quantifies the linearity of relationships between variables, aiding in identifying redundant
|
42
|
+
variables to simplify models and potentially improve performance.
|
43
|
+
- The heatmap visualization provides an easy-to-understand overview of correlations, beneficial for users not
|
44
|
+
comfortable with numerical matrices.
|
45
|
+
|
46
|
+
### Limitations
|
47
|
+
|
48
|
+
- Limited to detecting linear relationships, potentially missing non-linear relationships which impede
|
49
|
+
opportunities for dimensionality reduction.
|
50
|
+
- Measures only the degree of linear relationship, not the strength of one variable's effect on another.
|
51
|
+
- The 0.7 correlation threshold is arbitrary and might exclude valid dependencies with lower coefficients.
|
52
|
+
"""
|
82
53
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
54
|
+
corr_matrix = dataset.df.corr(numeric_only=True)
|
55
|
+
heatmap = go.Heatmap(
|
56
|
+
z=corr_matrix.values,
|
57
|
+
x=list(corr_matrix.columns),
|
58
|
+
y=list(corr_matrix.index),
|
59
|
+
colorscale="rdbu",
|
60
|
+
zmin=-1,
|
61
|
+
zmax=1,
|
62
|
+
)
|
63
|
+
|
64
|
+
annotations = []
|
65
|
+
for i, row in enumerate(corr_matrix.values):
|
66
|
+
for j, value in enumerate(row):
|
67
|
+
color = "#ffffff" if abs(value) > 0.7 else "#000000"
|
68
|
+
annotations.append(
|
69
|
+
go.layout.Annotation(
|
70
|
+
text=str(round(value, 2)),
|
71
|
+
x=corr_matrix.columns[j],
|
72
|
+
y=corr_matrix.index[i],
|
73
|
+
showarrow=False,
|
74
|
+
font=dict(color=color),
|
102
75
|
)
|
103
|
-
|
104
|
-
|
76
|
+
)
|
77
|
+
|
78
|
+
layout = go.Layout(
|
79
|
+
annotations=annotations,
|
80
|
+
xaxis=dict(side="top"),
|
81
|
+
yaxis=dict(scaleanchor="x", scaleratio=1),
|
82
|
+
width=800,
|
83
|
+
height=800,
|
84
|
+
autosize=True,
|
85
|
+
paper_bgcolor="rgba(0,0,0,0)",
|
86
|
+
plot_bgcolor="rgba(0,0,0,0)",
|
87
|
+
)
|
88
|
+
|
89
|
+
fig = go.Figure(data=[heatmap], layout=layout)
|
90
|
+
|
91
|
+
return fig
|