validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -16,38 +16,45 @@ class DescriptiveStatistics(Metric):
|
|
16
16
|
Performs a detailed descriptive statistical analysis of both numerical and categorical data within a model's
|
17
17
|
dataset.
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
The purpose of the Descriptive Statistics metric is to provide a comprehensive summary of both numerical and
|
22
|
+
categorical data within a dataset. This involves statistics such as count, mean, standard deviation, minimum and
|
23
|
+
maximum values for numerical data. For categorical data, it calculates the count, number of unique values, most
|
24
|
+
common value and its frequency, and the proportion of the most frequent value relative to the total. The goal is to
|
25
|
+
visualize the overall distribution of the variables in the dataset, aiding in understanding the model's behavior
|
26
|
+
and predicting its performance.
|
27
|
+
|
28
|
+
### Test Mechanism
|
29
|
+
|
30
|
+
The testing mechanism utilizes two in-built functions of pandas dataframes: `describe()` for numerical fields and
|
31
|
+
`value_counts()` for categorical fields. The `describe()` function pulls out several summary statistics, while
|
32
|
+
`value_counts()` accounts for unique values. The resulting data is formatted into two distinct tables, one for
|
33
|
+
numerical and another for categorical variable summaries. These tables provide a clear summary of the main
|
34
|
+
characteristics of the variables, which can be instrumental in assessing the model's performance.
|
35
|
+
|
36
|
+
### Signs of High Risk
|
37
|
+
|
33
38
|
- Skewed data or significant outliers can represent high risk. For numerical data, this may be reflected via a
|
34
39
|
significant difference between the mean and median (50% percentile).
|
35
40
|
- For categorical data, a lack of diversity (low count of unique values), or overdominance of a single category
|
36
41
|
(high frequency of the top value) can indicate high risk.
|
37
42
|
|
38
|
-
|
39
|
-
|
40
|
-
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides a comprehensive summary of the dataset, shedding light on the distribution and characteristics of the
|
46
|
+
variables under consideration.
|
41
47
|
- It is a versatile and robust method, applicable to both numerical and categorical data.
|
42
|
-
-
|
48
|
+
- Helps highlight crucial anomalies such as outliers, extreme skewness, or lack of diversity, which are vital in
|
43
49
|
understanding model behavior during testing and validation.
|
44
50
|
|
45
|
-
|
51
|
+
### Limitations
|
52
|
+
|
46
53
|
- While this metric offers a high-level overview of the data, it may fail to detect subtle correlations or complex
|
47
54
|
patterns.
|
48
|
-
-
|
55
|
+
- Does not offer any insights on the relationship between variables.
|
49
56
|
- Alone, descriptive statistics cannot be used to infer properties about future unseen data.
|
50
|
-
-
|
57
|
+
- Should be used in conjunction with other statistical tests to provide a comprehensive understanding of the
|
51
58
|
model's data.
|
52
59
|
"""
|
53
60
|
|
@@ -21,35 +21,43 @@ class Duplicates(ThresholdTest):
|
|
21
21
|
"""
|
22
22
|
Tests dataset for duplicate entries, ensuring model reliability via data quality verification.
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
24
|
+
### Purpose
|
25
|
+
|
26
|
+
The 'Duplicates' test is designed to check for duplicate rows within the dataset provided to the model. It serves
|
27
|
+
as a measure of data quality, ensuring that the model isn't merely memorizing duplicate entries or being swayed by
|
28
|
+
redundant information. This is an important step in the pre-processing of data for both classification and
|
29
|
+
regression tasks.
|
30
|
+
|
31
|
+
### Test Mechanism
|
32
|
+
|
33
|
+
This test operates by checking each row for duplicates in the dataset. If a text column is specified in the
|
34
|
+
dataset, the test is conducted on this column; if not, the test is run on all feature columns. The number and
|
35
|
+
percentage of duplicates are calculated and returned in a DataFrame. Additionally, a test is passed if the total
|
36
|
+
count of duplicates falls below a specified minimum threshold.
|
37
|
+
|
38
|
+
### Signs of High Risk
|
39
|
+
|
40
|
+
- A high number of duplicate rows in the dataset, which can lead to overfitting where the model performs well on
|
41
|
+
the training data but poorly on unseen data.
|
42
|
+
- A high percentage of duplicate rows in the dataset, indicating potential problems with data collection or
|
43
|
+
processing.
|
44
|
+
|
45
|
+
### Strengths
|
46
|
+
|
41
47
|
- Assists in improving the reliability of the model's training process by ensuring the training data is not
|
42
|
-
contaminated with duplicate entries which can distort statistical analyses.
|
43
|
-
- Provides both absolute
|
48
|
+
contaminated with duplicate entries, which can distort statistical analyses.
|
49
|
+
- Provides both absolute numbers and percentage values of duplicate rows, giving a thorough overview of data
|
50
|
+
quality.
|
44
51
|
- Highly customizable as it allows for setting a user-defined minimum threshold to determine if the test has been
|
45
52
|
passed.
|
46
53
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
54
|
+
### Limitations
|
55
|
+
|
56
|
+
- Does not distinguish between benign duplicates (i.e., coincidental identical entries in different rows) and
|
57
|
+
problematic duplicates originating from data collection or processing errors.
|
58
|
+
- The test becomes more computationally intensive as the size of the dataset increases, which might not be suitable
|
59
|
+
for very large datasets.
|
60
|
+
- Can only check for exact duplicates and may miss semantically similar information packaged differently.
|
53
61
|
"""
|
54
62
|
|
55
63
|
name = "duplicates"
|
@@ -10,41 +10,43 @@ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableM
|
|
10
10
|
|
11
11
|
class EngleGrangerCoint(Metric):
|
12
12
|
"""
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
over time.
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
13
|
+
Assesses the degree of co-movement between pairs of time series data using the Engle-Granger cointegration test.
|
14
|
+
|
15
|
+
### Purpose
|
16
|
+
|
17
|
+
The intent of this Engle-Granger cointegration test is to explore and quantify the degree of co-movement between
|
18
|
+
pairs of time series variables in a dataset. This is particularly useful in enhancing the accuracy of predictive
|
19
|
+
regressions whenever the underlying variables are co-integrated, i.e., they move together over time.
|
20
|
+
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
The test first drops any non-applicable values from the input dataset and then iterates over each pair of variables
|
24
|
+
to apply the Engle-Granger cointegration test. The test generates a 'p' value, which is then compared against a
|
25
|
+
pre-specified threshold (0.05 by default). The pair is labeled as 'Cointegrated' if the 'p' value is less than or
|
26
|
+
equal to the threshold or 'Not cointegrated' otherwise. A summary table is returned by the metric showing
|
27
|
+
cointegration results for each variable pair.
|
28
|
+
|
29
|
+
### Signs of High Risk
|
30
|
+
|
31
|
+
- A significant number of hypothesized cointegrated variables do not pass the test.
|
32
|
+
- A considerable number of 'p' values are close to the threshold, indicating minor data fluctuations can switch the
|
33
|
+
decision between 'Cointegrated' and 'Not cointegrated'.
|
34
|
+
|
35
|
+
### Strengths
|
36
|
+
|
37
|
+
- Provides an effective way to analyze relationships between time series, particularly in contexts where it's
|
38
|
+
essential to check if variables move together in a statistically significant manner.
|
39
|
+
- Useful in various domains, especially finance or economics, where predictive models often hinge on understanding
|
40
|
+
how different variables move together over time.
|
41
|
+
|
42
|
+
### Limitations
|
43
|
+
|
44
|
+
- Assumes that the time series are integrated of the same order, which isn't always true in multivariate time
|
45
|
+
series datasets.
|
43
46
|
- The presence of non-stationary characteristics in the series or structural breaks can result in falsely positive
|
44
47
|
or negative cointegration results.
|
45
|
-
-
|
46
|
-
|
47
|
-
evaluation.
|
48
|
+
- May not perform well for small sample sizes due to lack of statistical power and should be supplemented with
|
49
|
+
other predictive indicators for a more robust model evaluation.
|
48
50
|
"""
|
49
51
|
|
50
52
|
type = "dataset"
|
@@ -2,108 +2,96 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
5
|
|
7
6
|
import numpy as np
|
8
7
|
import plotly.graph_objects as go
|
9
8
|
|
10
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
11
10
|
|
12
11
|
|
13
|
-
@
|
14
|
-
|
12
|
+
@tags("tabular_data", "visualization", "correlation")
|
13
|
+
@tasks("classification", "regression")
|
14
|
+
def FeatureTargetCorrelationPlot(dataset, fig_height=600):
|
15
15
|
"""
|
16
|
-
Visualizes the correlation between input features and model's target output in a color-coded horizontal bar
|
16
|
+
Visualizes the correlation between input features and the model's target output in a color-coded horizontal bar
|
17
|
+
plot.
|
17
18
|
|
18
|
-
|
19
|
-
the target output of a Machine Learning model. Understanding how each feature influences the model's predictions is
|
20
|
-
crucial - a higher correlation indicates stronger influence of the feature on the target variable. This correlation
|
21
|
-
study is especially advantageous during feature selection and for comprehending the model's operation.
|
19
|
+
### Purpose
|
22
20
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
This test is designed to graphically illustrate the correlations between distinct input features and the target
|
22
|
+
output of a Machine Learning model. Understanding how each feature influences the model's predictions is crucial—a
|
23
|
+
higher correlation indicates a stronger influence of the feature on the target variable. This correlation study is
|
24
|
+
especially advantageous during feature selection and for comprehending the model's operation.
|
25
|
+
|
26
|
+
### Test Mechanism
|
27
|
+
|
28
|
+
This FeatureTargetCorrelationPlot test computes and presents the correlations between the features and the target
|
29
|
+
variable using a specific dataset. These correlations are calculated and are then graphically represented in a
|
30
|
+
horizontal bar plot, color-coded based on the strength of the correlation. A hovering template can also be utilized
|
31
|
+
for informative tooltips. It is possible to specify the features to be analyzed and adjust the graph's height
|
32
|
+
according to need.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
28
35
|
|
29
|
-
**Signs of High Risk**:
|
30
36
|
- There are no strong correlations (either positive or negative) between features and the target variable. This
|
31
37
|
could suggest high risk as the supplied features do not appear to significantly impact the prediction output.
|
32
38
|
- The presence of duplicated correlation values might hint at redundancy in the feature set.
|
33
39
|
|
34
|
-
|
40
|
+
### Strengths
|
41
|
+
|
35
42
|
- Provides visual assistance to interpreting correlations more effectively.
|
36
43
|
- Gives a clear and simple tour of how each feature affects the model's target variable.
|
37
44
|
- Beneficial for feature selection and grasping the model's prediction nature.
|
38
45
|
- Precise correlation values for each feature are offered by the hover template, contributing to a granular-level
|
39
46
|
comprehension.
|
40
47
|
|
41
|
-
|
48
|
+
### Limitations
|
49
|
+
|
42
50
|
- The test only accepts numerical data, meaning variables of other types need to be prepared beforehand.
|
43
51
|
- The plot assumes all correlations to be linear, thus non-linear relationships might not be captured effectively.
|
44
52
|
- Not apt for models that employ complex feature interactions, like Decision Trees or Neural Networks, as the test
|
45
53
|
may not accurately reflect their importance.
|
46
54
|
"""
|
47
55
|
|
48
|
-
|
49
|
-
|
50
|
-
default_params = {"features": None, "fig_height": 600}
|
51
|
-
tasks = ["classification", "regression"]
|
52
|
-
tags = ["tabular_data", "visualization", "feature_importance", "correlation"]
|
56
|
+
# Filter DataFrame based on features and target_column
|
57
|
+
df = dataset.df[dataset.feature_columns + [dataset.target_column]]
|
53
58
|
|
54
|
-
|
55
|
-
fig_height = self.params["fig_height"]
|
59
|
+
fig = _visualize_feature_target_correlation(df, dataset.target_column, fig_height)
|
56
60
|
|
57
|
-
|
58
|
-
features = self.inputs.dataset.feature_columns
|
59
|
-
else:
|
60
|
-
features = self.params["features"]
|
61
|
+
return fig
|
61
62
|
|
62
|
-
target_column = self.inputs.dataset.target_column
|
63
63
|
|
64
|
-
|
65
|
-
|
64
|
+
def _visualize_feature_target_correlation(df, target_column, fig_height):
|
65
|
+
# Compute correlations with the target variable
|
66
|
+
correlations = (
|
67
|
+
df.corr(numeric_only=True)[target_column].drop(target_column).to_frame()
|
68
|
+
)
|
69
|
+
correlations = correlations.loc[:, ~correlations.columns.duplicated()]
|
66
70
|
|
67
|
-
|
68
|
-
df, target_column, fig_height
|
69
|
-
)
|
71
|
+
correlations = correlations.sort_values(by=target_column, ascending=True)
|
70
72
|
|
71
|
-
|
73
|
+
# Create a gradual color map from red (1) to blue (-1)
|
74
|
+
color_map = np.linspace(1, -1, len(correlations))
|
75
|
+
colors = [
|
76
|
+
f"rgb({int(255 * (1 - val))}, 0, {int(255 * (1 + val))})" for val in color_map
|
77
|
+
]
|
72
78
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
# Create a gradual color map from red (1) to blue (-1)
|
83
|
-
color_map = np.linspace(1, -1, len(correlations))
|
84
|
-
colors = [
|
85
|
-
f"rgb({int(255 * (1 - val))}, 0, {int(255 * (1 + val))})"
|
86
|
-
for val in color_map
|
87
|
-
]
|
88
|
-
|
89
|
-
# Create a horizontal bar plot with gradual color mapping
|
90
|
-
fig = go.Figure(
|
91
|
-
data=go.Bar(
|
92
|
-
x=correlations[target_column],
|
93
|
-
y=correlations.index,
|
94
|
-
orientation="h",
|
95
|
-
marker=dict(color=colors),
|
96
|
-
hovertemplate="Feature: %{y}<br>Correlation: %{x:.2f}<extra></extra>", # Hover template for tooltips
|
97
|
-
)
|
79
|
+
# Create a horizontal bar plot with gradual color mapping
|
80
|
+
fig = go.Figure(
|
81
|
+
data=go.Bar(
|
82
|
+
x=correlations[target_column],
|
83
|
+
y=correlations.index,
|
84
|
+
orientation="h",
|
85
|
+
marker=dict(color=colors),
|
86
|
+
hovertemplate="Feature: %{y}<br>Correlation: %{x:.2f}<extra></extra>", # Hover template for tooltips
|
98
87
|
)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
return [figure]
|
88
|
+
)
|
89
|
+
|
90
|
+
# Set the title and axis labels
|
91
|
+
fig.update_layout(
|
92
|
+
title=f"Correlations of Features vs Target Variable ({target_column})",
|
93
|
+
xaxis_title="",
|
94
|
+
yaxis_title="",
|
95
|
+
height=fig_height, # Adjust the height value as needed
|
96
|
+
)
|
97
|
+
return fig
|
@@ -22,26 +22,33 @@ class HighCardinality(ThresholdTest):
|
|
22
22
|
"""
|
23
23
|
Assesses the number of unique values in categorical columns to detect high cardinality and potential overfitting.
|
24
24
|
|
25
|
-
|
26
|
-
columns of a dataset. In this context, high cardinality implies the presence of a large number of unique,
|
27
|
-
non-repetitive values in the dataset.
|
25
|
+
### Purpose
|
28
26
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
The “High Cardinality” test is used to evaluate the number of unique values present in the categorical columns of a
|
28
|
+
dataset. In this context, high cardinality implies the presence of a large number of unique, non-repetitive values
|
29
|
+
in the dataset.
|
30
|
+
|
31
|
+
### Test Mechanism
|
32
|
+
|
33
|
+
The test first infers the dataset's type and then calculates an initial numeric threshold based on the test
|
34
|
+
parameters. It only considers columns classified as "Categorical". For each of these columns, the number of
|
35
|
+
distinct values (n_distinct) and the percentage of distinct values (p_distinct) are calculated. The test will pass
|
36
|
+
if n_distinct is less than the calculated numeric threshold. Lastly, the results, which include details such as
|
37
|
+
column name, number of distinct values, and pass/fail status, are compiled into a table.
|
38
|
+
|
39
|
+
### Signs of High Risk
|
34
40
|
|
35
|
-
**Signs of High Risk**:
|
36
41
|
- A large number of distinct values (high cardinality) in one or more categorical columns implies a high risk.
|
37
42
|
- A column failing the test (n_distinct >= num_threshold) is another indicator of high risk.
|
38
43
|
|
39
|
-
|
44
|
+
### Strengths
|
45
|
+
|
40
46
|
- The High Cardinality test is effective in early detection of potential overfitting and unwanted noise.
|
41
47
|
- It aids in identifying potential outliers and inconsistencies, thereby improving data quality.
|
42
|
-
- The test can be applied to both
|
48
|
+
- The test can be applied to both classification and regression task types, demonstrating its versatility.
|
49
|
+
|
50
|
+
### Limitations
|
43
51
|
|
44
|
-
**Limitations**:
|
45
52
|
- The test is restricted to only "Categorical" data types and is thus not suitable for numerical or continuous
|
46
53
|
features, limiting its scope.
|
47
54
|
- The test does not consider the relevance or importance of unique values in categorical features, potentially
|
@@ -22,36 +22,41 @@ class HighPearsonCorrelation(ThresholdTest):
|
|
22
22
|
"""
|
23
23
|
Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
|
24
24
|
|
25
|
-
|
26
|
-
the main goal of identifying high correlations that might indicate feature redundancy or multicollinearity.
|
27
|
-
Identification of such issue allows developers and risk management teams to properly deal with potential impacts on
|
28
|
-
the machine learning model's performance and interpretability.
|
25
|
+
### Purpose
|
29
26
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
The High Pearson Correlation test measures the linear relationship between features in a dataset, with the main
|
28
|
+
goal of identifying high correlations that might indicate feature redundancy or multicollinearity. Identification
|
29
|
+
of such issues allows developers and risk management teams to properly deal with potential impacts on the machine
|
30
|
+
learning model's performance and interpretability.
|
31
|
+
|
32
|
+
### Test Mechanism
|
33
|
+
|
34
|
+
The test works by generating pairwise Pearson correlations for all features in the dataset, then sorting and
|
35
|
+
eliminating duplicate and self-correlations. It assigns a Pass or Fail based on whether the absolute value of the
|
36
|
+
correlation coefficient surpasses a pre-set threshold (defaulted at 0.3). It lastly returns the top ten strongest
|
37
|
+
correlations regardless of passing or failing status.
|
38
|
+
|
39
|
+
### Signs of High Risk
|
34
40
|
|
35
|
-
**Signs of High Risk**:
|
36
41
|
- A high risk indication would be the presence of correlation coefficients exceeding the threshold.
|
37
42
|
- If the features share a strong linear relationship, this could lead to potential multicollinearity and model
|
38
43
|
overfitting.
|
39
44
|
- Redundancy of variables can undermine the interpretability of the model due to uncertainty over the authenticity
|
40
45
|
of individual variable's predictive power.
|
41
46
|
|
42
|
-
|
43
|
-
|
44
|
-
pairs.
|
45
|
-
-
|
46
|
-
|
47
|
-
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
relationships or dependencies.
|
52
|
-
-
|
53
|
-
-
|
54
|
-
|
47
|
+
### Strengths
|
48
|
+
|
49
|
+
- Provides a quick and simple means of identifying relationships between feature pairs.
|
50
|
+
- Generates a transparent output that displays pairs of correlated variables, the Pearson correlation coefficient,
|
51
|
+
and a Pass or Fail status for each.
|
52
|
+
- Aids in early identification of potential multicollinearity issues that may disrupt model training.
|
53
|
+
|
54
|
+
### Limitations
|
55
|
+
|
56
|
+
- Can only delineate linear relationships, failing to shed light on nonlinear relationships or dependencies.
|
57
|
+
- Sensitive to outliers where a few outliers could notably affect the correlation coefficient.
|
58
|
+
- Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
|
59
|
+
three or more variables.
|
55
60
|
- The top 10 result filter might not fully capture the richness of the data; an option to configure the number of
|
56
61
|
retained results could be helpful.
|
57
62
|
"""
|
@@ -12,14 +12,18 @@ from validmind.vm_models import Figure, Metric
|
|
12
12
|
@dataclass
|
13
13
|
class IQROutliersBarPlot(Metric):
|
14
14
|
"""
|
15
|
-
Visualizes outlier distribution across percentiles in numerical data using Interquartile Range (IQR) method.
|
15
|
+
Visualizes outlier distribution across percentiles in numerical data using the Interquartile Range (IQR) method.
|
16
16
|
|
17
|
-
|
18
|
-
evaluate the extent of outliers in numeric variables based on percentiles. Its primary purpose is to clarify the
|
19
|
-
dataset's distribution, flag possible abnormalities in it and gauge potential risks associated with processing
|
20
|
-
potentially skewed data, which can affect the machine learning model's predictive prowess.
|
17
|
+
### Purpose
|
21
18
|
|
22
|
-
|
19
|
+
The InterQuartile Range Outliers Bar Plot (IQROutliersBarPlot) metric aims to visually analyze and evaluate the
|
20
|
+
extent of outliers in numeric variables based on percentiles. Its primary purpose is to clarify the dataset's
|
21
|
+
distribution, flag possible abnormalities in it, and gauge potential risks associated with processing potentially
|
22
|
+
skewed data, which can affect the machine learning model's predictive prowess.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The examination invokes a series of steps:
|
23
27
|
|
24
28
|
1. For every numeric feature in the dataset, the 25th percentile (Q1) and 75th percentile (Q3) are calculated
|
25
29
|
before deriving the Interquartile Range (IQR), the difference between Q1 and Q3.
|
@@ -31,8 +35,7 @@ class IQROutliersBarPlot(Metric):
|
|
31
35
|
5. These counts are employed to construct a bar plot for the feature, showcasing the distribution of outliers
|
32
36
|
across different percentiles.
|
33
37
|
|
34
|
-
|
35
|
-
following signs:
|
38
|
+
### Signs of High Risk
|
36
39
|
|
37
40
|
- A prevalence of outliers in the data, potentially skewing its distribution.
|
38
41
|
- Outliers dominating higher percentiles (75-100) which implies the presence of extreme values, capable of severely
|
@@ -40,7 +43,7 @@ class IQROutliersBarPlot(Metric):
|
|
40
43
|
- Certain features harboring most of their values as outliers, which signifies that these features might not
|
41
44
|
contribute positively to the model's forecasting ability.
|
42
45
|
|
43
|
-
|
46
|
+
### Strengths
|
44
47
|
|
45
48
|
- Effectively identifies outliers in the data through visual means, facilitating easier comprehension and offering
|
46
49
|
insights into the outliers' possible impact on the model.
|
@@ -48,7 +51,7 @@ class IQROutliersBarPlot(Metric):
|
|
48
51
|
- Task-agnostic in nature; it is viable for both classification and regression tasks.
|
49
52
|
- Can handle large datasets as its operation does not hinge on computationally heavy operations.
|
50
53
|
|
51
|
-
|
54
|
+
### Limitations
|
52
55
|
|
53
56
|
- Its application is limited to numerical variables and does not extend to categorical ones.
|
54
57
|
- Relies on a predefined threshold (default being 1.5) for outlier identification, which may not be suitable for
|