validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from statsmodels.tsa.stattools import coint
|
7
7
|
|
8
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
9
11
|
|
10
12
|
|
11
|
-
|
13
|
+
@tags("time_series_data", "statistical_test", "forecasting")
|
14
|
+
@tasks("regression")
|
15
|
+
def EngleGrangerCoint(dataset: VMDataset, threshold: float = 0.05):
|
12
16
|
"""
|
13
17
|
Assesses the degree of co-movement between pairs of time series data using the Engle-Granger cointegration test.
|
14
18
|
|
@@ -48,80 +52,54 @@ class EngleGrangerCoint(Metric):
|
|
48
52
|
- May not perform well for small sample sizes due to lack of statistical power and should be supplemented with
|
49
53
|
other predictive indicators for a more robust model evaluation.
|
50
54
|
"""
|
55
|
+
df = dataset.df
|
51
56
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
tasks = ["regression"]
|
57
|
-
tags = ["time_series_data", "statistical_test", "forecasting"]
|
58
|
-
|
59
|
-
def run(self):
|
60
|
-
threshold = self.params["threshold"]
|
61
|
-
df = self.inputs.dataset.df.dropna()
|
62
|
-
|
63
|
-
# Create an empty DataFrame to store the results
|
64
|
-
summary_cointegration = pd.DataFrame()
|
65
|
-
|
66
|
-
columns = df.columns
|
67
|
-
num_vars = len(columns)
|
68
|
-
|
69
|
-
for i in range(num_vars):
|
70
|
-
for j in range(i + 1, num_vars):
|
71
|
-
var1 = columns[i]
|
72
|
-
var2 = columns[j]
|
73
|
-
|
74
|
-
# Perform the Engle-Granger cointegration test
|
75
|
-
_, p_value, _ = coint(df[var1], df[var2])
|
76
|
-
|
77
|
-
# Determine the decision based on the p-value and the significance level
|
78
|
-
decision = (
|
79
|
-
"Cointegrated" if p_value <= threshold else "Not cointegrated"
|
80
|
-
)
|
81
|
-
pass_fail = "Pass" if p_value <= threshold else "Fail"
|
82
|
-
|
83
|
-
# Append the result of each test directly into the DataFrame
|
84
|
-
summary_cointegration = pd.concat(
|
85
|
-
[
|
86
|
-
summary_cointegration,
|
87
|
-
pd.DataFrame(
|
88
|
-
[
|
89
|
-
{
|
90
|
-
"Variable 1": var1,
|
91
|
-
"Variable 2": var2,
|
92
|
-
"Test": "Engle-Granger",
|
93
|
-
"p-value": p_value,
|
94
|
-
"Threshold": threshold,
|
95
|
-
"Pass/Fail": pass_fail,
|
96
|
-
"Decision": decision,
|
97
|
-
}
|
98
|
-
]
|
99
|
-
),
|
100
|
-
],
|
101
|
-
ignore_index=True,
|
102
|
-
)
|
103
|
-
|
104
|
-
return self.cache_results(
|
105
|
-
{
|
106
|
-
"cointegration_analysis": summary_cointegration.to_dict(
|
107
|
-
orient="records"
|
108
|
-
),
|
109
|
-
}
|
57
|
+
# Validate that the index is datetime
|
58
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
59
|
+
raise SkipTestError(
|
60
|
+
"Dataset index must be a datetime or period index for cointegration analysis."
|
110
61
|
)
|
111
62
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
63
|
+
df = dataset.df.dropna()
|
64
|
+
|
65
|
+
summary_cointegration = pd.DataFrame()
|
66
|
+
|
67
|
+
columns = df.columns
|
68
|
+
num_vars = len(columns)
|
69
|
+
|
70
|
+
for i in range(num_vars):
|
71
|
+
for j in range(i + 1, num_vars):
|
72
|
+
var1 = columns[i]
|
73
|
+
var2 = columns[j]
|
74
|
+
|
75
|
+
# Perform the Engle-Granger cointegration test
|
76
|
+
_, p_value, _ = coint(df[var1], df[var2])
|
77
|
+
|
78
|
+
# Determine the decision based on the p-value and the significance level
|
79
|
+
decision = "Cointegrated" if p_value <= threshold else "Not cointegrated"
|
80
|
+
pass_fail = "Pass" if p_value <= threshold else "Fail"
|
81
|
+
|
82
|
+
# Append the result of each test directly into the DataFrame
|
83
|
+
summary_cointegration = pd.concat(
|
84
|
+
[
|
85
|
+
summary_cointegration,
|
86
|
+
pd.DataFrame(
|
87
|
+
[
|
88
|
+
{
|
89
|
+
"Variable 1": var1,
|
90
|
+
"Variable 2": var2,
|
91
|
+
"Test": "Engle-Granger",
|
92
|
+
"p-value": p_value,
|
93
|
+
"Threshold": threshold,
|
94
|
+
"Pass/Fail": pass_fail,
|
95
|
+
"Decision": decision,
|
96
|
+
}
|
97
|
+
]
|
124
98
|
),
|
125
|
-
|
126
|
-
|
127
|
-
|
99
|
+
],
|
100
|
+
ignore_index=True,
|
101
|
+
)
|
102
|
+
|
103
|
+
return {
|
104
|
+
"Cointegration Analysis Results": summary_cointegration,
|
105
|
+
}
|
@@ -2,23 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
)
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass
|
21
|
-
class HighCardinality(ThresholdTest):
|
5
|
+
from validmind import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
7
|
+
|
8
|
+
|
9
|
+
@tags("tabular_data", "data_quality", "categorical_data")
|
10
|
+
@tasks("classification", "regression")
|
11
|
+
def HighCardinality(
|
12
|
+
dataset: VMDataset,
|
13
|
+
num_threshold: int = 100,
|
14
|
+
percent_threshold: float = 0.1,
|
15
|
+
threshold_type: str = "percent",
|
16
|
+
):
|
22
17
|
"""
|
23
18
|
Assesses the number of unique values in categorical columns to detect high cardinality and potential overfitting.
|
24
19
|
|
@@ -56,72 +51,29 @@ class HighCardinality(ThresholdTest):
|
|
56
51
|
- The threshold (both number and percent) used for the test is static and may not be optimal for diverse datasets
|
57
52
|
and varied applications. Further mechanisms to adjust and refine this threshold could enhance its effectiveness.
|
58
53
|
"""
|
54
|
+
df = dataset.df
|
59
55
|
|
60
|
-
|
61
|
-
|
62
|
-
default_params = {
|
63
|
-
"num_threshold": 100,
|
64
|
-
"percent_threshold": 0.1,
|
65
|
-
"threshold_type": "percent", # or "num"
|
66
|
-
}
|
67
|
-
tasks = ["classification", "regression"]
|
68
|
-
tags = ["tabular_data", "data_quality", "categorical_data"]
|
69
|
-
|
70
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
71
|
-
"""
|
72
|
-
The high cardinality test returns results like these:
|
73
|
-
[{"values": {"n_distinct": 0, "p_distinct": 0.0}, "column": "Exited", "passed": true}]
|
74
|
-
"""
|
75
|
-
results_table = [
|
76
|
-
{
|
77
|
-
"Column": result.column,
|
78
|
-
"Number of Distinct Values": result.values["n_distinct"],
|
79
|
-
"Percentage of Distinct Values (%)": result.values["p_distinct"] * 100,
|
80
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
81
|
-
}
|
82
|
-
for result in results
|
83
|
-
]
|
84
|
-
return ResultSummary(
|
85
|
-
results=[
|
86
|
-
ResultTable(
|
87
|
-
data=results_table,
|
88
|
-
metadata=ResultTableMetadata(
|
89
|
-
title="High Cardinality Results for Dataset"
|
90
|
-
),
|
91
|
-
)
|
92
|
-
]
|
93
|
-
)
|
56
|
+
if threshold_type == "percent":
|
57
|
+
num_threshold = int(percent_threshold * df.shape[0])
|
94
58
|
|
95
|
-
|
96
|
-
|
97
|
-
dataset_types = typeset.infer_type(self.inputs.dataset.df)
|
59
|
+
table = []
|
60
|
+
all_passed = True
|
98
61
|
|
99
|
-
|
100
|
-
|
62
|
+
for col in dataset.feature_columns_categorical:
|
63
|
+
n_distinct = df[col].nunique()
|
64
|
+
p_distinct = n_distinct / df.shape[0]
|
65
|
+
passed = n_distinct < num_threshold
|
101
66
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
n_distinct = self.inputs.dataset.df[col].nunique()
|
112
|
-
p_distinct = n_distinct / rows
|
113
|
-
|
114
|
-
passed = n_distinct < num_threshold
|
67
|
+
table.append(
|
68
|
+
{
|
69
|
+
"Column": col,
|
70
|
+
"Number of Distinct Values": n_distinct,
|
71
|
+
"Percentage of Distinct Values (%)": p_distinct * 100,
|
72
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
73
|
+
}
|
74
|
+
)
|
115
75
|
|
116
|
-
|
117
|
-
|
118
|
-
column=col,
|
119
|
-
passed=passed,
|
120
|
-
values={
|
121
|
-
"n_distinct": n_distinct,
|
122
|
-
"p_distinct": p_distinct,
|
123
|
-
},
|
124
|
-
)
|
125
|
-
)
|
76
|
+
if not passed:
|
77
|
+
all_passed = False
|
126
78
|
|
127
|
-
|
79
|
+
return table, all_passed
|
@@ -2,23 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from validmind import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
7
7
|
|
8
|
-
import numpy as np
|
9
|
-
import pandas as pd
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass
|
21
|
-
class HighPearsonCorrelation(ThresholdTest):
|
9
|
+
@tags("tabular_data", "data_quality", "correlation")
|
10
|
+
@tasks("classification", "regression")
|
11
|
+
def HighPearsonCorrelation(
|
12
|
+
dataset: VMDataset, max_threshold: float = 0.3, top_n_correlations: int = 10
|
13
|
+
):
|
22
14
|
"""
|
23
15
|
Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
|
24
16
|
|
@@ -33,8 +25,9 @@ class HighPearsonCorrelation(ThresholdTest):
|
|
33
25
|
|
34
26
|
The test works by generating pairwise Pearson correlations for all features in the dataset, then sorting and
|
35
27
|
eliminating duplicate and self-correlations. It assigns a Pass or Fail based on whether the absolute value of the
|
36
|
-
correlation coefficient surpasses a pre-set threshold (defaulted at 0.3). It lastly returns the top
|
37
|
-
correlations regardless of passing or failing status
|
28
|
+
correlation coefficient surpasses a pre-set threshold (defaulted at 0.3). It lastly returns the top n strongest
|
29
|
+
correlations regardless of passing or failing status (where n is 10 by default but can be configured by passing the
|
30
|
+
`top_n_correlations` parameter).
|
38
31
|
|
39
32
|
### Signs of High Risk
|
40
33
|
|
@@ -57,86 +50,25 @@ class HighPearsonCorrelation(ThresholdTest):
|
|
57
50
|
- Sensitive to outliers where a few outliers could notably affect the correlation coefficient.
|
58
51
|
- Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
|
59
52
|
three or more variables.
|
60
|
-
- The top 10 result filter might not fully capture the richness of the data; an option to configure the number of
|
61
|
-
retained results could be helpful.
|
62
53
|
"""
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
"
|
76
|
-
|
77
|
-
]
|
78
|
-
},
|
79
|
-
"column": "Balance",
|
80
|
-
"passed": false,
|
81
|
-
}
|
82
|
-
]
|
83
|
-
"""
|
84
|
-
results_table = [
|
85
|
-
{
|
86
|
-
"Columns": f'({result.column}, {result.values["correlations"][0]["column"]})',
|
87
|
-
"Coefficient": result.values["correlations"][0]["correlation"],
|
88
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
89
|
-
}
|
90
|
-
for result in results
|
91
|
-
]
|
92
|
-
return ResultSummary(
|
93
|
-
results=[
|
94
|
-
ResultTable(
|
95
|
-
data=results_table,
|
96
|
-
metadata=ResultTableMetadata(
|
97
|
-
title="High Pearson Correlation Results for Dataset"
|
98
|
-
),
|
99
|
-
)
|
100
|
-
]
|
101
|
-
)
|
102
|
-
|
103
|
-
def run(self):
|
104
|
-
corr = self.inputs.dataset.df.corr(numeric_only=True)
|
105
|
-
|
106
|
-
# Create a table of correlation coefficients and column pairs
|
107
|
-
corr_table = corr.unstack().sort_values(
|
108
|
-
kind="quicksort", key=abs, ascending=False
|
109
|
-
)
|
110
|
-
corr_df = pd.DataFrame(corr_table).reset_index()
|
111
|
-
corr_df.columns = ["Column1", "Column2", "Coefficient"]
|
112
|
-
|
113
|
-
# Remove duplicate correlations and self-correlations
|
114
|
-
corr_df = corr_df.loc[corr_df["Column1"] < corr_df["Column2"]]
|
115
|
-
|
116
|
-
# Assign Pass/Fail based on correlation coefficient
|
117
|
-
corr_df["Pass/Fail"] = np.where(
|
118
|
-
corr_df["Coefficient"].abs() <= self.params["max_threshold"], "Pass", "Fail"
|
119
|
-
)
|
120
|
-
|
121
|
-
# Only keep the top 10 correlations. TODO: configurable
|
122
|
-
corr_df = corr_df.head(10)
|
123
|
-
|
124
|
-
passed = corr_df["Pass/Fail"].eq("Pass").all()
|
125
|
-
|
126
|
-
results = [
|
127
|
-
ThresholdTestResult(
|
128
|
-
column=col1,
|
129
|
-
values={
|
130
|
-
"correlations": [
|
131
|
-
{
|
132
|
-
"column": col2,
|
133
|
-
"correlation": coeff,
|
134
|
-
}
|
135
|
-
]
|
136
|
-
},
|
137
|
-
passed=pass_fail == "Pass",
|
54
|
+
# Get correlation matrix for numeric columns
|
55
|
+
corr = dataset.df.corr(numeric_only=True)
|
56
|
+
|
57
|
+
# Create table of correlation coefficients and column pairs
|
58
|
+
pairs = []
|
59
|
+
for i in range(len(corr.columns)):
|
60
|
+
for j in range(i + 1, len(corr.columns)):
|
61
|
+
coeff = corr.iloc[i, j]
|
62
|
+
pairs.append(
|
63
|
+
{
|
64
|
+
"Columns": f"({corr.columns[i]}, {corr.columns[j]})",
|
65
|
+
"Coefficient": coeff,
|
66
|
+
"Pass/Fail": "Pass" if abs(coeff) <= max_threshold else "Fail",
|
67
|
+
}
|
138
68
|
)
|
139
|
-
for _, (col1, col2, coeff, pass_fail) in corr_df.iterrows()
|
140
|
-
]
|
141
69
|
|
142
|
-
|
70
|
+
# Sort by absolute coefficient and get top N
|
71
|
+
pairs.sort(key=lambda x: abs(x["Coefficient"]), reverse=True)
|
72
|
+
pairs = pairs[:top_n_correlations]
|
73
|
+
|
74
|
+
return pairs, all(p["Pass/Fail"] == "Pass" for p in pairs)
|
@@ -2,15 +2,27 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import plotly.graph_objects as go
|
8
6
|
|
9
|
-
from validmind
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMDataset
|
9
|
+
|
10
|
+
|
11
|
+
def compute_outliers(series, threshold):
|
12
|
+
Q1 = series.quantile(0.25)
|
13
|
+
Q3 = series.quantile(0.75)
|
14
|
+
IQR = Q3 - Q1
|
15
|
+
lower_bound = Q1 - threshold * IQR
|
16
|
+
upper_bound = Q3 + threshold * IQR
|
17
|
+
|
18
|
+
return series[(series < lower_bound) | (series > upper_bound)]
|
10
19
|
|
11
20
|
|
12
|
-
@
|
13
|
-
|
21
|
+
@tags("tabular_data", "visualization", "numerical_data")
|
22
|
+
@tasks("classification", "regression")
|
23
|
+
def IQROutliersBarPlot(
|
24
|
+
dataset: VMDataset, threshold: float = 1.5, fig_width: int = 800
|
25
|
+
):
|
14
26
|
"""
|
15
27
|
Visualizes outlier distribution across percentiles in numerical data using the Interquartile Range (IQR) method.
|
16
28
|
|
@@ -54,99 +66,56 @@ class IQROutliersBarPlot(Metric):
|
|
54
66
|
### Limitations
|
55
67
|
|
56
68
|
- Its application is limited to numerical variables and does not extend to categorical ones.
|
57
|
-
- Relies on a predefined threshold (default being 1.5) for outlier identification, which may not be suitable for
|
58
|
-
all cases.
|
59
69
|
- Only reveals the presence and distribution of outliers and does not provide insights into how these outliers
|
60
70
|
might affect the model's predictive performance.
|
61
71
|
- The assumption that data is unimodal and symmetric may not always hold true. In cases with non-normal
|
62
72
|
distributions, the results can be misleading.
|
63
73
|
"""
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
74
|
+
df = dataset.df
|
75
|
+
|
76
|
+
figures = []
|
77
|
+
|
78
|
+
for col in dataset.feature_columns_numeric:
|
79
|
+
# Skip binary features
|
80
|
+
if len(df[col].unique()) <= 2:
|
81
|
+
continue
|
82
|
+
|
83
|
+
outliers = compute_outliers(df[col], threshold)
|
84
|
+
if outliers.empty:
|
85
|
+
continue
|
86
|
+
|
87
|
+
Q1_count = outliers[
|
88
|
+
(outliers >= 0) & (outliers < outliers.quantile(0.25))
|
89
|
+
].count()
|
90
|
+
Q2_count = outliers[
|
91
|
+
(outliers >= outliers.quantile(0.25)) & (outliers < outliers.median())
|
92
|
+
].count()
|
93
|
+
Q3_count = outliers[
|
94
|
+
(outliers >= outliers.median()) & (outliers < outliers.quantile(0.75))
|
95
|
+
].count()
|
96
|
+
Q4_count = outliers[
|
97
|
+
(outliers >= outliers.quantile(0.75)) & (outliers <= outliers.max())
|
98
|
+
].count()
|
99
|
+
|
100
|
+
bar_data = [Q1_count, Q2_count, Q3_count, Q4_count]
|
101
|
+
percentile_labels = [
|
102
|
+
"0-25",
|
103
|
+
"25-50",
|
104
|
+
"50-75",
|
105
|
+
"75-100",
|
82
106
|
]
|
83
107
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
return series[(series < lower_bound) | (series > upper_bound)]
|
99
|
-
|
100
|
-
def detect_and_visualize_outliers(self, df, threshold, fig_width):
|
101
|
-
num_cols = df.columns.tolist()
|
102
|
-
figures = []
|
103
|
-
|
104
|
-
for col in num_cols:
|
105
|
-
# Compute outliers
|
106
|
-
outliers = self.compute_outliers(df[col], threshold)
|
107
|
-
|
108
|
-
if outliers.empty:
|
109
|
-
continue # Skip plotting if there are no outliers
|
110
|
-
|
111
|
-
Q1_count = outliers[
|
112
|
-
(outliers >= 0) & (outliers < outliers.quantile(0.25))
|
113
|
-
].count()
|
114
|
-
Q2_count = outliers[
|
115
|
-
(outliers >= outliers.quantile(0.25)) & (outliers < outliers.median())
|
116
|
-
].count()
|
117
|
-
Q3_count = outliers[
|
118
|
-
(outliers >= outliers.median()) & (outliers < outliers.quantile(0.75))
|
119
|
-
].count()
|
120
|
-
Q4_count = outliers[
|
121
|
-
(outliers >= outliers.quantile(0.75)) & (outliers <= outliers.max())
|
122
|
-
].count()
|
123
|
-
|
124
|
-
# Prepare data for bar plot
|
125
|
-
bar_data = [Q1_count, Q2_count, Q3_count, Q4_count]
|
126
|
-
percentile_labels = [
|
127
|
-
"0-25",
|
128
|
-
"25-50",
|
129
|
-
"50-75",
|
130
|
-
"75-100",
|
131
|
-
]
|
132
|
-
|
133
|
-
# Create a bar plot
|
134
|
-
fig = go.Figure(
|
135
|
-
data=[go.Bar(x=percentile_labels, y=bar_data, marker_color="skyblue")]
|
136
|
-
)
|
137
|
-
|
138
|
-
# Set layout properties
|
139
|
-
fig.update_layout(
|
140
|
-
title_text=col,
|
141
|
-
width=fig_width,
|
142
|
-
height=400,
|
143
|
-
plot_bgcolor="white",
|
144
|
-
xaxis_title="Percentile",
|
145
|
-
yaxis_title="Outlier Count",
|
146
|
-
)
|
147
|
-
|
148
|
-
# Create a Figure object and append to figures list
|
149
|
-
figure = Figure(for_object=self, key=f"{self.key}:{col}", figure=fig)
|
150
|
-
figures.append(figure)
|
151
|
-
|
152
|
-
return self.cache_results(figures=figures)
|
108
|
+
fig = go.Figure(
|
109
|
+
data=[go.Bar(x=percentile_labels, y=bar_data, marker_color="skyblue")]
|
110
|
+
)
|
111
|
+
fig.update_layout(
|
112
|
+
title_text=col,
|
113
|
+
width=fig_width,
|
114
|
+
height=400,
|
115
|
+
plot_bgcolor="white",
|
116
|
+
xaxis_title="Percentile",
|
117
|
+
yaxis_title="Outlier Count",
|
118
|
+
)
|
119
|
+
figures.append(fig)
|
120
|
+
|
121
|
+
return tuple(figures)
|