validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,15 +2,23 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
5
|
+
from validmind import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
6
7
|
|
7
|
-
import pandas as pd
|
8
8
|
|
9
|
-
|
9
|
+
def compute_outliers(series, threshold=1.5):
|
10
|
+
Q1 = series.quantile(0.25)
|
11
|
+
Q3 = series.quantile(0.75)
|
12
|
+
IQR = Q3 - Q1
|
13
|
+
lower_bound = Q1 - threshold * IQR
|
14
|
+
upper_bound = Q3 + threshold * IQR
|
10
15
|
|
16
|
+
return series[(series < lower_bound) | (series > upper_bound)]
|
11
17
|
|
12
|
-
|
13
|
-
|
18
|
+
|
19
|
+
@tags("tabular_data", "numerical_data")
|
20
|
+
@tasks("classification", "regression")
|
21
|
+
def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
|
14
22
|
"""
|
15
23
|
Determines and summarizes outliers in numerical features using the Interquartile Range method.
|
16
24
|
|
@@ -53,80 +61,32 @@ class IQROutliersTable(Metric):
|
|
53
61
|
- Default thresholds may not be optimal for data with heavy pre-processing, manipulation, or inherently high
|
54
62
|
kurtosis (heavy tails).
|
55
63
|
"""
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
df, features, threshold
|
64
|
+
df = dataset.df
|
65
|
+
|
66
|
+
outliers_table = []
|
67
|
+
|
68
|
+
for col in dataset.feature_columns_numeric:
|
69
|
+
# Skip binary features
|
70
|
+
if len(df[col].unique()) <= 2:
|
71
|
+
continue
|
72
|
+
|
73
|
+
outliers = compute_outliers(df[col], threshold)
|
74
|
+
if outliers.empty:
|
75
|
+
continue
|
76
|
+
|
77
|
+
outliers_table.append(
|
78
|
+
{
|
79
|
+
"Variable": col,
|
80
|
+
"Total Count of Outliers": outliers.count(),
|
81
|
+
"Mean Value of Variable": df[col].mean(),
|
82
|
+
"Minimum Outlier Value": outliers.min(),
|
83
|
+
"Outlier Value at 25th Percentile": outliers.quantile(0.25),
|
84
|
+
"Outlier Value at 50th Percentile": outliers.median(),
|
85
|
+
"Outlier Value at 75th Percentile": outliers.quantile(0.75),
|
86
|
+
"Maximum Outlier Value": outliers.max(),
|
87
|
+
}
|
81
88
|
)
|
82
89
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
@staticmethod
|
88
|
-
def compute_outliers(series, threshold=1.5):
|
89
|
-
Q1 = series.quantile(0.25)
|
90
|
-
Q3 = series.quantile(0.75)
|
91
|
-
IQR = Q3 - Q1
|
92
|
-
lower_bound = Q1 - threshold * IQR
|
93
|
-
upper_bound = Q3 + threshold * IQR
|
94
|
-
return series[(series < lower_bound) | (series > upper_bound)]
|
95
|
-
|
96
|
-
def detect_and_analyze_outliers(self, df, features, threshold=1.5):
|
97
|
-
|
98
|
-
outliers_summary = []
|
99
|
-
for feature in features:
|
100
|
-
outliers_series = self.compute_outliers(df[feature], threshold)
|
101
|
-
if not outliers_series.empty:
|
102
|
-
outliers_summary.append(
|
103
|
-
{
|
104
|
-
"Variable": feature,
|
105
|
-
"Total Count of Outliers": outliers_series.count(),
|
106
|
-
"Mean Value of Variable": df[feature].mean(),
|
107
|
-
"Minimum Outlier Value": outliers_series.min(),
|
108
|
-
"Outlier Value at 25th Percentile": outliers_series.quantile(
|
109
|
-
0.25
|
110
|
-
),
|
111
|
-
"Outlier Value at 50th Percentile": outliers_series.median(),
|
112
|
-
"Outlier Value at 75th Percentile": outliers_series.quantile(
|
113
|
-
0.75
|
114
|
-
),
|
115
|
-
"Maximum Outlier Value": outliers_series.max(),
|
116
|
-
}
|
117
|
-
)
|
118
|
-
outliers_summary_table = pd.DataFrame(outliers_summary)
|
119
|
-
return outliers_summary_table
|
120
|
-
|
121
|
-
def summary(self, metric_value):
|
122
|
-
outliers_summary_table = pd.DataFrame(metric_value["outliers_summary_table"])
|
123
|
-
return ResultSummary(
|
124
|
-
results=[
|
125
|
-
ResultTable(
|
126
|
-
data=outliers_summary_table,
|
127
|
-
metadata=ResultTableMetadata(
|
128
|
-
title="Summary of Outliers Detected by IQR Method"
|
129
|
-
),
|
130
|
-
),
|
131
|
-
]
|
132
|
-
)
|
90
|
+
return {
|
91
|
+
"Summary of Outliers Detected by IQR Method": outliers_table,
|
92
|
+
}
|
@@ -3,17 +3,23 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
6
|
|
8
7
|
import matplotlib.pyplot as plt
|
9
8
|
import seaborn as sns
|
10
9
|
from sklearn.ensemble import IsolationForest
|
11
10
|
|
12
|
-
from validmind
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.vm_models import VMDataset
|
13
13
|
|
14
14
|
|
15
|
-
@
|
16
|
-
|
15
|
+
@tags("tabular_data", "anomaly_detection")
|
16
|
+
@tasks("classification")
|
17
|
+
def IsolationForestOutliers(
|
18
|
+
dataset: VMDataset,
|
19
|
+
random_state: int = 0,
|
20
|
+
contamination: float = 0.1,
|
21
|
+
feature_columns: list = None,
|
22
|
+
):
|
17
23
|
"""
|
18
24
|
Detects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots.
|
19
25
|
|
@@ -55,64 +61,36 @@ class IsolationForestOutliers(Metric):
|
|
55
61
|
- Potential failure in detecting collective anomalies if they behave similarly to normal data
|
56
62
|
- Potential lack of precision in identifying which features contribute most to the anomalous behavior
|
57
63
|
"""
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
"random_state": 0,
|
62
|
-
"contamination": 0.1,
|
63
|
-
"features_columns": None,
|
64
|
-
}
|
65
|
-
tasks = ["classification"]
|
66
|
-
tags = ["tabular_data", "anomaly_detection"]
|
67
|
-
|
68
|
-
required_inputs = ["dataset"]
|
69
|
-
|
70
|
-
def run(self):
|
71
|
-
if self.params["features_columns"] is None:
|
72
|
-
features_list = self.inputs.dataset.feature_columns_numeric
|
73
|
-
else:
|
74
|
-
features_list = self.params["features_columns"]
|
75
|
-
|
76
|
-
# Check if all elements from features_list are present in the feature columns
|
77
|
-
all_present = all(
|
78
|
-
elem in self.inputs.dataset.feature_columns for elem in features_list
|
64
|
+
if feature_columns and not all(elem in dataset.columns for elem in feature_columns):
|
65
|
+
raise ValueError(
|
66
|
+
"The list of feature columns provided do not match with training dataset feature columns"
|
79
67
|
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
68
|
+
|
69
|
+
feature_columns = feature_columns or dataset.feature_columns_numeric
|
70
|
+
|
71
|
+
df = dataset.df[feature_columns]
|
72
|
+
|
73
|
+
clf = IsolationForest(
|
74
|
+
random_state=random_state,
|
75
|
+
contamination=contamination,
|
76
|
+
)
|
77
|
+
clf.fit(df)
|
78
|
+
y_pred = clf.predict(df)
|
79
|
+
|
80
|
+
figures = []
|
81
|
+
|
82
|
+
for feature1, feature2 in itertools.combinations(feature_columns, 2):
|
83
|
+
fig = plt.figure()
|
84
|
+
ax = sns.scatterplot(
|
85
|
+
data=df, x=feature1, y=feature2, hue=y_pred, palette="bright"
|
92
86
|
)
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
handles, labels = ax.get_legend_handles_labels()
|
104
|
-
labels = list(map(lambda x: x.replace("-1", "Outliers"), labels))
|
105
|
-
labels = list(map(lambda x: x.replace("1", "Inliers"), labels))
|
106
|
-
ax.legend(handles, labels)
|
107
|
-
# Do this if you want to prevent the figure from being displayed
|
108
|
-
plt.close("all")
|
109
|
-
|
110
|
-
test_figures.append(
|
111
|
-
Figure(
|
112
|
-
for_object=self,
|
113
|
-
key=f"{self.name}:{feature1}_{feature2}",
|
114
|
-
figure=fig,
|
115
|
-
)
|
116
|
-
)
|
117
|
-
|
118
|
-
return self.cache_results(figures=test_figures)
|
87
|
+
handles, labels = ax.get_legend_handles_labels()
|
88
|
+
labels = list(map(lambda x: x.replace("-1", "Outliers"), labels))
|
89
|
+
labels = list(map(lambda x: x.replace("1", "Inliers"), labels))
|
90
|
+
ax.legend(handles, labels)
|
91
|
+
|
92
|
+
figures.append(fig)
|
93
|
+
|
94
|
+
plt.close()
|
95
|
+
|
96
|
+
return tuple(figures)
|
@@ -2,19 +2,20 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
from statsmodels.tsa.stattools import kpss
|
9
7
|
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
10
|
from validmind.logging import get_logger
|
11
|
-
from validmind.vm_models import
|
11
|
+
from validmind.vm_models import VMDataset
|
12
12
|
|
13
13
|
logger = get_logger(__name__)
|
14
14
|
|
15
15
|
|
16
|
-
@
|
17
|
-
|
16
|
+
@tags("time_series_data", "stationarity", "unit_root_test", "statsmodels")
|
17
|
+
@tasks("data_validation")
|
18
|
+
def KPSS(dataset: VMDataset):
|
18
19
|
"""
|
19
20
|
Assesses the stationarity of time-series data in a machine learning model using the KPSS unit root test.
|
20
21
|
|
@@ -53,81 +54,32 @@ class KPSS(Metric):
|
|
53
54
|
- The reliability of the test is contingent on the number of lags selected, which introduces potential bias in the
|
54
55
|
measurement.
|
55
56
|
"""
|
57
|
+
df = dataset.df.dropna()
|
56
58
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
tags = [
|
61
|
-
"time_series_data",
|
62
|
-
"forecasting",
|
63
|
-
"stationarity",
|
64
|
-
"unit_root_test",
|
65
|
-
"statsmodels",
|
66
|
-
]
|
67
|
-
|
68
|
-
def run(self):
|
69
|
-
"""
|
70
|
-
Calculates KPSS for each of the dataset features
|
71
|
-
"""
|
72
|
-
dataset = self.inputs.dataset.df
|
73
|
-
|
74
|
-
# Check if the dataset is a time series
|
75
|
-
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
76
|
-
raise ValueError(
|
77
|
-
"Dataset index must be a datetime or period index for time series analysis."
|
78
|
-
)
|
79
|
-
|
80
|
-
# Preprocessing: Drop rows with any NaN values
|
81
|
-
if dataset.isnull().values.any():
|
82
|
-
logger.warning(
|
83
|
-
"Dataset contains missing values. Rows with NaNs will be dropped."
|
84
|
-
)
|
85
|
-
dataset = dataset.dropna()
|
86
|
-
|
87
|
-
# Convert to numeric and handle non-numeric data
|
88
|
-
dataset = dataset.apply(pd.to_numeric, errors="coerce")
|
89
|
-
|
90
|
-
# Initialize a list to store KPSS results
|
91
|
-
kpss_values = []
|
92
|
-
|
93
|
-
for col in dataset.columns:
|
94
|
-
try:
|
95
|
-
kpss_stat, pvalue, usedlag, critical_values = kpss(dataset[col].values)
|
96
|
-
kpss_values.append(
|
97
|
-
{
|
98
|
-
"Variable": col,
|
99
|
-
"stat": kpss_stat,
|
100
|
-
"pvalue": pvalue,
|
101
|
-
"usedlag": usedlag,
|
102
|
-
"critical_values": critical_values,
|
103
|
-
}
|
104
|
-
)
|
105
|
-
except Exception as e:
|
106
|
-
logger.error(f"Error processing column '{col}': {e}")
|
107
|
-
kpss_values.append(
|
108
|
-
{
|
109
|
-
"Variable": col,
|
110
|
-
"stat": None,
|
111
|
-
"pvalue": None,
|
112
|
-
"usedlag": None,
|
113
|
-
"critical_values": None,
|
114
|
-
"error": str(e),
|
115
|
-
}
|
116
|
-
)
|
117
|
-
|
118
|
-
return self.cache_results({"kpss_results": kpss_values})
|
119
|
-
|
120
|
-
def summary(self, metric_value):
|
121
|
-
"""
|
122
|
-
Build a table for summarizing the KPSS results
|
123
|
-
"""
|
124
|
-
kpss_results = metric_value["kpss_results"]
|
125
|
-
|
126
|
-
return ResultSummary(
|
127
|
-
results=[
|
128
|
-
ResultTable(
|
129
|
-
data=kpss_results,
|
130
|
-
metadata=ResultTableMetadata(title="KPSS Test Results"),
|
131
|
-
)
|
132
|
-
]
|
59
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
60
|
+
raise SkipTestError(
|
61
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
133
62
|
)
|
63
|
+
|
64
|
+
df = df.apply(pd.to_numeric, errors="coerce")
|
65
|
+
|
66
|
+
kpss_table = []
|
67
|
+
|
68
|
+
for col in dataset.columns:
|
69
|
+
kpss_stat, pvalue, usedlag, critical_values = kpss(df[col].values)
|
70
|
+
kpss_table.append(
|
71
|
+
{
|
72
|
+
"Variable": col,
|
73
|
+
"stat": kpss_stat,
|
74
|
+
"pvalue": pvalue,
|
75
|
+
"usedlag": usedlag,
|
76
|
+
"critical_values": critical_values,
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
if not kpss_table:
|
81
|
+
raise SkipTestError(f"No KPSS results found for dataset: {dataset.input_id}")
|
82
|
+
|
83
|
+
return {
|
84
|
+
"KPSS Test Results": kpss_table,
|
85
|
+
}
|
@@ -6,13 +6,16 @@ import numpy as np
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.figure_factory as ff
|
8
8
|
|
9
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset
|
10
11
|
|
11
12
|
# Define the 'coolwarm' color scale manually
|
12
13
|
COOLWARM = [[0, "rgb(95,5,255)"], [0.5, "rgb(255,255,255)"], [1, "rgb(255,5,0)"]]
|
13
14
|
|
14
15
|
|
15
|
-
|
16
|
+
@tags("time_series_data", "visualization")
|
17
|
+
@tasks("regression")
|
18
|
+
def LaggedCorrelationHeatmap(dataset: VMDataset, num_lags: int = 10):
|
16
19
|
"""
|
17
20
|
Assesses and visualizes correlation between target variable and lagged independent variables in a time-series
|
18
21
|
dataset.
|
@@ -55,98 +58,47 @@ class LaggedCorrelationHeatmap(Metric):
|
|
55
58
|
to interpret, while too few might overlook delayed effects.
|
56
59
|
- This metric does not take into account any causal relationships, but merely demonstrates correlation.
|
57
60
|
"""
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
target_col: df[target_col],
|
72
|
-
f"{ind_var_col}_lag{lag}": df[ind_var_col].shift(lag),
|
73
|
-
}
|
74
|
-
)
|
75
|
-
|
76
|
-
temp_df = temp_df.dropna()
|
77
|
-
|
78
|
-
corr = temp_df[target_col].corr(temp_df[f"{ind_var_col}_lag{lag}"])
|
79
|
-
|
80
|
-
correlations[i, lag] = corr
|
81
|
-
|
82
|
-
return correlations
|
83
|
-
|
84
|
-
def _plot_heatmap(self, correlations, independent_vars, target_col, num_lags):
|
85
|
-
correlation_df = pd.DataFrame(
|
86
|
-
correlations,
|
87
|
-
columns=[f"{i}" for i in range(num_lags + 1)],
|
88
|
-
index=independent_vars,
|
89
|
-
)
|
90
|
-
|
91
|
-
# Create heatmap using Plotly
|
92
|
-
fig = ff.create_annotated_heatmap(
|
93
|
-
z=correlation_df.values,
|
94
|
-
x=list(correlation_df.columns),
|
95
|
-
y=list(correlation_df.index),
|
96
|
-
colorscale=COOLWARM,
|
97
|
-
annotation_text=correlation_df.round(2).values,
|
98
|
-
showscale=True,
|
99
|
-
)
|
100
|
-
|
101
|
-
fig.update_layout(
|
102
|
-
title={
|
103
|
-
"text": f"Correlations between {target_col} and Lags of Features",
|
104
|
-
"y": 0.95,
|
105
|
-
"x": 0.5,
|
106
|
-
"xanchor": "center",
|
107
|
-
"yanchor": "top",
|
108
|
-
},
|
109
|
-
font=dict(size=14),
|
110
|
-
xaxis_title="Lags",
|
111
|
-
)
|
112
|
-
|
113
|
-
return fig
|
114
|
-
|
115
|
-
def run(self):
|
116
|
-
if isinstance(self.inputs.dataset.target_column, list):
|
117
|
-
target_col = self.inputs.dataset.target_column[
|
118
|
-
0
|
119
|
-
] # take the first item from the list
|
120
|
-
else:
|
121
|
-
target_col = self.inputs.dataset.target_column
|
122
|
-
|
123
|
-
independent_vars = list(self.inputs.dataset.feature_columns)
|
124
|
-
num_lags = self.params.get("num_lags", 10)
|
125
|
-
|
126
|
-
if isinstance(target_col, list) and len(target_col) == 1:
|
127
|
-
target_col = target_col[0]
|
128
|
-
|
129
|
-
if not isinstance(target_col, str):
|
130
|
-
raise ValueError(
|
131
|
-
"The 'target_col' must be a single string or a list containing a single string"
|
132
|
-
)
|
133
|
-
|
134
|
-
df = self.inputs.dataset.df
|
135
|
-
|
136
|
-
correlations = self._compute_correlations(
|
137
|
-
df, target_col, independent_vars, num_lags
|
138
|
-
)
|
139
|
-
fig = self._plot_heatmap(correlations, independent_vars, target_col, num_lags)
|
140
|
-
|
141
|
-
figures = []
|
142
|
-
figures.append(
|
143
|
-
Figure(
|
144
|
-
for_object=self,
|
145
|
-
key=self.key,
|
146
|
-
figure=fig,
|
61
|
+
correlations = np.zeros((len(dataset.feature_columns), num_lags + 1))
|
62
|
+
|
63
|
+
for i, ind_var_col in enumerate(dataset.feature_columns):
|
64
|
+
for lag in range(num_lags + 1):
|
65
|
+
temp_df = pd.DataFrame(
|
66
|
+
{
|
67
|
+
dataset.target_column: dataset.df[dataset.target_column],
|
68
|
+
f"{ind_var_col}_lag{lag}": dataset.df[ind_var_col].shift(lag),
|
69
|
+
}
|
70
|
+
).dropna()
|
71
|
+
|
72
|
+
corr = temp_df[dataset.target_column].corr(
|
73
|
+
temp_df[f"{ind_var_col}_lag{lag}"]
|
147
74
|
)
|
148
|
-
)
|
149
75
|
|
150
|
-
|
151
|
-
|
152
|
-
|
76
|
+
correlations[i, lag] = corr
|
77
|
+
|
78
|
+
correlation_df = pd.DataFrame(
|
79
|
+
correlations,
|
80
|
+
columns=[f"{i}" for i in range(num_lags + 1)],
|
81
|
+
index=dataset.feature_columns,
|
82
|
+
)
|
83
|
+
|
84
|
+
fig = ff.create_annotated_heatmap(
|
85
|
+
z=correlation_df.values,
|
86
|
+
x=list(correlation_df.columns),
|
87
|
+
y=list(correlation_df.index),
|
88
|
+
colorscale=COOLWARM,
|
89
|
+
annotation_text=correlation_df.round(2).values,
|
90
|
+
showscale=True,
|
91
|
+
)
|
92
|
+
fig.update_layout(
|
93
|
+
title={
|
94
|
+
"text": f"Correlations between {dataset.target_column} and Lags of Features",
|
95
|
+
"y": 0.95,
|
96
|
+
"x": 0.5,
|
97
|
+
"xanchor": "center",
|
98
|
+
"yanchor": "top",
|
99
|
+
},
|
100
|
+
font=dict(size=14),
|
101
|
+
xaxis_title="Lags",
|
102
|
+
)
|
103
|
+
|
104
|
+
return fig
|
@@ -2,20 +2,13 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from validmind import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
7
7
|
|
8
|
-
from validmind.vm_models import (
|
9
|
-
ResultSummary,
|
10
|
-
ResultTable,
|
11
|
-
ResultTableMetadata,
|
12
|
-
ThresholdTest,
|
13
|
-
ThresholdTestResult,
|
14
|
-
)
|
15
8
|
|
16
|
-
|
17
|
-
@
|
18
|
-
|
9
|
+
@tags("tabular_data", "data_quality")
|
10
|
+
@tasks("classification", "regression")
|
11
|
+
def MissingValues(dataset: VMDataset, min_threshold: int = 1):
|
19
12
|
"""
|
20
13
|
Evaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold.
|
21
14
|
|
@@ -53,49 +46,15 @@ class MissingValues(ThresholdTest):
|
|
53
46
|
- Does not account for data encoded as values like "-999" or "None," which might not technically classify as
|
54
47
|
missing but could bear similar implications.
|
55
48
|
"""
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
results_table = [
|
69
|
-
{
|
70
|
-
"Column": result.column,
|
71
|
-
"Number of Missing Values": result.values["n_missing"],
|
72
|
-
"Percentage of Missing Values (%)": result.values["p_missing"] * 100,
|
73
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
74
|
-
}
|
75
|
-
for result in results
|
76
|
-
]
|
77
|
-
return ResultSummary(
|
78
|
-
results=[
|
79
|
-
ResultTable(
|
80
|
-
data=results_table,
|
81
|
-
metadata=ResultTableMetadata(
|
82
|
-
title="Missing Values Results for Dataset"
|
83
|
-
),
|
84
|
-
)
|
85
|
-
]
|
86
|
-
)
|
87
|
-
|
88
|
-
def run(self):
|
89
|
-
rows = self.inputs.dataset.df.shape[0]
|
90
|
-
|
91
|
-
missing = self.inputs.dataset.df.isna().sum()
|
92
|
-
results = [
|
93
|
-
ThresholdTestResult(
|
94
|
-
column=col,
|
95
|
-
passed=missing[col] < self.params["min_threshold"],
|
96
|
-
values={"n_missing": missing[col], "p_missing": missing[col] / rows},
|
97
|
-
)
|
98
|
-
for col in missing.index
|
99
|
-
]
|
100
|
-
|
101
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
49
|
+
df = dataset.df
|
50
|
+
missing = df.isna().sum()
|
51
|
+
|
52
|
+
return [
|
53
|
+
{
|
54
|
+
"Column": col,
|
55
|
+
"Number of Missing Values": missing[col],
|
56
|
+
"Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
|
57
|
+
"Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
|
58
|
+
}
|
59
|
+
for col in missing.index
|
60
|
+
], all(missing[col] < min_threshold for col in missing.index)
|