validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,10 +2,22 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
5
|
+
from typing import List
|
6
6
|
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMDataset
|
7
9
|
|
8
|
-
|
10
|
+
DATASET_LABELS = {
|
11
|
+
"train_ds": "Training",
|
12
|
+
"test_ds": "Test",
|
13
|
+
"validation_ds": "Validation",
|
14
|
+
"total": "Total",
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
@tags("tabular_data", "time_series_data", "text_data")
|
19
|
+
@tasks("classification", "regression", "text_classification", "text_summarization")
|
20
|
+
def DatasetSplit(datasets: List[VMDataset]):
|
9
21
|
"""
|
10
22
|
Evaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML
|
11
23
|
model.
|
@@ -47,80 +59,43 @@ class DatasetSplit(Metric):
|
|
47
59
|
- Potential lack of compatibility with more complex modes of data splitting (for example, stratified or time-based
|
48
60
|
splits) could limit the applicability of this test.
|
49
61
|
"""
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
"""
|
73
|
-
table_records = []
|
74
|
-
for key, value in raw_results.items():
|
75
|
-
if key.endswith("_size"):
|
76
|
-
dataset_name = key.replace("_size", "")
|
77
|
-
if dataset_name == "total":
|
78
|
-
table_records.append(
|
79
|
-
{
|
80
|
-
"Dataset": "Total",
|
81
|
-
"Size": value,
|
82
|
-
"Proportion": "100%",
|
83
|
-
}
|
84
|
-
)
|
85
|
-
continue
|
86
|
-
|
87
|
-
proportion = raw_results[f"{dataset_name}_proportion"] * 100
|
88
|
-
table_records.append(
|
62
|
+
results = {}
|
63
|
+
total_size = 0
|
64
|
+
|
65
|
+
# First calculate the total size of the dataset
|
66
|
+
for dataset in datasets:
|
67
|
+
if dataset is not None:
|
68
|
+
total_size += len(dataset.df)
|
69
|
+
|
70
|
+
# Then calculate the proportion of each dataset
|
71
|
+
for dataset in datasets:
|
72
|
+
if dataset is not None:
|
73
|
+
results[f"{dataset.input_id}_size"] = len(dataset.df)
|
74
|
+
results[f"{dataset.input_id}_proportion"] = len(dataset.df) / total_size
|
75
|
+
|
76
|
+
results["total_size"] = total_size
|
77
|
+
|
78
|
+
table = []
|
79
|
+
for key, value in results.items():
|
80
|
+
if key.endswith("_size"):
|
81
|
+
dataset_name = key.replace("_size", "")
|
82
|
+
if dataset_name == "total":
|
83
|
+
table.append(
|
89
84
|
{
|
90
|
-
"Dataset":
|
85
|
+
"Dataset": "Total",
|
91
86
|
"Size": value,
|
92
|
-
"Proportion":
|
87
|
+
"Proportion": "100%",
|
93
88
|
}
|
94
89
|
)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
total_size += len(dataset.df)
|
108
|
-
|
109
|
-
# Then calculate the proportion of each dataset
|
110
|
-
for dataset in available_datasets:
|
111
|
-
if dataset is not None:
|
112
|
-
results[f"{dataset.input_id}_size"] = len(dataset.df)
|
113
|
-
results[f"{dataset.input_id}_proportion"] = len(dataset.df) / total_size
|
114
|
-
|
115
|
-
results["total_size"] = total_size
|
116
|
-
|
117
|
-
return self.cache_results(results)
|
118
|
-
|
119
|
-
def test(self):
|
120
|
-
"""Unit Test for DatasetSplit Metric"""
|
121
|
-
assert self.result is not None
|
122
|
-
|
123
|
-
assert self.result.metric is not None
|
124
|
-
assert isinstance(self.result.metric.value, dict)
|
125
|
-
|
126
|
-
assert self.result.figures is None
|
90
|
+
continue
|
91
|
+
|
92
|
+
proportion = results[f"{dataset_name}_proportion"] * 100
|
93
|
+
table.append(
|
94
|
+
{
|
95
|
+
"Dataset": dataset_name, # DatasetSplit.dataset_labels[dataset_name],
|
96
|
+
"Size": value,
|
97
|
+
"Proportion": f"{proportion:.2f}%",
|
98
|
+
}
|
99
|
+
)
|
100
|
+
|
101
|
+
return table
|
@@ -2,16 +2,51 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.errors import SkipTestError
|
9
9
|
from validmind.utils import format_records
|
10
|
-
from validmind.vm_models import
|
10
|
+
from validmind.vm_models import VMDataset
|
11
|
+
|
12
|
+
|
13
|
+
def get_summary_statistics_numerical(df, numerical_fields):
|
14
|
+
percentiles = [0.25, 0.5, 0.75, 0.90, 0.95]
|
15
|
+
|
16
|
+
summary_stats = df[numerical_fields].describe(percentiles=percentiles).T
|
17
|
+
summary_stats = summary_stats[
|
18
|
+
["count", "mean", "std", "min", "25%", "50%", "75%", "90%", "95%", "max"]
|
19
|
+
]
|
20
|
+
summary_stats.columns = summary_stats.columns.str.title()
|
21
|
+
summary_stats.reset_index(inplace=True)
|
22
|
+
summary_stats.rename(columns={"index": "Name"}, inplace=True)
|
23
|
+
|
24
|
+
return summary_stats
|
25
|
+
|
26
|
+
|
27
|
+
def get_summary_statistics_categorical(df, categorical_fields):
|
28
|
+
summary_stats = pd.DataFrame()
|
11
29
|
|
30
|
+
for column in df[categorical_fields].columns:
|
31
|
+
top_value = df[column].value_counts().idxmax()
|
32
|
+
top_freq = df[column].value_counts().max()
|
33
|
+
summary_stats.loc[column, "Count"] = df[column].count()
|
34
|
+
summary_stats.loc[column, "Number of Unique Values"] = df[column].nunique()
|
35
|
+
summary_stats.loc[column, "Top Value"] = top_value
|
36
|
+
summary_stats.loc[column, "Top Value Frequency"] = top_freq
|
37
|
+
summary_stats.loc[column, "Top Value Frequency %"] = (
|
38
|
+
top_freq / df[column].count()
|
39
|
+
) * 100
|
12
40
|
|
13
|
-
|
14
|
-
|
41
|
+
summary_stats.reset_index(inplace=True)
|
42
|
+
summary_stats.rename(columns={"index": "Name"}, inplace=True)
|
43
|
+
|
44
|
+
return summary_stats
|
45
|
+
|
46
|
+
|
47
|
+
@tags("tabular_data", "time_series_data")
|
48
|
+
@tasks("classification", "regression")
|
49
|
+
def DescriptiveStatistics(dataset: VMDataset):
|
15
50
|
"""
|
16
51
|
Performs a detailed descriptive statistical analysis of both numerical and categorical data within a model's
|
17
52
|
dataset.
|
@@ -57,84 +92,23 @@ class DescriptiveStatistics(Metric):
|
|
57
92
|
- Should be used in conjunction with other statistical tests to provide a comprehensive understanding of the
|
58
93
|
model's data.
|
59
94
|
"""
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
]
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
return format_records(summary_stats)
|
78
|
-
|
79
|
-
def get_summary_statistics_categorical(self, df, categorical_fields):
|
80
|
-
summary_stats = pd.DataFrame()
|
81
|
-
|
82
|
-
for column in df[categorical_fields].columns:
|
83
|
-
top_value = df[column].value_counts().idxmax()
|
84
|
-
top_freq = df[column].value_counts().max()
|
85
|
-
summary_stats.loc[column, "Count"] = df[column].count()
|
86
|
-
summary_stats.loc[column, "Number of Unique Values"] = df[column].nunique()
|
87
|
-
summary_stats.loc[column, "Top Value"] = top_value
|
88
|
-
summary_stats.loc[column, "Top Value Frequency"] = top_freq
|
89
|
-
summary_stats.loc[column, "Top Value Frequency %"] = (
|
90
|
-
top_freq / df[column].count()
|
91
|
-
) * 100
|
92
|
-
|
93
|
-
summary_stats.reset_index(inplace=True)
|
94
|
-
summary_stats.rename(columns={"index": "Name"}, inplace=True)
|
95
|
-
|
96
|
-
return format_records(summary_stats)
|
97
|
-
|
98
|
-
def summary(self, metric_value):
|
99
|
-
"""
|
100
|
-
Build two tables: one for summarizing numerical variables and one for categorical variables
|
101
|
-
"""
|
102
|
-
summary_stats_numerical = metric_value["numerical"]
|
103
|
-
summary_stats_categorical = metric_value["categorical"]
|
104
|
-
results = []
|
105
|
-
if len(summary_stats_numerical) != 0:
|
106
|
-
results.append(
|
107
|
-
ResultTable(
|
108
|
-
data=summary_stats_numerical,
|
109
|
-
metadata=ResultTableMetadata(title="Numerical Variables"),
|
110
|
-
)
|
111
|
-
)
|
112
|
-
if len(summary_stats_categorical) != 0:
|
113
|
-
results.append(
|
114
|
-
ResultTable(
|
115
|
-
data=summary_stats_categorical,
|
116
|
-
metadata=ResultTableMetadata(title="Categorical Variables"),
|
117
|
-
)
|
118
|
-
)
|
119
|
-
|
120
|
-
return ResultSummary(results=results)
|
121
|
-
|
122
|
-
def run(self):
|
123
|
-
feature_columns = self.inputs.dataset.feature_columns
|
124
|
-
numerical_feature_columns = self.inputs.dataset.feature_columns_numeric
|
125
|
-
categorical_feature_columns = self.inputs.dataset.feature_columns_categorical
|
126
|
-
|
127
|
-
df = self.inputs.dataset.df[feature_columns]
|
128
|
-
|
129
|
-
summary_stats_numerical = self.get_summary_statistics_numerical(
|
130
|
-
df, numerical_feature_columns
|
131
|
-
)
|
132
|
-
summary_stats_categorical = self.get_summary_statistics_categorical(
|
133
|
-
df, categorical_feature_columns
|
134
|
-
)
|
135
|
-
return self.cache_results(
|
136
|
-
{
|
137
|
-
"numerical": summary_stats_numerical,
|
138
|
-
"categorical": summary_stats_categorical,
|
139
|
-
}
|
95
|
+
tables = {}
|
96
|
+
|
97
|
+
summary_stats_numerical = get_summary_statistics_numerical(
|
98
|
+
dataset.df, dataset.feature_columns_numeric
|
99
|
+
)
|
100
|
+
if not summary_stats_numerical.empty:
|
101
|
+
tables["Numerical Variables"] = format_records(summary_stats_numerical)
|
102
|
+
|
103
|
+
summary_stats_categorical = get_summary_statistics_categorical(
|
104
|
+
dataset.df, dataset.feature_columns_categorical
|
105
|
+
)
|
106
|
+
if not summary_stats_categorical.empty:
|
107
|
+
tables["Categorical Variables"] = format_records(summary_stats_categorical)
|
108
|
+
|
109
|
+
if not tables:
|
110
|
+
raise SkipTestError(
|
111
|
+
"No numerical or categorical variables found in the dataset."
|
140
112
|
)
|
113
|
+
|
114
|
+
return tables
|
@@ -2,20 +2,21 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
from arch.unitroot import DFGLS
|
9
7
|
from numpy.linalg import LinAlgError
|
10
8
|
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
11
|
from validmind.logging import get_logger
|
12
|
-
from validmind.vm_models import
|
12
|
+
from validmind.vm_models import VMDataset
|
13
13
|
|
14
14
|
logger = get_logger(__name__)
|
15
15
|
|
16
16
|
|
17
|
-
@
|
18
|
-
|
17
|
+
@tags("time_series_data", "forecasting", "unit_root_test")
|
18
|
+
@tasks("regression")
|
19
|
+
def DickeyFullerGLS(dataset: VMDataset):
|
19
20
|
"""
|
20
21
|
Assesses stationarity in time series data using the Dickey-Fuller GLS test to determine the order of integration.
|
21
22
|
|
@@ -56,77 +57,44 @@ class DFGLSArch(Metric):
|
|
56
57
|
- The test also presents challenges when dealing with shorter time series data or volatile data, not producing
|
57
58
|
reliable results in these cases.
|
58
59
|
"""
|
60
|
+
df = dataset.df.dropna()
|
59
61
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
def run(self):
|
66
|
-
"""
|
67
|
-
Calculates Dickey-Fuller GLS metric for each of the dataset features
|
68
|
-
"""
|
69
|
-
dataset = self.inputs.dataset.df
|
70
|
-
|
71
|
-
# Check if the dataset is a time series
|
72
|
-
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
73
|
-
raise ValueError(
|
74
|
-
"Dataset index must be a datetime or period index for time series analysis."
|
75
|
-
)
|
62
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
63
|
+
raise SkipTestError(
|
64
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
65
|
+
)
|
76
66
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
67
|
+
df = df.apply(pd.to_numeric, errors="coerce")
|
68
|
+
|
69
|
+
dfgls_values = []
|
70
|
+
|
71
|
+
for col in df.columns:
|
72
|
+
try:
|
73
|
+
dfgls_out = DFGLS(df[col].values)
|
74
|
+
dfgls_values.append(
|
75
|
+
{
|
76
|
+
"Variable": col,
|
77
|
+
"stat": dfgls_out.stat,
|
78
|
+
"pvalue": dfgls_out.pvalue,
|
79
|
+
"usedlag": dfgls_out.lags,
|
80
|
+
"nobs": dfgls_out.nobs,
|
81
|
+
}
|
81
82
|
)
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
}
|
101
|
-
)
|
102
|
-
except LinAlgError as e:
|
103
|
-
logger.error(
|
104
|
-
f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
|
105
|
-
)
|
106
|
-
dfgls_values.append(
|
107
|
-
{
|
108
|
-
"Variable": col,
|
109
|
-
"stat": None,
|
110
|
-
"pvalue": None,
|
111
|
-
"usedlag": None,
|
112
|
-
"nobs": None,
|
113
|
-
"error": str(e),
|
114
|
-
}
|
115
|
-
)
|
116
|
-
|
117
|
-
return self.cache_results({"dfgls_results": dfgls_values})
|
118
|
-
|
119
|
-
def summary(self, metric_value):
|
120
|
-
"""
|
121
|
-
Build a table for summarizing the DFGLS results
|
122
|
-
"""
|
123
|
-
dfgls_results = metric_value["dfgls_results"]
|
124
|
-
|
125
|
-
return ResultSummary(
|
126
|
-
results=[
|
127
|
-
ResultTable(
|
128
|
-
data=dfgls_results,
|
129
|
-
metadata=ResultTableMetadata(title="DFGLS Test Results"),
|
130
|
-
)
|
131
|
-
]
|
132
|
-
)
|
83
|
+
except LinAlgError as e:
|
84
|
+
logger.error(
|
85
|
+
f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
|
86
|
+
)
|
87
|
+
dfgls_values.append(
|
88
|
+
{
|
89
|
+
"Variable": col,
|
90
|
+
"stat": None,
|
91
|
+
"pvalue": None,
|
92
|
+
"usedlag": None,
|
93
|
+
"nobs": None,
|
94
|
+
"error": str(e),
|
95
|
+
}
|
96
|
+
)
|
97
|
+
|
98
|
+
return {
|
99
|
+
"DFGLS Test Results": dfgls_values,
|
100
|
+
}
|
@@ -2,22 +2,14 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import List
|
7
|
-
|
8
5
|
import pandas as pd
|
9
6
|
|
10
|
-
from validmind
|
11
|
-
ResultSummary,
|
12
|
-
ResultTable,
|
13
|
-
ResultTableMetadata,
|
14
|
-
ThresholdTest,
|
15
|
-
ThresholdTestResult,
|
16
|
-
)
|
7
|
+
from validmind import tags, tasks
|
17
8
|
|
18
9
|
|
19
|
-
@
|
20
|
-
|
10
|
+
@tags("tabular_data", "data_quality", "text_data")
|
11
|
+
@tasks("classification", "regression")
|
12
|
+
def Duplicates(dataset, min_threshold=1):
|
21
13
|
"""
|
22
14
|
Tests dataset for duplicate entries, ensuring model reliability via data quality verification.
|
23
15
|
|
@@ -59,82 +51,21 @@ class Duplicates(ThresholdTest):
|
|
59
51
|
for very large datasets.
|
60
52
|
- Can only check for exact duplicates and may miss semantically similar information packaged differently.
|
61
53
|
"""
|
54
|
+
df = dataset.df[dataset.text_column or dataset.feature_columns]
|
55
|
+
|
56
|
+
duplicate_rows_count = df.duplicated().sum()
|
57
|
+
percentage_duplicate_rows = (duplicate_rows_count / len(df)) * 100
|
58
|
+
|
59
|
+
result_df = pd.DataFrame(
|
60
|
+
{
|
61
|
+
"Number of Duplicates": [duplicate_rows_count],
|
62
|
+
"Percentage of Rows (%)": [percentage_duplicate_rows],
|
63
|
+
}
|
64
|
+
)
|
65
|
+
|
66
|
+
# test has passed if the total sum of duplicates is less than the threshold
|
67
|
+
passed = result_df["Number of Duplicates"].sum() < min_threshold
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
tasks = ["classification", "regression"]
|
67
|
-
tags = ["tabular_data", "data_quality", "text_data"]
|
68
|
-
|
69
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
70
|
-
"""
|
71
|
-
The duplicates test returns results like these:
|
72
|
-
[{"values": {"n_duplicates": 0, "p_duplicates": 0.0}, "passed": true}]
|
73
|
-
So we build a table with 1 row and show number of duplicates and percentage of duplicates.
|
74
|
-
"""
|
75
|
-
result = results[0]
|
76
|
-
results_table = [{k: v for k, v in row.items()} for row in result.values]
|
77
|
-
|
78
|
-
return ResultSummary(
|
79
|
-
results=[
|
80
|
-
ResultTable(
|
81
|
-
data=results_table,
|
82
|
-
metadata=ResultTableMetadata(
|
83
|
-
title="Duplicate Rows Results for Dataset"
|
84
|
-
),
|
85
|
-
)
|
86
|
-
]
|
87
|
-
)
|
88
|
-
|
89
|
-
def run(self):
|
90
|
-
if self.inputs.dataset.text_column:
|
91
|
-
columns = self.inputs.dataset.text_column
|
92
|
-
else:
|
93
|
-
columns = self.inputs.dataset.feature_columns
|
94
|
-
|
95
|
-
df = self.inputs.dataset.df[columns]
|
96
|
-
# Find duplicate rows
|
97
|
-
duplicate_rows = df.duplicated()
|
98
|
-
|
99
|
-
# Calculate number of duplicate rows
|
100
|
-
duplicate_rows_count = duplicate_rows.sum()
|
101
|
-
|
102
|
-
# Calculate total number of rows
|
103
|
-
total_rows = len(df)
|
104
|
-
|
105
|
-
# Calculate percentage of duplicate rows
|
106
|
-
percentage_duplicate_rows = (duplicate_rows_count / total_rows) * 100
|
107
|
-
|
108
|
-
# Create a DataFrame with results
|
109
|
-
result_df = pd.DataFrame(
|
110
|
-
{
|
111
|
-
"Number of Duplicates": [duplicate_rows_count],
|
112
|
-
"Percentage of Rows (%)": [percentage_duplicate_rows],
|
113
|
-
}
|
114
|
-
)
|
115
|
-
|
116
|
-
# test has passed if the total sum of duplicates is less than the threshold
|
117
|
-
n_duplicates = result_df["Number of Duplicates"].sum()
|
118
|
-
passed = n_duplicates < self.params["min_threshold"]
|
119
|
-
|
120
|
-
results = [
|
121
|
-
ThresholdTestResult(
|
122
|
-
passed=passed,
|
123
|
-
values=result_df.to_dict(orient="records"),
|
124
|
-
)
|
125
|
-
]
|
126
|
-
|
127
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
128
|
-
|
129
|
-
def test(self):
|
130
|
-
# Check that result object is not None
|
131
|
-
assert self.result is not None
|
132
|
-
# Check that we have a list of test results
|
133
|
-
assert isinstance(self.result.test_results.results, list)
|
134
|
-
# Check if the 'passed' variable in results reflects the test correctly
|
135
|
-
for result in self.result.test_results.results[1:]:
|
136
|
-
assert result.passed == (
|
137
|
-
result.values["n_duplicates"] < self.params["min_threshold"]
|
138
|
-
)
|
139
|
-
expected_results_count = 1
|
140
|
-
assert len(self.result.test_results.results) == expected_results_count
|
69
|
+
return {
|
70
|
+
"Duplicate Rows Results for Dataset": result_df.to_dict(orient="records")
|
71
|
+
}, passed
|