validmind 2.3.5__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +8 -1
- validmind/ai/utils.py +2 -1
- validmind/client.py +1 -0
- validmind/tests/__init__.py +14 -468
- validmind/tests/_store.py +102 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +7 -9
- validmind/tests/data_validation/ADF.py +8 -10
- validmind/tests/data_validation/ANOVAOneWayTable.py +8 -10
- validmind/tests/data_validation/AutoAR.py +2 -4
- validmind/tests/data_validation/AutoMA.py +2 -4
- validmind/tests/data_validation/AutoSeasonality.py +8 -10
- validmind/tests/data_validation/AutoStationarity.py +8 -10
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +8 -10
- validmind/tests/data_validation/BivariateHistograms.py +8 -10
- validmind/tests/data_validation/BivariateScatterPlots.py +8 -10
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +8 -10
- validmind/tests/data_validation/ClassImbalance.py +2 -4
- validmind/tests/data_validation/DFGLSArch.py +2 -4
- validmind/tests/data_validation/DatasetDescription.py +7 -9
- validmind/tests/data_validation/DatasetSplit.py +8 -9
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +2 -4
- validmind/tests/data_validation/EngleGrangerCoint.py +2 -4
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +2 -4
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +2 -4
- validmind/tests/data_validation/HighCardinality.py +2 -4
- validmind/tests/data_validation/HighPearsonCorrelation.py +2 -4
- validmind/tests/data_validation/IQROutliersBarPlot.py +2 -4
- validmind/tests/data_validation/IQROutliersTable.py +2 -4
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -4
- validmind/tests/data_validation/KPSS.py +8 -10
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +2 -4
- validmind/tests/data_validation/MissingValues.py +2 -4
- validmind/tests/data_validation/MissingValuesBarPlot.py +2 -4
- validmind/tests/data_validation/MissingValuesRisk.py +2 -4
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +2 -4
- validmind/tests/data_validation/PhillipsPerronArch.py +7 -9
- validmind/tests/data_validation/RollingStatsPlot.py +2 -4
- validmind/tests/data_validation/ScatterPlot.py +2 -4
- validmind/tests/data_validation/SeasonalDecompose.py +2 -4
- validmind/tests/data_validation/Skewness.py +2 -4
- validmind/tests/data_validation/SpreadPlot.py +2 -4
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +2 -4
- validmind/tests/data_validation/TabularDateTimeHistograms.py +2 -4
- validmind/tests/data_validation/TabularDescriptionTables.py +2 -4
- validmind/tests/data_validation/TabularNumericalHistograms.py +2 -4
- validmind/tests/data_validation/TargetRateBarPlots.py +2 -4
- validmind/tests/data_validation/TimeSeriesFrequency.py +2 -4
- validmind/tests/data_validation/TimeSeriesLinePlot.py +2 -4
- validmind/tests/data_validation/TimeSeriesMissingValues.py +2 -4
- validmind/tests/data_validation/TimeSeriesOutliers.py +2 -4
- validmind/tests/data_validation/TooManyZeroValues.py +2 -4
- validmind/tests/data_validation/UniqueRows.py +2 -4
- validmind/tests/data_validation/WOEBinPlots.py +2 -4
- validmind/tests/data_validation/WOEBinTable.py +2 -4
- validmind/tests/data_validation/ZivotAndrewsArch.py +2 -4
- validmind/tests/data_validation/nlp/CommonWords.py +2 -4
- validmind/tests/data_validation/nlp/Hashtags.py +2 -4
- validmind/tests/data_validation/nlp/Mentions.py +2 -4
- validmind/tests/data_validation/nlp/Punctuations.py +2 -4
- validmind/tests/data_validation/nlp/StopWords.py +2 -4
- validmind/tests/data_validation/nlp/TextDescription.py +2 -4
- validmind/tests/decorator.py +10 -8
- validmind/tests/load.py +264 -0
- validmind/tests/metadata.py +59 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +5 -7
- validmind/tests/model_validation/FeaturesAUC.py +6 -8
- validmind/tests/model_validation/ModelMetadata.py +8 -9
- validmind/tests/model_validation/RegressionResidualsPlot.py +2 -6
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +2 -4
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +2 -4
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -4
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +2 -4
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +2 -4
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +5 -7
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +5 -7
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +7 -9
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -7
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +5 -7
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +2 -7
- validmind/tests/model_validation/sklearn/CompletenessScore.py +5 -7
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +19 -10
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +5 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +5 -7
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +2 -7
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +4 -7
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +7 -9
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +7 -9
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +7 -9
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +8 -10
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +7 -9
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -10
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +7 -9
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +8 -10
- validmind/tests/model_validation/sklearn/ROCCurve.py +10 -11
- validmind/tests/model_validation/sklearn/RegressionErrors.py +5 -7
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +5 -7
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +5 -7
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +10 -14
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +8 -10
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -7
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +8 -10
- validmind/tests/model_validation/sklearn/VMeasure.py +5 -7
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +8 -10
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +2 -4
- validmind/tests/model_validation/statsmodels/BoxPierce.py +2 -4
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +3 -4
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +2 -4
- validmind/tests/model_validation/statsmodels/GINITable.py +2 -4
- validmind/tests/model_validation/statsmodels/JarqueBera.py +7 -9
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +7 -9
- validmind/tests/model_validation/statsmodels/LJungBox.py +2 -4
- validmind/tests/model_validation/statsmodels/Lilliefors.py +7 -9
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +7 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -4
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +6 -8
- validmind/tests/model_validation/statsmodels/RunsTest.py +2 -4
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +3 -4
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +2 -4
- validmind/tests/prompt_validation/Bias.py +2 -4
- validmind/tests/prompt_validation/Clarity.py +2 -4
- validmind/tests/prompt_validation/Conciseness.py +2 -4
- validmind/tests/prompt_validation/Delimitation.py +2 -4
- validmind/tests/prompt_validation/NegativeInstruction.py +2 -4
- validmind/tests/prompt_validation/Robustness.py +2 -4
- validmind/tests/prompt_validation/Specificity.py +2 -4
- validmind/tests/run.py +394 -0
- validmind/tests/test_providers.py +12 -0
- validmind/tests/utils.py +16 -0
- validmind/unit_metrics/__init__.py +12 -4
- validmind/unit_metrics/composite.py +3 -0
- validmind/vm_models/test/metric.py +8 -5
- validmind/vm_models/test/result_wrapper.py +2 -1
- validmind/vm_models/test/test.py +14 -11
- validmind/vm_models/test/threshold_test.py +1 -0
- validmind/vm_models/test_suite/runner.py +1 -0
- {validmind-2.3.5.dist-info → validmind-2.4.0.dist-info}/METADATA +1 -1
- {validmind-2.3.5.dist-info → validmind-2.4.0.dist-info}/RECORD +148 -143
- {validmind-2.3.5.dist-info → validmind-2.4.0.dist-info}/LICENSE +0 -0
- {validmind-2.3.5.dist-info → validmind-2.4.0.dist-info}/WHEEL +0 -0
- {validmind-2.3.5.dist-info → validmind-2.4.0.dist-info}/entry_points.txt +0 -0
@@ -47,16 +47,14 @@ class ADF(Metric):
|
|
47
47
|
|
48
48
|
name = "adf"
|
49
49
|
required_inputs = ["dataset"]
|
50
|
-
|
51
|
-
|
52
|
-
"
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
],
|
59
|
-
}
|
50
|
+
tasks = ["regression"]
|
51
|
+
tags = [
|
52
|
+
"time_series_data",
|
53
|
+
"statsmodels",
|
54
|
+
"forecasting",
|
55
|
+
"statistical_test",
|
56
|
+
"stationarity",
|
57
|
+
]
|
60
58
|
|
61
59
|
def summary(self, metric_value: dict):
|
62
60
|
table = pd.DataFrame.from_dict(metric_value, orient="index")
|
@@ -57,16 +57,14 @@ class ANOVAOneWayTable(Metric):
|
|
57
57
|
name = "anova_one_way_table"
|
58
58
|
required_inputs = ["dataset"]
|
59
59
|
default_params = {"features": None, "p_threshold": 0.05}
|
60
|
-
|
61
|
-
|
62
|
-
"
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
],
|
69
|
-
}
|
60
|
+
tasks = ["classification"]
|
61
|
+
tags = [
|
62
|
+
"tabular_data",
|
63
|
+
"statistical_test",
|
64
|
+
"multiclass_classification",
|
65
|
+
"binary_classification",
|
66
|
+
"numerical_data",
|
67
|
+
]
|
70
68
|
|
71
69
|
def run(self):
|
72
70
|
features = self.params["features"]
|
@@ -61,10 +61,8 @@ class AutoAR(Metric):
|
|
61
61
|
name = "auto_ar"
|
62
62
|
required_inputs = ["dataset"]
|
63
63
|
default_params = {"max_ar_order": 3}
|
64
|
-
|
65
|
-
|
66
|
-
"tags": ["time_series_data", "statsmodels", "forecasting", "statistical_test"],
|
67
|
-
}
|
64
|
+
tasks = ["regression"]
|
65
|
+
tags = ["time_series_data", "statsmodels", "forecasting", "statistical_test"]
|
68
66
|
|
69
67
|
def run(self):
|
70
68
|
if "max_ar_order" not in self.params:
|
@@ -57,10 +57,8 @@ class AutoMA(Metric):
|
|
57
57
|
name = "auto_ma"
|
58
58
|
required_inputs = ["dataset"]
|
59
59
|
default_params = {"max_ma_order": 3}
|
60
|
-
|
61
|
-
|
62
|
-
"tags": ["time_series_data", "statsmodels", "forecasting", "statistical_test"],
|
63
|
-
}
|
60
|
+
tasks = ["regression"]
|
61
|
+
tags = ["time_series_data", "statsmodels", "forecasting", "statistical_test"]
|
64
62
|
|
65
63
|
def run(self):
|
66
64
|
if "max_ma_order" not in self.params:
|
@@ -61,16 +61,14 @@ class AutoSeasonality(Metric):
|
|
61
61
|
name = "auto_seasonality"
|
62
62
|
required_inputs = ["dataset"]
|
63
63
|
default_params = {"min_period": 1, "max_period": 4}
|
64
|
-
|
65
|
-
|
66
|
-
"
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
],
|
73
|
-
}
|
64
|
+
tasks = ["regression"]
|
65
|
+
tags = [
|
66
|
+
"time_series_data",
|
67
|
+
"forecasting",
|
68
|
+
"statistical_test",
|
69
|
+
"statsmodels",
|
70
|
+
"seasonality",
|
71
|
+
]
|
74
72
|
|
75
73
|
def evaluate_seasonal_periods(self, series, min_period, max_period):
|
76
74
|
seasonal_periods = []
|
@@ -54,16 +54,14 @@ class AutoStationarity(Metric):
|
|
54
54
|
name = "auto_stationarity"
|
55
55
|
required_inputs = ["dataset"]
|
56
56
|
default_params = {"max_order": 5, "threshold": 0.05}
|
57
|
-
|
58
|
-
|
59
|
-
"
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
],
|
66
|
-
}
|
57
|
+
tasks = ["regression"]
|
58
|
+
tags = [
|
59
|
+
"time_series_data",
|
60
|
+
"statsmodels",
|
61
|
+
"forecasting",
|
62
|
+
"statistical_test",
|
63
|
+
"stationarity",
|
64
|
+
]
|
67
65
|
|
68
66
|
def run(self):
|
69
67
|
if "max_order" not in self.params:
|
@@ -56,16 +56,14 @@ class BivariateFeaturesBarPlots(Metric):
|
|
56
56
|
name = "bivariate_features_bar_plots"
|
57
57
|
required_inputs = ["dataset"]
|
58
58
|
default_params = {"features_pairs": None}
|
59
|
-
|
60
|
-
|
61
|
-
"
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
],
|
68
|
-
}
|
59
|
+
tasks = ["classification"]
|
60
|
+
tags = [
|
61
|
+
"tabular_data",
|
62
|
+
"categorical_data",
|
63
|
+
"binary_classification",
|
64
|
+
"multiclass_classification",
|
65
|
+
"visualization",
|
66
|
+
]
|
69
67
|
|
70
68
|
def run(self):
|
71
69
|
features_pairs = self.params["features_pairs"]
|
@@ -55,16 +55,14 @@ class BivariateHistograms(Metric):
|
|
55
55
|
name = "bivariate_histograms"
|
56
56
|
required_inputs = ["dataset"]
|
57
57
|
default_params = {"features_pairs": None, "target_filter": None}
|
58
|
-
|
59
|
-
|
60
|
-
"
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
],
|
67
|
-
}
|
58
|
+
tasks = ["classification"]
|
59
|
+
tags = [
|
60
|
+
"tabular_data",
|
61
|
+
"categorical_data",
|
62
|
+
"binary_classification",
|
63
|
+
"multiclass_classification",
|
64
|
+
"visualization",
|
65
|
+
]
|
68
66
|
|
69
67
|
def plot_bivariate_histogram(self, features_pairs, target_filter):
|
70
68
|
status_var = self.inputs.dataset.target_column
|
@@ -54,16 +54,14 @@ class BivariateScatterPlots(Metric):
|
|
54
54
|
name = "bivariate_scatter_plots"
|
55
55
|
required_inputs = ["dataset"]
|
56
56
|
default_params = {"selected_columns": None}
|
57
|
-
|
58
|
-
|
59
|
-
"
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
],
|
66
|
-
}
|
57
|
+
tasks = ["classification"]
|
58
|
+
tags = [
|
59
|
+
"tabular_data",
|
60
|
+
"categorical_data",
|
61
|
+
"binary_classification",
|
62
|
+
"multiclass_classification",
|
63
|
+
"visualization",
|
64
|
+
]
|
67
65
|
|
68
66
|
def plot_bivariate_scatter(self, columns):
|
69
67
|
figures = []
|
@@ -54,16 +54,14 @@ class ChiSquaredFeaturesTable(Metric):
|
|
54
54
|
name = "chi_squared_features_table"
|
55
55
|
required_inputs = ["dataset"]
|
56
56
|
default_params = {"cat_features": None, "p_threshold": 0.05}
|
57
|
-
|
58
|
-
|
59
|
-
"
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
],
|
66
|
-
}
|
57
|
+
tasks = ["classification"]
|
58
|
+
tags = [
|
59
|
+
"tabular_data",
|
60
|
+
"categorical_data",
|
61
|
+
"statistical_test",
|
62
|
+
"binary_classification",
|
63
|
+
"multiclass_classification",
|
64
|
+
]
|
67
65
|
|
68
66
|
def run(self):
|
69
67
|
target_column = self.inputs.dataset.target_column
|
@@ -73,10 +73,8 @@ class ClassImbalance(ThresholdTest):
|
|
73
73
|
name = "class_imbalance"
|
74
74
|
required_inputs = ["dataset"]
|
75
75
|
default_params = {"min_percent_threshold": 10}
|
76
|
-
|
77
|
-
|
78
|
-
"tags": ["tabular_data", "binary_classification", "multiclass_classification"],
|
79
|
-
}
|
76
|
+
tasks = ["classification"]
|
77
|
+
tags = ["tabular_data", "binary_classification", "multiclass_classification"]
|
80
78
|
|
81
79
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
82
80
|
return ResultSummary(
|
@@ -53,10 +53,8 @@ class DFGLSArch(Metric):
|
|
53
53
|
|
54
54
|
name = "dickey_fuller_gls"
|
55
55
|
required_inputs = ["dataset"]
|
56
|
-
|
57
|
-
|
58
|
-
"tags": ["time_series_data", "forecasting", "unit_root_test"],
|
59
|
-
}
|
56
|
+
tasks = ["regression"]
|
57
|
+
tags = ["time_series_data", "forecasting", "unit_root_test"]
|
60
58
|
|
61
59
|
def run(self):
|
62
60
|
"""
|
@@ -69,15 +69,13 @@ class DatasetDescription(Metric):
|
|
69
69
|
|
70
70
|
name = "dataset_description"
|
71
71
|
required_inputs = ["dataset"]
|
72
|
-
|
73
|
-
"
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
"tags": ["tabular_data", "time_series_data", "text_data"],
|
80
|
-
}
|
72
|
+
tasks = [
|
73
|
+
"classification",
|
74
|
+
"regression",
|
75
|
+
"text_classification",
|
76
|
+
"text_summarization",
|
77
|
+
]
|
78
|
+
tags = ["tabular_data", "time_series_data", "text_data"]
|
81
79
|
|
82
80
|
def summary(self, metric_value):
|
83
81
|
"""
|
@@ -48,15 +48,14 @@ class DatasetSplit(Metric):
|
|
48
48
|
|
49
49
|
name = "dataset_split"
|
50
50
|
required_inputs = ["datasets"]
|
51
|
-
|
52
|
-
"
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
}
|
51
|
+
tasks = [
|
52
|
+
"classification",
|
53
|
+
"regression",
|
54
|
+
"text_classification",
|
55
|
+
"text_summarization",
|
56
|
+
]
|
57
|
+
|
58
|
+
tags = ["tabular_data", "time_series_data", "text_data"]
|
60
59
|
|
61
60
|
dataset_labels = {
|
62
61
|
"train_ds": "Training",
|
@@ -53,10 +53,8 @@ class DescriptiveStatistics(Metric):
|
|
53
53
|
|
54
54
|
name = "descriptive_statistics"
|
55
55
|
required_inputs = ["dataset"]
|
56
|
-
|
57
|
-
|
58
|
-
"tags": ["tabular_data", "time_series_data"],
|
59
|
-
}
|
56
|
+
tasks = ["classification", "regression"]
|
57
|
+
tags = ["tabular_data", "time_series_data"]
|
60
58
|
|
61
59
|
def get_summary_statistics_numerical(self, df, numerical_fields):
|
62
60
|
percentiles = [0.25, 0.5, 0.75, 0.90, 0.95]
|
@@ -55,10 +55,8 @@ class Duplicates(ThresholdTest):
|
|
55
55
|
name = "duplicates"
|
56
56
|
required_inputs = ["dataset"]
|
57
57
|
default_params = {"min_threshold": 1}
|
58
|
-
|
59
|
-
|
60
|
-
"tags": ["tabular_data", "data_quality", "text_data"],
|
61
|
-
}
|
58
|
+
tasks = ["classification", "regression"]
|
59
|
+
tags = ["tabular_data", "data_quality", "text_data"]
|
62
60
|
|
63
61
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
64
62
|
"""
|
@@ -51,10 +51,8 @@ class EngleGrangerCoint(Metric):
|
|
51
51
|
name = "engle_granger_coint"
|
52
52
|
required_inputs = ["dataset"]
|
53
53
|
default_params = {"threshold": 0.05}
|
54
|
-
|
55
|
-
|
56
|
-
"tags": ["time_series_data", "statistical_test", "forecasting"],
|
57
|
-
}
|
54
|
+
tasks = ["regression"]
|
55
|
+
tags = ["time_series_data", "statistical_test", "forecasting"]
|
58
56
|
|
59
57
|
def run(self):
|
60
58
|
threshold = self.params["threshold"]
|
@@ -48,10 +48,8 @@ class FeatureTargetCorrelationPlot(Metric):
|
|
48
48
|
name = "feature_target_correlation_plot"
|
49
49
|
required_inputs = ["dataset"]
|
50
50
|
default_params = {"features": None, "fig_height": 600}
|
51
|
-
|
52
|
-
|
53
|
-
"tags": ["tabular_data", "visualization", "feature_importance", "correlation"],
|
54
|
-
}
|
51
|
+
tasks = ["classification", "regression"]
|
52
|
+
tags = ["tabular_data", "visualization", "feature_importance", "correlation"]
|
55
53
|
|
56
54
|
def run(self):
|
57
55
|
fig_height = self.params["fig_height"]
|
@@ -56,10 +56,8 @@ class HeatmapFeatureCorrelations(Metric):
|
|
56
56
|
name = "heatmap_feature_correlations"
|
57
57
|
required_inputs = ["dataset"]
|
58
58
|
default_params = {"declutter": None, "fontsize": None, "num_features": None}
|
59
|
-
|
60
|
-
|
61
|
-
"tags": ["tabular_data", "visualization", "correlation"],
|
62
|
-
}
|
59
|
+
tasks = ["classification", "regression"]
|
60
|
+
tags = ["tabular_data", "visualization", "correlation"]
|
63
61
|
|
64
62
|
def run(self):
|
65
63
|
features = self.params.get("features")
|
@@ -57,10 +57,8 @@ class HighCardinality(ThresholdTest):
|
|
57
57
|
"percent_threshold": 0.1,
|
58
58
|
"threshold_type": "percent", # or "num"
|
59
59
|
}
|
60
|
-
|
61
|
-
|
62
|
-
"tags": ["tabular_data", "data_quality", "categorical_data"],
|
63
|
-
}
|
60
|
+
tasks = ["classification", "regression"]
|
61
|
+
tags = ["tabular_data", "data_quality", "categorical_data"]
|
64
62
|
|
65
63
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
66
64
|
"""
|
@@ -59,10 +59,8 @@ class HighPearsonCorrelation(ThresholdTest):
|
|
59
59
|
name = "pearson_correlation"
|
60
60
|
required_inputs = ["dataset"]
|
61
61
|
default_params = {"max_threshold": 0.3}
|
62
|
-
|
63
|
-
|
64
|
-
"tags": ["tabular_data", "data_quality", "correlation"],
|
65
|
-
}
|
62
|
+
tasks = ["classification", "regression"]
|
63
|
+
tags = ["tabular_data", "data_quality", "correlation"]
|
66
64
|
|
67
65
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
68
66
|
"""The high pearson correlation test returns results like these:
|
@@ -63,10 +63,8 @@ class IQROutliersBarPlot(Metric):
|
|
63
63
|
name = "iqr_outliers_bar_plot"
|
64
64
|
required_inputs = ["dataset"]
|
65
65
|
default_params = {"threshold": 1.5, "num_features": None, "fig_width": 800}
|
66
|
-
|
67
|
-
|
68
|
-
"tags": ["tabular_data", "visualization", "numerical_data"],
|
69
|
-
}
|
66
|
+
tasks = ["classification", "regression"]
|
67
|
+
tags = ["tabular_data", "visualization", "numerical_data"]
|
70
68
|
|
71
69
|
def run(self):
|
72
70
|
df = self.inputs.dataset.df
|
@@ -54,10 +54,8 @@ class IQROutliersTable(Metric):
|
|
54
54
|
name = "iqr_outliers_table"
|
55
55
|
required_inputs = ["dataset"]
|
56
56
|
default_params = {"features": None, "threshold": 1.5}
|
57
|
-
|
58
|
-
|
59
|
-
"tags": ["tabular_data", "numerical_data"],
|
60
|
-
}
|
57
|
+
tasks = ["classification", "regression"]
|
58
|
+
tags = ["tabular_data", "numerical_data"]
|
61
59
|
|
62
60
|
def run(self):
|
63
61
|
features = self.params["features"]
|
@@ -55,10 +55,8 @@ class IsolationForestOutliers(Metric):
|
|
55
55
|
"contamination": 0.1,
|
56
56
|
"features_columns": None,
|
57
57
|
}
|
58
|
-
|
59
|
-
|
60
|
-
"tags": ["tabular_data", "anomaly_detection"],
|
61
|
-
}
|
58
|
+
tasks = ["classification"]
|
59
|
+
tags = ["tabular_data", "anomaly_detection"]
|
62
60
|
|
63
61
|
required_inputs = ["dataset"]
|
64
62
|
|
@@ -51,16 +51,14 @@ class KPSS(Metric):
|
|
51
51
|
|
52
52
|
name = "kpss"
|
53
53
|
required_inputs = ["dataset"]
|
54
|
-
|
55
|
-
|
56
|
-
"
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
],
|
63
|
-
}
|
54
|
+
tasks = ["regression"]
|
55
|
+
tags = [
|
56
|
+
"time_series_data",
|
57
|
+
"forecasting",
|
58
|
+
"stationarity",
|
59
|
+
"unit_root_test",
|
60
|
+
"statsmodels",
|
61
|
+
]
|
64
62
|
|
65
63
|
def run(self):
|
66
64
|
"""
|
@@ -51,10 +51,8 @@ class LaggedCorrelationHeatmap(Metric):
|
|
51
51
|
|
52
52
|
name = "lagged_correlation_heatmap"
|
53
53
|
required_inputs = ["dataset"]
|
54
|
-
|
55
|
-
|
56
|
-
"tags": ["time_series_data", "visualization"],
|
57
|
-
}
|
54
|
+
tasks = ["regression"]
|
55
|
+
tags = ["time_series_data", "visualization"]
|
58
56
|
|
59
57
|
def _compute_correlations(self, df, target_col, independent_vars, num_lags):
|
60
58
|
correlations = np.zeros((len(independent_vars), num_lags + 1))
|
@@ -52,10 +52,8 @@ class MissingValues(ThresholdTest):
|
|
52
52
|
name = "missing"
|
53
53
|
required_inputs = ["dataset"]
|
54
54
|
default_params = {"min_threshold": 1}
|
55
|
-
|
56
|
-
|
57
|
-
"tags": ["tabular_data", "data_quality"],
|
58
|
-
}
|
55
|
+
tasks = ["classification", "regression"]
|
56
|
+
tags = ["tabular_data", "data_quality"]
|
59
57
|
|
60
58
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
61
59
|
"""
|
@@ -55,10 +55,8 @@ class MissingValuesBarPlot(Metric):
|
|
55
55
|
name = "missing_values_bar_plot"
|
56
56
|
required_inputs = ["dataset"]
|
57
57
|
default_params = {"threshold": 80, "fig_height": 600}
|
58
|
-
|
59
|
-
|
60
|
-
"tags": ["tabular_data", "data_quality", "visualization"],
|
61
|
-
}
|
58
|
+
tasks = ["classification", "regression"]
|
59
|
+
tags = ["tabular_data", "data_quality", "visualization"]
|
62
60
|
|
63
61
|
def run(self):
|
64
62
|
threshold = self.params["threshold"]
|
@@ -52,10 +52,8 @@ class MissingValuesRisk(Metric):
|
|
52
52
|
|
53
53
|
name = "missing_values_risk"
|
54
54
|
required_inputs = ["dataset"]
|
55
|
-
|
56
|
-
|
57
|
-
"tags": ["tabular_data", "data_quality", "risk_analysis"],
|
58
|
-
}
|
55
|
+
tasks = ["classification", "regression"]
|
56
|
+
tags = ["tabular_data", "data_quality", "risk_analysis"]
|
59
57
|
|
60
58
|
def run(self):
|
61
59
|
total_cells = self.inputs.dataset.df.size
|
@@ -50,10 +50,8 @@ class PearsonCorrelationMatrix(Metric):
|
|
50
50
|
|
51
51
|
name = "pearson_correlation_matrix"
|
52
52
|
required_inputs = ["dataset"]
|
53
|
-
|
54
|
-
|
55
|
-
"tags": ["tabular_data", "numerical_data", "correlation"],
|
56
|
-
}
|
53
|
+
tasks = ["classification", "regression"]
|
54
|
+
tags = ["tabular_data", "numerical_data", "correlation"]
|
57
55
|
|
58
56
|
def run(self):
|
59
57
|
columns = self.params.get("columns", list(self.inputs.dataset.df.columns))
|
@@ -51,15 +51,13 @@ class PhillipsPerronArch(Metric):
|
|
51
51
|
|
52
52
|
name = "phillips_perron"
|
53
53
|
required_inputs = ["dataset"]
|
54
|
-
|
55
|
-
|
56
|
-
"
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
],
|
62
|
-
}
|
54
|
+
tasks = ["regression"]
|
55
|
+
tags = [
|
56
|
+
"time_series_data",
|
57
|
+
"forecasting",
|
58
|
+
"statistical_test",
|
59
|
+
"unit_root_test",
|
60
|
+
]
|
63
61
|
|
64
62
|
def run(self):
|
65
63
|
"""
|
@@ -54,10 +54,8 @@ class RollingStatsPlot(Metric):
|
|
54
54
|
name = "rolling_stats_plot"
|
55
55
|
required_inputs = ["dataset"]
|
56
56
|
default_params = {"window_size": 12}
|
57
|
-
|
58
|
-
|
59
|
-
"tags": ["time_series_data", "visualization", "stationarity"],
|
60
|
-
}
|
57
|
+
tasks = ["regression"]
|
58
|
+
tags = ["time_series_data", "visualization", "stationarity"]
|
61
59
|
|
62
60
|
def plot_rolling_statistics(self, col, window_size=12):
|
63
61
|
"""
|
@@ -52,10 +52,8 @@ class ScatterPlot(Metric):
|
|
52
52
|
|
53
53
|
name = "scatter_plot"
|
54
54
|
required_inputs = ["dataset"]
|
55
|
-
|
56
|
-
|
57
|
-
"tags": ["tabular_data", "visualization"],
|
58
|
-
}
|
55
|
+
tasks = ["classification", "regression"]
|
56
|
+
tags = ["tabular_data", "visualization"]
|
59
57
|
|
60
58
|
def run(self):
|
61
59
|
columns = list(self.inputs.dataset.df.columns)
|
@@ -59,10 +59,8 @@ class SeasonalDecompose(Metric):
|
|
59
59
|
name = "seasonal_decompose"
|
60
60
|
required_inputs = ["dataset"]
|
61
61
|
default_params = {"seasonal_model": "additive"}
|
62
|
-
|
63
|
-
|
64
|
-
"tags": ["time_series_data", "seasonality", "statsmodels"],
|
65
|
-
}
|
62
|
+
tasks = ["regression"]
|
63
|
+
tags = ["time_series_data", "seasonality", "statsmodels"]
|
66
64
|
|
67
65
|
def store_seasonal_decompose(self, column, sd_one_column):
|
68
66
|
"""
|