validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -5,26 +5,20 @@
|
|
5
5
|
"""
|
6
6
|
Threshold based tests
|
7
7
|
"""
|
8
|
-
from
|
9
|
-
from typing import List
|
8
|
+
from typing import Any, Dict, Tuple
|
10
9
|
|
11
|
-
import pandas as pd
|
12
10
|
import plotly.graph_objs as go
|
13
11
|
|
12
|
+
from validmind import tags, tasks
|
14
13
|
from validmind.errors import SkipTestError
|
15
|
-
from validmind.vm_models import
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
)
|
24
|
-
|
25
|
-
|
26
|
-
@dataclass
|
27
|
-
class ClassImbalance(ThresholdTest):
|
14
|
+
from validmind.vm_models import VMDataset
|
15
|
+
|
16
|
+
|
17
|
+
@tags("tabular_data", "binary_classification", "multiclass_classification")
|
18
|
+
@tasks("classification")
|
19
|
+
def ClassImbalance(
|
20
|
+
dataset: VMDataset, min_percent_threshold: int = 10
|
21
|
+
) -> Tuple[Dict[str, Any], go.Figure, bool]:
|
28
22
|
"""
|
29
23
|
Evaluates and quantifies class distribution imbalance in a dataset used by a machine learning model.
|
30
24
|
|
@@ -71,106 +65,43 @@ class ClassImbalance(ThresholdTest):
|
|
71
65
|
these imbalances.
|
72
66
|
- The test is only applicable for classification operations and unsuitable for regression or clustering tasks.
|
73
67
|
"""
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
def run(self):
|
95
|
-
# Can only run this test if we have a Dataset object
|
96
|
-
if not isinstance(self.inputs.dataset, VMDataset):
|
97
|
-
raise ValueError("ClassImbalance requires a validmind Dataset object")
|
98
|
-
|
99
|
-
if self.inputs.dataset.target_column is None:
|
100
|
-
print("Skipping class_imbalance test because no target column is defined")
|
101
|
-
return
|
102
|
-
|
103
|
-
target_column = self.inputs.dataset.target_column
|
104
|
-
imbalance_percentages = self.inputs.dataset.df[target_column].value_counts(
|
105
|
-
normalize=True
|
68
|
+
if not dataset.target_column:
|
69
|
+
raise SkipTestError("No target column provided")
|
70
|
+
|
71
|
+
imbalance_percentages = dataset.df[dataset.target_column].value_counts(
|
72
|
+
normalize=True
|
73
|
+
)
|
74
|
+
if len(imbalance_percentages) > 10:
|
75
|
+
raise SkipTestError("Skipping target column with more than 10 classes")
|
76
|
+
|
77
|
+
classes = list(imbalance_percentages.index)
|
78
|
+
|
79
|
+
imbalanced_classes = []
|
80
|
+
for i, percentage in enumerate(imbalance_percentages.values):
|
81
|
+
proportion = percentage * 100
|
82
|
+
imbalanced_classes.append(
|
83
|
+
{
|
84
|
+
dataset.target_column: classes[i],
|
85
|
+
"Percentage of Rows (%)": f"{proportion:.2f}%",
|
86
|
+
"Pass/Fail": "Pass" if proportion > min_percent_threshold else "Fail",
|
87
|
+
}
|
106
88
|
)
|
107
|
-
if len(imbalance_percentages) > 10:
|
108
|
-
raise SkipTestError(
|
109
|
-
f"Skipping {self.__class__.__name__} test as"
|
110
|
-
"target column as more than 10 classes"
|
111
|
-
)
|
112
|
-
|
113
|
-
classes = list(imbalance_percentages.index)
|
114
|
-
percentages = list(imbalance_percentages.values)
|
115
|
-
|
116
|
-
# Checking class imbalance
|
117
|
-
imbalanced_classes = []
|
118
|
-
for i, percentage in enumerate(percentages):
|
119
|
-
class_label = classes[i]
|
120
|
-
proportion = percentage * 100
|
121
|
-
passed = proportion > self.params["min_percent_threshold"]
|
122
|
-
|
123
|
-
imbalanced_classes.append(
|
124
|
-
{
|
125
|
-
target_column: class_label,
|
126
|
-
"Percentage of Rows (%) ": f"{proportion:.2f}%",
|
127
|
-
"Pass/Fail": "Pass" if passed else "Fail",
|
128
|
-
}
|
129
|
-
)
|
130
|
-
|
131
|
-
resultset = pd.DataFrame(imbalanced_classes)
|
132
|
-
tests_failed = all(resultset["Pass/Fail"] == "Pass")
|
133
|
-
results = [
|
134
|
-
ThresholdTestResult(
|
135
|
-
column=target_column,
|
136
|
-
passed=tests_failed,
|
137
|
-
values=resultset.to_dict(orient="records"),
|
138
|
-
)
|
139
|
-
]
|
140
|
-
|
141
|
-
# Create a bar chart trace
|
142
|
-
trace = go.Bar(
|
143
|
-
x=imbalance_percentages.index,
|
144
|
-
y=imbalance_percentages.values,
|
145
|
-
)
|
146
|
-
|
147
|
-
# Create a layout for the chart
|
148
|
-
layout = go.Layout(
|
149
|
-
title=f"Class Imbalance Results for Target Column {self.inputs.dataset.target_column}",
|
150
|
-
xaxis=dict(title="Class"),
|
151
|
-
yaxis=dict(title="Percentage"),
|
152
|
-
)
|
153
|
-
|
154
|
-
# Create a figure and add the trace and layout
|
155
|
-
fig = go.Figure(data=[trace], layout=layout)
|
156
|
-
|
157
|
-
return self.cache_results(
|
158
|
-
results,
|
159
|
-
passed=tests_failed,
|
160
|
-
figures=[
|
161
|
-
Figure(
|
162
|
-
for_object=self,
|
163
|
-
key=f"{self.name}",
|
164
|
-
figure=fig,
|
165
|
-
)
|
166
|
-
],
|
167
|
-
)
|
168
|
-
|
169
|
-
def test(self):
|
170
|
-
"""Unit test for ClassImbalance"""
|
171
|
-
assert self.result is not None
|
172
|
-
|
173
|
-
assert self.result.test_results is not None
|
174
|
-
assert self.result.test_results.passed
|
175
89
|
|
176
|
-
|
90
|
+
trace = go.Bar(
|
91
|
+
x=imbalance_percentages.index,
|
92
|
+
y=imbalance_percentages.values,
|
93
|
+
)
|
94
|
+
|
95
|
+
layout = go.Layout(
|
96
|
+
title=f"{dataset.target_column} Class Imbalance",
|
97
|
+
xaxis=dict(title="Class"),
|
98
|
+
yaxis=dict(title="Percentage"),
|
99
|
+
)
|
100
|
+
|
101
|
+
return (
|
102
|
+
{
|
103
|
+
f"{dataset.target_column} Class Imbalance": imbalanced_classes,
|
104
|
+
},
|
105
|
+
go.Figure(data=[trace], layout=layout),
|
106
|
+
all(row["Pass/Fail"] == "Pass" for row in imbalanced_classes),
|
107
|
+
)
|
@@ -4,15 +4,15 @@
|
|
4
4
|
|
5
5
|
import re
|
6
6
|
from collections import Counter
|
7
|
-
from dataclasses import dataclass
|
8
7
|
|
9
8
|
import numpy as np
|
10
9
|
from ydata_profiling.config import Settings
|
11
10
|
from ydata_profiling.model.typeset import ProfilingTypeSet
|
12
11
|
|
12
|
+
from validmind import tags, tasks
|
13
13
|
from validmind.errors import UnsupportedColumnTypeError
|
14
14
|
from validmind.logging import get_logger
|
15
|
-
from validmind.vm_models import
|
15
|
+
from validmind.vm_models import VMDataset
|
16
16
|
|
17
17
|
DEFAULT_HISTOGRAM_BINS = 10
|
18
18
|
DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
|
@@ -20,37 +20,179 @@ DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
|
|
20
20
|
logger = get_logger(__name__)
|
21
21
|
|
22
22
|
|
23
|
-
|
24
|
-
|
23
|
+
def infer_datatypes(df):
|
24
|
+
column_type_mappings = {}
|
25
|
+
typeset = ProfilingTypeSet(Settings())
|
26
|
+
variable_types = typeset.infer_type(df)
|
27
|
+
|
28
|
+
for column, type in variable_types.items():
|
29
|
+
if str(type) == "Unsupported":
|
30
|
+
if df[column].isnull().all():
|
31
|
+
column_type_mappings[column] = {"id": column, "type": "Null"}
|
32
|
+
else:
|
33
|
+
raise UnsupportedColumnTypeError(
|
34
|
+
f"Unsupported type for column {column}. Please review all values in this dataset column."
|
35
|
+
)
|
36
|
+
else:
|
37
|
+
column_type_mappings[column] = {"id": column, "type": str(type)}
|
38
|
+
|
39
|
+
return list(column_type_mappings.values())
|
40
|
+
|
41
|
+
|
42
|
+
def get_numerical_histograms(df, column):
|
43
|
+
"""
|
44
|
+
Returns a collection of histograms for a numerical column, each one
|
45
|
+
with a different bin size
|
25
46
|
"""
|
26
|
-
|
47
|
+
values = df[column].to_numpy()
|
48
|
+
values_cleaned = values[~np.isnan(values)]
|
49
|
+
|
50
|
+
# bins='sturges'. Cannot use 'auto' until we review and fix its performance
|
51
|
+
# on datasets with too many unique values
|
52
|
+
#
|
53
|
+
# 'sturges': R’s default method, only accounts for data size. Only optimal
|
54
|
+
# for gaussian data and underestimates number of bins for large non-gaussian datasets.
|
55
|
+
default_hist = np.histogram(values_cleaned, bins="sturges")
|
56
|
+
|
57
|
+
histograms = {
|
58
|
+
"default": {
|
59
|
+
"bin_size": len(default_hist[0]),
|
60
|
+
"histogram": {
|
61
|
+
"bin_edges": default_hist[1].tolist(),
|
62
|
+
"counts": default_hist[0].tolist(),
|
63
|
+
},
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
for bin_size in DEFAULT_HISTOGRAM_BIN_SIZES:
|
68
|
+
hist = np.histogram(values_cleaned, bins=bin_size)
|
69
|
+
histograms[f"bins_{bin_size}"] = {
|
70
|
+
"bin_size": bin_size,
|
71
|
+
"histogram": {
|
72
|
+
"bin_edges": hist[1].tolist(),
|
73
|
+
"counts": hist[0].tolist(),
|
74
|
+
},
|
75
|
+
}
|
76
|
+
|
77
|
+
return histograms
|
78
|
+
|
79
|
+
|
80
|
+
def get_column_histograms(df, column, type_):
|
81
|
+
"""
|
82
|
+
Returns a collection of histograms for a numerical or categorical column.
|
83
|
+
We store different combinations of bin sizes to allow analyzing the data better
|
84
|
+
|
85
|
+
Will be used in favor of _get_histogram in the future
|
86
|
+
"""
|
87
|
+
# Set the minimum number of bins to nunique if it's less than the default
|
88
|
+
if type_ == "Numeric":
|
89
|
+
return get_numerical_histograms(df, column)
|
90
|
+
elif type_ == "Categorical" or type_ == "Boolean":
|
91
|
+
value_counts = df[column].value_counts()
|
92
|
+
return {
|
93
|
+
"default": {
|
94
|
+
"bin_size": len(value_counts),
|
95
|
+
"histogram": value_counts.to_dict(),
|
96
|
+
}
|
97
|
+
}
|
98
|
+
elif type_ == "Text":
|
99
|
+
# Combine all the text in the specified column
|
100
|
+
text_data = " ".join(df[column].astype(str))
|
101
|
+
# Split the text into words (tokens) using a regular expression
|
102
|
+
words = re.findall(r"\w+", text_data)
|
103
|
+
# Use Counter to count the frequency of each word
|
104
|
+
word_counts = Counter(words)
|
105
|
+
|
106
|
+
return {
|
107
|
+
"default": {
|
108
|
+
"bin_size": len(word_counts),
|
109
|
+
"histogram": dict(word_counts),
|
110
|
+
}
|
111
|
+
}
|
112
|
+
elif type_ == "Null":
|
113
|
+
logger.info(f"Ignoring histogram generation for null column {column}")
|
114
|
+
else:
|
115
|
+
raise ValueError(
|
116
|
+
f"Unsupported column type found when computing its histogram: {type_}"
|
117
|
+
)
|
118
|
+
|
119
|
+
|
120
|
+
def describe_column(df, column):
|
121
|
+
"""
|
122
|
+
Gets descriptive statistics for a single column in a Pandas DataFrame.
|
123
|
+
"""
|
124
|
+
column_type = column["type"]
|
125
|
+
|
126
|
+
# Initialize statistics with count for all column types
|
127
|
+
column["statistics"] = {
|
128
|
+
"count": df[column["id"]].count(),
|
129
|
+
"n_missing": df[column["id"]].isna().sum(),
|
130
|
+
"missing": df[column["id"]].isna().sum() / len(df[column["id"]]),
|
131
|
+
"n_distinct": df[column["id"]].nunique(),
|
132
|
+
"distinct": df[column["id"]].nunique() / len(df[column["id"]]),
|
133
|
+
}
|
134
|
+
|
135
|
+
# Boolean (binary) columns should be reported as categorical
|
136
|
+
if column_type == "Boolean" or df[column["id"]].nunique() == 2:
|
137
|
+
column["type"] = "Categorical" # Change the type to Categorical
|
138
|
+
top_value = df[column["id"]].value_counts().nlargest(1)
|
139
|
+
column["statistics"].update(
|
140
|
+
{
|
141
|
+
"unique": df[column["id"]].nunique(),
|
142
|
+
"top": top_value.index[0],
|
143
|
+
"freq": top_value.values[0],
|
144
|
+
}
|
145
|
+
)
|
146
|
+
elif column_type == "Numeric":
|
147
|
+
column["statistics"].update(
|
148
|
+
df[column["id"]]
|
149
|
+
.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])
|
150
|
+
.to_dict()
|
151
|
+
)
|
152
|
+
elif column_type == "Categorical" or column_type == "Text":
|
153
|
+
column["statistics"].update(
|
154
|
+
df[column["id"]].astype("category").describe().to_dict()
|
155
|
+
)
|
156
|
+
|
157
|
+
column["histograms"] = get_column_histograms(
|
158
|
+
df, column["id"], column["type"]
|
159
|
+
) # Use updated type
|
160
|
+
|
161
|
+
return column
|
162
|
+
|
163
|
+
|
164
|
+
@tags("tabular_data", "time_series_data", "text_data")
|
165
|
+
@tasks("classification", "regression", "text_classification", "text_summarization")
|
166
|
+
def DatasetDescription(dataset: VMDataset):
|
167
|
+
"""
|
168
|
+
Provides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset.
|
27
169
|
|
28
170
|
### Purpose
|
29
171
|
|
30
172
|
The test depicted in the script is meant to run a comprehensive analysis on a Machine Learning model's datasets.
|
31
|
-
The test or metric is implemented to obtain a complete summary of the
|
32
|
-
statistics of each
|
33
|
-
boolean, and text
|
173
|
+
The test or metric is implemented to obtain a complete summary of the columns in the dataset, including vital
|
174
|
+
statistics of each column such as count, distinct values, missing values, histograms for numerical, categorical,
|
175
|
+
boolean, and text columns. This summary gives a comprehensive overview of the dataset to better understand the
|
34
176
|
characteristics of the data that the model is trained on or evaluates.
|
35
177
|
|
36
178
|
### Test Mechanism
|
37
179
|
|
38
180
|
The DatasetDescription class accomplishes the purpose as follows: firstly, the test method "run" infers the data
|
39
|
-
type of each column in the dataset and stores the details (id, column type). For each
|
40
|
-
"
|
181
|
+
type of each column in the dataset and stores the details (id, column type). For each column, the
|
182
|
+
"describe_column" method is invoked to collect statistical information about the column, including count,
|
41
183
|
missing value count and its proportion to the total, unique value count, and its proportion to the total. Depending
|
42
|
-
on the data type of a
|
43
|
-
Numerical
|
44
|
-
categorical, boolean and text
|
184
|
+
on the data type of a column, histograms are generated that reflect the distribution of data within the column.
|
185
|
+
Numerical columns use the "get_numerical_histograms" method to calculate histogram distribution, whereas for
|
186
|
+
categorical, boolean and text columns, a histogram is computed with frequencies of each unique value in the
|
45
187
|
datasets. For unsupported types, an error is raised. Lastly, a summary table is built to aggregate all the
|
46
|
-
statistical insights and histograms of the
|
188
|
+
statistical insights and histograms of the columns in a dataset.
|
47
189
|
|
48
190
|
### Signs of High Risk
|
49
191
|
|
50
|
-
- High ratio of missing values to total values in one or more
|
192
|
+
- High ratio of missing values to total values in one or more columns which may impact the quality of the
|
51
193
|
predictions.
|
52
|
-
- Unsupported data types in dataset
|
53
|
-
- Large number of unique values in the dataset's
|
194
|
+
- Unsupported data types in dataset columns.
|
195
|
+
- Large number of unique values in the dataset's columns which might make it harder for the model to establish
|
54
196
|
patterns.
|
55
197
|
- Extreme skewness or irregular distribution of data as reflected in the histograms.
|
56
198
|
|
@@ -65,201 +207,30 @@ class DatasetDescription(Metric):
|
|
65
207
|
|
66
208
|
### Limitations
|
67
209
|
|
68
|
-
- The computation can be expensive from a resource standpoint, particularly for large datasets with numerous
|
210
|
+
- The computation can be expensive from a resource standpoint, particularly for large datasets with numerous columns.
|
69
211
|
- The histograms use an arbitrary number of bins which may not be the optimal number of bins for specific data
|
70
212
|
distribution.
|
71
213
|
- Unsupported data types for columns will raise an error which may limit evaluating the dataset.
|
72
|
-
-
|
214
|
+
- Columns with all null or missing values are not included in histogram computation.
|
73
215
|
- This test only validates the quality of the dataset but doesn't address the model's performance directly.
|
74
216
|
"""
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
"
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
the following keys: count, mean, std, min, 25%, 50%, 75%, 90%, 95%, max. For
|
92
|
-
categorical fields, it has the following keys: count, unique, top, freq.
|
93
|
-
"""
|
94
|
-
results_table = []
|
95
|
-
for field in metric_value:
|
96
|
-
field_id = field["id"]
|
97
|
-
field_type = field["type"]
|
98
|
-
field_statistics = field["statistics"]
|
99
|
-
|
100
|
-
results_table.append(
|
101
|
-
{
|
102
|
-
"Name": field_id,
|
103
|
-
"Type": field_type,
|
104
|
-
"Count": field_statistics["count"],
|
105
|
-
"Missing": field_statistics["n_missing"],
|
106
|
-
"Missing %": field_statistics["missing"],
|
107
|
-
"Distinct": field_statistics["n_distinct"],
|
108
|
-
"Distinct %": field_statistics["distinct"],
|
109
|
-
}
|
110
|
-
)
|
111
|
-
|
112
|
-
return ResultSummary(
|
113
|
-
results=[
|
114
|
-
ResultTable(
|
115
|
-
data=results_table,
|
116
|
-
metadata=ResultTableMetadata(title="Dataset Description"),
|
117
|
-
)
|
118
|
-
]
|
119
|
-
)
|
120
|
-
|
121
|
-
def run(self):
|
122
|
-
results = []
|
123
|
-
for ds_field in self.infer_datatype(self.inputs.dataset.df):
|
124
|
-
self.describe_dataset_field(self.inputs.dataset.df, ds_field)
|
125
|
-
results.append(ds_field)
|
126
|
-
return self.cache_results(results)
|
127
|
-
|
128
|
-
def infer_datatype(self, df):
|
129
|
-
vm_dataset_variables = {}
|
130
|
-
typeset = ProfilingTypeSet(Settings())
|
131
|
-
variable_types = typeset.infer_type(df)
|
132
|
-
|
133
|
-
for column, type in variable_types.items():
|
134
|
-
if str(type) == "Unsupported":
|
135
|
-
if df[column].isnull().all():
|
136
|
-
vm_dataset_variables[column] = {"id": column, "type": "Null"}
|
137
|
-
else:
|
138
|
-
raise UnsupportedColumnTypeError(
|
139
|
-
f"Unsupported type for column {column}. Please review all values in this dataset column."
|
140
|
-
)
|
141
|
-
else:
|
142
|
-
vm_dataset_variables[column] = {"id": column, "type": str(type)}
|
143
|
-
|
144
|
-
return list(vm_dataset_variables.values())
|
145
|
-
|
146
|
-
def describe_dataset_field(self, df, field):
|
147
|
-
"""
|
148
|
-
Gets descriptive statistics for a single field in a Pandas DataFrame.
|
149
|
-
"""
|
150
|
-
field_type = field["type"]
|
151
|
-
|
152
|
-
# - When we call describe on one field at a time, Pandas will
|
153
|
-
# know better if it needs to report on numerical or categorical statistics
|
154
|
-
# - Boolean (binary) fields should be reported as categorical
|
155
|
-
# (force to categorical when nunique == 2)
|
156
|
-
if field_type == ["Boolean"] or df[field["id"]].nunique() == 2:
|
157
|
-
top_value = df[field["id"]].value_counts().nlargest(1)
|
158
|
-
|
159
|
-
field["statistics"] = {
|
160
|
-
"count": df[field["id"]].count(),
|
161
|
-
"unique": df[field["id"]].nunique(),
|
162
|
-
"top": top_value.index[0],
|
163
|
-
"freq": top_value.values[0],
|
164
|
-
}
|
165
|
-
elif field_type == "Numeric":
|
166
|
-
field["statistics"] = (
|
167
|
-
df[field["id"]]
|
168
|
-
.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])
|
169
|
-
.to_dict()
|
170
|
-
)
|
171
|
-
elif field_type == "Categorical" or field_type == "Text":
|
172
|
-
field["statistics"] = (
|
173
|
-
df[field["id"]].astype("category").describe().to_dict()
|
174
|
-
)
|
175
|
-
|
176
|
-
# Initialize statistics object for non-numeric or categorical fields
|
177
|
-
if "statistics" not in field:
|
178
|
-
field["statistics"] = {}
|
179
|
-
|
180
|
-
field["statistics"]["n_missing"] = df[field["id"]].isna().sum()
|
181
|
-
field["statistics"]["missing"] = field["statistics"]["n_missing"] / len(
|
182
|
-
df[field["id"]]
|
183
|
-
)
|
184
|
-
field["statistics"]["n_distinct"] = df[field["id"]].nunique()
|
185
|
-
field["statistics"]["distinct"] = field["statistics"]["n_distinct"] / len(
|
186
|
-
df[field["id"]]
|
187
|
-
)
|
188
|
-
|
189
|
-
field["histograms"] = self.get_field_histograms(df, field["id"], field_type)
|
190
|
-
|
191
|
-
def get_field_histograms(self, df, field, type_):
|
192
|
-
"""
|
193
|
-
Returns a collection of histograms for a numerical or categorical field.
|
194
|
-
We store different combinations of bin sizes to allow analyzing the data better
|
195
|
-
|
196
|
-
Will be used in favor of _get_histogram in the future
|
197
|
-
"""
|
198
|
-
# Set the minimum number of bins to nunique if it's less than the default
|
199
|
-
if type_ == "Numeric":
|
200
|
-
return self.get_numerical_histograms(df, field)
|
201
|
-
elif type_ == "Categorical" or type_ == "Boolean":
|
202
|
-
value_counts = df[field].value_counts()
|
203
|
-
return {
|
204
|
-
"default": {
|
205
|
-
"bin_size": len(value_counts),
|
206
|
-
"histogram": value_counts.to_dict(),
|
207
|
-
}
|
208
|
-
}
|
209
|
-
elif type_ == "Text":
|
210
|
-
# Combine all the text in the specified field
|
211
|
-
text_data = " ".join(df[field].astype(str))
|
212
|
-
# Split the text into words (tokens) using a regular expression
|
213
|
-
words = re.findall(r"\w+", text_data)
|
214
|
-
# Use Counter to count the frequency of each word
|
215
|
-
word_counts = Counter(words)
|
216
|
-
|
217
|
-
return {
|
218
|
-
"default": {
|
219
|
-
"bin_size": len(word_counts),
|
220
|
-
"histogram": dict(word_counts),
|
221
|
-
}
|
217
|
+
df = dataset.df
|
218
|
+
|
219
|
+
results = []
|
220
|
+
for column in infer_datatypes(df):
|
221
|
+
results.append(describe_column(df, column))
|
222
|
+
|
223
|
+
return {
|
224
|
+
"Dataset Description": [
|
225
|
+
{
|
226
|
+
"Name": column["id"],
|
227
|
+
"Type": column["type"],
|
228
|
+
"Count": column["statistics"]["count"],
|
229
|
+
"Missing": column["statistics"]["n_missing"],
|
230
|
+
"Missing %": column["statistics"]["missing"],
|
231
|
+
"Distinct": column["statistics"]["n_distinct"],
|
232
|
+
"Distinct %": column["statistics"]["distinct"],
|
222
233
|
}
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
raise ValueError(
|
227
|
-
f"Unsupported field type found when computing its histogram: {type_}"
|
228
|
-
)
|
229
|
-
|
230
|
-
def get_numerical_histograms(self, df, field):
|
231
|
-
"""
|
232
|
-
Returns a collection of histograms for a numerical field, each one
|
233
|
-
with a different bin size
|
234
|
-
"""
|
235
|
-
values = df[field].to_numpy()
|
236
|
-
values_cleaned = values[~np.isnan(values)]
|
237
|
-
|
238
|
-
# bins='sturges'. Cannot use 'auto' until we review and fix its performance
|
239
|
-
# on datasets with too many unique values
|
240
|
-
#
|
241
|
-
# 'sturges': R’s default method, only accounts for data size. Only optimal
|
242
|
-
# for gaussian data and underestimates number of bins for large non-gaussian datasets.
|
243
|
-
default_hist = np.histogram(values_cleaned, bins="sturges")
|
244
|
-
|
245
|
-
histograms = {
|
246
|
-
"default": {
|
247
|
-
"bin_size": len(default_hist[0]),
|
248
|
-
"histogram": {
|
249
|
-
"bin_edges": default_hist[1].tolist(),
|
250
|
-
"counts": default_hist[0].tolist(),
|
251
|
-
},
|
252
|
-
}
|
253
|
-
}
|
254
|
-
|
255
|
-
for bin_size in DEFAULT_HISTOGRAM_BIN_SIZES:
|
256
|
-
hist = np.histogram(values_cleaned, bins=bin_size)
|
257
|
-
histograms[f"bins_{bin_size}"] = {
|
258
|
-
"bin_size": bin_size,
|
259
|
-
"histogram": {
|
260
|
-
"bin_edges": hist[1].tolist(),
|
261
|
-
"counts": hist[0].tolist(),
|
262
|
-
},
|
263
|
-
}
|
264
|
-
|
265
|
-
return histograms
|
234
|
+
for column in results
|
235
|
+
]
|
236
|
+
}
|