validmind 2.8.29__py3-none-any.whl → 2.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +1 -9
- validmind/ai/utils.py +4 -24
- validmind/api_client.py +6 -17
- validmind/logging.py +48 -0
- validmind/tests/__init__.py +2 -0
- validmind/tests/__types__.py +18 -0
- validmind/tests/output.py +9 -2
- validmind/tests/plots/BoxPlot.py +260 -0
- validmind/tests/plots/CorrelationHeatmap.py +235 -0
- validmind/tests/plots/HistogramPlot.py +233 -0
- validmind/tests/plots/ViolinPlot.py +125 -0
- validmind/tests/plots/__init__.py +0 -0
- validmind/tests/stats/CorrelationAnalysis.py +251 -0
- validmind/tests/stats/DescriptiveStats.py +197 -0
- validmind/tests/stats/NormalityTests.py +147 -0
- validmind/tests/stats/OutlierDetection.py +173 -0
- validmind/tests/stats/__init__.py +0 -0
- validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
- validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
- validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
- validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
- validmind/unit_metrics/classification/individual/Confidence.py +52 -0
- validmind/unit_metrics/classification/individual/Correctness.py +41 -0
- validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
- validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
- validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
- validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
- validmind/unit_metrics/classification/individual/__init__.py +0 -0
- validmind/vm_models/dataset/dataset.py +147 -1
- validmind/vm_models/result/result.py +26 -4
- {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/METADATA +2 -2
- {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/RECORD +36 -15
- {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/LICENSE +0 -0
- {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/WHEEL +0 -0
- {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
from scipy import stats
|
10
|
+
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.errors import SkipTestError
|
13
|
+
from validmind.utils import format_records
|
14
|
+
from validmind.vm_models import VMDataset
|
15
|
+
|
16
|
+
|
17
|
+
def _validate_and_prepare_data(dataset: VMDataset, columns: Optional[List[str]]):
|
18
|
+
"""Validate inputs and prepare data for correlation analysis."""
|
19
|
+
if columns is None:
|
20
|
+
columns = dataset.feature_columns_numeric
|
21
|
+
else:
|
22
|
+
available_columns = set(dataset.feature_columns_numeric)
|
23
|
+
columns = [col for col in columns if col in available_columns]
|
24
|
+
|
25
|
+
if not columns:
|
26
|
+
raise SkipTestError("No numerical columns found for correlation analysis")
|
27
|
+
|
28
|
+
if len(columns) < 2:
|
29
|
+
raise SkipTestError(
|
30
|
+
"At least 2 numerical columns required for correlation analysis"
|
31
|
+
)
|
32
|
+
|
33
|
+
# Get data and remove constant columns
|
34
|
+
data = dataset.df[columns].dropna()
|
35
|
+
data = data.loc[:, data.var() != 0]
|
36
|
+
|
37
|
+
if data.shape[1] < 2:
|
38
|
+
raise SkipTestError(
|
39
|
+
"Insufficient non-constant columns for correlation analysis"
|
40
|
+
)
|
41
|
+
|
42
|
+
return data
|
43
|
+
|
44
|
+
|
45
|
+
def _compute_correlation_matrices(data, method: str):
|
46
|
+
"""Compute correlation and p-value matrices based on method."""
|
47
|
+
if method == "pearson":
|
48
|
+
return _compute_pearson_with_pvalues(data)
|
49
|
+
elif method == "spearman":
|
50
|
+
return _compute_spearman_with_pvalues(data)
|
51
|
+
elif method == "kendall":
|
52
|
+
return _compute_kendall_with_pvalues(data)
|
53
|
+
else:
|
54
|
+
raise ValueError(f"Unsupported correlation method: {method}")
|
55
|
+
|
56
|
+
|
57
|
+
def _create_correlation_pairs(
|
58
|
+
corr_matrix, p_matrix, significance_level: float, min_correlation: float
|
59
|
+
):
|
60
|
+
"""Create correlation pairs table."""
|
61
|
+
correlation_pairs = []
|
62
|
+
|
63
|
+
for i, col1 in enumerate(corr_matrix.columns):
|
64
|
+
for j, col2 in enumerate(corr_matrix.columns):
|
65
|
+
if i < j: # Only upper triangle to avoid duplicates
|
66
|
+
corr_val = corr_matrix.iloc[i, j]
|
67
|
+
p_val = p_matrix.iloc[i, j]
|
68
|
+
|
69
|
+
if abs(corr_val) >= min_correlation:
|
70
|
+
pair_info = {
|
71
|
+
"Feature 1": col1,
|
72
|
+
"Feature 2": col2,
|
73
|
+
"Correlation": corr_val,
|
74
|
+
"Abs Correlation": abs(corr_val),
|
75
|
+
"p-value": p_val,
|
76
|
+
"Significant": "Yes" if p_val < significance_level else "No",
|
77
|
+
"Strength": _correlation_strength(abs(corr_val)),
|
78
|
+
"Direction": "Positive" if corr_val > 0 else "Negative",
|
79
|
+
}
|
80
|
+
correlation_pairs.append(pair_info)
|
81
|
+
|
82
|
+
# Sort by absolute correlation value
|
83
|
+
correlation_pairs.sort(key=lambda x: x["Abs Correlation"], reverse=True)
|
84
|
+
return correlation_pairs
|
85
|
+
|
86
|
+
|
87
|
+
def _create_summary_statistics(corr_matrix, correlation_pairs):
|
88
|
+
"""Create summary statistics table."""
|
89
|
+
all_correlations = []
|
90
|
+
for i in range(len(corr_matrix.columns)):
|
91
|
+
for j in range(i + 1, len(corr_matrix.columns)):
|
92
|
+
all_correlations.append(abs(corr_matrix.iloc[i, j]))
|
93
|
+
|
94
|
+
significant_count = sum(
|
95
|
+
1 for pair in correlation_pairs if pair["Significant"] == "Yes"
|
96
|
+
)
|
97
|
+
high_corr_count = sum(
|
98
|
+
1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.7
|
99
|
+
)
|
100
|
+
very_high_corr_count = sum(
|
101
|
+
1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.9
|
102
|
+
)
|
103
|
+
|
104
|
+
return {
|
105
|
+
"Total Feature Pairs": len(all_correlations),
|
106
|
+
"Pairs Above Threshold": len(correlation_pairs),
|
107
|
+
"Significant Correlations": significant_count,
|
108
|
+
"High Correlations (>0.7)": high_corr_count,
|
109
|
+
"Very High Correlations (>0.9)": very_high_corr_count,
|
110
|
+
"Mean Absolute Correlation": np.mean(all_correlations),
|
111
|
+
"Max Absolute Correlation": np.max(all_correlations),
|
112
|
+
"Median Absolute Correlation": np.median(all_correlations),
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
@tags("tabular_data", "statistics", "correlation")
|
117
|
+
@tasks("classification", "regression", "clustering")
|
118
|
+
def CorrelationAnalysis(
|
119
|
+
dataset: VMDataset,
|
120
|
+
columns: Optional[List[str]] = None,
|
121
|
+
method: str = "pearson",
|
122
|
+
significance_level: float = 0.05,
|
123
|
+
min_correlation: float = 0.1,
|
124
|
+
) -> Dict[str, Any]:
|
125
|
+
"""
|
126
|
+
Performs comprehensive correlation analysis with significance testing for numerical features.
|
127
|
+
|
128
|
+
### Purpose
|
129
|
+
|
130
|
+
This test conducts detailed correlation analysis between numerical features, including
|
131
|
+
correlation coefficients, significance testing, and identification of significant
|
132
|
+
relationships. It helps identify multicollinearity, feature relationships, and
|
133
|
+
potential redundancies in the dataset.
|
134
|
+
|
135
|
+
### Test Mechanism
|
136
|
+
|
137
|
+
The test computes correlation coefficients using the specified method and performs
|
138
|
+
statistical significance testing for each correlation pair. It provides:
|
139
|
+
- Correlation matrix with significance indicators
|
140
|
+
- List of significant correlations above threshold
|
141
|
+
- Summary statistics about correlation patterns
|
142
|
+
- Identification of highly correlated feature pairs
|
143
|
+
|
144
|
+
### Signs of High Risk
|
145
|
+
|
146
|
+
- Very high correlations (>0.9) indicating potential multicollinearity
|
147
|
+
- Many significant correlations suggesting complex feature interactions
|
148
|
+
- Features with no significant correlations to others (potential isolation)
|
149
|
+
- Unexpected correlation patterns contradicting domain knowledge
|
150
|
+
|
151
|
+
### Strengths
|
152
|
+
|
153
|
+
- Provides statistical significance testing for correlations
|
154
|
+
- Supports multiple correlation methods (Pearson, Spearman, Kendall)
|
155
|
+
- Identifies potentially problematic high correlations
|
156
|
+
- Filters results by minimum correlation threshold
|
157
|
+
- Comprehensive summary of correlation patterns
|
158
|
+
|
159
|
+
### Limitations
|
160
|
+
|
161
|
+
- Limited to numerical features only
|
162
|
+
- Cannot detect non-linear relationships (except with Spearman)
|
163
|
+
- Significance testing assumes certain distributional properties
|
164
|
+
- Correlation does not imply causation
|
165
|
+
"""
|
166
|
+
# Validate and prepare data
|
167
|
+
data = _validate_and_prepare_data(dataset, columns)
|
168
|
+
|
169
|
+
# Compute correlation matrices
|
170
|
+
corr_matrix, p_matrix = _compute_correlation_matrices(data, method)
|
171
|
+
|
172
|
+
# Create correlation pairs
|
173
|
+
correlation_pairs = _create_correlation_pairs(
|
174
|
+
corr_matrix, p_matrix, significance_level, min_correlation
|
175
|
+
)
|
176
|
+
|
177
|
+
# Build results
|
178
|
+
results = {}
|
179
|
+
if correlation_pairs:
|
180
|
+
results["Correlation Pairs"] = format_records(pd.DataFrame(correlation_pairs))
|
181
|
+
|
182
|
+
# Create summary statistics
|
183
|
+
summary_stats = _create_summary_statistics(corr_matrix, correlation_pairs)
|
184
|
+
results["Summary Statistics"] = format_records(pd.DataFrame([summary_stats]))
|
185
|
+
|
186
|
+
return results
|
187
|
+
|
188
|
+
|
189
|
+
def _compute_pearson_with_pvalues(data):
|
190
|
+
"""Compute Pearson correlation with p-values"""
|
191
|
+
n_vars = data.shape[1]
|
192
|
+
corr_matrix = data.corr(method="pearson")
|
193
|
+
p_matrix = pd.DataFrame(
|
194
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
195
|
+
)
|
196
|
+
|
197
|
+
for i, col1 in enumerate(data.columns):
|
198
|
+
for j, col2 in enumerate(data.columns):
|
199
|
+
if i != j:
|
200
|
+
_, p_val = stats.pearsonr(data[col1], data[col2])
|
201
|
+
p_matrix.iloc[i, j] = p_val
|
202
|
+
|
203
|
+
return corr_matrix, p_matrix
|
204
|
+
|
205
|
+
|
206
|
+
def _compute_spearman_with_pvalues(data):
|
207
|
+
"""Compute Spearman correlation with p-values"""
|
208
|
+
n_vars = data.shape[1]
|
209
|
+
corr_matrix = data.corr(method="spearman")
|
210
|
+
p_matrix = pd.DataFrame(
|
211
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
212
|
+
)
|
213
|
+
|
214
|
+
for i, col1 in enumerate(data.columns):
|
215
|
+
for j, col2 in enumerate(data.columns):
|
216
|
+
if i != j:
|
217
|
+
_, p_val = stats.spearmanr(data[col1], data[col2])
|
218
|
+
p_matrix.iloc[i, j] = p_val
|
219
|
+
|
220
|
+
return corr_matrix, p_matrix
|
221
|
+
|
222
|
+
|
223
|
+
def _compute_kendall_with_pvalues(data):
|
224
|
+
"""Compute Kendall correlation with p-values"""
|
225
|
+
n_vars = data.shape[1]
|
226
|
+
corr_matrix = data.corr(method="kendall")
|
227
|
+
p_matrix = pd.DataFrame(
|
228
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
229
|
+
)
|
230
|
+
|
231
|
+
for i, col1 in enumerate(data.columns):
|
232
|
+
for j, col2 in enumerate(data.columns):
|
233
|
+
if i != j:
|
234
|
+
_, p_val = stats.kendalltau(data[col1], data[col2])
|
235
|
+
p_matrix.iloc[i, j] = p_val
|
236
|
+
|
237
|
+
return corr_matrix, p_matrix
|
238
|
+
|
239
|
+
|
240
|
+
def _correlation_strength(abs_corr):
|
241
|
+
"""Classify correlation strength"""
|
242
|
+
if abs_corr >= 0.9:
|
243
|
+
return "Very Strong"
|
244
|
+
elif abs_corr >= 0.7:
|
245
|
+
return "Strong"
|
246
|
+
elif abs_corr >= 0.5:
|
247
|
+
return "Moderate"
|
248
|
+
elif abs_corr >= 0.3:
|
249
|
+
return "Weak"
|
250
|
+
else:
|
251
|
+
return "Very Weak"
|
@@ -0,0 +1,197 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
from scipy import stats
|
10
|
+
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.errors import SkipTestError
|
13
|
+
from validmind.utils import format_records
|
14
|
+
from validmind.vm_models import VMDataset
|
15
|
+
|
16
|
+
|
17
|
+
def _validate_columns(dataset: VMDataset, columns: Optional[List[str]]):
|
18
|
+
"""Validate and return numerical columns (excluding boolean columns)."""
|
19
|
+
if columns is None:
|
20
|
+
# Get all columns marked as numeric
|
21
|
+
numeric_columns = dataset.feature_columns_numeric
|
22
|
+
else:
|
23
|
+
available_columns = set(dataset.feature_columns_numeric)
|
24
|
+
numeric_columns = [col for col in columns if col in available_columns]
|
25
|
+
|
26
|
+
# Filter out boolean columns as they can't have proper statistical measures computed
|
27
|
+
columns = []
|
28
|
+
for col in numeric_columns:
|
29
|
+
dtype = dataset.df[col].dtype
|
30
|
+
# Only include integer and float types, exclude boolean
|
31
|
+
if pd.api.types.is_integer_dtype(dtype) or pd.api.types.is_float_dtype(dtype):
|
32
|
+
columns.append(col)
|
33
|
+
|
34
|
+
if not columns:
|
35
|
+
raise SkipTestError(
|
36
|
+
"No numerical columns (integer/float) found for descriptive statistics"
|
37
|
+
)
|
38
|
+
|
39
|
+
return columns
|
40
|
+
|
41
|
+
|
42
|
+
def _compute_basic_stats(column: str, data, total_count: int):
|
43
|
+
"""Compute basic statistics for a column."""
|
44
|
+
return {
|
45
|
+
"Feature": column,
|
46
|
+
"Count": len(data),
|
47
|
+
"Missing": total_count - len(data),
|
48
|
+
"Missing %": ((total_count - len(data)) / total_count) * 100,
|
49
|
+
"Mean": data.mean(),
|
50
|
+
"Median": data.median(),
|
51
|
+
"Std": data.std(),
|
52
|
+
"Min": data.min(),
|
53
|
+
"Max": data.max(),
|
54
|
+
"Q1": data.quantile(0.25),
|
55
|
+
"Q3": data.quantile(0.75),
|
56
|
+
"IQR": data.quantile(0.75) - data.quantile(0.25),
|
57
|
+
}
|
58
|
+
|
59
|
+
|
60
|
+
def _compute_advanced_stats(column: str, data, confidence_level: float):
|
61
|
+
"""Compute advanced statistics for a column."""
|
62
|
+
try:
|
63
|
+
# Distribution measures
|
64
|
+
skewness = stats.skew(data)
|
65
|
+
kurtosis_val = stats.kurtosis(data)
|
66
|
+
cv = (data.std() / data.mean()) * 100 if data.mean() != 0 else np.nan
|
67
|
+
|
68
|
+
# Confidence interval for mean
|
69
|
+
ci_lower, ci_upper = stats.t.interval(
|
70
|
+
confidence_level,
|
71
|
+
len(data) - 1,
|
72
|
+
loc=data.mean(),
|
73
|
+
scale=data.std() / np.sqrt(len(data)),
|
74
|
+
)
|
75
|
+
|
76
|
+
# Normality test
|
77
|
+
if len(data) <= 5000:
|
78
|
+
normality_stat, normality_p = stats.shapiro(data)
|
79
|
+
normality_test = "Shapiro-Wilk"
|
80
|
+
else:
|
81
|
+
ad_result = stats.anderson(data, dist="norm")
|
82
|
+
normality_stat = ad_result.statistic
|
83
|
+
normality_p = 0.05 if normality_stat > ad_result.critical_values[2] else 0.1
|
84
|
+
normality_test = "Anderson-Darling"
|
85
|
+
|
86
|
+
# Outlier detection using IQR method
|
87
|
+
iqr = data.quantile(0.75) - data.quantile(0.25)
|
88
|
+
lower_bound = data.quantile(0.25) - 1.5 * iqr
|
89
|
+
upper_bound = data.quantile(0.75) + 1.5 * iqr
|
90
|
+
outliers = data[(data < lower_bound) | (data > upper_bound)]
|
91
|
+
outlier_count = len(outliers)
|
92
|
+
outlier_pct = (outlier_count / len(data)) * 100
|
93
|
+
|
94
|
+
return {
|
95
|
+
"Feature": column,
|
96
|
+
"Skewness": skewness,
|
97
|
+
"Kurtosis": kurtosis_val,
|
98
|
+
"CV %": cv,
|
99
|
+
f"CI Lower ({confidence_level*100:.0f}%)": ci_lower,
|
100
|
+
f"CI Upper ({confidence_level*100:.0f}%)": ci_upper,
|
101
|
+
"Normality Test": normality_test,
|
102
|
+
"Normality Stat": normality_stat,
|
103
|
+
"Normality p-value": normality_p,
|
104
|
+
"Normal Distribution": "Yes" if normality_p > 0.05 else "No",
|
105
|
+
"Outliers (IQR)": outlier_count,
|
106
|
+
"Outliers %": outlier_pct,
|
107
|
+
}
|
108
|
+
except Exception:
|
109
|
+
return None
|
110
|
+
|
111
|
+
|
112
|
+
@tags("tabular_data", "statistics", "data_quality")
|
113
|
+
@tasks("classification", "regression", "clustering")
|
114
|
+
def DescriptiveStats(
|
115
|
+
dataset: VMDataset,
|
116
|
+
columns: Optional[List[str]] = None,
|
117
|
+
include_advanced: bool = True,
|
118
|
+
confidence_level: float = 0.95,
|
119
|
+
) -> Dict[str, Any]:
|
120
|
+
"""
|
121
|
+
Provides comprehensive descriptive statistics for numerical features in a dataset.
|
122
|
+
|
123
|
+
### Purpose
|
124
|
+
|
125
|
+
This test generates detailed descriptive statistics for numerical features, including
|
126
|
+
basic statistics, distribution measures, confidence intervals, and normality tests.
|
127
|
+
It provides a comprehensive overview of data characteristics essential for
|
128
|
+
understanding data quality and distribution properties.
|
129
|
+
|
130
|
+
### Test Mechanism
|
131
|
+
|
132
|
+
The test computes various statistical measures for each numerical column:
|
133
|
+
- Basic statistics: count, mean, median, std, min, max, quartiles
|
134
|
+
- Distribution measures: skewness, kurtosis, coefficient of variation
|
135
|
+
- Confidence intervals for the mean
|
136
|
+
- Normality tests (Shapiro-Wilk for small samples, Anderson-Darling for larger)
|
137
|
+
- Missing value analysis
|
138
|
+
|
139
|
+
### Signs of High Risk
|
140
|
+
|
141
|
+
- High skewness or kurtosis indicating non-normal distributions
|
142
|
+
- Large coefficients of variation suggesting high data variability
|
143
|
+
- Significant results in normality tests when normality is expected
|
144
|
+
- High percentage of missing values
|
145
|
+
- Extreme outliers based on IQR analysis
|
146
|
+
|
147
|
+
### Strengths
|
148
|
+
|
149
|
+
- Comprehensive statistical analysis in a single test
|
150
|
+
- Includes advanced statistical measures beyond basic descriptives
|
151
|
+
- Provides confidence intervals for uncertainty quantification
|
152
|
+
- Handles missing values appropriately
|
153
|
+
- Suitable for both exploratory and confirmatory analysis
|
154
|
+
|
155
|
+
### Limitations
|
156
|
+
|
157
|
+
- Limited to numerical features only
|
158
|
+
- Normality tests may not be meaningful for all data types
|
159
|
+
- Large datasets may make some tests computationally expensive
|
160
|
+
- Interpretation requires statistical knowledge
|
161
|
+
"""
|
162
|
+
# Validate inputs
|
163
|
+
columns = _validate_columns(dataset, columns)
|
164
|
+
|
165
|
+
# Compute statistics
|
166
|
+
basic_stats = []
|
167
|
+
advanced_stats = []
|
168
|
+
|
169
|
+
for column in columns:
|
170
|
+
data = dataset.df[column].dropna()
|
171
|
+
total_count = len(dataset.df[column])
|
172
|
+
|
173
|
+
if len(data) == 0:
|
174
|
+
continue
|
175
|
+
|
176
|
+
# Basic statistics
|
177
|
+
basic_row = _compute_basic_stats(column, data, total_count)
|
178
|
+
basic_stats.append(basic_row)
|
179
|
+
|
180
|
+
# Advanced statistics
|
181
|
+
if include_advanced and len(data) > 2:
|
182
|
+
advanced_row = _compute_advanced_stats(column, data, confidence_level)
|
183
|
+
if advanced_row is not None:
|
184
|
+
advanced_stats.append(advanced_row)
|
185
|
+
|
186
|
+
# Format results
|
187
|
+
results = {}
|
188
|
+
if basic_stats:
|
189
|
+
results["Basic Statistics"] = format_records(pd.DataFrame(basic_stats))
|
190
|
+
|
191
|
+
if advanced_stats and include_advanced:
|
192
|
+
results["Advanced Statistics"] = format_records(pd.DataFrame(advanced_stats))
|
193
|
+
|
194
|
+
if not results:
|
195
|
+
raise SkipTestError("Unable to compute statistics for any columns")
|
196
|
+
|
197
|
+
return results
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
from scipy import stats
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import SkipTestError
|
12
|
+
from validmind.utils import format_records
|
13
|
+
from validmind.vm_models import VMDataset
|
14
|
+
|
15
|
+
|
16
|
+
def _validate_columns(dataset: VMDataset, columns: Optional[List[str]]):
|
17
|
+
"""Validate and return numerical columns."""
|
18
|
+
if columns is None:
|
19
|
+
columns = dataset.feature_columns_numeric
|
20
|
+
else:
|
21
|
+
available_columns = set(dataset.feature_columns_numeric)
|
22
|
+
columns = [col for col in columns if col in available_columns]
|
23
|
+
|
24
|
+
if not columns:
|
25
|
+
raise SkipTestError("No numerical columns found for normality testing")
|
26
|
+
|
27
|
+
return columns
|
28
|
+
|
29
|
+
|
30
|
+
def _run_shapiro_test(data, tests: List[str], alpha: float):
|
31
|
+
"""Run Shapiro-Wilk test if requested and data size is appropriate."""
|
32
|
+
results = {}
|
33
|
+
if "shapiro" in tests and len(data) <= 5000:
|
34
|
+
try:
|
35
|
+
stat, p_value = stats.shapiro(data)
|
36
|
+
results["Shapiro-Wilk Stat"] = stat
|
37
|
+
results["Shapiro-Wilk p-value"] = p_value
|
38
|
+
results["Shapiro-Wilk Normal"] = "Yes" if p_value > alpha else "No"
|
39
|
+
except Exception:
|
40
|
+
results["Shapiro-Wilk Normal"] = "Test Failed"
|
41
|
+
return results
|
42
|
+
|
43
|
+
|
44
|
+
def _run_anderson_test(data, tests: List[str]):
|
45
|
+
"""Run Anderson-Darling test if requested."""
|
46
|
+
results = {}
|
47
|
+
if "anderson" in tests:
|
48
|
+
try:
|
49
|
+
ad_result = stats.anderson(data, dist="norm")
|
50
|
+
critical_value = ad_result.critical_values[2] # 5% level
|
51
|
+
results["Anderson-Darling Stat"] = ad_result.statistic
|
52
|
+
results["Anderson-Darling Critical"] = critical_value
|
53
|
+
results["Anderson-Darling Normal"] = (
|
54
|
+
"Yes" if ad_result.statistic < critical_value else "No"
|
55
|
+
)
|
56
|
+
except Exception:
|
57
|
+
results["Anderson-Darling Normal"] = "Test Failed"
|
58
|
+
return results
|
59
|
+
|
60
|
+
|
61
|
+
def _run_ks_test(data, tests: List[str], alpha: float):
|
62
|
+
"""Run Kolmogorov-Smirnov test if requested."""
|
63
|
+
results = {}
|
64
|
+
if "kstest" in tests:
|
65
|
+
try:
|
66
|
+
standardized = (data - data.mean()) / data.std()
|
67
|
+
stat, p_value = stats.kstest(standardized, "norm")
|
68
|
+
results["KS Test Stat"] = stat
|
69
|
+
results["KS Test p-value"] = p_value
|
70
|
+
results["KS Test Normal"] = "Yes" if p_value > alpha else "No"
|
71
|
+
except Exception:
|
72
|
+
results["KS Test Normal"] = "Test Failed"
|
73
|
+
return results
|
74
|
+
|
75
|
+
|
76
|
+
def _process_column_tests(column: str, data, tests: List[str], alpha: float):
|
77
|
+
"""Process all normality tests for a single column."""
|
78
|
+
result_row = {"Feature": column, "Sample Size": len(data)}
|
79
|
+
|
80
|
+
# Run individual tests
|
81
|
+
result_row.update(_run_shapiro_test(data, tests, alpha))
|
82
|
+
result_row.update(_run_anderson_test(data, tests))
|
83
|
+
result_row.update(_run_ks_test(data, tests, alpha))
|
84
|
+
|
85
|
+
return result_row
|
86
|
+
|
87
|
+
|
88
|
+
@tags("tabular_data", "statistics", "normality")
|
89
|
+
@tasks("classification", "regression", "clustering")
|
90
|
+
def NormalityTests(
|
91
|
+
dataset: VMDataset,
|
92
|
+
columns: Optional[List[str]] = None,
|
93
|
+
alpha: float = 0.05,
|
94
|
+
tests: List[str] = ["shapiro", "anderson", "kstest"],
|
95
|
+
) -> Dict[str, Any]:
|
96
|
+
"""
|
97
|
+
Performs multiple normality tests on numerical features to assess distribution normality.
|
98
|
+
|
99
|
+
### Purpose
|
100
|
+
|
101
|
+
This test evaluates whether numerical features follow a normal distribution using
|
102
|
+
various statistical tests. Understanding distribution normality is crucial for
|
103
|
+
selecting appropriate statistical methods and model assumptions.
|
104
|
+
|
105
|
+
### Test Mechanism
|
106
|
+
|
107
|
+
The test applies multiple normality tests:
|
108
|
+
- Shapiro-Wilk test: Best for small to medium samples
|
109
|
+
- Anderson-Darling test: More sensitive to deviations in tails
|
110
|
+
- Kolmogorov-Smirnov test: General goodness-of-fit test
|
111
|
+
|
112
|
+
### Signs of High Risk
|
113
|
+
|
114
|
+
- Multiple normality tests failing consistently
|
115
|
+
- Very low p-values indicating strong evidence against normality
|
116
|
+
- Conflicting results between different normality tests
|
117
|
+
|
118
|
+
### Strengths
|
119
|
+
|
120
|
+
- Multiple statistical tests for robust assessment
|
121
|
+
- Clear pass/fail indicators for each test
|
122
|
+
- Suitable for different sample sizes
|
123
|
+
|
124
|
+
### Limitations
|
125
|
+
|
126
|
+
- Limited to numerical features only
|
127
|
+
- Some tests sensitive to sample size
|
128
|
+
- Perfect normality is rare in real data
|
129
|
+
"""
|
130
|
+
# Validate inputs
|
131
|
+
columns = _validate_columns(dataset, columns)
|
132
|
+
|
133
|
+
# Process each column
|
134
|
+
normality_results = []
|
135
|
+
for column in columns:
|
136
|
+
data = dataset.df[column].dropna()
|
137
|
+
|
138
|
+
if len(data) >= 3:
|
139
|
+
result_row = _process_column_tests(column, data, tests, alpha)
|
140
|
+
normality_results.append(result_row)
|
141
|
+
|
142
|
+
# Format results
|
143
|
+
results = {}
|
144
|
+
if normality_results:
|
145
|
+
results["Normality Tests"] = format_records(pd.DataFrame(normality_results))
|
146
|
+
|
147
|
+
return results
|