validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +16 -5
- validmind/__version__.py +1 -1
- validmind/ai/utils.py +4 -24
- validmind/api_client.py +6 -17
- validmind/datasets/credit_risk/lending_club.py +13 -1
- validmind/datasets/nlp/cnn_dailymail.py +15 -1
- validmind/logging.py +48 -0
- validmind/tests/__init__.py +2 -0
- validmind/tests/__types__.py +18 -0
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
- validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
- validmind/tests/data_validation/SeasonalDecompose.py +14 -2
- validmind/tests/data_validation/ShapiroWilk.py +14 -1
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
- validmind/tests/data_validation/WOEBinPlots.py +14 -1
- validmind/tests/data_validation/WOEBinTable.py +13 -2
- validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
- validmind/tests/data_validation/nlp/CommonWords.py +14 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
- validmind/tests/data_validation/nlp/Sentiment.py +13 -1
- validmind/tests/data_validation/nlp/StopWords.py +14 -2
- validmind/tests/data_validation/nlp/TextDescription.py +14 -2
- validmind/tests/data_validation/nlp/Toxicity.py +13 -1
- validmind/tests/model_validation/BertScore.py +13 -2
- validmind/tests/model_validation/BleuScore.py +13 -2
- validmind/tests/model_validation/ContextualRecall.py +13 -1
- validmind/tests/model_validation/MeteorScore.py +13 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
- validmind/tests/model_validation/RegardScore.py +13 -2
- validmind/tests/model_validation/RougeScore.py +14 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
- validmind/tests/model_validation/ToxicityScore.py +13 -1
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
- validmind/tests/output.py +9 -2
- validmind/tests/plots/BoxPlot.py +260 -0
- validmind/tests/plots/CorrelationHeatmap.py +235 -0
- validmind/tests/plots/HistogramPlot.py +233 -0
- validmind/tests/plots/ViolinPlot.py +125 -0
- validmind/tests/plots/__init__.py +0 -0
- validmind/tests/stats/CorrelationAnalysis.py +251 -0
- validmind/tests/stats/DescriptiveStats.py +197 -0
- validmind/tests/stats/NormalityTests.py +147 -0
- validmind/tests/stats/OutlierDetection.py +173 -0
- validmind/tests/stats/__init__.py +0 -0
- validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
- validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
- validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
- validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
- validmind/unit_metrics/classification/individual/Confidence.py +52 -0
- validmind/unit_metrics/classification/individual/Correctness.py +41 -0
- validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
- validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
- validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
- validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
- validmind/unit_metrics/classification/individual/__init__.py +0 -0
- validmind/vm_models/dataset/dataset.py +147 -1
- validmind/vm_models/result/result.py +30 -6
- validmind-2.10.0rc1.dist-info/METADATA +845 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
- validmind-2.8.29.dist-info/METADATA +0 -137
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,233 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List, Optional, Union
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import plotly.graph_objects as go
|
9
|
+
from plotly.subplots import make_subplots
|
10
|
+
from scipy import stats
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
from validmind.errors import SkipTestError
|
14
|
+
from validmind.vm_models import VMDataset
|
15
|
+
|
16
|
+
|
17
|
+
def _validate_columns(dataset: VMDataset, columns: Optional[List[str]]):
|
18
|
+
"""Validate and return numerical columns."""
|
19
|
+
if columns is None:
|
20
|
+
columns = dataset.feature_columns_numeric
|
21
|
+
else:
|
22
|
+
available_columns = set(dataset.feature_columns_numeric)
|
23
|
+
columns = [col for col in columns if col in available_columns]
|
24
|
+
|
25
|
+
if not columns:
|
26
|
+
raise SkipTestError("No numerical columns found for histogram plotting")
|
27
|
+
|
28
|
+
return columns
|
29
|
+
|
30
|
+
|
31
|
+
def _process_column_data(data, log_scale: bool, column: str):
|
32
|
+
"""Process column data and return plot data and xlabel."""
|
33
|
+
plot_data = data
|
34
|
+
xlabel = column
|
35
|
+
if log_scale and (data > 0).all():
|
36
|
+
plot_data = np.log10(data)
|
37
|
+
xlabel = f"log10({column})"
|
38
|
+
return plot_data, xlabel
|
39
|
+
|
40
|
+
|
41
|
+
def _add_histogram_trace(
|
42
|
+
fig, plot_data, bins, color, opacity, normalize, column, row, col
|
43
|
+
):
|
44
|
+
"""Add histogram trace to figure."""
|
45
|
+
histnorm = "probability density" if normalize else None
|
46
|
+
|
47
|
+
fig.add_trace(
|
48
|
+
go.Histogram(
|
49
|
+
x=plot_data,
|
50
|
+
nbinsx=bins if isinstance(bins, int) else None,
|
51
|
+
name=f"Histogram - {column}",
|
52
|
+
marker_color=color,
|
53
|
+
opacity=opacity,
|
54
|
+
histnorm=histnorm,
|
55
|
+
showlegend=False,
|
56
|
+
),
|
57
|
+
row=row,
|
58
|
+
col=col,
|
59
|
+
)
|
60
|
+
|
61
|
+
|
62
|
+
def _add_kde_trace(fig, plot_data, bins, normalize, column, row, col):
|
63
|
+
"""Add KDE trace to figure if possible."""
|
64
|
+
try:
|
65
|
+
kde = stats.gaussian_kde(plot_data)
|
66
|
+
x_range = np.linspace(plot_data.min(), plot_data.max(), 100)
|
67
|
+
kde_values = kde(x_range)
|
68
|
+
|
69
|
+
if not normalize:
|
70
|
+
hist_max = (
|
71
|
+
len(plot_data) / bins if isinstance(bins, int) else len(plot_data) / 30
|
72
|
+
)
|
73
|
+
kde_values = kde_values * hist_max / kde_values.max()
|
74
|
+
|
75
|
+
fig.add_trace(
|
76
|
+
go.Scatter(
|
77
|
+
x=x_range,
|
78
|
+
y=kde_values,
|
79
|
+
mode="lines",
|
80
|
+
name=f"KDE - {column}",
|
81
|
+
line=dict(color="red", width=2),
|
82
|
+
showlegend=False,
|
83
|
+
),
|
84
|
+
row=row,
|
85
|
+
col=col,
|
86
|
+
)
|
87
|
+
except Exception:
|
88
|
+
pass
|
89
|
+
|
90
|
+
|
91
|
+
def _add_stats_annotation(fig, data, idx, row, col):
|
92
|
+
"""Add statistics annotation to subplot."""
|
93
|
+
stats_text = f"Mean: {data.mean():.3f}<br>Std: {data.std():.3f}<br>N: {len(data)}"
|
94
|
+
fig.add_annotation(
|
95
|
+
text=stats_text,
|
96
|
+
x=0.02,
|
97
|
+
y=0.98,
|
98
|
+
xref=f"x{idx+1} domain" if idx > 0 else "x domain",
|
99
|
+
yref=f"y{idx+1} domain" if idx > 0 else "y domain",
|
100
|
+
showarrow=False,
|
101
|
+
align="left",
|
102
|
+
bgcolor="rgba(255,255,255,0.8)",
|
103
|
+
bordercolor="black",
|
104
|
+
borderwidth=1,
|
105
|
+
row=row,
|
106
|
+
col=col,
|
107
|
+
)
|
108
|
+
|
109
|
+
|
110
|
+
@tags("tabular_data", "visualization", "data_quality")
|
111
|
+
@tasks("classification", "regression", "clustering")
|
112
|
+
def HistogramPlot(
|
113
|
+
dataset: VMDataset,
|
114
|
+
columns: Optional[List[str]] = None,
|
115
|
+
bins: Union[int, str, List] = 30,
|
116
|
+
color: str = "steelblue",
|
117
|
+
opacity: float = 0.7,
|
118
|
+
show_kde: bool = True,
|
119
|
+
normalize: bool = False,
|
120
|
+
log_scale: bool = False,
|
121
|
+
title_prefix: str = "Histogram of",
|
122
|
+
width: int = 1200,
|
123
|
+
height: int = 800,
|
124
|
+
n_cols: int = 2,
|
125
|
+
vertical_spacing: float = 0.15,
|
126
|
+
horizontal_spacing: float = 0.1,
|
127
|
+
) -> go.Figure:
|
128
|
+
"""
|
129
|
+
Generates customizable histogram plots for numerical features in a dataset using Plotly.
|
130
|
+
|
131
|
+
### Purpose
|
132
|
+
|
133
|
+
This test provides a flexible way to visualize the distribution of numerical features in a dataset.
|
134
|
+
It allows for extensive customization of the histogram appearance and behavior through parameters,
|
135
|
+
making it suitable for various exploratory data analysis tasks.
|
136
|
+
|
137
|
+
### Test Mechanism
|
138
|
+
|
139
|
+
The test creates histogram plots for specified numerical columns (or all numerical columns if none specified).
|
140
|
+
It supports various customization options including:
|
141
|
+
- Number of bins or bin edges
|
142
|
+
- Color and opacity
|
143
|
+
- Kernel density estimation overlay
|
144
|
+
- Logarithmic scaling
|
145
|
+
- Normalization options
|
146
|
+
- Configurable subplot layout (columns and spacing)
|
147
|
+
|
148
|
+
### Signs of High Risk
|
149
|
+
|
150
|
+
- Highly skewed distributions that may indicate data quality issues
|
151
|
+
- Unexpected bimodal or multimodal distributions
|
152
|
+
- Presence of extreme outliers
|
153
|
+
- Empty or sparse distributions
|
154
|
+
|
155
|
+
### Strengths
|
156
|
+
|
157
|
+
- Highly customizable visualization options
|
158
|
+
- Interactive Plotly plots with zoom, pan, and hover capabilities
|
159
|
+
- Supports both single and multiple column analysis
|
160
|
+
- Provides insights into data distribution patterns
|
161
|
+
- Can handle different data types and scales
|
162
|
+
- Configurable subplot layout for better visualization
|
163
|
+
|
164
|
+
### Limitations
|
165
|
+
|
166
|
+
- Limited to numerical features only
|
167
|
+
- Visual interpretation may be subjective
|
168
|
+
- May not be suitable for high-dimensional datasets
|
169
|
+
- Performance may degrade with very large datasets
|
170
|
+
"""
|
171
|
+
# Validate inputs
|
172
|
+
columns = _validate_columns(dataset, columns)
|
173
|
+
|
174
|
+
# Calculate subplot layout
|
175
|
+
n_cols = min(n_cols, len(columns))
|
176
|
+
n_rows = (len(columns) + n_cols - 1) // n_cols
|
177
|
+
|
178
|
+
# Create subplots
|
179
|
+
subplot_titles = [f"{title_prefix} {col}" for col in columns]
|
180
|
+
fig = make_subplots(
|
181
|
+
rows=n_rows,
|
182
|
+
cols=n_cols,
|
183
|
+
subplot_titles=subplot_titles,
|
184
|
+
vertical_spacing=vertical_spacing,
|
185
|
+
horizontal_spacing=horizontal_spacing,
|
186
|
+
)
|
187
|
+
|
188
|
+
for idx, column in enumerate(columns):
|
189
|
+
row = (idx // n_cols) + 1
|
190
|
+
col = (idx % n_cols) + 1
|
191
|
+
data = dataset.df[column].dropna()
|
192
|
+
|
193
|
+
if len(data) == 0:
|
194
|
+
fig.add_annotation(
|
195
|
+
text=f"No data available<br>for {column}",
|
196
|
+
x=0.5,
|
197
|
+
y=0.5,
|
198
|
+
xref=f"x{idx+1}" if idx > 0 else "x",
|
199
|
+
yref=f"y{idx+1}" if idx > 0 else "y",
|
200
|
+
showarrow=False,
|
201
|
+
row=row,
|
202
|
+
col=col,
|
203
|
+
)
|
204
|
+
continue
|
205
|
+
|
206
|
+
# Process data
|
207
|
+
plot_data, xlabel = _process_column_data(data, log_scale, column)
|
208
|
+
|
209
|
+
# Add histogram
|
210
|
+
_add_histogram_trace(
|
211
|
+
fig, plot_data, bins, color, opacity, normalize, column, row, col
|
212
|
+
)
|
213
|
+
|
214
|
+
# Add KDE if requested
|
215
|
+
if show_kde and len(data) > 1:
|
216
|
+
_add_kde_trace(fig, plot_data, bins, normalize, column, row, col)
|
217
|
+
|
218
|
+
# Update axes and add annotations
|
219
|
+
fig.update_xaxes(title_text=xlabel, row=row, col=col)
|
220
|
+
ylabel = "Density" if normalize else "Frequency"
|
221
|
+
fig.update_yaxes(title_text=ylabel, row=row, col=col)
|
222
|
+
_add_stats_annotation(fig, data, idx, row, col)
|
223
|
+
|
224
|
+
# Update layout
|
225
|
+
fig.update_layout(
|
226
|
+
title_text="Dataset Feature Distributions",
|
227
|
+
showlegend=False,
|
228
|
+
width=width,
|
229
|
+
height=height,
|
230
|
+
template="plotly_white",
|
231
|
+
)
|
232
|
+
|
233
|
+
return fig
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
|
+
from validmind.vm_models import VMDataset
|
12
|
+
|
13
|
+
|
14
|
+
@tags("tabular_data", "visualization", "distribution")
|
15
|
+
@tasks("classification", "regression", "clustering")
|
16
|
+
def ViolinPlot(
|
17
|
+
dataset: VMDataset,
|
18
|
+
columns: Optional[List[str]] = None,
|
19
|
+
group_by: Optional[str] = None,
|
20
|
+
width: int = 800,
|
21
|
+
height: int = 600,
|
22
|
+
) -> px.violin:
|
23
|
+
"""
|
24
|
+
Generates interactive violin plots for numerical features using Plotly.
|
25
|
+
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
This test creates violin plots to visualize the distribution of numerical features,
|
29
|
+
showing both the probability density and summary statistics. Violin plots combine
|
30
|
+
aspects of box plots and kernel density estimation for rich distribution visualization.
|
31
|
+
|
32
|
+
### Test Mechanism
|
33
|
+
|
34
|
+
The test creates violin plots for specified numerical columns, with optional
|
35
|
+
grouping by categorical variables. Each violin shows the distribution shape,
|
36
|
+
quartiles, and median values.
|
37
|
+
|
38
|
+
### Signs of High Risk
|
39
|
+
|
40
|
+
- Multimodal distributions that might indicate mixed populations
|
41
|
+
- Highly skewed distributions suggesting data quality issues
|
42
|
+
- Large differences in distribution shapes across groups
|
43
|
+
- Unusual distribution patterns that contradict domain expectations
|
44
|
+
|
45
|
+
### Strengths
|
46
|
+
|
47
|
+
- Shows detailed distribution shape information
|
48
|
+
- Interactive Plotly visualization with hover details
|
49
|
+
- Effective for comparing distributions across groups
|
50
|
+
- Combines density estimation with quartile information
|
51
|
+
|
52
|
+
### Limitations
|
53
|
+
|
54
|
+
- Limited to numerical features only
|
55
|
+
- Requires sufficient data points for meaningful density estimation
|
56
|
+
- May not be suitable for discrete variables
|
57
|
+
- Can be misleading with very small sample sizes
|
58
|
+
"""
|
59
|
+
# Get numerical columns
|
60
|
+
if columns is None:
|
61
|
+
columns = dataset.feature_columns_numeric
|
62
|
+
else:
|
63
|
+
available_columns = set(dataset.feature_columns_numeric)
|
64
|
+
columns = [col for col in columns if col in available_columns]
|
65
|
+
|
66
|
+
if not columns:
|
67
|
+
raise SkipTestError("No numerical columns found for violin plot")
|
68
|
+
|
69
|
+
# For violin plots, we'll melt the data to long format
|
70
|
+
data = dataset.df[columns].dropna()
|
71
|
+
|
72
|
+
if len(data) == 0:
|
73
|
+
raise SkipTestError("No valid data available for violin plot")
|
74
|
+
|
75
|
+
# Melt the dataframe to long format
|
76
|
+
melted_data = data.melt(var_name="Feature", value_name="Value")
|
77
|
+
|
78
|
+
# Add group column if specified
|
79
|
+
if group_by and group_by in dataset.df.columns:
|
80
|
+
# Repeat group values for each feature
|
81
|
+
group_values = []
|
82
|
+
for column in columns:
|
83
|
+
column_data = dataset.df[[column, group_by]].dropna()
|
84
|
+
group_values.extend(column_data[group_by].tolist())
|
85
|
+
|
86
|
+
if len(group_values) == len(melted_data):
|
87
|
+
melted_data["Group"] = group_values
|
88
|
+
else:
|
89
|
+
group_by = None # Disable grouping if lengths don't match
|
90
|
+
|
91
|
+
# Create violin plot
|
92
|
+
if group_by and "Group" in melted_data.columns:
|
93
|
+
fig = px.violin(
|
94
|
+
melted_data,
|
95
|
+
x="Feature",
|
96
|
+
y="Value",
|
97
|
+
color="Group",
|
98
|
+
box=True,
|
99
|
+
title=f"Distribution of Features by {group_by}",
|
100
|
+
width=width,
|
101
|
+
height=height,
|
102
|
+
)
|
103
|
+
else:
|
104
|
+
fig = px.violin(
|
105
|
+
melted_data,
|
106
|
+
x="Feature",
|
107
|
+
y="Value",
|
108
|
+
box=True,
|
109
|
+
title="Feature Distributions",
|
110
|
+
width=width,
|
111
|
+
height=height,
|
112
|
+
)
|
113
|
+
|
114
|
+
# Update layout
|
115
|
+
fig.update_layout(
|
116
|
+
template="plotly_white",
|
117
|
+
title_x=0.5,
|
118
|
+
xaxis_title="Features",
|
119
|
+
yaxis_title="Values",
|
120
|
+
)
|
121
|
+
|
122
|
+
# Rotate x-axis labels for better readability
|
123
|
+
fig.update_xaxes(tickangle=45)
|
124
|
+
|
125
|
+
return fig
|
File without changes
|
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
from scipy import stats
|
10
|
+
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.errors import SkipTestError
|
13
|
+
from validmind.utils import format_records
|
14
|
+
from validmind.vm_models import VMDataset
|
15
|
+
|
16
|
+
|
17
|
+
def _validate_and_prepare_data(dataset: VMDataset, columns: Optional[List[str]]):
|
18
|
+
"""Validate inputs and prepare data for correlation analysis."""
|
19
|
+
if columns is None:
|
20
|
+
columns = dataset.feature_columns_numeric
|
21
|
+
else:
|
22
|
+
available_columns = set(dataset.feature_columns_numeric)
|
23
|
+
columns = [col for col in columns if col in available_columns]
|
24
|
+
|
25
|
+
if not columns:
|
26
|
+
raise SkipTestError("No numerical columns found for correlation analysis")
|
27
|
+
|
28
|
+
if len(columns) < 2:
|
29
|
+
raise SkipTestError(
|
30
|
+
"At least 2 numerical columns required for correlation analysis"
|
31
|
+
)
|
32
|
+
|
33
|
+
# Get data and remove constant columns
|
34
|
+
data = dataset.df[columns].dropna()
|
35
|
+
data = data.loc[:, data.var() != 0]
|
36
|
+
|
37
|
+
if data.shape[1] < 2:
|
38
|
+
raise SkipTestError(
|
39
|
+
"Insufficient non-constant columns for correlation analysis"
|
40
|
+
)
|
41
|
+
|
42
|
+
return data
|
43
|
+
|
44
|
+
|
45
|
+
def _compute_correlation_matrices(data, method: str):
|
46
|
+
"""Compute correlation and p-value matrices based on method."""
|
47
|
+
if method == "pearson":
|
48
|
+
return _compute_pearson_with_pvalues(data)
|
49
|
+
elif method == "spearman":
|
50
|
+
return _compute_spearman_with_pvalues(data)
|
51
|
+
elif method == "kendall":
|
52
|
+
return _compute_kendall_with_pvalues(data)
|
53
|
+
else:
|
54
|
+
raise ValueError(f"Unsupported correlation method: {method}")
|
55
|
+
|
56
|
+
|
57
|
+
def _create_correlation_pairs(
|
58
|
+
corr_matrix, p_matrix, significance_level: float, min_correlation: float
|
59
|
+
):
|
60
|
+
"""Create correlation pairs table."""
|
61
|
+
correlation_pairs = []
|
62
|
+
|
63
|
+
for i, col1 in enumerate(corr_matrix.columns):
|
64
|
+
for j, col2 in enumerate(corr_matrix.columns):
|
65
|
+
if i < j: # Only upper triangle to avoid duplicates
|
66
|
+
corr_val = corr_matrix.iloc[i, j]
|
67
|
+
p_val = p_matrix.iloc[i, j]
|
68
|
+
|
69
|
+
if abs(corr_val) >= min_correlation:
|
70
|
+
pair_info = {
|
71
|
+
"Feature 1": col1,
|
72
|
+
"Feature 2": col2,
|
73
|
+
"Correlation": corr_val,
|
74
|
+
"Abs Correlation": abs(corr_val),
|
75
|
+
"p-value": p_val,
|
76
|
+
"Significant": "Yes" if p_val < significance_level else "No",
|
77
|
+
"Strength": _correlation_strength(abs(corr_val)),
|
78
|
+
"Direction": "Positive" if corr_val > 0 else "Negative",
|
79
|
+
}
|
80
|
+
correlation_pairs.append(pair_info)
|
81
|
+
|
82
|
+
# Sort by absolute correlation value
|
83
|
+
correlation_pairs.sort(key=lambda x: x["Abs Correlation"], reverse=True)
|
84
|
+
return correlation_pairs
|
85
|
+
|
86
|
+
|
87
|
+
def _create_summary_statistics(corr_matrix, correlation_pairs):
|
88
|
+
"""Create summary statistics table."""
|
89
|
+
all_correlations = []
|
90
|
+
for i in range(len(corr_matrix.columns)):
|
91
|
+
for j in range(i + 1, len(corr_matrix.columns)):
|
92
|
+
all_correlations.append(abs(corr_matrix.iloc[i, j]))
|
93
|
+
|
94
|
+
significant_count = sum(
|
95
|
+
1 for pair in correlation_pairs if pair["Significant"] == "Yes"
|
96
|
+
)
|
97
|
+
high_corr_count = sum(
|
98
|
+
1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.7
|
99
|
+
)
|
100
|
+
very_high_corr_count = sum(
|
101
|
+
1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.9
|
102
|
+
)
|
103
|
+
|
104
|
+
return {
|
105
|
+
"Total Feature Pairs": len(all_correlations),
|
106
|
+
"Pairs Above Threshold": len(correlation_pairs),
|
107
|
+
"Significant Correlations": significant_count,
|
108
|
+
"High Correlations (>0.7)": high_corr_count,
|
109
|
+
"Very High Correlations (>0.9)": very_high_corr_count,
|
110
|
+
"Mean Absolute Correlation": np.mean(all_correlations),
|
111
|
+
"Max Absolute Correlation": np.max(all_correlations),
|
112
|
+
"Median Absolute Correlation": np.median(all_correlations),
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
@tags("tabular_data", "statistics", "correlation")
|
117
|
+
@tasks("classification", "regression", "clustering")
|
118
|
+
def CorrelationAnalysis(
|
119
|
+
dataset: VMDataset,
|
120
|
+
columns: Optional[List[str]] = None,
|
121
|
+
method: str = "pearson",
|
122
|
+
significance_level: float = 0.05,
|
123
|
+
min_correlation: float = 0.1,
|
124
|
+
) -> Dict[str, Any]:
|
125
|
+
"""
|
126
|
+
Performs comprehensive correlation analysis with significance testing for numerical features.
|
127
|
+
|
128
|
+
### Purpose
|
129
|
+
|
130
|
+
This test conducts detailed correlation analysis between numerical features, including
|
131
|
+
correlation coefficients, significance testing, and identification of significant
|
132
|
+
relationships. It helps identify multicollinearity, feature relationships, and
|
133
|
+
potential redundancies in the dataset.
|
134
|
+
|
135
|
+
### Test Mechanism
|
136
|
+
|
137
|
+
The test computes correlation coefficients using the specified method and performs
|
138
|
+
statistical significance testing for each correlation pair. It provides:
|
139
|
+
- Correlation matrix with significance indicators
|
140
|
+
- List of significant correlations above threshold
|
141
|
+
- Summary statistics about correlation patterns
|
142
|
+
- Identification of highly correlated feature pairs
|
143
|
+
|
144
|
+
### Signs of High Risk
|
145
|
+
|
146
|
+
- Very high correlations (>0.9) indicating potential multicollinearity
|
147
|
+
- Many significant correlations suggesting complex feature interactions
|
148
|
+
- Features with no significant correlations to others (potential isolation)
|
149
|
+
- Unexpected correlation patterns contradicting domain knowledge
|
150
|
+
|
151
|
+
### Strengths
|
152
|
+
|
153
|
+
- Provides statistical significance testing for correlations
|
154
|
+
- Supports multiple correlation methods (Pearson, Spearman, Kendall)
|
155
|
+
- Identifies potentially problematic high correlations
|
156
|
+
- Filters results by minimum correlation threshold
|
157
|
+
- Comprehensive summary of correlation patterns
|
158
|
+
|
159
|
+
### Limitations
|
160
|
+
|
161
|
+
- Limited to numerical features only
|
162
|
+
- Cannot detect non-linear relationships (except with Spearman)
|
163
|
+
- Significance testing assumes certain distributional properties
|
164
|
+
- Correlation does not imply causation
|
165
|
+
"""
|
166
|
+
# Validate and prepare data
|
167
|
+
data = _validate_and_prepare_data(dataset, columns)
|
168
|
+
|
169
|
+
# Compute correlation matrices
|
170
|
+
corr_matrix, p_matrix = _compute_correlation_matrices(data, method)
|
171
|
+
|
172
|
+
# Create correlation pairs
|
173
|
+
correlation_pairs = _create_correlation_pairs(
|
174
|
+
corr_matrix, p_matrix, significance_level, min_correlation
|
175
|
+
)
|
176
|
+
|
177
|
+
# Build results
|
178
|
+
results = {}
|
179
|
+
if correlation_pairs:
|
180
|
+
results["Correlation Pairs"] = format_records(pd.DataFrame(correlation_pairs))
|
181
|
+
|
182
|
+
# Create summary statistics
|
183
|
+
summary_stats = _create_summary_statistics(corr_matrix, correlation_pairs)
|
184
|
+
results["Summary Statistics"] = format_records(pd.DataFrame([summary_stats]))
|
185
|
+
|
186
|
+
return results
|
187
|
+
|
188
|
+
|
189
|
+
def _compute_pearson_with_pvalues(data):
|
190
|
+
"""Compute Pearson correlation with p-values"""
|
191
|
+
n_vars = data.shape[1]
|
192
|
+
corr_matrix = data.corr(method="pearson")
|
193
|
+
p_matrix = pd.DataFrame(
|
194
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
195
|
+
)
|
196
|
+
|
197
|
+
for i, col1 in enumerate(data.columns):
|
198
|
+
for j, col2 in enumerate(data.columns):
|
199
|
+
if i != j:
|
200
|
+
_, p_val = stats.pearsonr(data[col1], data[col2])
|
201
|
+
p_matrix.iloc[i, j] = p_val
|
202
|
+
|
203
|
+
return corr_matrix, p_matrix
|
204
|
+
|
205
|
+
|
206
|
+
def _compute_spearman_with_pvalues(data):
|
207
|
+
"""Compute Spearman correlation with p-values"""
|
208
|
+
n_vars = data.shape[1]
|
209
|
+
corr_matrix = data.corr(method="spearman")
|
210
|
+
p_matrix = pd.DataFrame(
|
211
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
212
|
+
)
|
213
|
+
|
214
|
+
for i, col1 in enumerate(data.columns):
|
215
|
+
for j, col2 in enumerate(data.columns):
|
216
|
+
if i != j:
|
217
|
+
_, p_val = stats.spearmanr(data[col1], data[col2])
|
218
|
+
p_matrix.iloc[i, j] = p_val
|
219
|
+
|
220
|
+
return corr_matrix, p_matrix
|
221
|
+
|
222
|
+
|
223
|
+
def _compute_kendall_with_pvalues(data):
|
224
|
+
"""Compute Kendall correlation with p-values"""
|
225
|
+
n_vars = data.shape[1]
|
226
|
+
corr_matrix = data.corr(method="kendall")
|
227
|
+
p_matrix = pd.DataFrame(
|
228
|
+
np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
|
229
|
+
)
|
230
|
+
|
231
|
+
for i, col1 in enumerate(data.columns):
|
232
|
+
for j, col2 in enumerate(data.columns):
|
233
|
+
if i != j:
|
234
|
+
_, p_val = stats.kendalltau(data[col1], data[col2])
|
235
|
+
p_matrix.iloc[i, j] = p_val
|
236
|
+
|
237
|
+
return corr_matrix, p_matrix
|
238
|
+
|
239
|
+
|
240
|
+
def _correlation_strength(abs_corr):
|
241
|
+
"""Classify correlation strength"""
|
242
|
+
if abs_corr >= 0.9:
|
243
|
+
return "Very Strong"
|
244
|
+
elif abs_corr >= 0.7:
|
245
|
+
return "Strong"
|
246
|
+
elif abs_corr >= 0.5:
|
247
|
+
return "Moderate"
|
248
|
+
elif abs_corr >= 0.3:
|
249
|
+
return "Weak"
|
250
|
+
else:
|
251
|
+
return "Very Weak"
|