validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +16 -5
- validmind/__version__.py +1 -1
- validmind/ai/utils.py +4 -24
- validmind/api_client.py +6 -17
- validmind/datasets/credit_risk/lending_club.py +13 -1
- validmind/datasets/nlp/cnn_dailymail.py +15 -1
- validmind/logging.py +48 -0
- validmind/tests/__init__.py +2 -0
- validmind/tests/__types__.py +18 -0
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
- validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
- validmind/tests/data_validation/SeasonalDecompose.py +14 -2
- validmind/tests/data_validation/ShapiroWilk.py +14 -1
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
- validmind/tests/data_validation/WOEBinPlots.py +14 -1
- validmind/tests/data_validation/WOEBinTable.py +13 -2
- validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
- validmind/tests/data_validation/nlp/CommonWords.py +14 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
- validmind/tests/data_validation/nlp/Sentiment.py +13 -1
- validmind/tests/data_validation/nlp/StopWords.py +14 -2
- validmind/tests/data_validation/nlp/TextDescription.py +14 -2
- validmind/tests/data_validation/nlp/Toxicity.py +13 -1
- validmind/tests/model_validation/BertScore.py +13 -2
- validmind/tests/model_validation/BleuScore.py +13 -2
- validmind/tests/model_validation/ContextualRecall.py +13 -1
- validmind/tests/model_validation/MeteorScore.py +13 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
- validmind/tests/model_validation/RegardScore.py +13 -2
- validmind/tests/model_validation/RougeScore.py +14 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
- validmind/tests/model_validation/ToxicityScore.py +13 -1
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
- validmind/tests/output.py +9 -2
- validmind/tests/plots/BoxPlot.py +260 -0
- validmind/tests/plots/CorrelationHeatmap.py +235 -0
- validmind/tests/plots/HistogramPlot.py +233 -0
- validmind/tests/plots/ViolinPlot.py +125 -0
- validmind/tests/plots/__init__.py +0 -0
- validmind/tests/stats/CorrelationAnalysis.py +251 -0
- validmind/tests/stats/DescriptiveStats.py +197 -0
- validmind/tests/stats/NormalityTests.py +147 -0
- validmind/tests/stats/OutlierDetection.py +173 -0
- validmind/tests/stats/__init__.py +0 -0
- validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
- validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
- validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
- validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
- validmind/unit_metrics/classification/individual/Confidence.py +52 -0
- validmind/unit_metrics/classification/individual/Correctness.py +41 -0
- validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
- validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
- validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
- validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
- validmind/unit_metrics/classification/individual/__init__.py +0 -0
- validmind/vm_models/dataset/dataset.py +147 -1
- validmind/vm_models/result/result.py +30 -6
- validmind-2.10.0rc1.dist-info/METADATA +845 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
- validmind-2.8.29.dist-info/METADATA +0 -137
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
import plotly.graph_objects as go
|
8
|
+
from plotly.subplots import make_subplots
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import SkipTestError
|
12
|
+
from validmind.vm_models import VMDataset
|
13
|
+
|
14
|
+
|
15
|
+
def _validate_inputs(
|
16
|
+
dataset: VMDataset, columns: Optional[List[str]], group_by: Optional[str]
|
17
|
+
):
|
18
|
+
"""Validate inputs and return validated columns."""
|
19
|
+
if columns is None:
|
20
|
+
columns = dataset.feature_columns_numeric
|
21
|
+
else:
|
22
|
+
available_columns = set(dataset.feature_columns_numeric)
|
23
|
+
columns = [col for col in columns if col in available_columns]
|
24
|
+
|
25
|
+
if not columns:
|
26
|
+
raise SkipTestError("No numerical columns found for box plotting")
|
27
|
+
|
28
|
+
if group_by is not None:
|
29
|
+
if group_by not in dataset.df.columns:
|
30
|
+
raise SkipTestError(f"Group column '{group_by}' not found in dataset")
|
31
|
+
if group_by in columns:
|
32
|
+
columns.remove(group_by)
|
33
|
+
|
34
|
+
return columns
|
35
|
+
|
36
|
+
|
37
|
+
def _create_grouped_boxplot(
|
38
|
+
dataset, columns, group_by, colors, show_outliers, title_prefix, width, height
|
39
|
+
):
|
40
|
+
"""Create grouped box plots."""
|
41
|
+
fig = go.Figure()
|
42
|
+
groups = dataset.df[group_by].dropna().unique()
|
43
|
+
|
44
|
+
for col_idx, column in enumerate(columns):
|
45
|
+
for group_idx, group_value in enumerate(groups):
|
46
|
+
data_subset = dataset.df[dataset.df[group_by] == group_value][
|
47
|
+
column
|
48
|
+
].dropna()
|
49
|
+
|
50
|
+
if len(data_subset) > 0:
|
51
|
+
color = colors[group_idx % len(colors)]
|
52
|
+
fig.add_trace(
|
53
|
+
go.Box(
|
54
|
+
y=data_subset,
|
55
|
+
name=f"{group_value}",
|
56
|
+
marker_color=color,
|
57
|
+
boxpoints="outliers" if show_outliers else False,
|
58
|
+
jitter=0.3,
|
59
|
+
pointpos=-1.8,
|
60
|
+
legendgroup=f"{group_value}",
|
61
|
+
showlegend=(col_idx == 0),
|
62
|
+
offsetgroup=group_idx,
|
63
|
+
x=[column] * len(data_subset),
|
64
|
+
)
|
65
|
+
)
|
66
|
+
|
67
|
+
fig.update_layout(
|
68
|
+
title=f"{title_prefix} Features by {group_by}",
|
69
|
+
xaxis_title="Features",
|
70
|
+
yaxis_title="Values",
|
71
|
+
boxmode="group",
|
72
|
+
width=width,
|
73
|
+
height=height,
|
74
|
+
template="plotly_white",
|
75
|
+
)
|
76
|
+
return fig
|
77
|
+
|
78
|
+
|
79
|
+
def _create_single_boxplot(
|
80
|
+
dataset, column, colors, show_outliers, title_prefix, width, height
|
81
|
+
):
|
82
|
+
"""Create single column box plot."""
|
83
|
+
data = dataset.df[column].dropna()
|
84
|
+
if len(data) == 0:
|
85
|
+
raise SkipTestError(f"No data available for column {column}")
|
86
|
+
|
87
|
+
fig = go.Figure()
|
88
|
+
fig.add_trace(
|
89
|
+
go.Box(
|
90
|
+
y=data,
|
91
|
+
name=column,
|
92
|
+
marker_color=colors[0],
|
93
|
+
boxpoints="outliers" if show_outliers else False,
|
94
|
+
jitter=0.3,
|
95
|
+
pointpos=-1.8,
|
96
|
+
)
|
97
|
+
)
|
98
|
+
|
99
|
+
fig.update_layout(
|
100
|
+
title=f"{title_prefix} {column}",
|
101
|
+
yaxis_title=column,
|
102
|
+
width=width,
|
103
|
+
height=height,
|
104
|
+
template="plotly_white",
|
105
|
+
showlegend=False,
|
106
|
+
)
|
107
|
+
return fig
|
108
|
+
|
109
|
+
|
110
|
+
def _create_multiple_boxplots(
|
111
|
+
dataset, columns, colors, show_outliers, title_prefix, width, height
|
112
|
+
):
|
113
|
+
"""Create multiple column box plots in subplot layout."""
|
114
|
+
n_cols = min(3, len(columns))
|
115
|
+
n_rows = (len(columns) + n_cols - 1) // n_cols
|
116
|
+
|
117
|
+
subplot_titles = [f"{title_prefix} {col}" for col in columns]
|
118
|
+
fig = make_subplots(
|
119
|
+
rows=n_rows,
|
120
|
+
cols=n_cols,
|
121
|
+
subplot_titles=subplot_titles,
|
122
|
+
vertical_spacing=0.1,
|
123
|
+
horizontal_spacing=0.1,
|
124
|
+
)
|
125
|
+
|
126
|
+
for idx, column in enumerate(columns):
|
127
|
+
row = (idx // n_cols) + 1
|
128
|
+
col = (idx % n_cols) + 1
|
129
|
+
data = dataset.df[column].dropna()
|
130
|
+
|
131
|
+
if len(data) > 0:
|
132
|
+
color = colors[idx % len(colors)]
|
133
|
+
fig.add_trace(
|
134
|
+
go.Box(
|
135
|
+
y=data,
|
136
|
+
name=column,
|
137
|
+
marker_color=color,
|
138
|
+
boxpoints="outliers" if show_outliers else False,
|
139
|
+
jitter=0.3,
|
140
|
+
pointpos=-1.8,
|
141
|
+
showlegend=False,
|
142
|
+
),
|
143
|
+
row=row,
|
144
|
+
col=col,
|
145
|
+
)
|
146
|
+
fig.update_yaxes(title_text=column, row=row, col=col)
|
147
|
+
else:
|
148
|
+
fig.add_annotation(
|
149
|
+
text=f"No data available<br>for {column}",
|
150
|
+
x=0.5,
|
151
|
+
y=0.5,
|
152
|
+
xref=f"x{idx+1} domain" if idx > 0 else "x domain",
|
153
|
+
yref=f"y{idx+1} domain" if idx > 0 else "y domain",
|
154
|
+
showarrow=False,
|
155
|
+
row=row,
|
156
|
+
col=col,
|
157
|
+
)
|
158
|
+
|
159
|
+
fig.update_layout(
|
160
|
+
title="Dataset Feature Distributions",
|
161
|
+
width=width,
|
162
|
+
height=height,
|
163
|
+
template="plotly_white",
|
164
|
+
showlegend=False,
|
165
|
+
)
|
166
|
+
return fig
|
167
|
+
|
168
|
+
|
169
|
+
@tags("tabular_data", "visualization", "data_quality")
|
170
|
+
@tasks("classification", "regression", "clustering")
|
171
|
+
def BoxPlot(
|
172
|
+
dataset: VMDataset,
|
173
|
+
columns: Optional[List[str]] = None,
|
174
|
+
group_by: Optional[str] = None,
|
175
|
+
width: int = 1200,
|
176
|
+
height: int = 600,
|
177
|
+
colors: Optional[List[str]] = None,
|
178
|
+
show_outliers: bool = True,
|
179
|
+
title_prefix: str = "Box Plot of",
|
180
|
+
) -> go.Figure:
|
181
|
+
"""
|
182
|
+
Generates customizable box plots for numerical features in a dataset with optional grouping using Plotly.
|
183
|
+
|
184
|
+
### Purpose
|
185
|
+
|
186
|
+
This test provides a flexible way to visualize the distribution of numerical features
|
187
|
+
through interactive box plots, with optional grouping by categorical variables. Box plots are
|
188
|
+
effective for identifying outliers, comparing distributions across groups, and
|
189
|
+
understanding the spread and central tendency of the data.
|
190
|
+
|
191
|
+
### Test Mechanism
|
192
|
+
|
193
|
+
The test creates interactive box plots for specified numerical columns (or all numerical columns
|
194
|
+
if none specified). It supports various customization options including:
|
195
|
+
- Grouping by categorical variables
|
196
|
+
- Customizable colors and styling
|
197
|
+
- Outlier display options
|
198
|
+
- Interactive hover information
|
199
|
+
- Zoom and pan capabilities
|
200
|
+
|
201
|
+
### Signs of High Risk
|
202
|
+
|
203
|
+
- Presence of many outliers indicating data quality issues
|
204
|
+
- Highly skewed distributions
|
205
|
+
- Large differences in variance across groups
|
206
|
+
- Unexpected patterns in grouped data
|
207
|
+
|
208
|
+
### Strengths
|
209
|
+
|
210
|
+
- Clear visualization of distribution statistics (median, quartiles, outliers)
|
211
|
+
- Interactive Plotly plots with hover information and zoom capabilities
|
212
|
+
- Effective for comparing distributions across groups
|
213
|
+
- Handles missing values appropriately
|
214
|
+
- Highly customizable appearance
|
215
|
+
|
216
|
+
### Limitations
|
217
|
+
|
218
|
+
- Limited to numerical features only
|
219
|
+
- May not be suitable for continuous variables with many unique values
|
220
|
+
- Visual interpretation may be subjective
|
221
|
+
- Less effective with very large datasets
|
222
|
+
"""
|
223
|
+
# Validate inputs
|
224
|
+
columns = _validate_inputs(dataset, columns, group_by)
|
225
|
+
|
226
|
+
# Set default colors
|
227
|
+
if colors is None:
|
228
|
+
colors = [
|
229
|
+
"steelblue",
|
230
|
+
"orange",
|
231
|
+
"green",
|
232
|
+
"red",
|
233
|
+
"purple",
|
234
|
+
"brown",
|
235
|
+
"pink",
|
236
|
+
"gray",
|
237
|
+
"olive",
|
238
|
+
"cyan",
|
239
|
+
]
|
240
|
+
|
241
|
+
# Create appropriate plot type
|
242
|
+
if group_by is not None:
|
243
|
+
return _create_grouped_boxplot(
|
244
|
+
dataset,
|
245
|
+
columns,
|
246
|
+
group_by,
|
247
|
+
colors,
|
248
|
+
show_outliers,
|
249
|
+
title_prefix,
|
250
|
+
width,
|
251
|
+
height,
|
252
|
+
)
|
253
|
+
elif len(columns) == 1:
|
254
|
+
return _create_single_boxplot(
|
255
|
+
dataset, columns[0], colors, show_outliers, title_prefix, width, height
|
256
|
+
)
|
257
|
+
else:
|
258
|
+
return _create_multiple_boxplots(
|
259
|
+
dataset, columns, colors, show_outliers, title_prefix, width, height
|
260
|
+
)
|
@@ -0,0 +1,235 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import plotly.graph_objects as go
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import SkipTestError
|
12
|
+
from validmind.vm_models import VMDataset
|
13
|
+
|
14
|
+
|
15
|
+
def _validate_and_prepare_data(
|
16
|
+
dataset: VMDataset, columns: Optional[List[str]], method: str
|
17
|
+
):
|
18
|
+
"""Validate inputs and prepare correlation data."""
|
19
|
+
if columns is None:
|
20
|
+
columns = dataset.feature_columns_numeric
|
21
|
+
else:
|
22
|
+
available_columns = set(dataset.feature_columns_numeric)
|
23
|
+
columns = [col for col in columns if col in available_columns]
|
24
|
+
|
25
|
+
if not columns:
|
26
|
+
raise SkipTestError("No numerical columns found for correlation analysis")
|
27
|
+
|
28
|
+
if len(columns) < 2:
|
29
|
+
raise SkipTestError(
|
30
|
+
"At least 2 numerical columns required for correlation analysis"
|
31
|
+
)
|
32
|
+
|
33
|
+
# Get data and remove constant columns
|
34
|
+
data = dataset.df[columns]
|
35
|
+
data = data.loc[:, data.var() != 0]
|
36
|
+
|
37
|
+
if data.shape[1] < 2:
|
38
|
+
raise SkipTestError(
|
39
|
+
"Insufficient non-constant columns for correlation analysis"
|
40
|
+
)
|
41
|
+
|
42
|
+
return data.corr(method=method)
|
43
|
+
|
44
|
+
|
45
|
+
def _apply_filters(corr_matrix, threshold: Optional[float], mask_upper: bool):
|
46
|
+
"""Apply threshold and masking filters to correlation matrix."""
|
47
|
+
if threshold is not None:
|
48
|
+
mask = np.abs(corr_matrix) < threshold
|
49
|
+
corr_matrix = corr_matrix.mask(mask)
|
50
|
+
|
51
|
+
if mask_upper:
|
52
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
53
|
+
corr_matrix = corr_matrix.mask(mask)
|
54
|
+
|
55
|
+
return corr_matrix
|
56
|
+
|
57
|
+
|
58
|
+
def _create_annotation_text(z_values, y_labels, x_labels, show_values: bool):
|
59
|
+
"""Create text annotations for heatmap cells."""
|
60
|
+
if not show_values:
|
61
|
+
return None
|
62
|
+
|
63
|
+
text = []
|
64
|
+
for i in range(len(y_labels)):
|
65
|
+
text_row = []
|
66
|
+
for j in range(len(x_labels)):
|
67
|
+
value = z_values[i][j]
|
68
|
+
if np.isnan(value):
|
69
|
+
text_row.append("")
|
70
|
+
else:
|
71
|
+
text_row.append(f"{value:.3f}")
|
72
|
+
text.append(text_row)
|
73
|
+
return text
|
74
|
+
|
75
|
+
|
76
|
+
def _calculate_adaptive_font_size(n_features: int) -> int:
|
77
|
+
"""Calculate adaptive font size based on number of features."""
|
78
|
+
if n_features <= 10:
|
79
|
+
return 12
|
80
|
+
elif n_features <= 20:
|
81
|
+
return 10
|
82
|
+
elif n_features <= 30:
|
83
|
+
return 8
|
84
|
+
else:
|
85
|
+
return 6
|
86
|
+
|
87
|
+
|
88
|
+
def _calculate_stats_and_update_layout(
|
89
|
+
fig, corr_matrix, method: str, title: str, width: int, height: int
|
90
|
+
):
|
91
|
+
"""Calculate statistics and update figure layout."""
|
92
|
+
n_features = corr_matrix.shape[0]
|
93
|
+
upper_triangle = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
|
94
|
+
upper_triangle = upper_triangle[~np.isnan(upper_triangle)]
|
95
|
+
|
96
|
+
if len(upper_triangle) > 0:
|
97
|
+
mean_corr = np.abs(upper_triangle).mean()
|
98
|
+
max_corr = np.abs(upper_triangle).max()
|
99
|
+
stats_text = f"Features: {n_features}<br>Mean |r|: {mean_corr:.3f}<br>Max |r|: {max_corr:.3f}"
|
100
|
+
else:
|
101
|
+
stats_text = f"Features: {n_features}"
|
102
|
+
|
103
|
+
fig.update_layout(
|
104
|
+
title={
|
105
|
+
"text": f"{title} ({method.capitalize()} Correlation)",
|
106
|
+
"x": 0.5,
|
107
|
+
"xanchor": "center",
|
108
|
+
},
|
109
|
+
width=width,
|
110
|
+
height=height,
|
111
|
+
template="plotly_white",
|
112
|
+
xaxis=dict(tickangle=45, side="bottom"),
|
113
|
+
yaxis=dict(tickmode="linear", autorange="reversed"),
|
114
|
+
annotations=[
|
115
|
+
dict(
|
116
|
+
text=stats_text,
|
117
|
+
x=0.02,
|
118
|
+
y=0.98,
|
119
|
+
xref="paper",
|
120
|
+
yref="paper",
|
121
|
+
showarrow=False,
|
122
|
+
align="left",
|
123
|
+
bgcolor="rgba(255,255,255,0.8)",
|
124
|
+
bordercolor="black",
|
125
|
+
borderwidth=1,
|
126
|
+
)
|
127
|
+
],
|
128
|
+
)
|
129
|
+
|
130
|
+
|
131
|
+
@tags("tabular_data", "visualization", "correlation")
|
132
|
+
@tasks("classification", "regression", "clustering")
|
133
|
+
def CorrelationHeatmap(
|
134
|
+
dataset: VMDataset,
|
135
|
+
columns: Optional[List[str]] = None,
|
136
|
+
method: str = "pearson",
|
137
|
+
show_values: bool = True,
|
138
|
+
colorscale: str = "RdBu",
|
139
|
+
width: int = 800,
|
140
|
+
height: int = 600,
|
141
|
+
mask_upper: bool = False,
|
142
|
+
threshold: Optional[float] = None,
|
143
|
+
title: str = "Correlation Heatmap",
|
144
|
+
) -> go.Figure:
|
145
|
+
"""
|
146
|
+
Generates customizable correlation heatmap plots for numerical features in a dataset using Plotly.
|
147
|
+
|
148
|
+
### Purpose
|
149
|
+
|
150
|
+
This test provides a flexible way to visualize correlations between numerical features
|
151
|
+
in a dataset using interactive Plotly heatmaps. It supports different correlation methods
|
152
|
+
and extensive customization options for the heatmap appearance, making it suitable for
|
153
|
+
exploring feature relationships in data analysis.
|
154
|
+
|
155
|
+
### Test Mechanism
|
156
|
+
|
157
|
+
The test computes correlation coefficients between specified numerical columns
|
158
|
+
(or all numerical columns if none specified) using the specified method.
|
159
|
+
It then creates an interactive heatmap visualization with customizable appearance options including:
|
160
|
+
- Different correlation methods (pearson, spearman, kendall)
|
161
|
+
- Color schemes and annotations
|
162
|
+
- Masking options for upper triangle
|
163
|
+
- Threshold filtering for significant correlations
|
164
|
+
- Interactive hover information
|
165
|
+
|
166
|
+
### Signs of High Risk
|
167
|
+
|
168
|
+
- Very high correlations (>0.9) between features indicating multicollinearity
|
169
|
+
- Unexpected correlation patterns that contradict domain knowledge
|
170
|
+
- Features with no correlation to any other variables
|
171
|
+
- Strong correlations with the target variable that might indicate data leakage
|
172
|
+
|
173
|
+
### Strengths
|
174
|
+
|
175
|
+
- Supports multiple correlation methods
|
176
|
+
- Interactive Plotly plots with hover information and zoom capabilities
|
177
|
+
- Highly customizable visualization options
|
178
|
+
- Can handle missing values appropriately
|
179
|
+
- Provides clear visual representation of feature relationships
|
180
|
+
- Optional thresholding to focus on significant correlations
|
181
|
+
|
182
|
+
### Limitations
|
183
|
+
|
184
|
+
- Limited to numerical features only
|
185
|
+
- Cannot capture non-linear relationships effectively
|
186
|
+
- May be difficult to interpret with many features
|
187
|
+
- Correlation does not imply causation
|
188
|
+
"""
|
189
|
+
# Validate inputs and compute correlation
|
190
|
+
corr_matrix = _validate_and_prepare_data(dataset, columns, method)
|
191
|
+
|
192
|
+
# Apply filters
|
193
|
+
corr_matrix = _apply_filters(corr_matrix, threshold, mask_upper)
|
194
|
+
|
195
|
+
# Prepare heatmap data
|
196
|
+
z_values = corr_matrix.values
|
197
|
+
x_labels = corr_matrix.columns.tolist()
|
198
|
+
y_labels = corr_matrix.index.tolist()
|
199
|
+
text = _create_annotation_text(z_values, y_labels, x_labels, show_values)
|
200
|
+
|
201
|
+
# Calculate adaptive font size
|
202
|
+
n_features = len(x_labels)
|
203
|
+
font_size = _calculate_adaptive_font_size(n_features)
|
204
|
+
|
205
|
+
# Create heatmap
|
206
|
+
heatmap_kwargs = {
|
207
|
+
"z": z_values,
|
208
|
+
"x": x_labels,
|
209
|
+
"y": y_labels,
|
210
|
+
"colorscale": colorscale,
|
211
|
+
"zmin": -1,
|
212
|
+
"zmax": 1,
|
213
|
+
"colorbar": dict(title=f"{method.capitalize()} Correlation"),
|
214
|
+
"hoverongaps": False,
|
215
|
+
"hovertemplate": "<b>%{y}</b> vs <b>%{x}</b><br>"
|
216
|
+
+ f"{method.capitalize()} Correlation: %{{z:.3f}}<br>"
|
217
|
+
+ "<extra></extra>",
|
218
|
+
}
|
219
|
+
|
220
|
+
# Add text annotations if requested
|
221
|
+
if show_values and text is not None:
|
222
|
+
heatmap_kwargs.update(
|
223
|
+
{
|
224
|
+
"text": text,
|
225
|
+
"texttemplate": "%{text}",
|
226
|
+
"textfont": {"size": font_size, "color": "black"},
|
227
|
+
}
|
228
|
+
)
|
229
|
+
|
230
|
+
fig = go.Figure(data=go.Heatmap(**heatmap_kwargs))
|
231
|
+
|
232
|
+
# Update layout with stats
|
233
|
+
_calculate_stats_and_update_layout(fig, corr_matrix, method, title, width, height)
|
234
|
+
|
235
|
+
return fig
|