validmind 2.8.29__py3-none-any.whl → 2.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +1 -9
  3. validmind/ai/utils.py +4 -24
  4. validmind/api_client.py +6 -17
  5. validmind/logging.py +48 -0
  6. validmind/tests/__init__.py +2 -0
  7. validmind/tests/__types__.py +18 -0
  8. validmind/tests/output.py +9 -2
  9. validmind/tests/plots/BoxPlot.py +260 -0
  10. validmind/tests/plots/CorrelationHeatmap.py +235 -0
  11. validmind/tests/plots/HistogramPlot.py +233 -0
  12. validmind/tests/plots/ViolinPlot.py +125 -0
  13. validmind/tests/plots/__init__.py +0 -0
  14. validmind/tests/stats/CorrelationAnalysis.py +251 -0
  15. validmind/tests/stats/DescriptiveStats.py +197 -0
  16. validmind/tests/stats/NormalityTests.py +147 -0
  17. validmind/tests/stats/OutlierDetection.py +173 -0
  18. validmind/tests/stats/__init__.py +0 -0
  19. validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
  20. validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
  21. validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
  22. validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
  23. validmind/unit_metrics/classification/individual/Confidence.py +52 -0
  24. validmind/unit_metrics/classification/individual/Correctness.py +41 -0
  25. validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
  26. validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
  27. validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
  28. validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
  29. validmind/unit_metrics/classification/individual/__init__.py +0 -0
  30. validmind/vm_models/dataset/dataset.py +147 -1
  31. validmind/vm_models/result/result.py +26 -4
  32. {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/METADATA +2 -2
  33. {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/RECORD +36 -15
  34. {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/LICENSE +0 -0
  35. {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/WHEEL +0 -0
  36. {validmind-2.8.29.dist-info → validmind-2.9.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,235 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional
6
+
7
+ import numpy as np
8
+ import plotly.graph_objects as go
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import SkipTestError
12
+ from validmind.vm_models import VMDataset
13
+
14
+
15
+ def _validate_and_prepare_data(
16
+ dataset: VMDataset, columns: Optional[List[str]], method: str
17
+ ):
18
+ """Validate inputs and prepare correlation data."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for correlation analysis")
27
+
28
+ if len(columns) < 2:
29
+ raise SkipTestError(
30
+ "At least 2 numerical columns required for correlation analysis"
31
+ )
32
+
33
+ # Get data and remove constant columns
34
+ data = dataset.df[columns]
35
+ data = data.loc[:, data.var() != 0]
36
+
37
+ if data.shape[1] < 2:
38
+ raise SkipTestError(
39
+ "Insufficient non-constant columns for correlation analysis"
40
+ )
41
+
42
+ return data.corr(method=method)
43
+
44
+
45
+ def _apply_filters(corr_matrix, threshold: Optional[float], mask_upper: bool):
46
+ """Apply threshold and masking filters to correlation matrix."""
47
+ if threshold is not None:
48
+ mask = np.abs(corr_matrix) < threshold
49
+ corr_matrix = corr_matrix.mask(mask)
50
+
51
+ if mask_upper:
52
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
53
+ corr_matrix = corr_matrix.mask(mask)
54
+
55
+ return corr_matrix
56
+
57
+
58
+ def _create_annotation_text(z_values, y_labels, x_labels, show_values: bool):
59
+ """Create text annotations for heatmap cells."""
60
+ if not show_values:
61
+ return None
62
+
63
+ text = []
64
+ for i in range(len(y_labels)):
65
+ text_row = []
66
+ for j in range(len(x_labels)):
67
+ value = z_values[i][j]
68
+ if np.isnan(value):
69
+ text_row.append("")
70
+ else:
71
+ text_row.append(f"{value:.3f}")
72
+ text.append(text_row)
73
+ return text
74
+
75
+
76
+ def _calculate_adaptive_font_size(n_features: int) -> int:
77
+ """Calculate adaptive font size based on number of features."""
78
+ if n_features <= 10:
79
+ return 12
80
+ elif n_features <= 20:
81
+ return 10
82
+ elif n_features <= 30:
83
+ return 8
84
+ else:
85
+ return 6
86
+
87
+
88
+ def _calculate_stats_and_update_layout(
89
+ fig, corr_matrix, method: str, title: str, width: int, height: int
90
+ ):
91
+ """Calculate statistics and update figure layout."""
92
+ n_features = corr_matrix.shape[0]
93
+ upper_triangle = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
94
+ upper_triangle = upper_triangle[~np.isnan(upper_triangle)]
95
+
96
+ if len(upper_triangle) > 0:
97
+ mean_corr = np.abs(upper_triangle).mean()
98
+ max_corr = np.abs(upper_triangle).max()
99
+ stats_text = f"Features: {n_features}<br>Mean |r|: {mean_corr:.3f}<br>Max |r|: {max_corr:.3f}"
100
+ else:
101
+ stats_text = f"Features: {n_features}"
102
+
103
+ fig.update_layout(
104
+ title={
105
+ "text": f"{title} ({method.capitalize()} Correlation)",
106
+ "x": 0.5,
107
+ "xanchor": "center",
108
+ },
109
+ width=width,
110
+ height=height,
111
+ template="plotly_white",
112
+ xaxis=dict(tickangle=45, side="bottom"),
113
+ yaxis=dict(tickmode="linear", autorange="reversed"),
114
+ annotations=[
115
+ dict(
116
+ text=stats_text,
117
+ x=0.02,
118
+ y=0.98,
119
+ xref="paper",
120
+ yref="paper",
121
+ showarrow=False,
122
+ align="left",
123
+ bgcolor="rgba(255,255,255,0.8)",
124
+ bordercolor="black",
125
+ borderwidth=1,
126
+ )
127
+ ],
128
+ )
129
+
130
+
131
+ @tags("tabular_data", "visualization", "correlation")
132
+ @tasks("classification", "regression", "clustering")
133
+ def CorrelationHeatmap(
134
+ dataset: VMDataset,
135
+ columns: Optional[List[str]] = None,
136
+ method: str = "pearson",
137
+ show_values: bool = True,
138
+ colorscale: str = "RdBu",
139
+ width: int = 800,
140
+ height: int = 600,
141
+ mask_upper: bool = False,
142
+ threshold: Optional[float] = None,
143
+ title: str = "Correlation Heatmap",
144
+ ) -> go.Figure:
145
+ """
146
+ Generates customizable correlation heatmap plots for numerical features in a dataset using Plotly.
147
+
148
+ ### Purpose
149
+
150
+ This test provides a flexible way to visualize correlations between numerical features
151
+ in a dataset using interactive Plotly heatmaps. It supports different correlation methods
152
+ and extensive customization options for the heatmap appearance, making it suitable for
153
+ exploring feature relationships in data analysis.
154
+
155
+ ### Test Mechanism
156
+
157
+ The test computes correlation coefficients between specified numerical columns
158
+ (or all numerical columns if none specified) using the specified method.
159
+ It then creates an interactive heatmap visualization with customizable appearance options including:
160
+ - Different correlation methods (pearson, spearman, kendall)
161
+ - Color schemes and annotations
162
+ - Masking options for upper triangle
163
+ - Threshold filtering for significant correlations
164
+ - Interactive hover information
165
+
166
+ ### Signs of High Risk
167
+
168
+ - Very high correlations (>0.9) between features indicating multicollinearity
169
+ - Unexpected correlation patterns that contradict domain knowledge
170
+ - Features with no correlation to any other variables
171
+ - Strong correlations with the target variable that might indicate data leakage
172
+
173
+ ### Strengths
174
+
175
+ - Supports multiple correlation methods
176
+ - Interactive Plotly plots with hover information and zoom capabilities
177
+ - Highly customizable visualization options
178
+ - Can handle missing values appropriately
179
+ - Provides clear visual representation of feature relationships
180
+ - Optional thresholding to focus on significant correlations
181
+
182
+ ### Limitations
183
+
184
+ - Limited to numerical features only
185
+ - Cannot capture non-linear relationships effectively
186
+ - May be difficult to interpret with many features
187
+ - Correlation does not imply causation
188
+ """
189
+ # Validate inputs and compute correlation
190
+ corr_matrix = _validate_and_prepare_data(dataset, columns, method)
191
+
192
+ # Apply filters
193
+ corr_matrix = _apply_filters(corr_matrix, threshold, mask_upper)
194
+
195
+ # Prepare heatmap data
196
+ z_values = corr_matrix.values
197
+ x_labels = corr_matrix.columns.tolist()
198
+ y_labels = corr_matrix.index.tolist()
199
+ text = _create_annotation_text(z_values, y_labels, x_labels, show_values)
200
+
201
+ # Calculate adaptive font size
202
+ n_features = len(x_labels)
203
+ font_size = _calculate_adaptive_font_size(n_features)
204
+
205
+ # Create heatmap
206
+ heatmap_kwargs = {
207
+ "z": z_values,
208
+ "x": x_labels,
209
+ "y": y_labels,
210
+ "colorscale": colorscale,
211
+ "zmin": -1,
212
+ "zmax": 1,
213
+ "colorbar": dict(title=f"{method.capitalize()} Correlation"),
214
+ "hoverongaps": False,
215
+ "hovertemplate": "<b>%{y}</b> vs <b>%{x}</b><br>"
216
+ + f"{method.capitalize()} Correlation: %{{z:.3f}}<br>"
217
+ + "<extra></extra>",
218
+ }
219
+
220
+ # Add text annotations if requested
221
+ if show_values and text is not None:
222
+ heatmap_kwargs.update(
223
+ {
224
+ "text": text,
225
+ "texttemplate": "%{text}",
226
+ "textfont": {"size": font_size, "color": "black"},
227
+ }
228
+ )
229
+
230
+ fig = go.Figure(data=go.Heatmap(**heatmap_kwargs))
231
+
232
+ # Update layout with stats
233
+ _calculate_stats_and_update_layout(fig, corr_matrix, method, title, width, height)
234
+
235
+ return fig
@@ -0,0 +1,233 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional, Union
6
+
7
+ import numpy as np
8
+ import plotly.graph_objects as go
9
+ from plotly.subplots import make_subplots
10
+ from scipy import stats
11
+
12
+ from validmind import tags, tasks
13
+ from validmind.errors import SkipTestError
14
+ from validmind.vm_models import VMDataset
15
+
16
+
17
+ def _validate_columns(dataset: VMDataset, columns: Optional[List[str]]):
18
+ """Validate and return numerical columns."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for histogram plotting")
27
+
28
+ return columns
29
+
30
+
31
+ def _process_column_data(data, log_scale: bool, column: str):
32
+ """Process column data and return plot data and xlabel."""
33
+ plot_data = data
34
+ xlabel = column
35
+ if log_scale and (data > 0).all():
36
+ plot_data = np.log10(data)
37
+ xlabel = f"log10({column})"
38
+ return plot_data, xlabel
39
+
40
+
41
+ def _add_histogram_trace(
42
+ fig, plot_data, bins, color, opacity, normalize, column, row, col
43
+ ):
44
+ """Add histogram trace to figure."""
45
+ histnorm = "probability density" if normalize else None
46
+
47
+ fig.add_trace(
48
+ go.Histogram(
49
+ x=plot_data,
50
+ nbinsx=bins if isinstance(bins, int) else None,
51
+ name=f"Histogram - {column}",
52
+ marker_color=color,
53
+ opacity=opacity,
54
+ histnorm=histnorm,
55
+ showlegend=False,
56
+ ),
57
+ row=row,
58
+ col=col,
59
+ )
60
+
61
+
62
+ def _add_kde_trace(fig, plot_data, bins, normalize, column, row, col):
63
+ """Add KDE trace to figure if possible."""
64
+ try:
65
+ kde = stats.gaussian_kde(plot_data)
66
+ x_range = np.linspace(plot_data.min(), plot_data.max(), 100)
67
+ kde_values = kde(x_range)
68
+
69
+ if not normalize:
70
+ hist_max = (
71
+ len(plot_data) / bins if isinstance(bins, int) else len(plot_data) / 30
72
+ )
73
+ kde_values = kde_values * hist_max / kde_values.max()
74
+
75
+ fig.add_trace(
76
+ go.Scatter(
77
+ x=x_range,
78
+ y=kde_values,
79
+ mode="lines",
80
+ name=f"KDE - {column}",
81
+ line=dict(color="red", width=2),
82
+ showlegend=False,
83
+ ),
84
+ row=row,
85
+ col=col,
86
+ )
87
+ except Exception:
88
+ pass
89
+
90
+
91
+ def _add_stats_annotation(fig, data, idx, row, col):
92
+ """Add statistics annotation to subplot."""
93
+ stats_text = f"Mean: {data.mean():.3f}<br>Std: {data.std():.3f}<br>N: {len(data)}"
94
+ fig.add_annotation(
95
+ text=stats_text,
96
+ x=0.02,
97
+ y=0.98,
98
+ xref=f"x{idx+1} domain" if idx > 0 else "x domain",
99
+ yref=f"y{idx+1} domain" if idx > 0 else "y domain",
100
+ showarrow=False,
101
+ align="left",
102
+ bgcolor="rgba(255,255,255,0.8)",
103
+ bordercolor="black",
104
+ borderwidth=1,
105
+ row=row,
106
+ col=col,
107
+ )
108
+
109
+
110
+ @tags("tabular_data", "visualization", "data_quality")
111
+ @tasks("classification", "regression", "clustering")
112
+ def HistogramPlot(
113
+ dataset: VMDataset,
114
+ columns: Optional[List[str]] = None,
115
+ bins: Union[int, str, List] = 30,
116
+ color: str = "steelblue",
117
+ opacity: float = 0.7,
118
+ show_kde: bool = True,
119
+ normalize: bool = False,
120
+ log_scale: bool = False,
121
+ title_prefix: str = "Histogram of",
122
+ width: int = 1200,
123
+ height: int = 800,
124
+ n_cols: int = 2,
125
+ vertical_spacing: float = 0.15,
126
+ horizontal_spacing: float = 0.1,
127
+ ) -> go.Figure:
128
+ """
129
+ Generates customizable histogram plots for numerical features in a dataset using Plotly.
130
+
131
+ ### Purpose
132
+
133
+ This test provides a flexible way to visualize the distribution of numerical features in a dataset.
134
+ It allows for extensive customization of the histogram appearance and behavior through parameters,
135
+ making it suitable for various exploratory data analysis tasks.
136
+
137
+ ### Test Mechanism
138
+
139
+ The test creates histogram plots for specified numerical columns (or all numerical columns if none specified).
140
+ It supports various customization options including:
141
+ - Number of bins or bin edges
142
+ - Color and opacity
143
+ - Kernel density estimation overlay
144
+ - Logarithmic scaling
145
+ - Normalization options
146
+ - Configurable subplot layout (columns and spacing)
147
+
148
+ ### Signs of High Risk
149
+
150
+ - Highly skewed distributions that may indicate data quality issues
151
+ - Unexpected bimodal or multimodal distributions
152
+ - Presence of extreme outliers
153
+ - Empty or sparse distributions
154
+
155
+ ### Strengths
156
+
157
+ - Highly customizable visualization options
158
+ - Interactive Plotly plots with zoom, pan, and hover capabilities
159
+ - Supports both single and multiple column analysis
160
+ - Provides insights into data distribution patterns
161
+ - Can handle different data types and scales
162
+ - Configurable subplot layout for better visualization
163
+
164
+ ### Limitations
165
+
166
+ - Limited to numerical features only
167
+ - Visual interpretation may be subjective
168
+ - May not be suitable for high-dimensional datasets
169
+ - Performance may degrade with very large datasets
170
+ """
171
+ # Validate inputs
172
+ columns = _validate_columns(dataset, columns)
173
+
174
+ # Calculate subplot layout
175
+ n_cols = min(n_cols, len(columns))
176
+ n_rows = (len(columns) + n_cols - 1) // n_cols
177
+
178
+ # Create subplots
179
+ subplot_titles = [f"{title_prefix} {col}" for col in columns]
180
+ fig = make_subplots(
181
+ rows=n_rows,
182
+ cols=n_cols,
183
+ subplot_titles=subplot_titles,
184
+ vertical_spacing=vertical_spacing,
185
+ horizontal_spacing=horizontal_spacing,
186
+ )
187
+
188
+ for idx, column in enumerate(columns):
189
+ row = (idx // n_cols) + 1
190
+ col = (idx % n_cols) + 1
191
+ data = dataset.df[column].dropna()
192
+
193
+ if len(data) == 0:
194
+ fig.add_annotation(
195
+ text=f"No data available<br>for {column}",
196
+ x=0.5,
197
+ y=0.5,
198
+ xref=f"x{idx+1}" if idx > 0 else "x",
199
+ yref=f"y{idx+1}" if idx > 0 else "y",
200
+ showarrow=False,
201
+ row=row,
202
+ col=col,
203
+ )
204
+ continue
205
+
206
+ # Process data
207
+ plot_data, xlabel = _process_column_data(data, log_scale, column)
208
+
209
+ # Add histogram
210
+ _add_histogram_trace(
211
+ fig, plot_data, bins, color, opacity, normalize, column, row, col
212
+ )
213
+
214
+ # Add KDE if requested
215
+ if show_kde and len(data) > 1:
216
+ _add_kde_trace(fig, plot_data, bins, normalize, column, row, col)
217
+
218
+ # Update axes and add annotations
219
+ fig.update_xaxes(title_text=xlabel, row=row, col=col)
220
+ ylabel = "Density" if normalize else "Frequency"
221
+ fig.update_yaxes(title_text=ylabel, row=row, col=col)
222
+ _add_stats_annotation(fig, data, idx, row, col)
223
+
224
+ # Update layout
225
+ fig.update_layout(
226
+ title_text="Dataset Feature Distributions",
227
+ showlegend=False,
228
+ width=width,
229
+ height=height,
230
+ template="plotly_white",
231
+ )
232
+
233
+ return fig
@@ -0,0 +1,125 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional
6
+
7
+ import plotly.express as px
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
+ from validmind.vm_models import VMDataset
12
+
13
+
14
+ @tags("tabular_data", "visualization", "distribution")
15
+ @tasks("classification", "regression", "clustering")
16
+ def ViolinPlot(
17
+ dataset: VMDataset,
18
+ columns: Optional[List[str]] = None,
19
+ group_by: Optional[str] = None,
20
+ width: int = 800,
21
+ height: int = 600,
22
+ ) -> px.violin:
23
+ """
24
+ Generates interactive violin plots for numerical features using Plotly.
25
+
26
+ ### Purpose
27
+
28
+ This test creates violin plots to visualize the distribution of numerical features,
29
+ showing both the probability density and summary statistics. Violin plots combine
30
+ aspects of box plots and kernel density estimation for rich distribution visualization.
31
+
32
+ ### Test Mechanism
33
+
34
+ The test creates violin plots for specified numerical columns, with optional
35
+ grouping by categorical variables. Each violin shows the distribution shape,
36
+ quartiles, and median values.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - Multimodal distributions that might indicate mixed populations
41
+ - Highly skewed distributions suggesting data quality issues
42
+ - Large differences in distribution shapes across groups
43
+ - Unusual distribution patterns that contradict domain expectations
44
+
45
+ ### Strengths
46
+
47
+ - Shows detailed distribution shape information
48
+ - Interactive Plotly visualization with hover details
49
+ - Effective for comparing distributions across groups
50
+ - Combines density estimation with quartile information
51
+
52
+ ### Limitations
53
+
54
+ - Limited to numerical features only
55
+ - Requires sufficient data points for meaningful density estimation
56
+ - May not be suitable for discrete variables
57
+ - Can be misleading with very small sample sizes
58
+ """
59
+ # Get numerical columns
60
+ if columns is None:
61
+ columns = dataset.feature_columns_numeric
62
+ else:
63
+ available_columns = set(dataset.feature_columns_numeric)
64
+ columns = [col for col in columns if col in available_columns]
65
+
66
+ if not columns:
67
+ raise SkipTestError("No numerical columns found for violin plot")
68
+
69
+ # For violin plots, we'll melt the data to long format
70
+ data = dataset.df[columns].dropna()
71
+
72
+ if len(data) == 0:
73
+ raise SkipTestError("No valid data available for violin plot")
74
+
75
+ # Melt the dataframe to long format
76
+ melted_data = data.melt(var_name="Feature", value_name="Value")
77
+
78
+ # Add group column if specified
79
+ if group_by and group_by in dataset.df.columns:
80
+ # Repeat group values for each feature
81
+ group_values = []
82
+ for column in columns:
83
+ column_data = dataset.df[[column, group_by]].dropna()
84
+ group_values.extend(column_data[group_by].tolist())
85
+
86
+ if len(group_values) == len(melted_data):
87
+ melted_data["Group"] = group_values
88
+ else:
89
+ group_by = None # Disable grouping if lengths don't match
90
+
91
+ # Create violin plot
92
+ if group_by and "Group" in melted_data.columns:
93
+ fig = px.violin(
94
+ melted_data,
95
+ x="Feature",
96
+ y="Value",
97
+ color="Group",
98
+ box=True,
99
+ title=f"Distribution of Features by {group_by}",
100
+ width=width,
101
+ height=height,
102
+ )
103
+ else:
104
+ fig = px.violin(
105
+ melted_data,
106
+ x="Feature",
107
+ y="Value",
108
+ box=True,
109
+ title="Feature Distributions",
110
+ width=width,
111
+ height=height,
112
+ )
113
+
114
+ # Update layout
115
+ fig.update_layout(
116
+ template="plotly_white",
117
+ title_x=0.5,
118
+ xaxis_title="Features",
119
+ yaxis_title="Values",
120
+ )
121
+
122
+ # Rotate x-axis labels for better readability
123
+ fig.update_xaxes(tickangle=45)
124
+
125
+ return fig
File without changes