validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. validmind/__init__.py +16 -5
  2. validmind/__version__.py +1 -1
  3. validmind/ai/utils.py +4 -24
  4. validmind/api_client.py +6 -17
  5. validmind/datasets/credit_risk/lending_club.py +13 -1
  6. validmind/datasets/nlp/cnn_dailymail.py +15 -1
  7. validmind/logging.py +48 -0
  8. validmind/tests/__init__.py +2 -0
  9. validmind/tests/__types__.py +18 -0
  10. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
  11. validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
  12. validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
  13. validmind/tests/data_validation/SeasonalDecompose.py +14 -2
  14. validmind/tests/data_validation/ShapiroWilk.py +14 -1
  15. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
  16. validmind/tests/data_validation/WOEBinPlots.py +14 -1
  17. validmind/tests/data_validation/WOEBinTable.py +13 -2
  18. validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
  19. validmind/tests/data_validation/nlp/CommonWords.py +14 -2
  20. validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
  21. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
  22. validmind/tests/data_validation/nlp/Sentiment.py +13 -1
  23. validmind/tests/data_validation/nlp/StopWords.py +14 -2
  24. validmind/tests/data_validation/nlp/TextDescription.py +14 -2
  25. validmind/tests/data_validation/nlp/Toxicity.py +13 -1
  26. validmind/tests/model_validation/BertScore.py +13 -2
  27. validmind/tests/model_validation/BleuScore.py +13 -2
  28. validmind/tests/model_validation/ContextualRecall.py +13 -1
  29. validmind/tests/model_validation/MeteorScore.py +13 -2
  30. validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
  31. validmind/tests/model_validation/RegardScore.py +13 -2
  32. validmind/tests/model_validation/RougeScore.py +14 -1
  33. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
  34. validmind/tests/model_validation/ToxicityScore.py +13 -1
  35. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
  36. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
  37. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
  38. validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
  39. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
  40. validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
  41. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
  42. validmind/tests/output.py +9 -2
  43. validmind/tests/plots/BoxPlot.py +260 -0
  44. validmind/tests/plots/CorrelationHeatmap.py +235 -0
  45. validmind/tests/plots/HistogramPlot.py +233 -0
  46. validmind/tests/plots/ViolinPlot.py +125 -0
  47. validmind/tests/plots/__init__.py +0 -0
  48. validmind/tests/stats/CorrelationAnalysis.py +251 -0
  49. validmind/tests/stats/DescriptiveStats.py +197 -0
  50. validmind/tests/stats/NormalityTests.py +147 -0
  51. validmind/tests/stats/OutlierDetection.py +173 -0
  52. validmind/tests/stats/__init__.py +0 -0
  53. validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
  54. validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
  55. validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
  56. validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
  57. validmind/unit_metrics/classification/individual/Confidence.py +52 -0
  58. validmind/unit_metrics/classification/individual/Correctness.py +41 -0
  59. validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
  60. validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
  61. validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
  62. validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
  63. validmind/unit_metrics/classification/individual/__init__.py +0 -0
  64. validmind/vm_models/dataset/dataset.py +147 -1
  65. validmind/vm_models/result/result.py +30 -6
  66. validmind-2.10.0rc1.dist-info/METADATA +845 -0
  67. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
  68. validmind-2.8.29.dist-info/METADATA +0 -137
  69. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
  70. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
  71. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,260 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional
6
+
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import SkipTestError
12
+ from validmind.vm_models import VMDataset
13
+
14
+
15
+ def _validate_inputs(
16
+ dataset: VMDataset, columns: Optional[List[str]], group_by: Optional[str]
17
+ ):
18
+ """Validate inputs and return validated columns."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for box plotting")
27
+
28
+ if group_by is not None:
29
+ if group_by not in dataset.df.columns:
30
+ raise SkipTestError(f"Group column '{group_by}' not found in dataset")
31
+ if group_by in columns:
32
+ columns.remove(group_by)
33
+
34
+ return columns
35
+
36
+
37
+ def _create_grouped_boxplot(
38
+ dataset, columns, group_by, colors, show_outliers, title_prefix, width, height
39
+ ):
40
+ """Create grouped box plots."""
41
+ fig = go.Figure()
42
+ groups = dataset.df[group_by].dropna().unique()
43
+
44
+ for col_idx, column in enumerate(columns):
45
+ for group_idx, group_value in enumerate(groups):
46
+ data_subset = dataset.df[dataset.df[group_by] == group_value][
47
+ column
48
+ ].dropna()
49
+
50
+ if len(data_subset) > 0:
51
+ color = colors[group_idx % len(colors)]
52
+ fig.add_trace(
53
+ go.Box(
54
+ y=data_subset,
55
+ name=f"{group_value}",
56
+ marker_color=color,
57
+ boxpoints="outliers" if show_outliers else False,
58
+ jitter=0.3,
59
+ pointpos=-1.8,
60
+ legendgroup=f"{group_value}",
61
+ showlegend=(col_idx == 0),
62
+ offsetgroup=group_idx,
63
+ x=[column] * len(data_subset),
64
+ )
65
+ )
66
+
67
+ fig.update_layout(
68
+ title=f"{title_prefix} Features by {group_by}",
69
+ xaxis_title="Features",
70
+ yaxis_title="Values",
71
+ boxmode="group",
72
+ width=width,
73
+ height=height,
74
+ template="plotly_white",
75
+ )
76
+ return fig
77
+
78
+
79
+ def _create_single_boxplot(
80
+ dataset, column, colors, show_outliers, title_prefix, width, height
81
+ ):
82
+ """Create single column box plot."""
83
+ data = dataset.df[column].dropna()
84
+ if len(data) == 0:
85
+ raise SkipTestError(f"No data available for column {column}")
86
+
87
+ fig = go.Figure()
88
+ fig.add_trace(
89
+ go.Box(
90
+ y=data,
91
+ name=column,
92
+ marker_color=colors[0],
93
+ boxpoints="outliers" if show_outliers else False,
94
+ jitter=0.3,
95
+ pointpos=-1.8,
96
+ )
97
+ )
98
+
99
+ fig.update_layout(
100
+ title=f"{title_prefix} {column}",
101
+ yaxis_title=column,
102
+ width=width,
103
+ height=height,
104
+ template="plotly_white",
105
+ showlegend=False,
106
+ )
107
+ return fig
108
+
109
+
110
+ def _create_multiple_boxplots(
111
+ dataset, columns, colors, show_outliers, title_prefix, width, height
112
+ ):
113
+ """Create multiple column box plots in subplot layout."""
114
+ n_cols = min(3, len(columns))
115
+ n_rows = (len(columns) + n_cols - 1) // n_cols
116
+
117
+ subplot_titles = [f"{title_prefix} {col}" for col in columns]
118
+ fig = make_subplots(
119
+ rows=n_rows,
120
+ cols=n_cols,
121
+ subplot_titles=subplot_titles,
122
+ vertical_spacing=0.1,
123
+ horizontal_spacing=0.1,
124
+ )
125
+
126
+ for idx, column in enumerate(columns):
127
+ row = (idx // n_cols) + 1
128
+ col = (idx % n_cols) + 1
129
+ data = dataset.df[column].dropna()
130
+
131
+ if len(data) > 0:
132
+ color = colors[idx % len(colors)]
133
+ fig.add_trace(
134
+ go.Box(
135
+ y=data,
136
+ name=column,
137
+ marker_color=color,
138
+ boxpoints="outliers" if show_outliers else False,
139
+ jitter=0.3,
140
+ pointpos=-1.8,
141
+ showlegend=False,
142
+ ),
143
+ row=row,
144
+ col=col,
145
+ )
146
+ fig.update_yaxes(title_text=column, row=row, col=col)
147
+ else:
148
+ fig.add_annotation(
149
+ text=f"No data available<br>for {column}",
150
+ x=0.5,
151
+ y=0.5,
152
+ xref=f"x{idx+1} domain" if idx > 0 else "x domain",
153
+ yref=f"y{idx+1} domain" if idx > 0 else "y domain",
154
+ showarrow=False,
155
+ row=row,
156
+ col=col,
157
+ )
158
+
159
+ fig.update_layout(
160
+ title="Dataset Feature Distributions",
161
+ width=width,
162
+ height=height,
163
+ template="plotly_white",
164
+ showlegend=False,
165
+ )
166
+ return fig
167
+
168
+
169
+ @tags("tabular_data", "visualization", "data_quality")
170
+ @tasks("classification", "regression", "clustering")
171
+ def BoxPlot(
172
+ dataset: VMDataset,
173
+ columns: Optional[List[str]] = None,
174
+ group_by: Optional[str] = None,
175
+ width: int = 1200,
176
+ height: int = 600,
177
+ colors: Optional[List[str]] = None,
178
+ show_outliers: bool = True,
179
+ title_prefix: str = "Box Plot of",
180
+ ) -> go.Figure:
181
+ """
182
+ Generates customizable box plots for numerical features in a dataset with optional grouping using Plotly.
183
+
184
+ ### Purpose
185
+
186
+ This test provides a flexible way to visualize the distribution of numerical features
187
+ through interactive box plots, with optional grouping by categorical variables. Box plots are
188
+ effective for identifying outliers, comparing distributions across groups, and
189
+ understanding the spread and central tendency of the data.
190
+
191
+ ### Test Mechanism
192
+
193
+ The test creates interactive box plots for specified numerical columns (or all numerical columns
194
+ if none specified). It supports various customization options including:
195
+ - Grouping by categorical variables
196
+ - Customizable colors and styling
197
+ - Outlier display options
198
+ - Interactive hover information
199
+ - Zoom and pan capabilities
200
+
201
+ ### Signs of High Risk
202
+
203
+ - Presence of many outliers indicating data quality issues
204
+ - Highly skewed distributions
205
+ - Large differences in variance across groups
206
+ - Unexpected patterns in grouped data
207
+
208
+ ### Strengths
209
+
210
+ - Clear visualization of distribution statistics (median, quartiles, outliers)
211
+ - Interactive Plotly plots with hover information and zoom capabilities
212
+ - Effective for comparing distributions across groups
213
+ - Handles missing values appropriately
214
+ - Highly customizable appearance
215
+
216
+ ### Limitations
217
+
218
+ - Limited to numerical features only
219
+ - May not be suitable for continuous variables with many unique values
220
+ - Visual interpretation may be subjective
221
+ - Less effective with very large datasets
222
+ """
223
+ # Validate inputs
224
+ columns = _validate_inputs(dataset, columns, group_by)
225
+
226
+ # Set default colors
227
+ if colors is None:
228
+ colors = [
229
+ "steelblue",
230
+ "orange",
231
+ "green",
232
+ "red",
233
+ "purple",
234
+ "brown",
235
+ "pink",
236
+ "gray",
237
+ "olive",
238
+ "cyan",
239
+ ]
240
+
241
+ # Create appropriate plot type
242
+ if group_by is not None:
243
+ return _create_grouped_boxplot(
244
+ dataset,
245
+ columns,
246
+ group_by,
247
+ colors,
248
+ show_outliers,
249
+ title_prefix,
250
+ width,
251
+ height,
252
+ )
253
+ elif len(columns) == 1:
254
+ return _create_single_boxplot(
255
+ dataset, columns[0], colors, show_outliers, title_prefix, width, height
256
+ )
257
+ else:
258
+ return _create_multiple_boxplots(
259
+ dataset, columns, colors, show_outliers, title_prefix, width, height
260
+ )
@@ -0,0 +1,235 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional
6
+
7
+ import numpy as np
8
+ import plotly.graph_objects as go
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import SkipTestError
12
+ from validmind.vm_models import VMDataset
13
+
14
+
15
+ def _validate_and_prepare_data(
16
+ dataset: VMDataset, columns: Optional[List[str]], method: str
17
+ ):
18
+ """Validate inputs and prepare correlation data."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for correlation analysis")
27
+
28
+ if len(columns) < 2:
29
+ raise SkipTestError(
30
+ "At least 2 numerical columns required for correlation analysis"
31
+ )
32
+
33
+ # Get data and remove constant columns
34
+ data = dataset.df[columns]
35
+ data = data.loc[:, data.var() != 0]
36
+
37
+ if data.shape[1] < 2:
38
+ raise SkipTestError(
39
+ "Insufficient non-constant columns for correlation analysis"
40
+ )
41
+
42
+ return data.corr(method=method)
43
+
44
+
45
+ def _apply_filters(corr_matrix, threshold: Optional[float], mask_upper: bool):
46
+ """Apply threshold and masking filters to correlation matrix."""
47
+ if threshold is not None:
48
+ mask = np.abs(corr_matrix) < threshold
49
+ corr_matrix = corr_matrix.mask(mask)
50
+
51
+ if mask_upper:
52
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
53
+ corr_matrix = corr_matrix.mask(mask)
54
+
55
+ return corr_matrix
56
+
57
+
58
+ def _create_annotation_text(z_values, y_labels, x_labels, show_values: bool):
59
+ """Create text annotations for heatmap cells."""
60
+ if not show_values:
61
+ return None
62
+
63
+ text = []
64
+ for i in range(len(y_labels)):
65
+ text_row = []
66
+ for j in range(len(x_labels)):
67
+ value = z_values[i][j]
68
+ if np.isnan(value):
69
+ text_row.append("")
70
+ else:
71
+ text_row.append(f"{value:.3f}")
72
+ text.append(text_row)
73
+ return text
74
+
75
+
76
+ def _calculate_adaptive_font_size(n_features: int) -> int:
77
+ """Calculate adaptive font size based on number of features."""
78
+ if n_features <= 10:
79
+ return 12
80
+ elif n_features <= 20:
81
+ return 10
82
+ elif n_features <= 30:
83
+ return 8
84
+ else:
85
+ return 6
86
+
87
+
88
+ def _calculate_stats_and_update_layout(
89
+ fig, corr_matrix, method: str, title: str, width: int, height: int
90
+ ):
91
+ """Calculate statistics and update figure layout."""
92
+ n_features = corr_matrix.shape[0]
93
+ upper_triangle = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
94
+ upper_triangle = upper_triangle[~np.isnan(upper_triangle)]
95
+
96
+ if len(upper_triangle) > 0:
97
+ mean_corr = np.abs(upper_triangle).mean()
98
+ max_corr = np.abs(upper_triangle).max()
99
+ stats_text = f"Features: {n_features}<br>Mean |r|: {mean_corr:.3f}<br>Max |r|: {max_corr:.3f}"
100
+ else:
101
+ stats_text = f"Features: {n_features}"
102
+
103
+ fig.update_layout(
104
+ title={
105
+ "text": f"{title} ({method.capitalize()} Correlation)",
106
+ "x": 0.5,
107
+ "xanchor": "center",
108
+ },
109
+ width=width,
110
+ height=height,
111
+ template="plotly_white",
112
+ xaxis=dict(tickangle=45, side="bottom"),
113
+ yaxis=dict(tickmode="linear", autorange="reversed"),
114
+ annotations=[
115
+ dict(
116
+ text=stats_text,
117
+ x=0.02,
118
+ y=0.98,
119
+ xref="paper",
120
+ yref="paper",
121
+ showarrow=False,
122
+ align="left",
123
+ bgcolor="rgba(255,255,255,0.8)",
124
+ bordercolor="black",
125
+ borderwidth=1,
126
+ )
127
+ ],
128
+ )
129
+
130
+
131
+ @tags("tabular_data", "visualization", "correlation")
132
+ @tasks("classification", "regression", "clustering")
133
+ def CorrelationHeatmap(
134
+ dataset: VMDataset,
135
+ columns: Optional[List[str]] = None,
136
+ method: str = "pearson",
137
+ show_values: bool = True,
138
+ colorscale: str = "RdBu",
139
+ width: int = 800,
140
+ height: int = 600,
141
+ mask_upper: bool = False,
142
+ threshold: Optional[float] = None,
143
+ title: str = "Correlation Heatmap",
144
+ ) -> go.Figure:
145
+ """
146
+ Generates customizable correlation heatmap plots for numerical features in a dataset using Plotly.
147
+
148
+ ### Purpose
149
+
150
+ This test provides a flexible way to visualize correlations between numerical features
151
+ in a dataset using interactive Plotly heatmaps. It supports different correlation methods
152
+ and extensive customization options for the heatmap appearance, making it suitable for
153
+ exploring feature relationships in data analysis.
154
+
155
+ ### Test Mechanism
156
+
157
+ The test computes correlation coefficients between specified numerical columns
158
+ (or all numerical columns if none specified) using the specified method.
159
+ It then creates an interactive heatmap visualization with customizable appearance options including:
160
+ - Different correlation methods (pearson, spearman, kendall)
161
+ - Color schemes and annotations
162
+ - Masking options for upper triangle
163
+ - Threshold filtering for significant correlations
164
+ - Interactive hover information
165
+
166
+ ### Signs of High Risk
167
+
168
+ - Very high correlations (>0.9) between features indicating multicollinearity
169
+ - Unexpected correlation patterns that contradict domain knowledge
170
+ - Features with no correlation to any other variables
171
+ - Strong correlations with the target variable that might indicate data leakage
172
+
173
+ ### Strengths
174
+
175
+ - Supports multiple correlation methods
176
+ - Interactive Plotly plots with hover information and zoom capabilities
177
+ - Highly customizable visualization options
178
+ - Can handle missing values appropriately
179
+ - Provides clear visual representation of feature relationships
180
+ - Optional thresholding to focus on significant correlations
181
+
182
+ ### Limitations
183
+
184
+ - Limited to numerical features only
185
+ - Cannot capture non-linear relationships effectively
186
+ - May be difficult to interpret with many features
187
+ - Correlation does not imply causation
188
+ """
189
+ # Validate inputs and compute correlation
190
+ corr_matrix = _validate_and_prepare_data(dataset, columns, method)
191
+
192
+ # Apply filters
193
+ corr_matrix = _apply_filters(corr_matrix, threshold, mask_upper)
194
+
195
+ # Prepare heatmap data
196
+ z_values = corr_matrix.values
197
+ x_labels = corr_matrix.columns.tolist()
198
+ y_labels = corr_matrix.index.tolist()
199
+ text = _create_annotation_text(z_values, y_labels, x_labels, show_values)
200
+
201
+ # Calculate adaptive font size
202
+ n_features = len(x_labels)
203
+ font_size = _calculate_adaptive_font_size(n_features)
204
+
205
+ # Create heatmap
206
+ heatmap_kwargs = {
207
+ "z": z_values,
208
+ "x": x_labels,
209
+ "y": y_labels,
210
+ "colorscale": colorscale,
211
+ "zmin": -1,
212
+ "zmax": 1,
213
+ "colorbar": dict(title=f"{method.capitalize()} Correlation"),
214
+ "hoverongaps": False,
215
+ "hovertemplate": "<b>%{y}</b> vs <b>%{x}</b><br>"
216
+ + f"{method.capitalize()} Correlation: %{{z:.3f}}<br>"
217
+ + "<extra></extra>",
218
+ }
219
+
220
+ # Add text annotations if requested
221
+ if show_values and text is not None:
222
+ heatmap_kwargs.update(
223
+ {
224
+ "text": text,
225
+ "texttemplate": "%{text}",
226
+ "textfont": {"size": font_size, "color": "black"},
227
+ }
228
+ )
229
+
230
+ fig = go.Figure(data=go.Heatmap(**heatmap_kwargs))
231
+
232
+ # Update layout with stats
233
+ _calculate_stats_and_update_layout(fig, corr_matrix, method, title, width, height)
234
+
235
+ return fig