validmind 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +20 -4
  3. validmind/ai/test_result_description/user.jinja +5 -0
  4. validmind/datasets/credit_risk/lending_club.py +444 -14
  5. validmind/tests/data_validation/MutualInformation.py +129 -0
  6. validmind/tests/data_validation/ScoreBandDefaultRates.py +139 -0
  7. validmind/tests/data_validation/TooManyZeroValues.py +6 -5
  8. validmind/tests/data_validation/UniqueRows.py +3 -1
  9. validmind/tests/decorator.py +18 -16
  10. validmind/tests/model_validation/sklearn/CalibrationCurve.py +116 -0
  11. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +261 -0
  12. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +1 -0
  13. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +144 -56
  14. validmind/tests/model_validation/sklearn/ModelParameters.py +74 -0
  15. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +130 -0
  16. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -6
  17. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -3
  18. validmind/tests/run.py +43 -72
  19. validmind/utils.py +23 -7
  20. validmind/vm_models/result/result.py +18 -17
  21. {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/METADATA +1 -1
  22. {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/RECORD +25 -19
  23. {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/LICENSE +0 -0
  24. {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/WHEEL +0 -0
  25. {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/entry_points.txt +0 -0
@@ -2,73 +2,161 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from typing import Union
6
-
5
+ from typing import Union, Dict, List
7
6
  from sklearn.model_selection import GridSearchCV
7
+ from sklearn.metrics import make_scorer, recall_score
8
8
 
9
9
  from validmind import tags, tasks
10
- from validmind.errors import SkipTestError
11
10
  from validmind.vm_models import VMDataset, VMModel
12
11
 
13
12
 
14
13
  @tags("sklearn", "model_performance")
15
14
  @tasks("classification", "clustering")
15
+ def custom_recall(y_true, y_pred_proba, threshold=0.5):
16
+ y_pred = (y_pred_proba >= threshold).astype(int)
17
+ return recall_score(y_true, y_pred)
18
+
19
+
20
+ def _get_metrics(scoring):
21
+ """Convert scoring parameter to list of metrics."""
22
+ if scoring is None:
23
+ return ["accuracy"]
24
+ return (
25
+ scoring
26
+ if isinstance(scoring, list)
27
+ else list(scoring.keys()) if isinstance(scoring, dict) else [scoring]
28
+ )
29
+
30
+
31
+ def _get_thresholds(thresholds):
32
+ """Convert thresholds parameter to list."""
33
+ if thresholds is None:
34
+ return [0.5]
35
+ return [thresholds] if isinstance(thresholds, (int, float)) else thresholds
36
+
37
+
38
+ def _create_scoring_dict(scoring, metrics, threshold):
39
+ """Create scoring dictionary for GridSearchCV."""
40
+ if scoring is None:
41
+ return None
42
+
43
+ scoring_dict = {}
44
+ for metric in metrics:
45
+ if metric == "recall":
46
+ scoring_dict[metric] = make_scorer(
47
+ custom_recall, needs_proba=True, threshold=threshold
48
+ )
49
+ elif metric == "roc_auc":
50
+ scoring_dict[metric] = "roc_auc"
51
+ else:
52
+ scoring_dict[metric] = metric
53
+ return scoring_dict
54
+
55
+
56
+ @tags("sklearn", "model_performance")
57
+ @tasks("clustering", "classification")
16
58
  def HyperParametersTuning(
17
59
  model: VMModel,
18
60
  dataset: VMDataset,
19
- param_grid: Union[dict, None] = None,
20
- scoring: Union[str, None] = None,
61
+ param_grid: dict,
62
+ scoring: Union[str, List, Dict] = None,
63
+ thresholds: Union[float, List[float]] = None,
64
+ fit_params: dict = None,
21
65
  ):
22
66
  """
23
- Exerts exhaustive grid search to identify optimal hyperparameters for the model, improving performance.
24
-
25
- ### Purpose:
26
-
27
- The "HyperParametersTuning" metric aims to find the optimal set of hyperparameters for a given model. The test is
28
- designed to enhance the performance of the model by determining the best configuration of hyperparameters. The
29
- parameters that are being optimized are defined by the parameter grid provided to the metric.
30
-
31
- ### Test Mechanism:
32
-
33
- The HyperParametersTuning test employs a grid search mechanism using the GridSearchCV function from the
34
- scikit-learn library. The grid search algorithm systematically works through multiple combinations of parameter
35
- values, cross-validating to determine which combination gives the best model performance. The chosen model and the
36
- parameter grid passed for tuning are necessary inputs. Once the grid search is complete, the test caches and
37
- returns details of the best model and its associated parameters.
38
-
39
- ### Signs of High Risk:
40
-
41
- - The test raises a SkipTestError if the param_grid is not supplied, indicating a lack of specific parameters to
42
- optimize, which can be risky for certain model types reliant on parameter tuning.
43
- - Poorly chosen scoring metrics that do not align well with the specific model or problem at hand could reflect
44
- potential risks or failures in achieving optimal performance.
45
-
46
- ### Strengths:
47
-
48
- - Provides a comprehensive exploration mechanism to identify the best set of hyperparameters for the supplied
49
- model, thereby enhancing its performance.
50
- - Implements GridSearchCV, simplifying and automating the time-consuming task of hyperparameter tuning.
51
-
52
- ### Limitations:
53
-
54
- - The grid search algorithm can be computationally expensive, especially with large datasets or complex models, and
55
- can be time-consuming as it tests all possible combinations within the specified parameter grid.
56
- - The effectiveness of the tuning is heavily dependent on the quality of data and only accepts datasets with
57
- numerical or ordered categories.
58
- - Assumes that the same set of hyperparameters is optimal for all problem sets, which may not be true in every
59
- scenario.
60
- - There's a potential risk of overfitting the model if the training set is not representative of the data that the
61
- model will be applied to.
67
+ Performs exhaustive grid search over specified parameter ranges to find optimal model configurations
68
+ across different metrics and decision thresholds.
69
+
70
+ ### Purpose
71
+
72
+ The Hyperparameter Tuning test systematically explores the model's parameter space to identify optimal
73
+ configurations. It supports multiple optimization metrics and decision thresholds, providing a comprehensive
74
+ view of how different parameter combinations affect various aspects of model performance.
75
+
76
+ ### Test Mechanism
77
+
78
+ The test uses scikit-learn's GridSearchCV to perform cross-validation for each parameter combination.
79
+ For each specified threshold and optimization metric, it creates a scoring dictionary with
80
+ threshold-adjusted metrics, performs grid search with cross-validation, records best parameters and
81
+ corresponding scores, and combines results into a comparative table. This process is repeated for each
82
+ optimization metric to provide a comprehensive view of model performance under different configurations.
83
+
84
+ ### Signs of High Risk
85
+
86
+ - Large performance variations across different parameter combinations
87
+ - Significant discrepancies between different optimization metrics
88
+ - Best parameters at the edges of the parameter grid
89
+ - Unstable performance across different thresholds
90
+ - Overly complex model configurations (risk of overfitting)
91
+ - Very different optimal parameters for different metrics
92
+ - Cross-validation scores showing high variance
93
+ - Extreme parameter values in best configurations
94
+
95
+ ### Strengths
96
+
97
+ - Comprehensive exploration of parameter space
98
+ - Supports multiple optimization metrics
99
+ - Allows threshold optimization
100
+ - Provides comparative view across different configurations
101
+ - Uses cross-validation for robust evaluation
102
+ - Helps understand trade-offs between different metrics
103
+ - Enables systematic parameter selection
104
+ - Supports both classification and clustering tasks
105
+
106
+ ### Limitations
107
+
108
+ - Computationally expensive for large parameter grids
109
+ - May not find global optimum (limited to grid points)
110
+ - Cannot handle dependencies between parameters
111
+ - Memory intensive for large datasets
112
+ - Limited to scikit-learn compatible models
113
+ - Cross-validation splits may not preserve time series structure
114
+ - Grid search may miss optimal values between grid points
115
+ - Resource intensive for high-dimensional parameter spaces
62
116
  """
63
- if not param_grid:
64
- raise SkipTestError("'param_grid' dictionary must be provided to run this test")
65
-
66
- estimators = GridSearchCV(model.model, param_grid=param_grid, scoring=scoring)
67
- estimators.fit(dataset.x, dataset.y)
68
-
69
- return [
70
- {
71
- "Best Model": estimators.best_estimator_,
72
- "Best Parameters": estimators.best_params_,
73
- }
74
- ]
117
+ fit_params = fit_params or {}
118
+
119
+ # Simple case: no scoring and no thresholds
120
+ if scoring is None and thresholds is None:
121
+ estimators = GridSearchCV(model.model, param_grid=param_grid, scoring=None)
122
+ estimators.fit(dataset.x_df(), dataset.y, **fit_params)
123
+ return [
124
+ {
125
+ "Best Model": estimators.best_estimator_,
126
+ "Best Parameters": estimators.best_params_,
127
+ }
128
+ ]
129
+
130
+ # Complex case: with scoring or thresholds
131
+ results = []
132
+ metrics = _get_metrics(scoring)
133
+ thresholds = _get_thresholds(thresholds)
134
+
135
+ for threshold in thresholds:
136
+ scoring_dict = _create_scoring_dict(scoring, metrics, threshold)
137
+
138
+ for optimize_for in metrics:
139
+ estimators = GridSearchCV(
140
+ model.model,
141
+ param_grid=param_grid,
142
+ scoring=scoring_dict,
143
+ refit=optimize_for if scoring is not None else True,
144
+ )
145
+
146
+ estimators.fit(dataset.x_df(), dataset.y, **fit_params)
147
+
148
+ best_index = estimators.best_index_
149
+ row_result = {
150
+ "Optimized for": optimize_for,
151
+ "Threshold": threshold,
152
+ "Best Parameters": estimators.best_params_,
153
+ }
154
+
155
+ score_key = (
156
+ "mean_test_score" if scoring is None else f"mean_test_{optimize_for}"
157
+ )
158
+ row_result[optimize_for] = estimators.cv_results_[score_key][best_index]
159
+
160
+ results.append(row_result)
161
+
162
+ return results
@@ -0,0 +1,74 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ from validmind import tags, tasks
7
+
8
+
9
+ @tags("model_training", "metadata")
10
+ @tasks("classification", "regression")
11
+ def ModelParameters(model, model_params=None):
12
+ """
13
+ Extracts and displays model parameters in a structured format for transparency and reproducibility.
14
+
15
+ ### Purpose
16
+
17
+ The Model Parameters test is designed to provide transparency into model configuration and ensure
18
+ reproducibility of machine learning models. It accomplishes this by extracting and presenting all
19
+ relevant parameters that define the model's behavior, making it easier to audit, validate, and
20
+ reproduce model training.
21
+
22
+ ### Test Mechanism
23
+
24
+ The test leverages scikit-learn's API convention of get_params() to extract model parameters. It
25
+ produces a structured DataFrame containing parameter names and their corresponding values. For models
26
+ that follow scikit-learn's API (including XGBoost, RandomForest, and other estimators), all
27
+ parameters are automatically extracted and displayed.
28
+
29
+ ### Signs of High Risk
30
+
31
+ - Missing crucial parameters that should be explicitly set
32
+ - Extreme parameter values that could indicate overfitting (e.g., unlimited tree depth)
33
+ - Inconsistent parameters across different versions of the same model type
34
+ - Parameter combinations known to cause instability or poor performance
35
+ - Default values used for critical parameters that should be tuned
36
+
37
+ ### Strengths
38
+
39
+ - Universal compatibility with scikit-learn API-compliant models
40
+ - Ensures transparency in model configuration
41
+ - Facilitates model reproducibility and version control
42
+ - Enables systematic parameter auditing
43
+ - Supports both classification and regression models
44
+ - Helps identify potential configuration issues
45
+
46
+ ### Limitations
47
+
48
+ - Only works with models implementing scikit-learn's get_params() method
49
+ - Cannot capture dynamic parameters set during model training
50
+ - Does not validate parameter values for model-specific appropriateness
51
+ - Parameter meanings and impacts may vary across different model types
52
+ - Cannot detect indirect parameter interactions or their effects on model performance
53
+ """
54
+ # Check if model implements get_params()
55
+ if not hasattr(model.model, "get_params"):
56
+ return pd.DataFrame()
57
+
58
+ # Get all model parameters
59
+ params = model.model.get_params()
60
+
61
+ # If model_params is None, use all parameters from get_params()
62
+ if model_params is None:
63
+ model_params = sorted(params.keys()) # Sort for consistent ordering
64
+
65
+ # Create DataFrame with parameters and their values
66
+ param_df = pd.DataFrame(
67
+ [
68
+ {"Parameter": param, "Value": str(params.get(param, "Not specified"))}
69
+ for param in model_params
70
+ if params.get(param) is not None
71
+ ]
72
+ )
73
+
74
+ return param_df
@@ -0,0 +1,130 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMModel, VMDataset
9
+
10
+
11
+ @tags("visualization", "credit_risk", "calibration")
12
+ @tasks("classification")
13
+ def ScoreProbabilityAlignment(
14
+ model: VMModel, dataset: VMDataset, score_column: str = "score", n_bins: int = 10
15
+ ):
16
+ """
17
+ Analyzes the alignment between credit scores and predicted probabilities.
18
+
19
+ ### Purpose
20
+
21
+ The Score-Probability Alignment test evaluates how well credit scores align with
22
+ predicted default probabilities. This helps validate score scaling, identify potential
23
+ calibration issues, and ensure scores reflect risk appropriately.
24
+
25
+ ### Test Mechanism
26
+
27
+ The test:
28
+ 1. Groups scores into bins
29
+ 2. Calculates average predicted probability per bin
30
+ 3. Tests monotonicity of relationship
31
+ 4. Analyzes probability distribution within score bands
32
+
33
+ ### Signs of High Risk
34
+
35
+ - Non-monotonic relationship between scores and probabilities
36
+ - Large probability variations within score bands
37
+ - Unexpected probability jumps between adjacent bands
38
+ - Poor alignment with expected odds-to-score relationship
39
+ - Inconsistent probability patterns across score ranges
40
+ - Clustering of probabilities at extreme values
41
+ - Score bands with similar probability profiles
42
+ - Unstable probability estimates in key decision bands
43
+
44
+ ### Strengths
45
+
46
+ - Direct validation of score-to-probability relationship
47
+ - Identifies potential calibration issues
48
+ - Supports score band validation
49
+ - Helps understand model behavior
50
+ - Useful for policy setting
51
+ - Visual and numerical results
52
+ - Easy to interpret
53
+ - Supports regulatory documentation
54
+
55
+ ### Limitations
56
+
57
+ - Sensitive to bin selection
58
+ - Requires sufficient data per bin
59
+ - May mask within-bin variations
60
+ - Point-in-time analysis only
61
+ - Cannot detect all forms of miscalibration
62
+ - Assumes scores should align with probabilities
63
+ - May oversimplify complex relationships
64
+ - Limited to binary outcomes
65
+ """
66
+ if score_column not in dataset.df.columns:
67
+ raise ValueError(f"Score column '{score_column}' not found in dataset")
68
+
69
+ # Get predicted probabilities
70
+ y_prob = dataset.y_prob(model)
71
+
72
+ # Create score bins
73
+ df = dataset.df.copy()
74
+ df["probability"] = y_prob
75
+
76
+ # Create score bins with equal width
77
+ df["score_bin"] = pd.qcut(df[score_column], n_bins, duplicates="drop")
78
+
79
+ # Calculate statistics per bin
80
+ results = []
81
+ for bin_name, group in df.groupby("score_bin"):
82
+ bin_stats = {
83
+ "Score Range": f"{bin_name.left:.0f}-{bin_name.right:.0f}",
84
+ "Mean Score": group[score_column].mean(),
85
+ "Population Count": len(group),
86
+ "Population (%)": len(group) / len(df) * 100,
87
+ "Mean Probability (%)": group["probability"].mean() * 100,
88
+ "Min Probability (%)": group["probability"].min() * 100,
89
+ "Max Probability (%)": group["probability"].max() * 100,
90
+ "Probability Std": group["probability"].std() * 100,
91
+ }
92
+ results.append(bin_stats)
93
+
94
+ results_df = pd.DataFrame(results)
95
+
96
+ # Create visualization
97
+ fig = go.Figure()
98
+
99
+ # Add probability range
100
+ fig.add_trace(
101
+ go.Scatter(
102
+ x=results_df["Mean Score"],
103
+ y=results_df["Mean Probability (%)"],
104
+ mode="lines+markers",
105
+ name="Mean Probability",
106
+ line=dict(color="blue"),
107
+ error_y=dict(
108
+ type="data",
109
+ symmetric=False,
110
+ array=results_df["Max Probability (%)"]
111
+ - results_df["Mean Probability (%)"],
112
+ arrayminus=results_df["Mean Probability (%)"]
113
+ - results_df["Min Probability (%)"],
114
+ color="gray",
115
+ ),
116
+ )
117
+ )
118
+
119
+ # Update layout
120
+ fig.update_layout(
121
+ title="Score-Probability Alignment",
122
+ xaxis_title="Score",
123
+ yaxis_title="Default Probability (%)",
124
+ showlegend=True,
125
+ template="plotly_white",
126
+ width=800,
127
+ height=600,
128
+ )
129
+
130
+ return results_df, fig
@@ -9,22 +9,21 @@ from matplotlib import cm
9
9
  from validmind import tags, tasks
10
10
 
11
11
 
12
- @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tags("visualization", "credit_risk")
13
13
  @tasks("classification")
14
14
  def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
15
15
  """
16
- Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
17
- regression models.
16
+ Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models.
18
17
 
19
18
  ### Purpose
20
19
 
21
20
  This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
22
- in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
21
+ in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative
23
22
  probabilities for positive and negative classes across both the training and test datasets.
24
23
 
25
24
  ### Test Mechanism
26
25
 
27
- The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
26
+ The classification model is evaluated by first computing the predicted probabilities for each instance in both
28
27
  the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
29
28
  for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
30
29
  distributions of these probabilities are created for both positive and negative classes across both training and
@@ -51,7 +50,7 @@ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabil
51
50
 
52
51
  ### Limitations
53
52
 
54
- - Exclusive to classification tasks and specifically to logistic regression models.
53
+ - Exclusive to classification tasks and specifically to classification models.
55
54
  - Graphical results necessitate human interpretation and may not be directly applicable for automated risk
56
55
  detection.
57
56
  - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
@@ -9,7 +9,7 @@ from matplotlib import cm
9
9
  from validmind import tags, tasks
10
10
 
11
11
 
12
- @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tags("visualization", "credit_risk")
13
13
  @tasks("classification")
14
14
  def PredictionProbabilitiesHistogram(
15
15
  dataset, model, title="Histogram of Predictive Probabilities"
@@ -22,7 +22,7 @@ def PredictionProbabilitiesHistogram(
22
22
 
23
23
  The Prediction Probabilities Histogram test is designed to generate histograms displaying the Probability of
24
24
  Default (PD) predictions for both positive and negative classes in training and testing datasets. This helps in
25
- evaluating the performance of a logistic regression model, particularly for credit risk prediction.
25
+ evaluating the performance of a classification model.
26
26
 
27
27
  ### Test Mechanism
28
28
 
@@ -52,7 +52,6 @@ def PredictionProbabilitiesHistogram(
52
52
  ### Limitations
53
53
 
54
54
  - Specifically tailored for binary classification scenarios and not suited for multi-class classification tasks.
55
- - Mainly applicable to logistic regression models, and may not be effective for other model types.
56
55
  - Provides a robust visual representation but lacks a quantifiable measure to assess model performance.
57
56
  """
58
57