validmind 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +20 -4
- validmind/ai/test_result_description/user.jinja +5 -0
- validmind/datasets/credit_risk/lending_club.py +444 -14
- validmind/tests/data_validation/MutualInformation.py +129 -0
- validmind/tests/data_validation/ScoreBandDefaultRates.py +139 -0
- validmind/tests/data_validation/TooManyZeroValues.py +6 -5
- validmind/tests/data_validation/UniqueRows.py +3 -1
- validmind/tests/decorator.py +18 -16
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +116 -0
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +261 -0
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +1 -0
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +144 -56
- validmind/tests/model_validation/sklearn/ModelParameters.py +74 -0
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +130 -0
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -6
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -3
- validmind/tests/run.py +43 -72
- validmind/utils.py +23 -7
- validmind/vm_models/result/result.py +18 -17
- {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/METADATA +1 -1
- {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/RECORD +25 -19
- {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/LICENSE +0 -0
- {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/WHEEL +0 -0
- {validmind-2.7.2.dist-info → validmind-2.7.4.dist-info}/entry_points.txt +0 -0
@@ -2,73 +2,161 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from typing import Union
|
6
|
-
|
5
|
+
from typing import Union, Dict, List
|
7
6
|
from sklearn.model_selection import GridSearchCV
|
7
|
+
from sklearn.metrics import make_scorer, recall_score
|
8
8
|
|
9
9
|
from validmind import tags, tasks
|
10
|
-
from validmind.errors import SkipTestError
|
11
10
|
from validmind.vm_models import VMDataset, VMModel
|
12
11
|
|
13
12
|
|
14
13
|
@tags("sklearn", "model_performance")
|
15
14
|
@tasks("classification", "clustering")
|
15
|
+
def custom_recall(y_true, y_pred_proba, threshold=0.5):
|
16
|
+
y_pred = (y_pred_proba >= threshold).astype(int)
|
17
|
+
return recall_score(y_true, y_pred)
|
18
|
+
|
19
|
+
|
20
|
+
def _get_metrics(scoring):
|
21
|
+
"""Convert scoring parameter to list of metrics."""
|
22
|
+
if scoring is None:
|
23
|
+
return ["accuracy"]
|
24
|
+
return (
|
25
|
+
scoring
|
26
|
+
if isinstance(scoring, list)
|
27
|
+
else list(scoring.keys()) if isinstance(scoring, dict) else [scoring]
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
def _get_thresholds(thresholds):
|
32
|
+
"""Convert thresholds parameter to list."""
|
33
|
+
if thresholds is None:
|
34
|
+
return [0.5]
|
35
|
+
return [thresholds] if isinstance(thresholds, (int, float)) else thresholds
|
36
|
+
|
37
|
+
|
38
|
+
def _create_scoring_dict(scoring, metrics, threshold):
|
39
|
+
"""Create scoring dictionary for GridSearchCV."""
|
40
|
+
if scoring is None:
|
41
|
+
return None
|
42
|
+
|
43
|
+
scoring_dict = {}
|
44
|
+
for metric in metrics:
|
45
|
+
if metric == "recall":
|
46
|
+
scoring_dict[metric] = make_scorer(
|
47
|
+
custom_recall, needs_proba=True, threshold=threshold
|
48
|
+
)
|
49
|
+
elif metric == "roc_auc":
|
50
|
+
scoring_dict[metric] = "roc_auc"
|
51
|
+
else:
|
52
|
+
scoring_dict[metric] = metric
|
53
|
+
return scoring_dict
|
54
|
+
|
55
|
+
|
56
|
+
@tags("sklearn", "model_performance")
|
57
|
+
@tasks("clustering", "classification")
|
16
58
|
def HyperParametersTuning(
|
17
59
|
model: VMModel,
|
18
60
|
dataset: VMDataset,
|
19
|
-
param_grid:
|
20
|
-
scoring: Union[str,
|
61
|
+
param_grid: dict,
|
62
|
+
scoring: Union[str, List, Dict] = None,
|
63
|
+
thresholds: Union[float, List[float]] = None,
|
64
|
+
fit_params: dict = None,
|
21
65
|
):
|
22
66
|
"""
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
-
|
55
|
-
|
56
|
-
-
|
57
|
-
|
58
|
-
-
|
59
|
-
|
60
|
-
-
|
61
|
-
|
67
|
+
Performs exhaustive grid search over specified parameter ranges to find optimal model configurations
|
68
|
+
across different metrics and decision thresholds.
|
69
|
+
|
70
|
+
### Purpose
|
71
|
+
|
72
|
+
The Hyperparameter Tuning test systematically explores the model's parameter space to identify optimal
|
73
|
+
configurations. It supports multiple optimization metrics and decision thresholds, providing a comprehensive
|
74
|
+
view of how different parameter combinations affect various aspects of model performance.
|
75
|
+
|
76
|
+
### Test Mechanism
|
77
|
+
|
78
|
+
The test uses scikit-learn's GridSearchCV to perform cross-validation for each parameter combination.
|
79
|
+
For each specified threshold and optimization metric, it creates a scoring dictionary with
|
80
|
+
threshold-adjusted metrics, performs grid search with cross-validation, records best parameters and
|
81
|
+
corresponding scores, and combines results into a comparative table. This process is repeated for each
|
82
|
+
optimization metric to provide a comprehensive view of model performance under different configurations.
|
83
|
+
|
84
|
+
### Signs of High Risk
|
85
|
+
|
86
|
+
- Large performance variations across different parameter combinations
|
87
|
+
- Significant discrepancies between different optimization metrics
|
88
|
+
- Best parameters at the edges of the parameter grid
|
89
|
+
- Unstable performance across different thresholds
|
90
|
+
- Overly complex model configurations (risk of overfitting)
|
91
|
+
- Very different optimal parameters for different metrics
|
92
|
+
- Cross-validation scores showing high variance
|
93
|
+
- Extreme parameter values in best configurations
|
94
|
+
|
95
|
+
### Strengths
|
96
|
+
|
97
|
+
- Comprehensive exploration of parameter space
|
98
|
+
- Supports multiple optimization metrics
|
99
|
+
- Allows threshold optimization
|
100
|
+
- Provides comparative view across different configurations
|
101
|
+
- Uses cross-validation for robust evaluation
|
102
|
+
- Helps understand trade-offs between different metrics
|
103
|
+
- Enables systematic parameter selection
|
104
|
+
- Supports both classification and clustering tasks
|
105
|
+
|
106
|
+
### Limitations
|
107
|
+
|
108
|
+
- Computationally expensive for large parameter grids
|
109
|
+
- May not find global optimum (limited to grid points)
|
110
|
+
- Cannot handle dependencies between parameters
|
111
|
+
- Memory intensive for large datasets
|
112
|
+
- Limited to scikit-learn compatible models
|
113
|
+
- Cross-validation splits may not preserve time series structure
|
114
|
+
- Grid search may miss optimal values between grid points
|
115
|
+
- Resource intensive for high-dimensional parameter spaces
|
62
116
|
"""
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
117
|
+
fit_params = fit_params or {}
|
118
|
+
|
119
|
+
# Simple case: no scoring and no thresholds
|
120
|
+
if scoring is None and thresholds is None:
|
121
|
+
estimators = GridSearchCV(model.model, param_grid=param_grid, scoring=None)
|
122
|
+
estimators.fit(dataset.x_df(), dataset.y, **fit_params)
|
123
|
+
return [
|
124
|
+
{
|
125
|
+
"Best Model": estimators.best_estimator_,
|
126
|
+
"Best Parameters": estimators.best_params_,
|
127
|
+
}
|
128
|
+
]
|
129
|
+
|
130
|
+
# Complex case: with scoring or thresholds
|
131
|
+
results = []
|
132
|
+
metrics = _get_metrics(scoring)
|
133
|
+
thresholds = _get_thresholds(thresholds)
|
134
|
+
|
135
|
+
for threshold in thresholds:
|
136
|
+
scoring_dict = _create_scoring_dict(scoring, metrics, threshold)
|
137
|
+
|
138
|
+
for optimize_for in metrics:
|
139
|
+
estimators = GridSearchCV(
|
140
|
+
model.model,
|
141
|
+
param_grid=param_grid,
|
142
|
+
scoring=scoring_dict,
|
143
|
+
refit=optimize_for if scoring is not None else True,
|
144
|
+
)
|
145
|
+
|
146
|
+
estimators.fit(dataset.x_df(), dataset.y, **fit_params)
|
147
|
+
|
148
|
+
best_index = estimators.best_index_
|
149
|
+
row_result = {
|
150
|
+
"Optimized for": optimize_for,
|
151
|
+
"Threshold": threshold,
|
152
|
+
"Best Parameters": estimators.best_params_,
|
153
|
+
}
|
154
|
+
|
155
|
+
score_key = (
|
156
|
+
"mean_test_score" if scoring is None else f"mean_test_{optimize_for}"
|
157
|
+
)
|
158
|
+
row_result[optimize_for] = estimators.cv_results_[score_key][best_index]
|
159
|
+
|
160
|
+
results.append(row_result)
|
161
|
+
|
162
|
+
return results
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from validmind import tags, tasks
|
7
|
+
|
8
|
+
|
9
|
+
@tags("model_training", "metadata")
|
10
|
+
@tasks("classification", "regression")
|
11
|
+
def ModelParameters(model, model_params=None):
|
12
|
+
"""
|
13
|
+
Extracts and displays model parameters in a structured format for transparency and reproducibility.
|
14
|
+
|
15
|
+
### Purpose
|
16
|
+
|
17
|
+
The Model Parameters test is designed to provide transparency into model configuration and ensure
|
18
|
+
reproducibility of machine learning models. It accomplishes this by extracting and presenting all
|
19
|
+
relevant parameters that define the model's behavior, making it easier to audit, validate, and
|
20
|
+
reproduce model training.
|
21
|
+
|
22
|
+
### Test Mechanism
|
23
|
+
|
24
|
+
The test leverages scikit-learn's API convention of get_params() to extract model parameters. It
|
25
|
+
produces a structured DataFrame containing parameter names and their corresponding values. For models
|
26
|
+
that follow scikit-learn's API (including XGBoost, RandomForest, and other estimators), all
|
27
|
+
parameters are automatically extracted and displayed.
|
28
|
+
|
29
|
+
### Signs of High Risk
|
30
|
+
|
31
|
+
- Missing crucial parameters that should be explicitly set
|
32
|
+
- Extreme parameter values that could indicate overfitting (e.g., unlimited tree depth)
|
33
|
+
- Inconsistent parameters across different versions of the same model type
|
34
|
+
- Parameter combinations known to cause instability or poor performance
|
35
|
+
- Default values used for critical parameters that should be tuned
|
36
|
+
|
37
|
+
### Strengths
|
38
|
+
|
39
|
+
- Universal compatibility with scikit-learn API-compliant models
|
40
|
+
- Ensures transparency in model configuration
|
41
|
+
- Facilitates model reproducibility and version control
|
42
|
+
- Enables systematic parameter auditing
|
43
|
+
- Supports both classification and regression models
|
44
|
+
- Helps identify potential configuration issues
|
45
|
+
|
46
|
+
### Limitations
|
47
|
+
|
48
|
+
- Only works with models implementing scikit-learn's get_params() method
|
49
|
+
- Cannot capture dynamic parameters set during model training
|
50
|
+
- Does not validate parameter values for model-specific appropriateness
|
51
|
+
- Parameter meanings and impacts may vary across different model types
|
52
|
+
- Cannot detect indirect parameter interactions or their effects on model performance
|
53
|
+
"""
|
54
|
+
# Check if model implements get_params()
|
55
|
+
if not hasattr(model.model, "get_params"):
|
56
|
+
return pd.DataFrame()
|
57
|
+
|
58
|
+
# Get all model parameters
|
59
|
+
params = model.model.get_params()
|
60
|
+
|
61
|
+
# If model_params is None, use all parameters from get_params()
|
62
|
+
if model_params is None:
|
63
|
+
model_params = sorted(params.keys()) # Sort for consistent ordering
|
64
|
+
|
65
|
+
# Create DataFrame with parameters and their values
|
66
|
+
param_df = pd.DataFrame(
|
67
|
+
[
|
68
|
+
{"Parameter": param, "Value": str(params.get(param, "Not specified"))}
|
69
|
+
for param in model_params
|
70
|
+
if params.get(param) is not None
|
71
|
+
]
|
72
|
+
)
|
73
|
+
|
74
|
+
return param_df
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import plotly.graph_objects as go
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMModel, VMDataset
|
9
|
+
|
10
|
+
|
11
|
+
@tags("visualization", "credit_risk", "calibration")
|
12
|
+
@tasks("classification")
|
13
|
+
def ScoreProbabilityAlignment(
|
14
|
+
model: VMModel, dataset: VMDataset, score_column: str = "score", n_bins: int = 10
|
15
|
+
):
|
16
|
+
"""
|
17
|
+
Analyzes the alignment between credit scores and predicted probabilities.
|
18
|
+
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
The Score-Probability Alignment test evaluates how well credit scores align with
|
22
|
+
predicted default probabilities. This helps validate score scaling, identify potential
|
23
|
+
calibration issues, and ensure scores reflect risk appropriately.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
The test:
|
28
|
+
1. Groups scores into bins
|
29
|
+
2. Calculates average predicted probability per bin
|
30
|
+
3. Tests monotonicity of relationship
|
31
|
+
4. Analyzes probability distribution within score bands
|
32
|
+
|
33
|
+
### Signs of High Risk
|
34
|
+
|
35
|
+
- Non-monotonic relationship between scores and probabilities
|
36
|
+
- Large probability variations within score bands
|
37
|
+
- Unexpected probability jumps between adjacent bands
|
38
|
+
- Poor alignment with expected odds-to-score relationship
|
39
|
+
- Inconsistent probability patterns across score ranges
|
40
|
+
- Clustering of probabilities at extreme values
|
41
|
+
- Score bands with similar probability profiles
|
42
|
+
- Unstable probability estimates in key decision bands
|
43
|
+
|
44
|
+
### Strengths
|
45
|
+
|
46
|
+
- Direct validation of score-to-probability relationship
|
47
|
+
- Identifies potential calibration issues
|
48
|
+
- Supports score band validation
|
49
|
+
- Helps understand model behavior
|
50
|
+
- Useful for policy setting
|
51
|
+
- Visual and numerical results
|
52
|
+
- Easy to interpret
|
53
|
+
- Supports regulatory documentation
|
54
|
+
|
55
|
+
### Limitations
|
56
|
+
|
57
|
+
- Sensitive to bin selection
|
58
|
+
- Requires sufficient data per bin
|
59
|
+
- May mask within-bin variations
|
60
|
+
- Point-in-time analysis only
|
61
|
+
- Cannot detect all forms of miscalibration
|
62
|
+
- Assumes scores should align with probabilities
|
63
|
+
- May oversimplify complex relationships
|
64
|
+
- Limited to binary outcomes
|
65
|
+
"""
|
66
|
+
if score_column not in dataset.df.columns:
|
67
|
+
raise ValueError(f"Score column '{score_column}' not found in dataset")
|
68
|
+
|
69
|
+
# Get predicted probabilities
|
70
|
+
y_prob = dataset.y_prob(model)
|
71
|
+
|
72
|
+
# Create score bins
|
73
|
+
df = dataset.df.copy()
|
74
|
+
df["probability"] = y_prob
|
75
|
+
|
76
|
+
# Create score bins with equal width
|
77
|
+
df["score_bin"] = pd.qcut(df[score_column], n_bins, duplicates="drop")
|
78
|
+
|
79
|
+
# Calculate statistics per bin
|
80
|
+
results = []
|
81
|
+
for bin_name, group in df.groupby("score_bin"):
|
82
|
+
bin_stats = {
|
83
|
+
"Score Range": f"{bin_name.left:.0f}-{bin_name.right:.0f}",
|
84
|
+
"Mean Score": group[score_column].mean(),
|
85
|
+
"Population Count": len(group),
|
86
|
+
"Population (%)": len(group) / len(df) * 100,
|
87
|
+
"Mean Probability (%)": group["probability"].mean() * 100,
|
88
|
+
"Min Probability (%)": group["probability"].min() * 100,
|
89
|
+
"Max Probability (%)": group["probability"].max() * 100,
|
90
|
+
"Probability Std": group["probability"].std() * 100,
|
91
|
+
}
|
92
|
+
results.append(bin_stats)
|
93
|
+
|
94
|
+
results_df = pd.DataFrame(results)
|
95
|
+
|
96
|
+
# Create visualization
|
97
|
+
fig = go.Figure()
|
98
|
+
|
99
|
+
# Add probability range
|
100
|
+
fig.add_trace(
|
101
|
+
go.Scatter(
|
102
|
+
x=results_df["Mean Score"],
|
103
|
+
y=results_df["Mean Probability (%)"],
|
104
|
+
mode="lines+markers",
|
105
|
+
name="Mean Probability",
|
106
|
+
line=dict(color="blue"),
|
107
|
+
error_y=dict(
|
108
|
+
type="data",
|
109
|
+
symmetric=False,
|
110
|
+
array=results_df["Max Probability (%)"]
|
111
|
+
- results_df["Mean Probability (%)"],
|
112
|
+
arrayminus=results_df["Mean Probability (%)"]
|
113
|
+
- results_df["Min Probability (%)"],
|
114
|
+
color="gray",
|
115
|
+
),
|
116
|
+
)
|
117
|
+
)
|
118
|
+
|
119
|
+
# Update layout
|
120
|
+
fig.update_layout(
|
121
|
+
title="Score-Probability Alignment",
|
122
|
+
xaxis_title="Score",
|
123
|
+
yaxis_title="Default Probability (%)",
|
124
|
+
showlegend=True,
|
125
|
+
template="plotly_white",
|
126
|
+
width=800,
|
127
|
+
height=600,
|
128
|
+
)
|
129
|
+
|
130
|
+
return results_df, fig
|
@@ -9,22 +9,21 @@ from matplotlib import cm
|
|
9
9
|
from validmind import tags, tasks
|
10
10
|
|
11
11
|
|
12
|
-
@tags("visualization", "credit_risk"
|
12
|
+
@tags("visualization", "credit_risk")
|
13
13
|
@tasks("classification")
|
14
14
|
def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
|
15
15
|
"""
|
16
|
-
Visualizes cumulative probabilities of positive and negative classes for both training and testing in
|
17
|
-
regression models.
|
16
|
+
Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models.
|
18
17
|
|
19
18
|
### Purpose
|
20
19
|
|
21
20
|
This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
|
22
|
-
in a
|
21
|
+
in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative
|
23
22
|
probabilities for positive and negative classes across both the training and test datasets.
|
24
23
|
|
25
24
|
### Test Mechanism
|
26
25
|
|
27
|
-
The
|
26
|
+
The classification model is evaluated by first computing the predicted probabilities for each instance in both
|
28
27
|
the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
|
29
28
|
for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
|
30
29
|
distributions of these probabilities are created for both positive and negative classes across both training and
|
@@ -51,7 +50,7 @@ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabil
|
|
51
50
|
|
52
51
|
### Limitations
|
53
52
|
|
54
|
-
- Exclusive to classification tasks and specifically to
|
53
|
+
- Exclusive to classification tasks and specifically to classification models.
|
55
54
|
- Graphical results necessitate human interpretation and may not be directly applicable for automated risk
|
56
55
|
detection.
|
57
56
|
- The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
|
@@ -9,7 +9,7 @@ from matplotlib import cm
|
|
9
9
|
from validmind import tags, tasks
|
10
10
|
|
11
11
|
|
12
|
-
@tags("visualization", "credit_risk"
|
12
|
+
@tags("visualization", "credit_risk")
|
13
13
|
@tasks("classification")
|
14
14
|
def PredictionProbabilitiesHistogram(
|
15
15
|
dataset, model, title="Histogram of Predictive Probabilities"
|
@@ -22,7 +22,7 @@ def PredictionProbabilitiesHistogram(
|
|
22
22
|
|
23
23
|
The Prediction Probabilities Histogram test is designed to generate histograms displaying the Probability of
|
24
24
|
Default (PD) predictions for both positive and negative classes in training and testing datasets. This helps in
|
25
|
-
evaluating the performance of a
|
25
|
+
evaluating the performance of a classification model.
|
26
26
|
|
27
27
|
### Test Mechanism
|
28
28
|
|
@@ -52,7 +52,6 @@ def PredictionProbabilitiesHistogram(
|
|
52
52
|
### Limitations
|
53
53
|
|
54
54
|
- Specifically tailored for binary classification scenarios and not suited for multi-class classification tasks.
|
55
|
-
- Mainly applicable to logistic regression models, and may not be effective for other model types.
|
56
55
|
- Provides a robust visual representation but lacks a quantifiable measure to assess model performance.
|
57
56
|
"""
|
58
57
|
|