validmind 2.6.10__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. validmind/__init__.py +2 -0
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +20 -4
  4. validmind/ai/test_result_description/user.jinja +5 -0
  5. validmind/datasets/credit_risk/lending_club.py +444 -14
  6. validmind/tests/data_validation/MutualInformation.py +129 -0
  7. validmind/tests/data_validation/ScoreBandDefaultRates.py +139 -0
  8. validmind/tests/data_validation/TooManyZeroValues.py +6 -5
  9. validmind/tests/data_validation/UniqueRows.py +3 -1
  10. validmind/tests/decorator.py +18 -16
  11. validmind/tests/model_validation/sklearn/CalibrationCurve.py +116 -0
  12. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +261 -0
  13. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +1 -0
  14. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +144 -56
  15. validmind/tests/model_validation/sklearn/ModelParameters.py +74 -0
  16. validmind/tests/model_validation/sklearn/ROCCurve.py +26 -23
  17. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +130 -0
  18. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -6
  19. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -3
  20. validmind/tests/output.py +10 -1
  21. validmind/tests/run.py +52 -54
  22. validmind/utils.py +34 -7
  23. validmind/vm_models/figure.py +15 -0
  24. validmind/vm_models/result/__init__.py +2 -2
  25. validmind/vm_models/result/result.py +136 -23
  26. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/METADATA +1 -1
  27. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/RECORD +30 -24
  28. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/LICENSE +0 -0
  29. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/WHEEL +0 -0
  30. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,129 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import plotly.graph_objects as go
6
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset
9
+ from validmind.vm_models.result import RawData
10
+
11
+
12
+ @tags("feature_selection", "data_analysis")
13
+ @tasks("classification", "regression")
14
+ def MutualInformation(
15
+ dataset: VMDataset, min_threshold: float = 0.01, task: str = "classification"
16
+ ):
17
+ """
18
+ Calculates mutual information scores between features and target variable to evaluate feature relevance.
19
+
20
+ ### Purpose
21
+
22
+ The Mutual Information test quantifies the predictive power of each feature by measuring its statistical
23
+ dependency with the target variable. This helps identify relevant features for model training and
24
+ detect potential redundant or irrelevant variables, supporting feature selection decisions and model
25
+ interpretability.
26
+
27
+ ### Test Mechanism
28
+
29
+ The test employs sklearn's mutual_info_classif/mutual_info_regression functions to compute mutual
30
+ information between each feature and the target. It produces a normalized score (0 to 1) for each
31
+ feature, where higher scores indicate stronger relationships. Results are presented in both tabular
32
+ format and visualized through a bar plot with a configurable threshold line.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Many features showing very low mutual information scores
37
+ - Key business features exhibiting unexpectedly low scores
38
+ - All features showing similar, low information content
39
+ - Large discrepancy between business importance and MI scores
40
+ - Highly skewed distribution of MI scores
41
+ - Critical features below the minimum threshold
42
+ - Unexpected zero or near-zero scores for known important features
43
+ - Inconsistent scores across different data samples
44
+
45
+ ### Strengths
46
+
47
+ - Captures non-linear relationships between features and target
48
+ - Scale-invariant measurement of feature relevance
49
+ - Works for both classification and regression tasks
50
+ - Provides interpretable scores (0 to 1 scale)
51
+ - Supports automated feature selection
52
+ - No assumptions about data distribution
53
+ - Handles numerical and categorical features
54
+ - Computationally efficient for most datasets
55
+
56
+ ### Limitations
57
+
58
+ - Requires sufficient data for reliable estimates
59
+ - May be computationally intensive for very large datasets
60
+ - Cannot detect redundant features (pairwise relationships)
61
+ - Sensitive to feature discretization for continuous variables
62
+ - Does not account for feature interactions
63
+ - May underestimate importance of rare but crucial events
64
+ - Cannot handle missing values directly
65
+ - May be affected by extreme class imbalance
66
+ """
67
+ if task not in ["classification", "regression"]:
68
+ raise ValueError("task must be either 'classification' or 'regression'")
69
+
70
+ X = dataset.x
71
+ y = dataset.y
72
+
73
+ # Select appropriate MI function based on task type
74
+ if task == "classification":
75
+ mi_scores = mutual_info_classif(X, y)
76
+ else:
77
+ mi_scores = mutual_info_regression(X, y)
78
+
79
+ # Create DataFrame for raw data
80
+ raw_data = RawData(
81
+ feature=dataset.feature_columns,
82
+ mutual_information_score=mi_scores.tolist(),
83
+ pass_fail=["Pass" if score >= min_threshold else "Fail" for score in mi_scores],
84
+ )
85
+
86
+ # Create Plotly figure
87
+ fig = go.Figure()
88
+
89
+ # Sort data for better visualization
90
+ sorted_indices = sorted(
91
+ range(len(mi_scores)), key=lambda k: mi_scores[k], reverse=True
92
+ )
93
+ sorted_features = [dataset.feature_columns[i] for i in sorted_indices]
94
+ sorted_scores = [mi_scores[i] for i in sorted_indices]
95
+
96
+ # Add bar plot
97
+ fig.add_trace(
98
+ go.Bar(
99
+ x=sorted_features,
100
+ y=sorted_scores,
101
+ marker_color=[
102
+ "blue" if score >= min_threshold else "red" for score in sorted_scores
103
+ ],
104
+ name="Mutual Information Score",
105
+ )
106
+ )
107
+
108
+ # Add threshold line
109
+ fig.add_hline(
110
+ y=min_threshold,
111
+ line_dash="dash",
112
+ line_color="gray",
113
+ annotation_text=f"Threshold ({min_threshold})",
114
+ annotation_position="right",
115
+ )
116
+
117
+ # Update layout
118
+ fig.update_layout(
119
+ title="Mutual Information Scores by Feature",
120
+ xaxis_title="Features",
121
+ yaxis_title="Mutual Information Score",
122
+ xaxis_tickangle=-45,
123
+ showlegend=False,
124
+ width=1000,
125
+ height=600,
126
+ template="plotly_white",
127
+ )
128
+
129
+ return raw_data, fig
@@ -0,0 +1,139 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset, VMModel
9
+
10
+
11
+ @tags("visualization", "credit_risk", "scorecard")
12
+ @tasks("classification")
13
+ def ScoreBandDefaultRates(
14
+ dataset: VMDataset,
15
+ model: VMModel,
16
+ score_column: str = "score",
17
+ score_bands: list = None,
18
+ ):
19
+ """
20
+ Analyzes default rates and population distribution across credit score bands.
21
+
22
+ ### Purpose
23
+
24
+ The Score Band Default Rates test evaluates the discriminatory power of credit scores by analyzing
25
+ default rates across different score bands. This helps validate score effectiveness, supports
26
+ policy decisions, and provides insights into portfolio risk distribution.
27
+
28
+ ### Test Mechanism
29
+
30
+ The test segments the score distribution into bands and calculates key metrics for each band:
31
+ 1. Population count and percentage in each band
32
+ 2. Default rate within each band
33
+ 3. Cumulative statistics across bands
34
+ The results show how well the scores separate good and bad accounts.
35
+
36
+ ### Signs of High Risk
37
+
38
+ - Non-monotonic default rates across score bands
39
+ - Insufficient population in critical score bands
40
+ - Unexpected default rates for score ranges
41
+ - High concentration in specific score bands
42
+ - Similar default rates across adjacent bands
43
+ - Unstable default rates in key decision bands
44
+ - Extreme population skewness
45
+ - Poor risk separation between bands
46
+
47
+ ### Strengths
48
+
49
+ - Clear view of score effectiveness
50
+ - Supports policy threshold decisions
51
+ - Easy to interpret and communicate
52
+ - Directly links to business decisions
53
+ - Shows risk segmentation power
54
+ - Identifies potential score issues
55
+ - Helps validate scoring model
56
+ - Supports portfolio monitoring
57
+
58
+ ### Limitations
59
+
60
+ - Sensitive to band definition choices
61
+ - May mask within-band variations
62
+ - Requires sufficient data in each band
63
+ - Cannot capture non-linear patterns
64
+ - Point-in-time analysis only
65
+ - No temporal trend information
66
+ - Assumes band boundaries are appropriate
67
+ - May oversimplify risk patterns
68
+ """
69
+
70
+ if score_column not in dataset.df.columns:
71
+ raise ValueError(
72
+ f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
73
+ )
74
+
75
+ df = dataset._df.copy()
76
+
77
+ # Default score bands if none provided
78
+ if score_bands is None:
79
+ score_bands = [410, 440, 470]
80
+
81
+ # Create band labels
82
+ band_labels = [
83
+ f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1)
84
+ ]
85
+ band_labels.insert(0, f"<{score_bands[0]}")
86
+ band_labels.append(f">{score_bands[-1]}")
87
+
88
+ # Bin the scores with infinite upper bound
89
+ df["score_band"] = pd.cut(
90
+ df[score_column], bins=[-np.inf] + score_bands + [np.inf], labels=band_labels
91
+ )
92
+
93
+ # Calculate min and max scores for the total row
94
+ min_score = df[score_column].min()
95
+ max_score = df[score_column].max()
96
+
97
+ # Get predicted classes (0/1)
98
+ y_pred = dataset.y_pred(model)
99
+
100
+ # Calculate metrics by band using target_column name
101
+ results = []
102
+ for band in band_labels:
103
+ band_mask = df["score_band"] == band
104
+ population = band_mask.sum()
105
+ observed_defaults = df[band_mask][dataset.target_column].sum()
106
+ predicted_defaults = y_pred[
107
+ band_mask
108
+ ].sum() # Sum of 1s gives number of predicted defaults
109
+
110
+ results.append(
111
+ {
112
+ "Score Band": band,
113
+ "Population Count": population,
114
+ "Population (%)": population / len(df) * 100,
115
+ "Predicted Default Rate (%)": (
116
+ predicted_defaults / population * 100 if population > 0 else 0
117
+ ),
118
+ "Observed Default Rate (%)": (
119
+ observed_defaults / population * 100 if population > 0 else 0
120
+ ),
121
+ }
122
+ )
123
+
124
+ # Add total row
125
+ total_population = len(df)
126
+ total_observed = df[dataset.target_column].sum()
127
+ total_predicted = y_pred.sum() # Total number of predicted defaults
128
+
129
+ results.append(
130
+ {
131
+ "Score Band": f"Total ({min_score:.0f}-{max_score:.0f})",
132
+ "Population Count": total_population,
133
+ "Population (%)": sum(r["Population (%)"] for r in results),
134
+ "Predicted Default Rate (%)": total_predicted / total_population * 100,
135
+ "Observed Default Rate (%)": total_observed / total_population * 100,
136
+ }
137
+ )
138
+
139
+ return pd.DataFrame(results)
@@ -61,24 +61,25 @@ def TooManyZeroValues(dataset: VMDataset, max_percent_threshold: float = 0.03):
61
61
  issues.
62
62
  """
63
63
  df = dataset.df
64
-
65
64
  table = []
66
65
 
67
66
  for col in dataset.feature_columns_numeric:
68
67
  value_counts = df[col].value_counts()
68
+ row_count = df.shape[0]
69
69
 
70
70
  if 0 not in value_counts.index:
71
71
  continue
72
72
 
73
73
  n_zeros = value_counts[0]
74
- p_zeros = n_zeros / df.shape[0]
74
+ p_zeros = (n_zeros / row_count) * 100
75
75
 
76
76
  table.append(
77
77
  {
78
- "Column": col,
78
+ "Variable": col,
79
+ "Row Count": row_count,
79
80
  "Number of Zero Values": n_zeros,
80
- "Percentage of Zero Values (%)": p_zeros * 100,
81
- "Pass/Fail": "Pass" if p_zeros < max_percent_threshold else "Fail",
81
+ "Percentage of Zero Values (%)": p_zeros,
82
+ "Pass/Fail": ("Pass" if p_zeros < (max_percent_threshold) else "Fail"),
82
83
  }
83
84
  )
84
85
 
@@ -61,7 +61,9 @@ def UniqueRows(dataset: VMDataset, min_percent_threshold: float = 1):
61
61
  "Number of Unique Values": unique_rows[col],
62
62
  "Percentage of Unique Values (%)": unique_rows[col] / rows * 100,
63
63
  "Pass/Fail": (
64
- "Pass" if unique_rows[col] / rows >= min_percent_threshold else "Fail"
64
+ "Pass"
65
+ if (unique_rows[col] / rows * 100) >= min_percent_threshold
66
+ else "Fail"
65
67
  ),
66
68
  }
67
69
  for col in unique_rows.index
@@ -24,6 +24,11 @@ def _get_save_func(func, test_id):
24
24
  test library.
25
25
  """
26
26
 
27
+ # get og source before its wrapped by the test decorator
28
+ source = inspect.getsource(func)
29
+ # remove decorator line
30
+ source = source.split("\n", 1)[1]
31
+
27
32
  def save(root_folder=".", imports=None):
28
33
  parts = test_id.split(".")
29
34
 
@@ -41,35 +46,32 @@ def _get_save_func(func, test_id):
41
46
 
42
47
  full_path = os.path.join(path, f"{test_name}.py")
43
48
 
44
- source = inspect.getsource(func)
45
- # remove decorator line
46
- source = source.split("\n", 1)[1]
49
+ _source = source.replace(f"def {func.__name__}", f"def {test_name}")
50
+
47
51
  if imports:
48
52
  imports = "\n".join(imports)
49
- source = f"{imports}\n\n\n{source}"
53
+ _source = f"{imports}\n\n\n{_source}"
54
+
50
55
  # add comment to the top of the file
51
- source = f"""
56
+ _source = f"""
52
57
  # Saved from {func.__module__}.{func.__name__}
53
58
  # Original Test ID: {test_id}
54
59
  # New Test ID: {new_test_id}
55
60
 
56
- {source}
61
+ {_source}
57
62
  """
58
63
 
59
- # ensure that the function name matches the test name
60
- source = source.replace(f"def {func.__name__}", f"def {test_name}")
61
-
62
64
  # use black to format the code
63
65
  try:
64
66
  import black
65
67
 
66
- source = black.format_str(source, mode=black.FileMode())
68
+ _source = black.format_str(_source, mode=black.FileMode())
67
69
  except ImportError:
68
70
  # ignore if not available
69
71
  pass
70
72
 
71
73
  with open(full_path, "w") as file:
72
- file.writelines(source)
74
+ file.writelines(_source)
73
75
 
74
76
  logger.info(
75
77
  f"Saved to {os.path.abspath(full_path)}!"
@@ -119,12 +121,12 @@ def test(func_or_id):
119
121
  test_func = load_test(test_id, func, reload=True)
120
122
  test_store.register_test(test_id, test_func)
121
123
 
122
- @wraps(test_func)
123
- def wrapper(*args, **kwargs):
124
- return test_func(*args, **kwargs)
125
-
126
124
  # special function to allow the function to be saved to a file
127
- wrapper.save = _get_save_func(test_func, test_id)
125
+ save_func = _get_save_func(func, test_id)
126
+
127
+ wrapper = wraps(func)(test_func)
128
+ wrapper.test_id = test_id
129
+ wrapper.save = save_func
128
130
 
129
131
  return wrapper
130
132
 
@@ -0,0 +1,116 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from sklearn.calibration import calibration_curve
6
+ import plotly.graph_objects as go
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMModel, VMDataset
9
+ from validmind.vm_models.result import RawData
10
+
11
+
12
+ @tags("sklearn", "model_performance", "classification")
13
+ @tasks("classification")
14
+ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
15
+ """
16
+ Evaluates the calibration of probability estimates by comparing predicted probabilities against observed
17
+ frequencies.
18
+
19
+ ### Purpose
20
+
21
+ The Calibration Curve test assesses how well a model's predicted probabilities align with actual
22
+ observed frequencies. This is crucial for applications requiring accurate probability estimates,
23
+ such as risk assessment, decision-making systems, and cost-sensitive applications where probability
24
+ calibration directly impacts business decisions.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test uses sklearn's calibration_curve function to:
29
+ 1. Sort predictions into bins based on predicted probabilities
30
+ 2. Calculate the mean predicted probability in each bin
31
+ 3. Compare against the observed frequency of positive cases
32
+ 4. Plot the results against the perfect calibration line (y=x)
33
+ The resulting curve shows how well the predicted probabilities match empirical probabilities.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Significant deviation from the perfect calibration line
38
+ - Systematic overconfidence (predictions too close to 0 or 1)
39
+ - Systematic underconfidence (predictions clustered around 0.5)
40
+ - Empty or sparse bins indicating poor probability coverage
41
+ - Sharp discontinuities in the calibration curve
42
+ - Different calibration patterns across different probability ranges
43
+ - Consistent over/under estimation in critical probability regions
44
+ - Large confidence intervals in certain probability ranges
45
+
46
+ ### Strengths
47
+
48
+ - Visual and intuitive interpretation of probability quality
49
+ - Identifies systematic biases in probability estimates
50
+ - Supports probability threshold selection
51
+ - Helps understand model confidence patterns
52
+ - Applicable across different classification models
53
+ - Enables comparison between different models
54
+ - Guides potential need for recalibration
55
+ - Critical for risk-sensitive applications
56
+
57
+ ### Limitations
58
+
59
+ - Sensitive to the number of bins chosen
60
+ - Requires sufficient samples in each bin for reliable estimates
61
+ - May mask local calibration issues within bins
62
+ - Does not account for feature-dependent calibration issues
63
+ - Limited to binary classification problems
64
+ - Cannot detect all forms of miscalibration
65
+ - Assumes bin boundaries are appropriate for the problem
66
+ - May be affected by class imbalance
67
+ """
68
+ prob_true, prob_pred = calibration_curve(
69
+ dataset.y, dataset.y_prob(model), n_bins=n_bins
70
+ )
71
+
72
+ # Create DataFrame for raw data
73
+ raw_data = RawData(
74
+ mean_predicted_probability=prob_pred, observed_frequency=prob_true
75
+ )
76
+
77
+ # Create Plotly figure
78
+ fig = go.Figure()
79
+
80
+ # Add perfect calibration line
81
+ fig.add_trace(
82
+ go.Scatter(
83
+ x=[0, 1],
84
+ y=[0, 1],
85
+ mode="lines",
86
+ name="Perfect Calibration",
87
+ line=dict(dash="dash", color="gray"),
88
+ )
89
+ )
90
+
91
+ # Add calibration curve
92
+ fig.add_trace(
93
+ go.Scatter(
94
+ x=prob_pred,
95
+ y=prob_true,
96
+ mode="lines+markers",
97
+ name="Model Calibration",
98
+ line=dict(color="blue"),
99
+ marker=dict(size=8),
100
+ )
101
+ )
102
+
103
+ # Update layout
104
+ fig.update_layout(
105
+ title="Calibration Curve",
106
+ xaxis_title="Mean Predicted Probability",
107
+ yaxis_title="Observed Frequency",
108
+ xaxis=dict(range=[0, 1]),
109
+ yaxis=dict(range=[0, 1]),
110
+ width=800,
111
+ height=600,
112
+ showlegend=True,
113
+ template="plotly_white",
114
+ )
115
+
116
+ return raw_data, fig