validmind 2.6.10__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. validmind/__init__.py +2 -0
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +20 -4
  4. validmind/ai/test_result_description/user.jinja +5 -0
  5. validmind/datasets/credit_risk/lending_club.py +444 -14
  6. validmind/tests/data_validation/MutualInformation.py +129 -0
  7. validmind/tests/data_validation/ScoreBandDefaultRates.py +139 -0
  8. validmind/tests/data_validation/TooManyZeroValues.py +6 -5
  9. validmind/tests/data_validation/UniqueRows.py +3 -1
  10. validmind/tests/decorator.py +18 -16
  11. validmind/tests/model_validation/sklearn/CalibrationCurve.py +116 -0
  12. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +261 -0
  13. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +1 -0
  14. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +144 -56
  15. validmind/tests/model_validation/sklearn/ModelParameters.py +74 -0
  16. validmind/tests/model_validation/sklearn/ROCCurve.py +26 -23
  17. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +130 -0
  18. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -6
  19. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -3
  20. validmind/tests/output.py +10 -1
  21. validmind/tests/run.py +52 -54
  22. validmind/utils.py +34 -7
  23. validmind/vm_models/figure.py +15 -0
  24. validmind/vm_models/result/__init__.py +2 -2
  25. validmind/vm_models/result/result.py +136 -23
  26. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/METADATA +1 -1
  27. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/RECORD +30 -24
  28. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/LICENSE +0 -0
  29. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/WHEEL +0 -0
  30. {validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,130 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMModel, VMDataset
9
+
10
+
11
+ @tags("visualization", "credit_risk", "calibration")
12
+ @tasks("classification")
13
+ def ScoreProbabilityAlignment(
14
+ model: VMModel, dataset: VMDataset, score_column: str = "score", n_bins: int = 10
15
+ ):
16
+ """
17
+ Analyzes the alignment between credit scores and predicted probabilities.
18
+
19
+ ### Purpose
20
+
21
+ The Score-Probability Alignment test evaluates how well credit scores align with
22
+ predicted default probabilities. This helps validate score scaling, identify potential
23
+ calibration issues, and ensure scores reflect risk appropriately.
24
+
25
+ ### Test Mechanism
26
+
27
+ The test:
28
+ 1. Groups scores into bins
29
+ 2. Calculates average predicted probability per bin
30
+ 3. Tests monotonicity of relationship
31
+ 4. Analyzes probability distribution within score bands
32
+
33
+ ### Signs of High Risk
34
+
35
+ - Non-monotonic relationship between scores and probabilities
36
+ - Large probability variations within score bands
37
+ - Unexpected probability jumps between adjacent bands
38
+ - Poor alignment with expected odds-to-score relationship
39
+ - Inconsistent probability patterns across score ranges
40
+ - Clustering of probabilities at extreme values
41
+ - Score bands with similar probability profiles
42
+ - Unstable probability estimates in key decision bands
43
+
44
+ ### Strengths
45
+
46
+ - Direct validation of score-to-probability relationship
47
+ - Identifies potential calibration issues
48
+ - Supports score band validation
49
+ - Helps understand model behavior
50
+ - Useful for policy setting
51
+ - Visual and numerical results
52
+ - Easy to interpret
53
+ - Supports regulatory documentation
54
+
55
+ ### Limitations
56
+
57
+ - Sensitive to bin selection
58
+ - Requires sufficient data per bin
59
+ - May mask within-bin variations
60
+ - Point-in-time analysis only
61
+ - Cannot detect all forms of miscalibration
62
+ - Assumes scores should align with probabilities
63
+ - May oversimplify complex relationships
64
+ - Limited to binary outcomes
65
+ """
66
+ if score_column not in dataset.df.columns:
67
+ raise ValueError(f"Score column '{score_column}' not found in dataset")
68
+
69
+ # Get predicted probabilities
70
+ y_prob = dataset.y_prob(model)
71
+
72
+ # Create score bins
73
+ df = dataset.df.copy()
74
+ df["probability"] = y_prob
75
+
76
+ # Create score bins with equal width
77
+ df["score_bin"] = pd.qcut(df[score_column], n_bins, duplicates="drop")
78
+
79
+ # Calculate statistics per bin
80
+ results = []
81
+ for bin_name, group in df.groupby("score_bin"):
82
+ bin_stats = {
83
+ "Score Range": f"{bin_name.left:.0f}-{bin_name.right:.0f}",
84
+ "Mean Score": group[score_column].mean(),
85
+ "Population Count": len(group),
86
+ "Population (%)": len(group) / len(df) * 100,
87
+ "Mean Probability (%)": group["probability"].mean() * 100,
88
+ "Min Probability (%)": group["probability"].min() * 100,
89
+ "Max Probability (%)": group["probability"].max() * 100,
90
+ "Probability Std": group["probability"].std() * 100,
91
+ }
92
+ results.append(bin_stats)
93
+
94
+ results_df = pd.DataFrame(results)
95
+
96
+ # Create visualization
97
+ fig = go.Figure()
98
+
99
+ # Add probability range
100
+ fig.add_trace(
101
+ go.Scatter(
102
+ x=results_df["Mean Score"],
103
+ y=results_df["Mean Probability (%)"],
104
+ mode="lines+markers",
105
+ name="Mean Probability",
106
+ line=dict(color="blue"),
107
+ error_y=dict(
108
+ type="data",
109
+ symmetric=False,
110
+ array=results_df["Max Probability (%)"]
111
+ - results_df["Mean Probability (%)"],
112
+ arrayminus=results_df["Mean Probability (%)"]
113
+ - results_df["Min Probability (%)"],
114
+ color="gray",
115
+ ),
116
+ )
117
+ )
118
+
119
+ # Update layout
120
+ fig.update_layout(
121
+ title="Score-Probability Alignment",
122
+ xaxis_title="Score",
123
+ yaxis_title="Default Probability (%)",
124
+ showlegend=True,
125
+ template="plotly_white",
126
+ width=800,
127
+ height=600,
128
+ )
129
+
130
+ return results_df, fig
@@ -9,22 +9,21 @@ from matplotlib import cm
9
9
  from validmind import tags, tasks
10
10
 
11
11
 
12
- @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tags("visualization", "credit_risk")
13
13
  @tasks("classification")
14
14
  def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
15
15
  """
16
- Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
17
- regression models.
16
+ Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models.
18
17
 
19
18
  ### Purpose
20
19
 
21
20
  This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
22
- in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
21
+ in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative
23
22
  probabilities for positive and negative classes across both the training and test datasets.
24
23
 
25
24
  ### Test Mechanism
26
25
 
27
- The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
26
+ The classification model is evaluated by first computing the predicted probabilities for each instance in both
28
27
  the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
29
28
  for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
30
29
  distributions of these probabilities are created for both positive and negative classes across both training and
@@ -51,7 +50,7 @@ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabil
51
50
 
52
51
  ### Limitations
53
52
 
54
- - Exclusive to classification tasks and specifically to logistic regression models.
53
+ - Exclusive to classification tasks and specifically to classification models.
55
54
  - Graphical results necessitate human interpretation and may not be directly applicable for automated risk
56
55
  detection.
57
56
  - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
@@ -9,7 +9,7 @@ from matplotlib import cm
9
9
  from validmind import tags, tasks
10
10
 
11
11
 
12
- @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tags("visualization", "credit_risk")
13
13
  @tasks("classification")
14
14
  def PredictionProbabilitiesHistogram(
15
15
  dataset, model, title="Histogram of Predictive Probabilities"
@@ -22,7 +22,7 @@ def PredictionProbabilitiesHistogram(
22
22
 
23
23
  The Prediction Probabilities Histogram test is designed to generate histograms displaying the Probability of
24
24
  Default (PD) predictions for both positive and negative classes in training and testing datasets. This helps in
25
- evaluating the performance of a logistic regression model, particularly for credit risk prediction.
25
+ evaluating the performance of a classification model.
26
26
 
27
27
  ### Test Mechanism
28
28
 
@@ -52,7 +52,6 @@ def PredictionProbabilitiesHistogram(
52
52
  ### Limitations
53
53
 
54
54
  - Specifically tailored for binary classification scenarios and not suited for multi-class classification tasks.
55
- - Mainly applicable to logistic regression models, and may not be effective for other model types.
56
55
  - Provides a robust visual representation but lacks a quantifiable measure to assess model performance.
57
56
  """
58
57
 
validmind/tests/output.py CHANGED
@@ -15,7 +15,7 @@ from validmind.vm_models.figure import (
15
15
  is_plotly_figure,
16
16
  is_png_image,
17
17
  )
18
- from validmind.vm_models.result import ResultTable, TestResult
18
+ from validmind.vm_models.result import RawData, ResultTable, TestResult
19
19
 
20
20
 
21
21
  class OutputHandler(ABC):
@@ -103,6 +103,14 @@ class TableOutputHandler(OutputHandler):
103
103
  result.add_table(ResultTable(data=table_data, title=table_name or None))
104
104
 
105
105
 
106
+ class RawDataOutputHandler(OutputHandler):
107
+ def can_handle(self, item: Any) -> bool:
108
+ return isinstance(item, RawData)
109
+
110
+ def process(self, item: Any, result: TestResult) -> None:
111
+ result.raw_data = item
112
+
113
+
106
114
  def process_output(item: Any, result: TestResult) -> None:
107
115
  """Process a single test output item and update the TestResult."""
108
116
  handlers = [
@@ -110,6 +118,7 @@ def process_output(item: Any, result: TestResult) -> None:
110
118
  MetricOutputHandler(),
111
119
  FigureOutputHandler(),
112
120
  TableOutputHandler(),
121
+ RawDataOutputHandler(),
113
122
  ]
114
123
 
115
124
  for handler in handlers:
validmind/tests/run.py CHANGED
@@ -7,7 +7,7 @@ import subprocess
7
7
  import time
8
8
  from datetime import datetime
9
9
  from inspect import getdoc
10
- from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
11
11
  from uuid import uuid4
12
12
 
13
13
  from validmind import __version__
@@ -134,10 +134,9 @@ def _get_test_kwargs(
134
134
  def build_test_result(
135
135
  outputs: Union[Any, Tuple[Any, ...]],
136
136
  test_id: str,
137
+ test_doc: str,
137
138
  inputs: Dict[str, Union[VMInput, List[VMInput]]],
138
139
  params: Union[Dict[str, Any], None],
139
- description: str,
140
- generate_description: bool = True,
141
140
  title: Optional[str] = None,
142
141
  ):
143
142
  """Build a TestResult object from a set of raw test function outputs"""
@@ -149,6 +148,7 @@ def build_test_result(
149
148
  ref_id=ref_id,
150
149
  inputs=inputs,
151
150
  params=params if params else None, # None if empty dict or None
151
+ doc=test_doc,
152
152
  )
153
153
 
154
154
  if not isinstance(outputs, tuple):
@@ -157,16 +157,6 @@ def build_test_result(
157
157
  for item in outputs:
158
158
  process_output(item, result)
159
159
 
160
- result.description = get_result_description(
161
- test_id=test_id,
162
- test_description=description,
163
- tables=result.tables,
164
- figures=result.figures,
165
- metric=result.metric,
166
- should_generate=generate_description,
167
- title=title,
168
- )
169
-
170
160
  return result
171
161
 
172
162
 
@@ -177,7 +167,6 @@ def _run_composite_test(
177
167
  input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
178
168
  params: Union[Dict[str, Any], None],
179
169
  param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
180
- generate_description: bool,
181
170
  title: Optional[str] = None,
182
171
  ):
183
172
  """Run a composite test i.e. a test made up of multiple metrics"""
@@ -199,6 +188,14 @@ def _run_composite_test(
199
188
  if not all(result.metric is not None for result in results):
200
189
  raise ValueError("All tests must return a metric when used as a composite test")
201
190
 
191
+ # Create composite docstring from all test results
192
+ composite_doc = "\n\n".join(
193
+ [
194
+ f"{test_id_to_name(result.result_id)}:\n{_test_description(result.doc)}"
195
+ for result in results
196
+ ]
197
+ )
198
+
202
199
  return build_test_result(
203
200
  outputs=[
204
201
  {
@@ -208,12 +205,9 @@ def _run_composite_test(
208
205
  for result in results
209
206
  ], # pass in a single table with metric values as our 'outputs'
210
207
  test_id=test_id,
208
+ test_doc=composite_doc,
211
209
  inputs=results[0].inputs,
212
210
  params=results[0].params,
213
- description="\n\n".join(
214
- [_test_description(result.description, num_lines=1) for result in results]
215
- ), # join truncated (first line only) test descriptions
216
- generate_description=generate_description,
217
211
  title=title,
218
212
  )
219
213
 
@@ -226,7 +220,6 @@ def _run_comparison_test(
226
220
  input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
227
221
  params: Union[Dict[str, Any], None],
228
222
  param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
229
- generate_description: bool,
230
223
  title: Optional[str] = None,
231
224
  ):
232
225
  """Run a comparison test i.e. a test that compares multiple outputs of a test across
@@ -255,24 +248,43 @@ def _run_comparison_test(
255
248
  # composite tests have a test_id thats built from the name
256
249
  if not test_id:
257
250
  test_id = results[0].result_id
258
- description = results[0].description
251
+ test_doc = results[0].doc
259
252
  else:
260
- description = describe_test(test_id, raw=True)["Description"]
253
+ test_doc = describe_test(test_id, raw=True)["Description"]
261
254
 
262
255
  combined_outputs, combined_inputs, combined_params = combine_results(results)
263
256
 
264
257
  return build_test_result(
265
258
  outputs=tuple(combined_outputs),
266
259
  test_id=test_id,
260
+ test_doc=test_doc,
267
261
  inputs=combined_inputs,
268
262
  params=combined_params,
269
- description=description,
270
- generate_description=generate_description,
271
263
  title=title,
272
264
  )
273
265
 
274
266
 
275
- def run_test(
267
+ def _run_test(test_id: TestID, inputs: Dict[str, Any], params: Dict[str, Any]):
268
+ """Run a standard test and return a TestResult object"""
269
+ test_func = load_test(test_id)
270
+ input_kwargs, param_kwargs = _get_test_kwargs(
271
+ test_func=test_func,
272
+ inputs=inputs or {},
273
+ params=params or {},
274
+ )
275
+
276
+ raw_result = test_func(**input_kwargs, **param_kwargs)
277
+
278
+ return build_test_result(
279
+ outputs=raw_result,
280
+ test_id=test_id,
281
+ test_doc=getdoc(test_func),
282
+ inputs=input_kwargs,
283
+ params=param_kwargs,
284
+ )
285
+
286
+
287
+ def run_test( # noqa: C901
276
288
  test_id: Union[TestID, None] = None,
277
289
  name: Union[str, None] = None,
278
290
  unit_metrics: Union[List[TestID], None] = None,
@@ -283,6 +295,7 @@ def run_test(
283
295
  show: bool = True,
284
296
  generate_description: bool = True,
285
297
  title: Optional[str] = None,
298
+ post_process_fn: Union[Callable[[TestResult], None], None] = None,
286
299
  **kwargs,
287
300
  ) -> TestResult:
288
301
  """Run a ValidMind or custom test
@@ -306,6 +319,7 @@ def run_test(
306
319
  show (bool, optional): Whether to display results. Defaults to True.
307
320
  generate_description (bool, optional): Whether to generate a description. Defaults to True.
308
321
  title (str, optional): Custom title for the test result
322
+ post_process_fn (Callable[[TestResult], None], optional): Function to post-process the test result
309
323
 
310
324
  Returns:
311
325
  TestResult: A TestResult object containing the test results
@@ -343,7 +357,6 @@ def run_test(
343
357
  input_grid=input_grid,
344
358
  params=params,
345
359
  param_grid=param_grid,
346
- generate_description=generate_description,
347
360
  )
348
361
 
349
362
  elif unit_metrics:
@@ -357,43 +370,28 @@ def run_test(
357
370
  input_grid=input_grid,
358
371
  params=params,
359
372
  param_grid=param_grid,
360
- generate_description=generate_description,
361
- title=title,
362
- )
363
-
364
- elif input_grid or param_grid:
365
- result = _run_comparison_test(
366
- test_id=test_id,
367
- inputs=inputs,
368
- input_grid=input_grid,
369
- params=params,
370
- param_grid=param_grid,
371
- generate_description=generate_description,
372
373
  title=title,
373
374
  )
374
375
 
375
376
  else:
376
- test_func = load_test(test_id)
377
-
378
- input_kwargs, param_kwargs = _get_test_kwargs(
379
- test_func, inputs or {}, params or {}
380
- )
381
-
382
- raw_result = test_func(**input_kwargs, **param_kwargs)
383
-
384
- result = build_test_result(
385
- outputs=raw_result,
386
- test_id=test_id,
387
- inputs=input_kwargs,
388
- params=param_kwargs,
389
- description=getdoc(test_func),
390
- generate_description=generate_description,
391
- title=title,
392
- )
377
+ result = _run_test(test_id, inputs, params)
393
378
 
394
379
  end_time = time.perf_counter()
395
380
  result.metadata = _get_run_metadata(duration_seconds=end_time - start_time)
396
381
 
382
+ if post_process_fn:
383
+ result = post_process_fn(result)
384
+
385
+ result.description = get_result_description(
386
+ test_id=test_id,
387
+ test_description=result.doc,
388
+ tables=result.tables,
389
+ figures=result.figures,
390
+ metric=result.metric,
391
+ should_generate=generate_description,
392
+ title=title,
393
+ )
394
+
397
395
  if show:
398
396
  result.show()
399
397
 
validmind/utils.py CHANGED
@@ -168,6 +168,17 @@ class NumpyEncoder(json.JSONEncoder):
168
168
  return super().iterencode(obj, _one_shot)
169
169
 
170
170
 
171
+ class HumanReadableEncoder(NumpyEncoder):
172
+ def __init__(self, *args, **kwargs):
173
+ super().__init__(*args, **kwargs)
174
+ # truncate ndarrays to 10 items
175
+ self.type_handlers[self.is_numpy_ndarray] = lambda obj: (
176
+ obj.tolist()[:5] + ["..."] + obj.tolist()[-5:]
177
+ if len(obj) > 10
178
+ else obj.tolist()
179
+ )
180
+
181
+
171
182
  def get_full_typename(o: Any) -> Any:
172
183
  """We determine types based on type names so we don't have to import
173
184
  (and therefore depend on) PyTorch, TensorFlow, etc.
@@ -448,18 +459,23 @@ def get_dataset_info(dataset):
448
459
 
449
460
 
450
461
  def preview_test_config(config):
451
- formatted_json = json.dumps(config, indent=4)
462
+ """Preview test configuration in a collapsible HTML section.
463
+
464
+ Args:
465
+ config (dict): Test configuration dictionary
466
+ """
467
+
468
+ try:
469
+ formatted_json = json.dumps(serialize(config), indent=4)
470
+ except TypeError as e:
471
+ logger.error(f"JSON serialization failed: {e}")
472
+ return
452
473
 
453
- # JavaScript + HTML for the collapsible section
454
474
  collapsible_html = f"""
455
475
  <script>
456
476
  function toggleOutput() {{
457
477
  var content = document.getElementById("collapsibleContent");
458
- if (content.style.display === "none") {{
459
- content.style.display = "block";
460
- }} else {{
461
- content.style.display = "none";
462
- }}
478
+ content.style.display = content.style.display === "none" ? "block" : "none";
463
479
  }}
464
480
  </script>
465
481
  <button onclick="toggleOutput()">Preview Config</button>
@@ -545,3 +561,14 @@ def inspect_obj(obj):
545
561
  # Loop through the parameters and print detailed information
546
562
  for param_name, param in sig.parameters.items():
547
563
  print(f"{param_name} - ({param.default})")
564
+
565
+
566
+ def serialize(obj):
567
+ """Convert objects to JSON-serializable format with readable descriptions."""
568
+ if isinstance(obj, dict):
569
+ return {k: serialize(v) for k, v in obj.items()}
570
+ elif isinstance(obj, (list, tuple)):
571
+ return [serialize(x) for x in obj]
572
+ elif isinstance(obj, (pd.DataFrame, pd.Series)):
573
+ return "" # Simple empty string for non-serializable objects
574
+ return obj
@@ -33,6 +33,18 @@ def is_png_image(figure) -> bool:
33
33
  return isinstance(figure, bytes)
34
34
 
35
35
 
36
+ def create_figure(
37
+ figure: Union[matplotlib.figure.Figure, go.Figure, go.FigureWidget, bytes],
38
+ key: str,
39
+ ref_id: str,
40
+ ) -> "Figure":
41
+ """Create a VM Figure object from a raw figure object"""
42
+ if is_matplotlib_figure(figure) or is_plotly_figure(figure) or is_png_image(figure):
43
+ return Figure(key=key, figure=figure, ref_id=ref_id)
44
+
45
+ raise ValueError(f"Unsupported figure type: {type(figure)}")
46
+
47
+
36
48
  @dataclass
37
49
  class Figure:
38
50
  """
@@ -55,6 +67,9 @@ class Figure:
55
67
  ):
56
68
  self.figure = go.FigureWidget(self.figure)
57
69
 
70
+ def __repr__(self):
71
+ return f"Figure(key={self.key}, ref_id={self.ref_id})"
72
+
58
73
  def to_widget(self):
59
74
  """
60
75
  Returns the ipywidget compatible representation of the figure. Ideally
@@ -2,6 +2,6 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from .result import ErrorResult, Result, ResultTable, TestResult
5
+ from .result import ErrorResult, RawData, Result, ResultTable, TestResult
6
6
 
7
- __all__ = ["ErrorResult", "Result", "ResultTable", "TestResult"]
7
+ __all__ = ["ErrorResult", "RawData", "Result", "ResultTable", "TestResult"]