PyPI - validmind - Versions diffs - 2.6.10__py3-none-any.whl → 2.7.4__py3-none-any.whl - Mend

validmind 2.6.10py3-none-any.whl → 2.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

validmind/__init__.py +2 -0
validmind/__version__.py +1 -1
validmind/ai/test_descriptions.py +20 -4
validmind/ai/test_result_description/user.jinja +5 -0
validmind/datasets/credit_risk/lending_club.py +444 -14
validmind/tests/data_validation/MutualInformation.py +129 -0
validmind/tests/data_validation/ScoreBandDefaultRates.py +139 -0
validmind/tests/data_validation/TooManyZeroValues.py +6 -5
validmind/tests/data_validation/UniqueRows.py +3 -1
validmind/tests/decorator.py +18 -16
validmind/tests/model_validation/sklearn/CalibrationCurve.py +116 -0
validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +261 -0
validmind/tests/model_validation/sklearn/ConfusionMatrix.py +1 -0
validmind/tests/model_validation/sklearn/HyperParametersTuning.py +144 -56
validmind/tests/model_validation/sklearn/ModelParameters.py +74 -0
validmind/tests/model_validation/sklearn/ROCCurve.py +26 -23
validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +130 -0
validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -6
validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -3
validmind/tests/output.py +10 -1
validmind/tests/run.py +52 -54
validmind/utils.py +34 -7
validmind/vm_models/figure.py +15 -0
validmind/vm_models/result/__init__.py +2 -2
validmind/vm_models/result/result.py +136 -23
{validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/METADATA +1 -1
{validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/RECORD +30 -24
{validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/LICENSE +0 -0
{validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/WHEEL +0 -0
{validmind-2.6.10.dist-info → validmind-2.7.4.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import pandas as pd
+import plotly.graph_objects as go
+from validmind import tags, tasks
+from validmind.vm_models import VMModel, VMDataset
+@tags("visualization", "credit_risk", "calibration")
+@tasks("classification")
+def ScoreProbabilityAlignment(
+    model: VMModel, dataset: VMDataset, score_column: str = "score", n_bins: int = 10
+):
+    """
+    Analyzes the alignment between credit scores and predicted probabilities.
+    ### Purpose
+    The Score-Probability Alignment test evaluates how well credit scores align with
+    predicted default probabilities. This helps validate score scaling, identify potential
+    calibration issues, and ensure scores reflect risk appropriately.
+    ### Test Mechanism
+    The test:
+    1. Groups scores into bins
+    2. Calculates average predicted probability per bin
+    3. Tests monotonicity of relationship
+    4. Analyzes probability distribution within score bands
+    ### Signs of High Risk
+    - Non-monotonic relationship between scores and probabilities
+    - Large probability variations within score bands
+    - Unexpected probability jumps between adjacent bands
+    - Poor alignment with expected odds-to-score relationship
+    - Inconsistent probability patterns across score ranges
+    - Clustering of probabilities at extreme values
+    - Score bands with similar probability profiles
+    - Unstable probability estimates in key decision bands
+    ### Strengths
+    - Direct validation of score-to-probability relationship
+    - Identifies potential calibration issues
+    - Supports score band validation
+    - Helps understand model behavior
+    - Useful for policy setting
+    - Visual and numerical results
+    - Easy to interpret
+    - Supports regulatory documentation
+    ### Limitations
+    - Sensitive to bin selection
+    - Requires sufficient data per bin
+    - May mask within-bin variations
+    - Point-in-time analysis only
+    - Cannot detect all forms of miscalibration
+    - Assumes scores should align with probabilities
+    - May oversimplify complex relationships
+    - Limited to binary outcomes
+    """
+    if score_column not in dataset.df.columns:
+        raise ValueError(f"Score column '{score_column}' not found in dataset")
+    # Get predicted probabilities
+    y_prob = dataset.y_prob(model)
+    # Create score bins
+    df = dataset.df.copy()
+    df["probability"] = y_prob
+    # Create score bins with equal width
+    df["score_bin"] = pd.qcut(df[score_column], n_bins, duplicates="drop")
+    # Calculate statistics per bin
+    results = []
+    for bin_name, group in df.groupby("score_bin"):
+        bin_stats = {
+            "Score Range": f"{bin_name.left:.0f}-{bin_name.right:.0f}",
+            "Mean Score": group[score_column].mean(),
+            "Population Count": len(group),
+            "Population (%)": len(group) / len(df) * 100,
+            "Mean Probability (%)": group["probability"].mean() * 100,
+            "Min Probability (%)": group["probability"].min() * 100,
+            "Max Probability (%)": group["probability"].max() * 100,
+            "Probability Std": group["probability"].std() * 100,
+        }
+        results.append(bin_stats)
+    results_df = pd.DataFrame(results)
+    # Create visualization
+    fig = go.Figure()
+    # Add probability range
+    fig.add_trace(
+        go.Scatter(
+            x=results_df["Mean Score"],
+            y=results_df["Mean Probability (%)"],
+            mode="lines+markers",
+            name="Mean Probability",
+            line=dict(color="blue"),
+            error_y=dict(
+                type="data",
+                symmetric=False,
+                array=results_df["Max Probability (%)"]
+                - results_df["Mean Probability (%)"],
+                arrayminus=results_df["Mean Probability (%)"]
+                - results_df["Min Probability (%)"],
+                color="gray",
+            ),
+        )
+    )
+    # Update layout
+    fig.update_layout(
+        title="Score-Probability Alignment",
+        xaxis_title="Score",
+        yaxis_title="Default Probability (%)",
+        showlegend=True,
+        template="plotly_white",
+        width=800,
+        height=600,
+    )
+    return results_df, fig

validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py CHANGED Viewed

@@ -9,22 +9,21 @@ from matplotlib import cm
 from validmind import tags, tasks
-@tags("visualization", "credit_risk", "logistic_regression")
+@tags("visualization", "credit_risk")
 @tasks("classification")
 def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
     """
-    Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
-    regression models.
+    Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models.
     ### Purpose
     This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
-    in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
+    in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative
     probabilities for positive and negative classes across both the training and test datasets.
     ### Test Mechanism
-    The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
+    The classification model is evaluated by first computing the predicted probabilities for each instance in both
     the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
     for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
     distributions of these probabilities are created for both positive and negative classes across both training and
@@ -51,7 +50,7 @@ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabil
     ### Limitations
-    - Exclusive to classification tasks and specifically to logistic regression models.
+    - Exclusive to classification tasks and specifically to classification models.
     - Graphical results necessitate human interpretation and may not be directly applicable for automated risk
     detection.
     - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual

validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py CHANGED Viewed

@@ -9,7 +9,7 @@ from matplotlib import cm
 from validmind import tags, tasks
-@tags("visualization", "credit_risk", "logistic_regression")
+@tags("visualization", "credit_risk")
 @tasks("classification")
 def PredictionProbabilitiesHistogram(
     dataset, model, title="Histogram of Predictive Probabilities"
@@ -22,7 +22,7 @@ def PredictionProbabilitiesHistogram(
     The Prediction Probabilities Histogram test is designed to generate histograms displaying the Probability of
     Default (PD) predictions for both positive and negative classes in training and testing datasets. This helps in
-    evaluating the performance of a logistic regression model, particularly for credit risk prediction.
+    evaluating the performance of a classification model.
     ### Test Mechanism
@@ -52,7 +52,6 @@ def PredictionProbabilitiesHistogram(
     ### Limitations
     - Specifically tailored for binary classification scenarios and not suited for multi-class classification tasks.
-    - Mainly applicable to logistic regression models, and may not be effective for other model types.
     - Provides a robust visual representation but lacks a quantifiable measure to assess model performance.
     """

validmind/tests/output.py CHANGED Viewed

@@ -15,7 +15,7 @@ from validmind.vm_models.figure import (
     is_plotly_figure,
     is_png_image,
 )
-from validmind.vm_models.result import ResultTable, TestResult
+from validmind.vm_models.result import RawData, ResultTable, TestResult
 class OutputHandler(ABC):
@@ -103,6 +103,14 @@ class TableOutputHandler(OutputHandler):
             result.add_table(ResultTable(data=table_data, title=table_name or None))
+class RawDataOutputHandler(OutputHandler):
+    def can_handle(self, item: Any) -> bool:
+        return isinstance(item, RawData)
+    def process(self, item: Any, result: TestResult) -> None:
+        result.raw_data = item
 def process_output(item: Any, result: TestResult) -> None:
     """Process a single test output item and update the TestResult."""
     handlers = [
@@ -110,6 +118,7 @@ def process_output(item: Any, result: TestResult) -> None:
         MetricOutputHandler(),
         FigureOutputHandler(),
         TableOutputHandler(),
+        RawDataOutputHandler(),
     ]
     for handler in handlers:

validmind/tests/run.py CHANGED Viewed

@@ -7,7 +7,7 @@ import subprocess
 import time
 from datetime import datetime
 from inspect import getdoc
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from uuid import uuid4
 from validmind import __version__
@@ -134,10 +134,9 @@ def _get_test_kwargs(
 def build_test_result(
     outputs: Union[Any, Tuple[Any, ...]],
     test_id: str,
+    test_doc: str,
     inputs: Dict[str, Union[VMInput, List[VMInput]]],
     params: Union[Dict[str, Any], None],
-    description: str,
-    generate_description: bool = True,
     title: Optional[str] = None,
 ):
     """Build a TestResult object from a set of raw test function outputs"""
@@ -149,6 +148,7 @@ def build_test_result(
         ref_id=ref_id,
         inputs=inputs,
         params=params if params else None,  # None if empty dict or None
+        doc=test_doc,
     )
     if not isinstance(outputs, tuple):
@@ -157,16 +157,6 @@ def build_test_result(
     for item in outputs:
         process_output(item, result)
-    result.description = get_result_description(
-        test_id=test_id,
-        test_description=description,
-        tables=result.tables,
-        figures=result.figures,
-        metric=result.metric,
-        should_generate=generate_description,
-        title=title,
-    )
     return result
@@ -177,7 +167,6 @@ def _run_composite_test(
     input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
     params: Union[Dict[str, Any], None],
     param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
-    generate_description: bool,
     title: Optional[str] = None,
 ):
     """Run a composite test i.e. a test made up of multiple metrics"""
@@ -199,6 +188,14 @@ def _run_composite_test(
     if not all(result.metric is not None for result in results):
         raise ValueError("All tests must return a metric when used as a composite test")
+    # Create composite docstring from all test results
+    composite_doc = "\n\n".join(
+        [
+            f"{test_id_to_name(result.result_id)}:\n{_test_description(result.doc)}"
+            for result in results
+        ]
+    )
     return build_test_result(
         outputs=[
             {
@@ -208,12 +205,9 @@ def _run_composite_test(
             for result in results
         ],  # pass in a single table with metric values as our 'outputs'
         test_id=test_id,
+        test_doc=composite_doc,
         inputs=results[0].inputs,
         params=results[0].params,
-        description="\n\n".join(
-            [_test_description(result.description, num_lines=1) for result in results]
-        ),  # join truncated (first line only) test descriptions
-        generate_description=generate_description,
         title=title,
     )
@@ -226,7 +220,6 @@ def _run_comparison_test(
     input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
     params: Union[Dict[str, Any], None],
     param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
-    generate_description: bool,
     title: Optional[str] = None,
 ):
     """Run a comparison test i.e. a test that compares multiple outputs of a test across
@@ -255,24 +248,43 @@ def _run_comparison_test(
     # composite tests have a test_id thats built from the name
     if not test_id:
         test_id = results[0].result_id
-        description = results[0].description
+        test_doc = results[0].doc
     else:
-        description = describe_test(test_id, raw=True)["Description"]
+        test_doc = describe_test(test_id, raw=True)["Description"]
     combined_outputs, combined_inputs, combined_params = combine_results(results)
     return build_test_result(
         outputs=tuple(combined_outputs),
         test_id=test_id,
+        test_doc=test_doc,
         inputs=combined_inputs,
         params=combined_params,
-        description=description,
-        generate_description=generate_description,
         title=title,
     )
-def run_test(
+def _run_test(test_id: TestID, inputs: Dict[str, Any], params: Dict[str, Any]):
+    """Run a standard test and return a TestResult object"""
+    test_func = load_test(test_id)
+    input_kwargs, param_kwargs = _get_test_kwargs(
+        test_func=test_func,
+        inputs=inputs or {},
+        params=params or {},
+    )
+    raw_result = test_func(**input_kwargs, **param_kwargs)
+    return build_test_result(
+        outputs=raw_result,
+        test_id=test_id,
+        test_doc=getdoc(test_func),
+        inputs=input_kwargs,
+        params=param_kwargs,
+    )
+def run_test(  # noqa: C901
     test_id: Union[TestID, None] = None,
     name: Union[str, None] = None,
     unit_metrics: Union[List[TestID], None] = None,
@@ -283,6 +295,7 @@ def run_test(
     show: bool = True,
     generate_description: bool = True,
     title: Optional[str] = None,
+    post_process_fn: Union[Callable[[TestResult], None], None] = None,
     **kwargs,
 ) -> TestResult:
     """Run a ValidMind or custom test
@@ -306,6 +319,7 @@ def run_test(
         show (bool, optional): Whether to display results. Defaults to True.
         generate_description (bool, optional): Whether to generate a description. Defaults to True.
         title (str, optional): Custom title for the test result
+        post_process_fn (Callable[[TestResult], None], optional): Function to post-process the test result
     Returns:
         TestResult: A TestResult object containing the test results
@@ -343,7 +357,6 @@ def run_test(
             input_grid=input_grid,
             params=params,
             param_grid=param_grid,
-            generate_description=generate_description,
         )
     elif unit_metrics:
@@ -357,43 +370,28 @@ def run_test(
             input_grid=input_grid,
             params=params,
             param_grid=param_grid,
-            generate_description=generate_description,
-            title=title,
-        )
-    elif input_grid or param_grid:
-        result = _run_comparison_test(
-            test_id=test_id,
-            inputs=inputs,
-            input_grid=input_grid,
-            params=params,
-            param_grid=param_grid,
-            generate_description=generate_description,
             title=title,
         )
     else:
-        test_func = load_test(test_id)
-        input_kwargs, param_kwargs = _get_test_kwargs(
-            test_func, inputs or {}, params or {}
-        )
-        raw_result = test_func(**input_kwargs, **param_kwargs)
-        result = build_test_result(
-            outputs=raw_result,
-            test_id=test_id,
-            inputs=input_kwargs,
-            params=param_kwargs,
-            description=getdoc(test_func),
-            generate_description=generate_description,
-            title=title,
-        )
+        result = _run_test(test_id, inputs, params)
     end_time = time.perf_counter()
     result.metadata = _get_run_metadata(duration_seconds=end_time - start_time)
+    if post_process_fn:
+        result = post_process_fn(result)
+    result.description = get_result_description(
+        test_id=test_id,
+        test_description=result.doc,
+        tables=result.tables,
+        figures=result.figures,
+        metric=result.metric,
+        should_generate=generate_description,
+        title=title,
+    )
     if show:
         result.show()

validmind/utils.py CHANGED Viewed

@@ -168,6 +168,17 @@ class NumpyEncoder(json.JSONEncoder):
         return super().iterencode(obj, _one_shot)
+class HumanReadableEncoder(NumpyEncoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # truncate ndarrays to 10 items
+        self.type_handlers[self.is_numpy_ndarray] = lambda obj: (
+            obj.tolist()[:5] + ["..."] + obj.tolist()[-5:]
+            if len(obj) > 10
+            else obj.tolist()
+        )
 def get_full_typename(o: Any) -> Any:
     """We determine types based on type names so we don't have to import
     (and therefore depend on) PyTorch, TensorFlow, etc.
@@ -448,18 +459,23 @@ def get_dataset_info(dataset):
 def preview_test_config(config):
-    formatted_json = json.dumps(config, indent=4)
+    """Preview test configuration in a collapsible HTML section.
+    Args:
+        config (dict): Test configuration dictionary
+    """
+    try:
+        formatted_json = json.dumps(serialize(config), indent=4)
+    except TypeError as e:
+        logger.error(f"JSON serialization failed: {e}")
+        return
-    # JavaScript + HTML for the collapsible section
     collapsible_html = f"""
     <script>
     function toggleOutput() {{
         var content = document.getElementById("collapsibleContent");
-        if (content.style.display === "none") {{
-            content.style.display = "block";
-        }} else {{
-            content.style.display = "none";
-        }}
+        content.style.display = content.style.display === "none" ? "block" : "none";
     }}
     </script>
     <button onclick="toggleOutput()">Preview Config</button>
@@ -545,3 +561,14 @@ def inspect_obj(obj):
         # Loop through the parameters and print detailed information
         for param_name, param in sig.parameters.items():
             print(f"{param_name} - ({param.default})")
+def serialize(obj):
+    """Convert objects to JSON-serializable format with readable descriptions."""
+    if isinstance(obj, dict):
+        return {k: serialize(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [serialize(x) for x in obj]
+    elif isinstance(obj, (pd.DataFrame, pd.Series)):
+        return ""  # Simple empty string for non-serializable objects
+    return obj

validmind/vm_models/figure.py CHANGED Viewed

@@ -33,6 +33,18 @@ def is_png_image(figure) -> bool:
     return isinstance(figure, bytes)
+def create_figure(
+    figure: Union[matplotlib.figure.Figure, go.Figure, go.FigureWidget, bytes],
+    key: str,
+    ref_id: str,
+) -> "Figure":
+    """Create a VM Figure object from a raw figure object"""
+    if is_matplotlib_figure(figure) or is_plotly_figure(figure) or is_png_image(figure):
+        return Figure(key=key, figure=figure, ref_id=ref_id)
+    raise ValueError(f"Unsupported figure type: {type(figure)}")
 @dataclass
 class Figure:
     """
@@ -55,6 +67,9 @@ class Figure:
         ):
             self.figure = go.FigureWidget(self.figure)
+    def __repr__(self):
+        return f"Figure(key={self.key}, ref_id={self.ref_id})"
     def to_widget(self):
         """
         Returns the ipywidget compatible representation of the figure. Ideally

validmind/vm_models/result/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from .result import ErrorResult, Result, ResultTable, TestResult
+from .result import ErrorResult, RawData, Result, ResultTable, TestResult
-__all__ = ["ErrorResult", "Result", "ResultTable", "TestResult"]
+__all__ = ["ErrorResult", "RawData", "Result", "ResultTable", "TestResult"]

validmind 2.6.10__py3-none-any.whl → 2.7.4__py3-none-any.whl

validmind 2.6.10py3-none-any.whl → 2.7.4py3-none-any.whl