PyPI - validmind - Versions diffs - 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl - Mend

validmind 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py CHANGED Viewed

@@ -109,12 +109,11 @@ class RobustnessDiagnosis(ThresholdTest):
         features_list = self.params["features_columns"]
         if features_list is None:
-            features_list = self.inputs.datasets[0].get_numeric_features_columns()
+            features_list = self.inputs.datasets[0].feature_columns
         # Check if all elements from features_list are present in the numerical feature columns
         all_present = all(
-            elem in self.inputs.datasets[0].get_numeric_features_columns()
-            for elem in features_list
+            elem in self.inputs.datasets[0].feature_columns for elem in features_list
         )
         if not all_present:
             raise ValueError(

validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py CHANGED Viewed

@@ -11,6 +11,7 @@ import shap
 from validmind.errors import UnsupportedModelForSHAPError
 from validmind.logging import get_logger
+from validmind.models import CatBoostModel, SKlearnModel, StatsModelsModel
 from validmind.vm_models import Figure, Metric
 logger = get_logger(__name__)
@@ -76,6 +77,7 @@ class SHAPGlobalImportance(Metric):
     }
     default_params = {
         "kernel_explainer_samples": 10,
+        "tree_or_linear_explainer_samples": 200,
     }
     def _generate_shap_plot(self, type_, shap_values, x_test):
@@ -130,20 +132,14 @@ class SHAPGlobalImportance(Metric):
         )
     def run(self):
-        model_library = self.inputs.model.model_library()
-        if model_library in [
-            "statsmodels",
-            "pytorch",
-            "catboost",
-            "transformers",
-            "FoundationModel",
-            "R",
-        ]:
-            logger.info(f"Skiping SHAP for {model_library} models")
+        if not isinstance(self.inputs.model, SKlearnModel) or isinstance(
+            self.inputs.model, (CatBoostModel, StatsModelsModel)
+        ):
+            logger.info(f"Skiping SHAP for {self.inputs.model.library} models")
             return
         trained_model = self.inputs.model.model
-        model_class = self.inputs.model.model_class()
+        model_class = self.inputs.model.class_
         # the shap library generates a bunch of annoying warnings that we don't care about
         warnings.filterwarnings("ignore", category=UserWarning)
@@ -175,6 +171,7 @@ class SHAPGlobalImportance(Metric):
                 ),
             )
         else:
+            model_class = "<ExternalModel>" if model_class is None else model_class
             raise UnsupportedModelForSHAPError(
                 f"Model {model_class} not supported for SHAP importance."
             )
@@ -186,7 +183,12 @@ class SHAPGlobalImportance(Metric):
                 self.params["kernel_explainer_samples"],
             )
         else:
-            shap_sample = self.inputs.dataset.x_df()
+            shap_sample = self.inputs.dataset.x_df().sample(
+                min(
+                    self.params["tree_or_linear_explainer_samples"],
+                    self.inputs.dataset.x_df().shape[0],
+                )
+            )
         shap_values = explainer.shap_values(shap_sample)

validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py CHANGED Viewed

@@ -113,7 +113,7 @@ class WeakspotsDiagnosis(ThresholdTest):
                 raise ValueError(f"Threshold for metric {metric} is missing")
         if self.params["features_columns"] is None:
-            features_list = self.inputs.datasets[0].get_features_columns()
+            features_list = self.inputs.datasets[0].feature_columns
         else:
             features_list = self.params["features_columns"]
@@ -124,8 +124,7 @@ class WeakspotsDiagnosis(ThresholdTest):
         # Check if all elements from features_list are present in the feature columns
         all_present = all(
-            elem in self.inputs.datasets[0].get_features_columns()
-            for elem in features_list
+            elem in self.inputs.datasets[0].feature_columns for elem in features_list
         )
         if not all_present:
             raise ValueError(
@@ -150,7 +149,7 @@ class WeakspotsDiagnosis(ThresholdTest):
         results_headers.extend(self.default_metrics.keys())
         for feature in features_list:
             bins = 10
-            if feature in self.inputs.datasets[0].get_categorical_features_columns():
+            if feature in self.inputs.datasets[0].feature_columns_categorical:
                 bins = len(train_df[feature].unique())
             train_df["bin"] = pd.cut(train_df[feature], bins=bins)

validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py CHANGED Viewed

@@ -89,7 +89,7 @@ class RegressionModelForecastPlot(Metric):
         figures = []
         for i, fitted_model in enumerate(model_list):
-            feature_columns = datasets[0].get_features_columns()
+            feature_columns = datasets[0].feature_columns
             train_ds = datasets[0]
             test_ds = datasets[1]

validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py CHANGED Viewed

@@ -98,7 +98,7 @@ class RegressionModelForecastPlotLevels(Metric):
         figures = []
         for i, fitted_model in enumerate(model_list):
-            feature_columns = datasets[0].get_features_columns()
+            feature_columns = datasets[0].feature_columns
             train_ds = datasets[0]
             test_ds = datasets[1]

validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py CHANGED Viewed

@@ -106,7 +106,7 @@ class RegressionModelInsampleComparison(Metric):
         evaluation_results = []
         for i, model in enumerate(models):
-            X_columns = dataset.get_features_columns()
+            X_columns = dataset.feature_columns
             y_true = dataset.y
             y_pred = dataset.y_pred(model)

validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py CHANGED Viewed

@@ -96,7 +96,7 @@ class RegressionModelOutsampleComparison(Metric):
         for fitted_model in model_list:
             # Extract the column names of the independent variables from the model
-            independent_vars = dataset.get_features_columns()
+            independent_vars = dataset.feature_columns
             # Separate the target variable and features in the test dataset
             y_test = dataset.y

validmind/tests/model_validation/statsmodels/RegressionModelSummary.py CHANGED Viewed

@@ -57,7 +57,7 @@ class RegressionModelSummary(Metric):
     }
     def run(self):
-        X_columns = self.inputs.dataset.get_features_columns()
+        X_columns = self.inputs.dataset.feature_columns
         y_true = self.inputs.dataset.y
         y_pred = self.inputs.dataset.y_pred(self.inputs.model)

validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py CHANGED Viewed

@@ -73,7 +73,7 @@ class RegressionModelsCoeffs(Metric):
             raise ValueError("List of models must be provided in the models parameter")
         for model in self.inputs.models:
-            if model.model_class() != "statsmodels" and model.model_class() != "R":
+            if model.class_ != "statsmodels" and model.class_ != "R":
                 raise SkipTestError(
                     "Only statsmodels and R models are supported for this metric"
                 )

validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py CHANGED Viewed

@@ -80,7 +80,7 @@ class RegressionModelsPerformance(Metric):
         evaluation_results = []
         for model, dataset in zip(models, datasets):
-            X_columns = dataset.get_features_columns()
+            X_columns = dataset.feature_columns
             y_true = dataset.y
             y_pred = dataset.y_pred(model)

validmind/tests/model_validation/statsmodels/ScorecardHistogram.py CHANGED Viewed

@@ -112,16 +112,15 @@ class ScorecardHistogram(Metric):
         dataframes = []
         metric_value = {"score_histogram": {}}
         for dataset in self.inputs.datasets:
-            df = dataset.df.copy()
-            # Check if the score_column exists in the DataFrame
-            if score_column not in df.columns:
+            if score_column not in dataset.df.columns:
                 raise ValueError(
                     f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
                 )
-            df[score_column] = dataset.get_extra_column(score_column)
-            dataframes.append(df)
-            metric_value["score_histogram"][dataset.input_id] = list(df[score_column])
+            dataframes.append(dataset.df.copy())
+            metric_value["score_histogram"][dataset.input_id] = list(
+                dataset.df[score_column]
+            )
         figures = self.plot_score_histogram(
             dataframes, dataset_titles, score_column, target_column, title

validmind/unit_metrics/__init__.py CHANGED Viewed

@@ -6,8 +6,6 @@ import hashlib
 import json
 from importlib import import_module
-import numpy as np
 from ..tests.decorator import _build_result, _inspect_signature
 from ..utils import get_model_info, test_id_to_name
@@ -58,7 +56,7 @@ def _serialize_model(model):
     return hash_object.hexdigest()
-def _serialize_dataset(dataset, model_id):
+def _serialize_dataset(dataset, model):
     """
     Serialize the description of the dataset input to a unique hash.
@@ -68,11 +66,11 @@ def _serialize_dataset(dataset, model_id):
     Args:
         dataset: The dataset object, which should have properties like _df (pandas DataFrame),
-                 target_column (string), feature_columns (list of strings), and _extra_columns (dict).
-        model_id (str): The ID of the model associated with the prediction column.
+                 target_column (string), feature_columns (list of strings), and extra_columns (dict).
+        model (VMModel): The model whose predictions will be included in the serialized dataset
     Returns:
-        str: A SHA-256 hash representing the dataset.
+        str: MD5 hash of the dataset
     Note:
         Including the model ID and prediction column name in the hash calculation ensures uniqueness,
@@ -80,57 +78,33 @@ def _serialize_dataset(dataset, model_id):
         This approach guarantees that the hash will distinguish between model-generated predictions
         and pre-computed prediction columns, addressing potential hash collisions.
     """
-    # Access the prediction column for the given model ID from the dataset's extra columns
-    prediction_column_name = dataset._extra_columns["prediction_columns"][model_id]
-    # Include model ID and prediction column name directly in the hash calculation
-    model_and_prediction_info = f"{model_id}_{prediction_column_name}".encode()
-    # Start with target and feature columns, and include the prediction column
-    columns = (
-        [dataset._target_column] + dataset._feature_columns + [prediction_column_name]
+    return _fast_hash(
+        dataset.df[
+            [
+                *dataset.feature_columns,
+                dataset.target_column,
+                dataset.prediction_column(model),
+            ]
+        ]
     )
-    # Use _fast_hash function and include model_and_prediction_info in the hash calculation
-    hash_digest = _fast_hash(
-        dataset._df[columns], model_and_prediction_info=model_and_prediction_info
-    )
-    return hash_digest
-def _fast_hash(df, sample_size=1000, model_and_prediction_info=None):
+def _fast_hash(df, sample_size=1000):
     """
-    Generates a hash for a DataFrame by sampling and combining its size, content,
-    and optionally model and prediction information.
+    Generates a fast hash by sampling, converting to string and md5 hashing.
     Args:
         df (pd.DataFrame): The DataFrame to hash.
         sample_size (int): The maximum number of rows to include in the sample.
-        model_and_prediction_info (bytes, optional): Additional information to include in the hash.
     Returns:
-        str: A SHA-256 hash of the DataFrame's sample and additional information.
+        str: MD5 hash of the DataFrame.
     """
-    # Convert the number of rows to bytes and include it in the hash calculation
-    rows_bytes = str(len(df)).encode()
+    df_sample = df.sample(n=min(sample_size, len(df)), random_state=42)
-    # Sample rows if DataFrame is larger than sample_size, ensuring reproducibility
-    if len(df) > sample_size:
-        df_sample = df.sample(n=sample_size, random_state=42)
-    else:
-        df_sample = df
-    # Convert the sampled DataFrame to a byte array. np.asarray ensures compatibility with various DataFrame contents.
-    byte_array = np.asarray(df_sample).data.tobytes()
-    # Initialize the hash object and update it with the row count, data bytes, and additional info
-    hash_obj = hashlib.sha256(
-        rows_bytes + byte_array + (model_and_prediction_info or b"")
-    )
-    return hash_obj.hexdigest()
+    return hashlib.md5(
+        df_sample.to_string(header=True, index=True).encode()
+    ).hexdigest()
 def get_metric_cache_key(metric_id, params, inputs):
@@ -150,9 +124,8 @@ def get_metric_cache_key(metric_id, params, inputs):
     dataset = inputs["dataset"]
     model = inputs["model"]
-    model_id = model.input_id
-    cache_elements.append(_serialize_dataset(dataset, model_id))
+    cache_elements.append(_serialize_dataset(dataset, model))
     cache_elements.append(_serialize_model(model))
@@ -197,7 +170,11 @@ def run_metric(metric_id, inputs=None, params=None, show=True, value_only=False)
             **{k: v for k, v in inputs.items() if k in _inputs.keys()},
             **{k: v for k, v in params.items() if k in _params.keys()},
         )
-        unit_metric_results_cache[cache_key] = (result, list(_inputs.keys()))
+        unit_metric_results_cache[cache_key] = (
+            result,
+            # store the input ids that were used to calculate the result
+            [v.input_id for v in inputs.values()],
+        )
     value = unit_metric_results_cache[cache_key][0]
@@ -235,7 +212,7 @@ def run_metric(metric_id, inputs=None, params=None, show=True, value_only=False)
     )
     # in case the user tries to log the result object
-    def log(self):
+    def log():
         raise Exception(
             "Cannot log unit metrics directly..."
             "You can run this unit metric as part of a composite metric and log that"

validmind/unit_metrics/composite.py CHANGED Viewed

@@ -37,6 +37,7 @@ class CompositeMetric(Metric):
             metric_ids=self.unit_metrics,
             description=self.description(),
             inputs=self._get_input_dict(),
+            accessed_inputs=self.get_accessed_inputs(),
             params=self.params,
             output_template=self.output_template,
             show=False,
@@ -103,6 +104,7 @@ def run_metrics(
     description: str = None,
     output_template: str = None,
     inputs: dict = None,
+    accessed_inputs: List[str] = None,
     params: dict = None,
     test_id: str = None,
     show: bool = True,
@@ -128,6 +130,8 @@ def run_metrics(
         output_template (_type_, optional): Output template to customize the result
             table.
         inputs (_type_, optional): Inputs to pass to the unit metrics. Defaults to None
+        accessed_inputs (_type_, optional): Inputs that were accessed when running the
+            unit metrics - used for input tracking. Defaults to None.
         params (_type_, optional): Parameters to pass to the unit metrics. Defaults to
             None.
         test_id (str, optional): Test ID of the composite metric. Required if name is
@@ -212,7 +216,7 @@ def run_metrics(
                 "json": {"output_template": output_template},
             },
         ],
-        inputs=list(inputs.keys()),
+        inputs=accessed_inputs,
         output_template=output_template,
         metric=MetricResult(
             key=test_id,

validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py CHANGED Viewed

@@ -16,6 +16,6 @@ def AdjustedRSquaredScore(model, dataset):
     )
     row_count = len(dataset.y)
-    feature_count = len(dataset.get_features_columns())
+    feature_count = len(dataset.feature_columns)
     return 1 - (1 - r2_score) * (row_count - 1) / (row_count - feature_count)

validmind/utils.py CHANGED Viewed

@@ -12,16 +12,21 @@ from platform import python_version
 from typing import Any
 import matplotlib.pylab as pylab
+import mistune
 import nest_asyncio
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from IPython.core import getipython
-from IPython.display import HTML, display
+from IPython.display import HTML
+from IPython.display import display as ipy_display
+from latex2mathml.converter import convert
 from matplotlib.axes._axes import _log as matplotlib_axes_logger
 from numpy import ndarray
 from tabulate import tabulate
+from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
 DEFAULT_BIG_NUMBER_DECIMALS = 2
 DEFAULT_SMALL_NUMBER_DECIMALS = 4
@@ -97,6 +102,8 @@ class NumpyEncoder(json.JSONEncoder):
             return bool(obj)
         if isinstance(obj, pd.Timestamp):
             return str(obj)
+        if isinstance(obj, set):
+            return list(obj)
         return super().default(obj)
     def encode(self, obj):
@@ -345,10 +352,10 @@ def test_id_to_name(test_id: str) -> str:
 def get_model_info(model):
     """Attempts to extract all model info from a model object instance"""
-    architecture = model.model_name()
-    framework = model.model_library()
-    framework_version = model.model_library_version()
-    language = model.model_language()
+    architecture = model.name
+    framework = model.library
+    framework_version = model.library_version
+    language = model.language
     if language is None:
         language = f"Python {python_version()}"
@@ -402,4 +409,47 @@ def preview_test_config(config):
     <div id="collapsibleContent" style="display:none;"><pre>{formatted_json}</pre></div>
     """
-    display(HTML(collapsible_html))
+    ipy_display(HTML(collapsible_html))
+def display(widget_or_html, syntax_highlighting=True, mathjax=True):
+    """Display widgets with extra goodies (syntax highlighting, MathJax, etc.)"""
+    if isinstance(widget_or_html, str):
+        ipy_display(HTML(widget_or_html))
+        # if html we can auto-detect if we actually need syntax highlighting or MathJax
+        syntax_highlighting = 'class="language-' in widget_or_html
+        mathjax = "$$" in widget_or_html
+    else:
+        ipy_display(widget_or_html)
+    if syntax_highlighting:
+        ipy_display(HTML(python_syntax_highlighting))
+    if mathjax:
+        ipy_display(HTML(math_jax_snippet))
+def md_to_html(md: str, mathml=False) -> str:
+    """Converts Markdown to HTML using mistune with plugins"""
+    # use mistune with math plugin to convert to html
+    html = mistune.create_markdown(plugins=["math"])(md)
+    if not mathml:
+        # return the html as is (with latex that will be rendered by MathJax)
+        return html
+    # convert the latex to MathML which CKeditor can render
+    math_block_pattern = re.compile(r'<div class="math">\$\$([\s\S]*?)\$\$</div>')
+    html = math_block_pattern.sub(
+        lambda match: "<p>{}</p>".format(convert(match.group(1), display="block")), html
+    )
+    inline_math_pattern = re.compile(r'<span class="math">\\\((.*?)\\\)</span>')
+    html = inline_math_pattern.sub(
+        lambda match: "<span>{}</span>".format(
+            convert(match.group(1), display="inline")
+        ),
+        html,
+    )
+    return html

validmind/vm_models/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@
 Models entrypoint
 """
-from .dataset import VMDataset
+from .dataset.dataset import VMDataset
 from .figure import Figure
 from .model import R_MODEL_TYPES, ModelAttributes, VMModel
 from .test.metric import Metric

validmind/vm_models/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from .dataset import DataFrameDataset, PolarsDataset, TorchDataset, VMDataset
+__all__ = ["VMDataset", "DataFrameDataset", "PolarsDataset", "TorchDataset"]

validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

validmind 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl