PyPI - validmind - Versions diffs - 2.0.7__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

validmind 2.0.7py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

validmind/utils.py CHANGED Viewed

@@ -238,40 +238,6 @@ def summarize_data_quality_results(results):
     )
-def clean_docstring(docstring: str) -> str:
-    """
-    Clean up docstrings by removing leading and trailing whitespace and
-    replacing newlines with spaces.
-    """
-    description = (docstring or "").strip()
-    paragraphs = description.split("\n\n")  # Split into paragraphs
-    paragraphs = [
-        " ".join([line.strip() for line in paragraph.split("\n")])
-        for paragraph in paragraphs
-    ]
-    paragraphs = [
-        paragraph.replace(" - ", "\n- ") for paragraph in paragraphs
-    ]  # Add newline before list items
-    # Join paragraphs with double newlines for markdown
-    description = "\n\n".join(paragraphs)
-    lines = description.split("\n")
-    in_bullet_list = False
-    for i, line in enumerate([line for line in lines]):
-        if line.strip().startswith("-") and not in_bullet_list:
-            if lines[i - 1] != "":
-                lines[i] = "\n" + line
-            in_bullet_list = True
-            continue
-        elif line.strip().startswith("-") and in_bullet_list:
-            continue
-        elif line.strip() == "" and in_bullet_list:
-            in_bullet_list = False
-    return "\n".join(lines)
 def format_number(number):
     """
     Format a number for display purposes. If the number is a float, round it
@@ -354,20 +320,27 @@ def fuzzy_match(string: str, search_string: str, threshold=0.7):
     return score >= threshold
-def test_id_to_name(test_id: str):
-    """Convert a test ID to a human-readable name"""
-    # Extract the last part of the ID string
-    last_part = test_id.split(".")[-1]
+def test_id_to_name(test_id: str) -> str:
+    """Convert a test ID to a human-readable name.
-    # Use a regular expression to find words and acronyms in the CamelCase string
-    words = re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", last_part)
+    Args:
+        test_id (str): The test identifier, typically in CamelCase or snake_case.
-    # Join the words with spaces and capitalize the first letter of each word, keeping acronyms unchanged
-    title = " ".join(
-        [word.capitalize() if not word.isupper() else word for word in words]
-    )
+    Returns:
+        str: A human-readable name derived from the test ID.
+    """
+    last_part = test_id.split(".")[-1]
+    words = []
+    # Split on underscores and apply regex to each part to handle CamelCase and acronyms
+    for part in last_part.split("_"):
+        # Regex pattern to match uppercase acronyms, mixed-case words, or alphanumeric combinations
+        words.extend(
+            re.findall(r"[A-Z]+(?:_[A-Z]+)*(?=_|$|[A-Z][a-z])|[A-Z]?[a-z0-9]+", part)
+        )
-    return title
+    # Join the words with spaces, capitalize non-acronym words
+    return " ".join(word.capitalize() if not word.isupper() else word for word in words)
 def get_model_info(model):

validmind/vm_models/__init__.py CHANGED Viewed

@@ -15,7 +15,6 @@ from .test.result_summary import ResultSummary, ResultTable, ResultTableMetadata
 from .test.test import Test
 from .test.threshold_test import ThresholdTest
 from .test.threshold_test_result import ThresholdTestResult, ThresholdTestResults
-from .test.unit_metric import UnitMetric
 from .test_context import TestContext, TestInput
 from .test_suite.runner import TestSuiteRunner
 from .test_suite.test_suite import TestSuite
@@ -30,7 +29,6 @@ __all__ = [
     "ResultTable",
     "ResultTableMetadata",
     "Test",
-    "UnitMetric",
     "Metric",
     "MetricResult",
     "ThresholdTest",

validmind/vm_models/dataset.py CHANGED Viewed

@@ -6,6 +6,7 @@
 Dataset class wrapper
 """
+import warnings
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
@@ -13,6 +14,7 @@ import numpy as np
 import pandas as pd
 import polars as pl
+from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.logging import get_logger
 from validmind.vm_models.model import VMModel
@@ -40,7 +42,9 @@ class VMDataset(ABC):
         self,
         model,
         prediction_values: list = None,
+        prediction_probabilities: list = None,
         prediction_column=None,
+        probability_column=None,
     ):
         """
         Assigns predictions to the dataset for a given model or prediction values.
@@ -151,15 +155,24 @@ class VMDataset(ABC):
         pass
     @abstractmethod
-    def y_pred(self, model_id) -> np.ndarray:
+    def y_pred(self, model) -> np.ndarray:
         """
-        Returns the prediction values (y_pred) of the dataset for a given model_id.
+        Returns the prediction values (y_pred) of the dataset for a given model.
         Returns:
             np.ndarray: The prediction values.
         """
         pass
+    def y_prob(self, model) -> np.ndarray:
+        """
+        Returns the prediction probabilities (y_prob) of the dataset for a given model.
+        Returns:
+            np.ndarray: The prediction probabilities.
+        """
+        pass
     @property
     @abstractmethod
     def df(self):
@@ -200,7 +213,17 @@ class VMDataset(ABC):
         pass
     @abstractmethod
-    def y_pred_df(self, model_id):
+    def y_pred_df(self, model):
+        """
+        Returns the target columns (y) of the dataset.
+        Returns:
+            pd.DataFrame: The target columns.
+        """
+        pass
+    @abstractmethod
+    def y_prob_df(self, model):
         """
         Returns the target columns (y) of the dataset.
@@ -210,7 +233,7 @@ class VMDataset(ABC):
         pass
     @abstractmethod
-    def prediction_column(self, model_id) -> str:
+    def prediction_column(self, model) -> str:
         """
         Returns the prediction column name of the dataset.
@@ -219,6 +242,15 @@ class VMDataset(ABC):
         """
         pass
+    def probability_column(self, model) -> str:
+        """
+        Returns the probability column name of the dataset.
+        Returns:
+            str: The probability column name.
+        """
+        pass
     @abstractmethod
     def get_features_columns(self):
         """
@@ -270,6 +302,7 @@ class NumpyDataset(VMDataset):
     _extra_columns: dict = field(
         default_factory=lambda: {
             "prediction_columns": {},
+            "probability_columns": {},
             "group_by_column": None,
         }
     )
@@ -356,6 +389,7 @@ class NumpyDataset(VMDataset):
         if extra_columns is None:
             extra_columns = {
                 "prediction_columns": {},
+                "probability_columns": {},
                 "group_by_column": None,
             }
         self._extra_columns = extra_columns
@@ -395,6 +429,9 @@ class NumpyDataset(VMDataset):
         return df
+    def __model_id_in_probability_columns(self, model, probability_column):
+        return model.input_id in self._extra_columns.get("probability_columns", {})
     def __model_id_in_prediction_columns(self, model, prediction_column):
         return model.input_id in self._extra_columns.get("prediction_columns", {})
@@ -423,17 +460,60 @@ class NumpyDataset(VMDataset):
         if pred_column not in self._columns:
             self._columns.append(pred_column)
+    def __assign_prediction_probabilities(
+        self, model, prob_column, prediction_probabilities
+    ):
+        # Link the prediction column with the model
+        self._extra_columns.setdefault("probability_columns", {})[
+            model.input_id
+        ] = prob_column
+        # Check if the predictions are multi-dimensional (e.g., embeddings)
+        is_multi_dimensional = (
+            isinstance(prediction_probabilities, np.ndarray)
+            and prediction_probabilities.ndim > 1
+        )
+        if is_multi_dimensional:
+            # For multi-dimensional outputs, convert to a list of lists to store in DataFrame
+            self._df[prob_column] = list(map(list, prediction_probabilities))
+        else:
+            # If not multi-dimensional or a standard numpy array, reshape for compatibility
+            self._raw_dataset = np.hstack(
+                (self._raw_dataset, np.array(prediction_probabilities).reshape(-1, 1))
+            )
+            self._df[prob_column] = prediction_probabilities
+        # Update the dataset columns list
+        if prob_column not in self._columns:
+            self._columns.append(prob_column)
     def assign_predictions(  # noqa: C901 - we need to simplify this method
         self,
         model,
         prediction_values: list = None,
+        prediction_probabilities: list = None,
         prediction_column=None,
+        probability_column=None,
     ):
+        def _is_probability(output):
+            """Check if the output from the predict method is probabilities."""
+            # This is a simple check that assumes output is probabilities if they lie between 0 and 1
+            if np.all((output >= 0) & (output <= 1)):
+                # Check if there is at least one element that is neither 0 nor 1
+                if np.any((output > 0) & (output < 1)):
+                    return True
+            return np.all((output >= 0) & (output <= 1)) and np.any(
+                (output > 0) & (output < 1)
+            )
+        # Step 1: Check for Model Presence
         if not model:
             raise ValueError(
                 "Model must be provided to link prediction column with the dataset"
             )
+        # Step 2: Prediction Column Provided
         if prediction_column:
             if prediction_column not in self.columns:
                 raise ValueError(
@@ -448,6 +528,8 @@ class NumpyDataset(VMDataset):
             self._extra_columns.setdefault("prediction_columns", {})[
                 model.input_id
             ] = prediction_column
+        # Step 4: Prediction Values Provided without Specific Column
         elif prediction_values is not None:
             if len(prediction_values) != self.df.shape[0]:
                 raise ValueError(
@@ -455,13 +537,58 @@ class NumpyDataset(VMDataset):
                 )
             pred_column = f"{model.input_id}_prediction"
             if pred_column in self.columns:
-                raise ValueError(
-                    f"Prediction column {pred_column} already exists in the dataset"
+                warnings.warn(
+                    f"Prediction column {pred_column} already exists in the dataset, overwriting the existing predictions",
+                    UserWarning,
                 )
+            logger.info(
+                f"Assigning prediction values to column '{pred_column}' and linked to model '{model.input_id}'"
+            )
             self.__assign_prediction_values(model, pred_column, prediction_values)
+        # Step 3: Probability Column Provided
+        if probability_column:
+            if probability_column not in self.columns:
+                raise ValueError(
+                    f"Probability column {probability_column} doesn't exist in the dataset"
+                )
+            if self.__model_id_in_probability_columns(
+                model=model, probability_column=probability_column
+            ):
+                raise ValueError(
+                    f"Probability column {probability_column} already linked to the VM model"
+                )
+            self._extra_columns.setdefault("probability_columns", {})[
+                model.input_id
+            ] = probability_column
+        # Step 5: Prediction Probabilities Provided without Specific Column
+        elif prediction_probabilities is not None:
+            if len(prediction_probabilities) != self.df.shape[0]:
+                raise ValueError(
+                    "Length of prediction probabilities doesn't match number of rows of the dataset"
+                )
+            prob_column = f"{model.input_id}_probabilities"
+            if prob_column in self.columns:
+                warnings.warn(
+                    f"Probability column {prob_column} already exists in the dataset, overwriting the existing probabilities",
+                    UserWarning,
+                )
+            logger.info(
+                f"Assigning prediction probabilities to column '{prob_column}' and linked to model '{model.input_id}'"
+            )
+            self.__assign_prediction_probabilities(
+                model, prob_column, prediction_probabilities
+            )
+        # Step 6: Neither Specific Column Nor Values Provided
         elif not self.__model_id_in_prediction_columns(
             model=model, prediction_column=prediction_column
         ):
+            # Compute prediction values directly from the VM model
             pred_column = f"{model.input_id}_prediction"
             if pred_column in self.columns:
                 logger.info(
@@ -479,7 +606,49 @@ class NumpyDataset(VMDataset):
             )
             prediction_values = np.array(model.predict(x_only))
-            self.__assign_prediction_values(model, pred_column, prediction_values)
+            # Check if the prediction values are probabilities
+            if _is_probability(prediction_values):
+                threshold = 0.5
+                logger.info(
+                    "Predict method returned probabilities instead of direct labels or regression values. "
+                    + "This implies the model is likely configured for a classification task with probability output."
+                )
+                prob_column = f"{model.input_id}_probabilities"
+                logger.info(
+                    f"Assigning probabilities to column '{prob_column}' and computing class labels using a threshold of {threshold}."
+                )
+                self.__assign_prediction_probabilities(
+                    model, prob_column, prediction_values
+                )
+                # Convert probabilities to class labels based on the threshold
+                prediction_classes = (prediction_values > threshold).astype(int)
+                self.__assign_prediction_values(model, pred_column, prediction_classes)
+            else:
+                # If not assign the prediction values directly
+                pred_column = f"{model.input_id}_prediction"
+                self.__assign_prediction_values(model, pred_column, prediction_values)
+                try:
+                    logger.info("Running predict_proba()... This may take a while")
+                    prediction_probabilities = np.array(model.predict_proba(x_only))
+                    prob_column = f"{model.input_id}_probabilities"
+                    self.__assign_prediction_probabilities(
+                        model, prob_column, prediction_probabilities
+                    )
+                except MissingOrInvalidModelPredictFnError:
+                    # Log that predict_proba is not available or failed
+                    logger.warn(
+                        f"Model class '{model.__class__}' does not have a compatible predict_proba implementation."
+                        + " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
+                    )
+        # Step 7: Prediction Column Already Linked
         else:
             logger.info(
                 f"Prediction column {self._extra_columns['prediction_columns'][model.input_id]} already linked to the {model.input_id}"
@@ -673,19 +842,19 @@ class NumpyDataset(VMDataset):
             ],
         ]
-    def y_pred(self, model_id) -> np.ndarray:
+    def y_pred(self, model) -> np.ndarray:
         """
-        Returns the prediction variables for a given model_id, accommodating
+        Returns the prediction variables for a given model, accommodating
         both scalar predictions and multi-dimensional outputs such as embeddings.
         Args:
-            model_id (str): The ID of the model whose predictions are sought.
+            model (VMModel): The model whose predictions are sought.
         Returns:
             np.ndarray: The prediction variables, either as a flattened array for
             scalar predictions or as an array of arrays for multi-dimensional outputs.
         """
-        pred_column = self.prediction_column(model_id)
+        pred_column = self.prediction_column(model)
         # First, attempt to retrieve the prediction data from the DataFrame
         if hasattr(self, "_df") and pred_column in self._df.columns:
@@ -712,6 +881,45 @@ class NumpyDataset(VMDataset):
         return predictions
+    def y_prob(self, model) -> np.ndarray:
+        """
+        Returns the prediction variables for a given model, accommodating
+        both scalar predictions and multi-dimensional outputs such as embeddings.
+        Args:
+            model (str): The ID of the model whose predictions are sought.
+        Returns:
+            np.ndarray: The prediction variables, either as a flattened array for
+            scalar predictions or as an array of arrays for multi-dimensional outputs.
+        """
+        prob_column = self.probability_column(model)
+        # First, attempt to retrieve the prediction data from the DataFrame
+        if hasattr(self, "_df") and prob_column in self._df.columns:
+            probabilities = self._df[prob_column].to_numpy()
+            # Check if the predictions are stored as objects (e.g., lists for embeddings)
+            if self._df[prob_column].dtype == object:
+                # Attempt to convert lists to a numpy array
+                try:
+                    probabilities = np.stack(probabilities)
+                except ValueError as e:
+                    # Handling cases where predictions cannot be directly stacked
+                    raise ValueError(f"Error stacking prediction arrays: {e}")
+        else:
+            # Fallback to using the raw numpy dataset if DataFrame is not available or suitable
+            try:
+                probabilities = self.raw_dataset[
+                    :, self.columns.index(prob_column)
+                ].flatten()
+            except IndexError as e:
+                raise ValueError(
+                    f"Prediction column '{prob_column}' not found in raw dataset: {e}"
+                )
+        return probabilities
     @property
     def type(self) -> str:
         """
@@ -757,22 +965,32 @@ class NumpyDataset(VMDataset):
         """
         return self._df[self.target_column]
-    def y_pred_df(self, model_id):
+    def y_pred_df(self, model):
+        """
+        Returns the target columns (y) of the dataset.
+        Returns:
+            pd.DataFrame: The target columns.
+        """
+        return self._df[self.prediction_column(model)]
+    def y_prob_df(self, model):
         """
         Returns the target columns (y) of the dataset.
         Returns:
             pd.DataFrame: The target columns.
         """
-        return self._df[self.prediction_column(model_id=model_id)]
+        return self._df[self.probability_column(model)]
-    def prediction_column(self, model_id) -> str:
+    def prediction_column(self, model) -> str:
         """
         Returns the prediction column name of the dataset.
         Returns:
             str: The prediction column name.
         """
+        model_id = model.input_id
         pred_column = self._extra_columns.get("prediction_columns", {}).get(model_id)
         if pred_column is None:
             raise ValueError(
@@ -780,6 +998,21 @@ class NumpyDataset(VMDataset):
             )
         return pred_column
+    def probability_column(self, model) -> str:
+        """
+        Returns the prediction column name of the dataset.
+        Returns:
+            str: The prediction column name.
+        """
+        model_id = model.input_id
+        prob_column = self._extra_columns.get("probability_columns", {}).get(model_id)
+        if prob_column is None:
+            raise ValueError(
+                f"Probability column is not linked with the given {model_id}"
+            )
+        return prob_column
     def serialize(self):
         """
         Serializes the dataset to a dictionary.
@@ -1023,12 +1256,16 @@ class TorchDataset(NumpyDataset):
             text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
             target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
         """
-        # if we can't import torch, then it's not a PyTorch model
         try:
             import torch
         except ImportError:
-            return False
+            raise ImportError(
+                "PyTorch is not installed, please run `pip install validmind[pytorch]`"
+            )
         columns = []
         for id, tens in zip(range(0, len(raw_dataset.tensors)), raw_dataset.tensors):
             if id == 0 and feature_columns is None:
                 n_cols = tens.shape[1]
@@ -1039,9 +1276,11 @@ class TorchDataset(NumpyDataset):
                     ).astype(str)
                 ]
                 columns.append(feature_columns)
             elif id == 1 and target_column is None:
                 target_column = "y"
                 columns.append(target_column)
             elif id == 2 and extra_columns is None:
                 extra_columns.prediction_column = "y_pred"
                 columns.append(extra_columns.prediction_column)

validmind/vm_models/test/metric.py CHANGED Viewed

@@ -15,7 +15,6 @@ import pandas as pd
 from ...ai import generate_description
 from ...errors import MissingCacheResultsArgumentsError
-from ...utils import clean_docstring
 from ..figure import Figure
 from .metric_result import MetricResult
 from .result_wrapper import MetricResultWrapper
@@ -98,7 +97,7 @@ class Metric(Test):
             )
         else:
             revision_name = "Default Description"
-            description = clean_docstring(self.description())
+            description = self.description()
         description_metadata = {
             "content_id": f"metric_description:{self.test_id}::{revision_name}",

validmind/vm_models/test/result_wrapper.py CHANGED Viewed

@@ -13,7 +13,7 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
 import ipywidgets as widgets
-import markdown
+import mistune
 import pandas as pd
 from IPython.display import display
@@ -103,7 +103,7 @@ class ResultWrapper(ABC):
         """
         Convert a markdown string to html
         """
-        return markdown.markdown(description, extensions=["markdown.extensions.tables"])
+        return mistune.html(description)
     def _summary_tables_to_widget(self, summary: ResultSummary):
         """
@@ -120,21 +120,19 @@ class ResultWrapper(ABC):
                     [
                         {
                             "selector": "",
-                            "props": [
-                                ("width", "100%"),
-                            ],
+                            "props": [("width", "100%")],
+                        },
+                        {
+                            "selector": "th",
+                            "props": [("text-align", "left")],
                         },
                         {
                             "selector": "tbody tr:nth-child(even)",
-                            "props": [
-                                ("background-color", "#FFFFFF"),
-                            ],
+                            "props": [("background-color", "#FFFFFF")],
                         },
                         {
                             "selector": "tbody tr:nth-child(odd)",
-                            "props": [
-                                ("background-color", "#F5F5F5"),
-                            ],
+                            "props": [("background-color", "#F5F5F5")],
                         },
                         {
                             "selector": "td, th",
@@ -144,7 +142,8 @@ class ResultWrapper(ABC):
                             ],
                         },
                     ]
-                )  # add borders
+                )
+                .set_properties(**{"text-align": "left"})
                 .to_html(escape=False)
             )  # table.data is an orient=records dump
@@ -217,7 +216,7 @@ class MetricResultWrapper(ResultWrapper):
             return ""
         vbox_children = [
-            widgets.HTML(value=f"<h1>{test_id_to_name(self.result_id)}</h1>")
+            widgets.HTML(value=f"<h1>{test_id_to_name(self.result_id)}</h1>"),
         ]
         if self.result_metadata:

validmind/vm_models/test/test.py CHANGED Viewed

@@ -6,6 +6,7 @@
 from abc import abstractmethod
 from dataclasses import dataclass
+from inspect import getdoc
 from typing import ClassVar, List, TypedDict
 from uuid import uuid4
@@ -66,7 +67,7 @@ class Test(TestUtils):
         Return the test description. May be overridden by subclasses. Defaults
         to returning the class' docstring
         """
-        return self.__doc__.strip()
+        return getdoc(self).strip()
     @abstractmethod
     def summary(self, *args, **kwargs):

validmind/vm_models/test/threshold_test.py CHANGED Viewed

@@ -13,7 +13,6 @@ from dataclasses import dataclass
 from typing import ClassVar, List, Optional
 from ...ai import generate_description
-from ...utils import clean_docstring
 from ..figure import Figure
 from .result_summary import ResultSummary, ResultTable
 from .result_wrapper import ThresholdTestResultWrapper
@@ -94,7 +93,7 @@ class ThresholdTest(Test):
             )
         else:
             revision_name = "Default Description"
-            description = clean_docstring(self.description())
+            description = self.description()
         description_metadata = {
             "content_id": f"test_description:{self.test_id}::{revision_name}",

validmind 2.0.7__py3-none-any.whl → 2.1.0__py3-none-any.whl

validmind 2.0.7py3-none-any.whl → 2.1.0py3-none-any.whl