PyPI - validmind - Versions diffs - 2.5.15__py3-none-any.whl → 2.5.18__py3-none-any.whl - Mend

validmind 2.5.15py3-none-any.whl → 2.5.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

validmind/tests/model_validation/ragas/ContextUtilization.py ADDED Viewed

@@ -0,0 +1,155 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import warnings
+import plotly.express as px
+from datasets import Dataset
+from validmind import tags, tasks
+from .utils import get_ragas_config, get_renamed_columns
+@tags("ragas", "llm", "retrieval_performance")
+@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
+def ContextUtilization(
+    dataset,
+    question_column: str = "question",
+    contexts_column: str = "contexts",
+    answer_column: str = "answer",
+):  # noqa: B950
+    """
+    Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
+    within the provided contexts.
+    ### Purpose
+    The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
+    higher within the provided retrieval results. This metric is essential for assessing the performance of models,
+    especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
+    ### Test Mechanism
+    The test calculates Context Utilization using the formula:
+    $$
+    \\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
+    $$
+    $$
+    \\text{Precision@k} = {\\text{true positives@k} \\over  (\\text{true positives@k} + \\text{false positives@k})}
+    $$
+    Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
+    This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
+    scores, generating a histogram and box plot for visualization.
+    #### Configuring Columns
+    This metric requires the following columns in your dataset:
+    - `question` (str): The text query that was input into the model.
+    - `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
+       make sure they contain relevant info in the correct order.
+    - `answer` (str): The llm-generated response for the input `question`.
+    If the above data is not in the appropriate column, you can specify different column
+    names for these fields using the parameters `question_column`, `contexts_column`
+    and `ground_truth_column`.
+    For example, if your dataset has this data stored in different columns, you can
+    pass the following parameters:
+    ```python
+    {
+        "question_column": "question",
+        "contexts_column": "context_info"
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    If the data is stored as a dictionary in another column, specify the column and key
+    like this:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": f"{pred_col}.contexts",
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    For more complex situations, you can use a function to extract the data:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": lambda x: [x[pred_col]["context_message"]],
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    ### Signs of High Risk
+    - Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
+    - High standard deviation, suggesting inconsistent model performance.
+    - Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
+    ### Strengths
+    - Quantifies the rank of relevant context chunks in generating responses.
+    - Provides clear visualizations through histograms and box plots for ease of interpretation.
+    - Adapts to different dataset schema by allowing configurable column names.
+    ### Limitations
+    - Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
+    - Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
+    - Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
+    """
+    try:
+        from ragas import evaluate
+        from ragas.metrics import context_utilization
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    warnings.filterwarnings(
+        "ignore",
+        category=FutureWarning,
+        message="promote has been superseded by promote_options='default'.",
+    )
+    required_columns = {
+        "question": question_column,
+        "contexts": contexts_column,
+        "answer": answer_column,
+    }
+    df = get_renamed_columns(dataset._df, required_columns)
+    result_df = evaluate(
+        Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
+    ).to_pandas()
+    fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
+    fig_box = px.box(x=result_df["context_utilization"].to_list())
+    return (
+        {
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["question", "contexts", "answer", "context_utilization"]
+            # ],
+            "Aggregate Scores": [
+                {
+                    "Mean Score": result_df["context_utilization"].mean(),
+                    "Median Score": result_df["context_utilization"].median(),
+                    "Max Score": result_df["context_utilization"].max(),
+                    "Min Score": result_df["context_utilization"].min(),
+                    "Standard Deviation": result_df["context_utilization"].std(),
+                    "Count": result_df.shape[0],
+                }
+            ],
+        },
+        fig_histogram,
+        fig_box,
+    )

validmind/tests/model_validation/ragas/NoiseSensitivity.py ADDED Viewed

@@ -0,0 +1,152 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import warnings
+import plotly.express as px
+from datasets import Dataset
+from validmind import tags, tasks
+from .utils import get_ragas_config, get_renamed_columns
+@tags("ragas", "llm", "rag_performance")
+@tasks("text_qa", "text_generation", "text_summarization")
+def NoiseSensitivity(
+    dataset,
+    answer_column="answer",
+    contexts_column="contexts",
+    ground_truth_column="ground_truth",
+):
+    """
+    Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
+    generates incorrect responses.
+    ### Purpose
+    The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
+    contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
+    generating accurate answers from given contexts.
+    ### Test Mechanism
+    This test evaluates the model's answers by comparing the claims made in the generated response against the ground
+    truth and the retrieved context. The noise sensitivity score is calculated as:
+    $$
+    \\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
+    $$
+    The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
+    'answer', 'context', and 'ground_truth' columns are specified.
+    #### Configuring Columns
+    This metric requires the following columns in your dataset:
+    - `contexts` (List[str]): A list of text contexts which are retrieved to generate
+    the answer.
+    - `answer` (str): The response generated by the model
+    - `ground_truth` (str): The "correct" answer to the question
+    If the above data is not in the appropriate column, you can specify different column
+    names for these fields using the parameters `contexts_column` and `answer_column`.
+    For example, if your dataset has this data stored in different columns, you can
+    pass the following parameters:
+    ```python
+    {
+        "contexts_column": "context_info"
+        "answer_column": "my_answer_col",
+    }
+    ```
+    If the data is stored as a dictionary in another column, specify the column and key
+    like this:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": f"{pred_col}.contexts",
+        "answer_column": f"{pred_col}.answer",
+    }
+    ```
+    For more complex situations, you can use a function to extract the data:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": lambda row: [row[pred_col]["context_message"]],
+        "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
+    }
+    ### Signs of High Risk
+    - High noise sensitivity scores across multiple samples.
+    - Significant deviation between mean and median noise sensitivity scores.
+    - High standard deviation indicating inconsistency in the model's performance.
+    ### Strengths
+    - Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
+    - Easy integration and configuration using column parameters.
+    - Utilizes both histogram and box plot visualizations to analyze score distribution.
+    ### Limitations
+    - Requires accurate ground truth that aligns with the generated answers.
+    - Assumes the context provided is sufficiently granular to assess noise sensitivity.
+    - Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
+    critical.
+    """
+    try:
+        from ragas import evaluate
+        from ragas.metrics import noise_sensitivity_relevant
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    warnings.filterwarnings(
+        "ignore",
+        category=FutureWarning,
+        message="promote has been superseded by promote_options='default'.",
+    )
+    required_columns = {
+        "answer": answer_column,
+        "contexts": contexts_column,
+        "ground_truth": ground_truth_column,
+    }
+    df = get_renamed_columns(dataset._df, required_columns)
+    result_df = evaluate(
+        Dataset.from_pandas(df),
+        metrics=[noise_sensitivity_relevant],
+        **get_ragas_config(),
+    ).to_pandas()
+    fig_histogram = px.histogram(
+        x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
+    )
+    fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
+    return (
+        {
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
+            # ],
+            "Aggregate Scores": [
+                {
+                    "Mean Score": result_df["noise_sensitivity_relevant"].mean(),
+                    "Median Score": result_df["noise_sensitivity_relevant"].median(),
+                    "Max Score": result_df["noise_sensitivity_relevant"].max(),
+                    "Min Score": result_df["noise_sensitivity_relevant"].min(),
+                    "Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
+                    "Count": result_df.shape[0],
+                }
+            ],
+        },
+        fig_histogram,
+        fig_box,
+    )

validmind/tests/model_validation/sklearn/FeatureImportance.py CHANGED Viewed

@@ -81,9 +81,9 @@ def FeatureImportance(dataset, model, num_features=3):
     # Dynamically add feature columns to the result
     for i in range(num_features):
         if i < len(top_features):
-            result[f"Feature {i + 1}"] = (
-                f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
-            )
+            result[
+                f"Feature {i + 1}"
+            ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
         else:
             result[f"Feature {i + 1}"] = None

validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py CHANGED Viewed

@@ -109,7 +109,7 @@ class PermutationFeatureImportance(Metric):
             )
         )
         fig.update_layout(
-            title_text="Permutation Importances (train set)",
+            title_text="Permutation Importances",
             yaxis=dict(
                 tickmode="linear",  # set tick mode to linear
                 dtick=1,  # set interval between ticks

validmind/tests/model_validation/sklearn/RegressionR2Square.py CHANGED Viewed

@@ -3,11 +3,10 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 import pandas as pd
 from sklearn import metrics
-from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
 from validmind import tags, tasks
+from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
 @tags("sklearn", "model_performance")

validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py CHANGED Viewed

@@ -78,6 +78,7 @@ class SHAPGlobalImportance(Metric):
     default_params = {
         "kernel_explainer_samples": 10,
         "tree_or_linear_explainer_samples": 200,
+        "class_of_interest": None,
     }
     def _generate_shap_plot(self, type_, shap_values, x_test):
@@ -107,6 +108,7 @@ class SHAPGlobalImportance(Metric):
                 shap_values / max_shap_value * 100
             )  # scaling factor to make the top feature 100%
             summary_plot_extra_args = {"plot_type": "bar"}
             shap.summary_plot(
                 shap_values, x_test, show=False, **summary_plot_extra_args
             )
@@ -192,6 +194,10 @@ class SHAPGlobalImportance(Metric):
         shap_values = explainer.shap_values(shap_sample)
+        # Select the SHAP values for the specified class (classification) or for the regression output.
+        class_of_interest = self.params["class_of_interest"]
+        shap_values = _select_shap_values(shap_values, class_of_interest)
         figures = [
             self._generate_shap_plot("mean", shap_values, shap_sample),
             self._generate_shap_plot("summary", shap_values, shap_sample),
@@ -214,3 +220,56 @@ class SHAPGlobalImportance(Metric):
         for fig_num, type_ in enumerate(["mean", "summary"], start=1):
             assert isinstance(self.result.figures[fig_num - 1], Figure)
             assert self.result.figures[fig_num - 1].metadata["type"] == type_
+def _select_shap_values(shap_values, class_of_interest=None):
+    """
+    Selects SHAP values for binary or multiclass classification. For regression models,
+    returns the SHAP values directly as there are no classes.
+    Parameters:
+    -----------
+    shap_values : list or numpy.ndarray
+        The SHAP values returned by the SHAP explainer. For multiclass classification,
+        this will be a list where each element corresponds to a class. For regression,
+        this will be a single array of SHAP values.
+    class_of_interest : int, optional
+        The class index for which to retrieve SHAP values. If None (default), the function
+        will assume binary classification and use class 1 by default.
+    Returns:
+    --------
+    numpy.ndarray
+        The SHAP values for the specified class (classification) or for the regression output.
+    Raises:
+    -------
+    ValueError
+        If class_of_interest is specified and is out of bounds for the number of classes.
+    """
+    # Check if we are dealing with a multiclass classification
+    if isinstance(shap_values, list):
+        num_classes = len(shap_values)
+        # Default to class 1 for binary classification
+        if num_classes == 2 and class_of_interest is None:
+            logger.info(
+                "Binary classification detected: using SHAP values for class 1 (positive class)."
+            )
+            return shap_values[1]
+        else:
+            # Multiclass classification: use the specified class_of_interest
+            if class_of_interest is not None and 0 <= class_of_interest < num_classes:
+                logger.info(
+                    f"Multiclass classification: using SHAP values for class {class_of_interest}."
+                )
+                return shap_values[class_of_interest]
+            else:
+                raise ValueError(
+                    f"Invalid class_of_interest: {class_of_interest}. Must be between 0 and {num_classes - 1}."
+                )
+    else:
+        # For regression, return the SHAP values as they are
+        logger.info("Regression model detected: returning SHAP values as-is.")
+        return shap_values

validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py CHANGED Viewed

@@ -2,15 +2,15 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
+import pandas as pd
 from statsmodels.stats.stattools import durbin_watson
-from validmind.vm_models import Metric
+from validmind import tags, tasks
-@dataclass
-class DurbinWatsonTest(Metric):
+@tasks("regression")
+@tags("time_series_data", "forecasting", "statistical_test", "statsmodels")
+def DurbinWatsonTest(dataset, model, threshold=[1.5, 2.5]):
     """
     Assesses autocorrelation in time series data features using the Durbin-Watson statistic.
@@ -49,18 +49,38 @@ class DurbinWatsonTest(Metric):
     to detect higher-order autocorrelation.
     """
-    name = "durbin_watson"
-    required_inputs = ["dataset"]
-    tasks = ["regression"]
-    tags = ["time_series_data", "forecasting", "statistical_test", "statsmodels"]
-    def run(self):
-        """
-        Calculates DB for each of the dataset features
-        """
-        x_train = self.inputs.dataset.df
-        dw_values = {}
-        for col in x_train.columns:
-            dw_values[col] = durbin_watson(x_train[col].values)
-        return self.cache_results(dw_values)
+    # Validate threshold values
+    if not (0 < threshold[0] < threshold[1] < 4):
+        raise ValueError(
+            "Invalid threshold. It should be in the form [a, b] where 0 < a < b < 4."
+        )
+    # Check if threshold values are around 2
+    if abs(2 - threshold[0]) > 1 or abs(2 - threshold[1]) > 1:
+        raise ValueError(
+            "Threshold values should be around 2 for meaningful Durbin-Watson test results."
+        )
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    residuals = y_true - y_pred
+    dw_statistic = durbin_watson(residuals)
+    def get_autocorrelation(dw_value, threshold):
+        if dw_value < threshold[0]:
+            return "Positive autocorrelation"
+        elif dw_value > threshold[1]:
+            return "Negative autocorrelation"
+        else:
+            return "No autocorrelation"
+    results = pd.DataFrame(
+        {
+            "dw_statistic": [dw_statistic],
+            "threshold": [str(threshold)],
+            "autocorrelation": [get_autocorrelation(dw_statistic, threshold)],
+        }
+    )
+    return results

validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py CHANGED Viewed

@@ -6,7 +6,6 @@
 import plotly.graph_objects as go
 from matplotlib import cm
 from validmind import tags, tasks

validmind/tests/model_validation/statsmodels/RegressionCoeffs.py CHANGED Viewed

@@ -7,8 +7,8 @@ import pandas as pd
 import plotly.graph_objects as go
 from scipy import stats
-from validmind.errors import SkipTestError
 from validmind import tags, tasks
+from validmind.errors import SkipTestError
 @tags("tabular_data", "visualization", "model_training")

validmind/utils.py CHANGED Viewed

@@ -175,6 +175,10 @@ def format_records(df):
             continue
         not_zero = df[col][df[col] != 0]
         min_number = not_zero.min()
+        if math.isnan(min_number) or math.isinf(min_number):
+            df[col] = df[col].round(DEFAULT_SMALL_NUMBER_DECIMALS)
+            continue
         _, min_scale = precision_and_scale(min_number)
         if min_number >= 10:

validmind/vm_models/test/metric.py CHANGED Viewed

@@ -77,6 +77,7 @@ class Metric(Test):
         self.result = MetricResultWrapper(
             result_id=self.test_id,
+            result_description=self.description(),
             result_metadata=[
                 (
                     get_description_metadata(

validmind/vm_models/test/result_wrapper.py CHANGED Viewed

@@ -128,6 +128,8 @@ class ResultWrapper(ABC):
     # id of the result, can be set by the subclass. This helps
     # looking up results later on
     result_id: str = None
+    # Text description from test or metric (docstring usually)
+    result_description: str = None
     # Text metadata about the result, can include description, etc.
     result_metadata: List[dict] = None
     # Output template to use for rendering the result
@@ -300,38 +302,60 @@ class MetricResultWrapper(ResultWrapper):
         return VBox(vbox_children)
     def _get_filtered_summary(self):
-        """Check if the metric summary has columns from input datasets"""
-        dataset_columns = set()
-        for input in self.inputs:
-            input_id = input if isinstance(input, str) else input.input_id
-            input_obj = input_registry.get(input_id)
-            if isinstance(input_obj, VMDataset):
-                dataset_columns.update(input_obj.columns)
-        for table in [*self.metric.summary.results]:
-            columns = set()
+        """Check if the metric summary has columns from input datasets with matching row counts."""
+        dataset_columns = self._get_dataset_columns()
+        filtered_results = []
+        for table in self.metric.summary.results:
+            table_columns = self._get_table_columns(table)
+            sensitive_columns = self._find_sensitive_columns(
+                dataset_columns, table_columns
+            )
-            if isinstance(table.data, pd.DataFrame):
-                columns.update(table.data.columns)
-            elif isinstance(table.data, list):
-                columns.update(table.data[0].keys())
+            if sensitive_columns:
+                self._log_sensitive_data_warning(sensitive_columns)
             else:
-                raise ValueError("Invalid data type in summary table")
+                filtered_results.append(table)
-            if bool(columns.intersection(dataset_columns)):
-                logger.warning(
-                    "Sensitive data in metric summary table. Not logging to API automatically."
-                    " Pass `unsafe=True` to result.log() method to override manually."
-                )
-                logger.warning(
-                    f"The following columns are present in the table: {columns}"
-                    f" and also present in the dataset: {dataset_columns}"
+        self.metric.summary.results = filtered_results
+        return self.metric.summary
+    def _get_dataset_columns(self):
+        dataset_columns = {}
+        for input_item in self.inputs:
+            input_id = (
+                input_item if isinstance(input_item, str) else input_item.input_id
+            )
+            input_obj = input_registry.get(input_id)
+            if isinstance(input_obj, VMDataset):
+                dataset_columns.update(
+                    {col: len(input_obj.df) for col in input_obj.columns}
                 )
+        return dataset_columns
-                self.metric.summary.results.remove(table)
+    def _get_table_columns(self, table):
+        if isinstance(table.data, pd.DataFrame):
+            return {col: len(table.data) for col in table.data.columns}
+        elif isinstance(table.data, list) and table.data:
+            return {col: len(table.data) for col in table.data[0].keys()}
+        else:
+            raise ValueError("Invalid data type in summary table")
-        return self.metric.summary
+    def _find_sensitive_columns(self, dataset_columns, table_columns):
+        return [
+            col
+            for col, row_count in table_columns.items()
+            if col in dataset_columns and row_count == dataset_columns[col]
+        ]
+    def _log_sensitive_data_warning(self, sensitive_columns):
+        logger.warning(
+            "Sensitive data in metric summary table. Not logging to API automatically. "
+            "Pass `unsafe=True` to result.log() method to override manually."
+        )
+        logger.warning(
+            f"The following columns are present in the table with matching row counts: {sensitive_columns}"
+        )
     async def log_async(
         self, section_id: str = None, position: int = None, unsafe=False

validmind/vm_models/test/threshold_test.py CHANGED Viewed

@@ -80,6 +80,7 @@ class ThresholdTest(Test):
         self.result = ThresholdTestResultWrapper(
             result_id=self.test_id,
+            result_description=self.description(),
             result_metadata=[
                 get_description_metadata(
                     test_id=self.test_id,

validmind 2.5.15__py3-none-any.whl → 2.5.18__py3-none-any.whl

validmind 2.5.15py3-none-any.whl → 2.5.18py3-none-any.whl