PyPI - validmind - Versions diffs - 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

validmind 2.0.1py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

validmind/tests/decorator.py ADDED Viewed

@@ -0,0 +1,313 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+"""Decorators for creating and registering metrics with the ValidMind framework."""
+# TODO: as we move entirely to a functional approach a lot of this logic
+# should be moved into the __init__ to replace the old class-based stuff
+import inspect
+import os
+from uuid import uuid4
+import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
+from validmind.logging import get_logger
+from validmind.vm_models import (
+    Metric,
+    MetricResult,
+    ResultSummary,
+    ResultTable,
+    ResultTableMetadata,
+)
+from validmind.vm_models.figure import (
+    Figure,
+    is_matplotlib_figure,
+    is_plotly_figure,
+    is_png_image,
+)
+from validmind.vm_models.test.result_wrapper import MetricResultWrapper
+logger = get_logger(__name__)
+def _inspect_signature(test_func: callable):
+    input_keys = ["dataset", "datasets", "model", "models"]
+    inputs = {}
+    params = {}
+    for name, arg in inspect.signature(test_func).parameters.items():
+        if name in input_keys:
+            target_dict = inputs
+        else:
+            target_dict = params
+        target_dict[name] = {
+            "type": arg.annotation,
+            "default": (
+                arg.default if arg.default is not inspect.Parameter.empty else None
+            ),
+        }
+    return inputs, params
+def _build_result(results, test_id, description, output_template, inputs):  # noqa: C901
+    ref_id = str(uuid4())
+    figure_metadata = {
+        "_type": "metric",
+        "_name": test_id,
+        "_ref_id": ref_id,
+    }
+    tables = []
+    figures = []
+    def process_item(item):
+        # TOOD: build out a more robust/extensible system for this
+        # TODO: custom type handlers would be really cool
+        # unit metrics (scalar values) - show in a simple table for now
+        if isinstance(item, int) or isinstance(item, float) or isinstance(item, str):
+            tables.append(ResultTable(data=[{test_id.split(".")[-1]: item}]))
+        # plots
+        elif isinstance(item, Figure):
+            figures.append(item)
+        elif is_matplotlib_figure(item) or is_plotly_figure(item) or is_png_image(item):
+            figures.append(
+                Figure(
+                    key=f"{test_id}:{len(figures) + 1}",
+                    figure=item,
+                    metadata=figure_metadata,
+                )
+            )
+        # tables
+        elif isinstance(item, list) or isinstance(item, pd.DataFrame):
+            tables.append(ResultTable(data=item))
+        elif isinstance(item, dict):
+            for table_name, table in item.items():
+                if not isinstance(table, list) and not isinstance(table, pd.DataFrame):
+                    raise ValueError(
+                        f"Invalid table format: {table_name} must be a list or DataFrame"
+                    )
+                tables.append(
+                    ResultTable(
+                        data=table,
+                        metadata=ResultTableMetadata(title=table_name),
+                    )
+                )
+        else:
+            raise ValueError(f"Invalid return type: {type(item)}")
+    # if the results are a tuple, process each item as a separate result
+    if isinstance(results, tuple):
+        for item in results:
+            process_item(item)
+    else:
+        process_item(results)
+    return MetricResultWrapper(
+        result_id=test_id,
+        metric=MetricResult(
+            key=test_id,
+            ref_id=ref_id,
+            value="Empty",
+            summary=ResultSummary(results=tables),
+        ),
+        figures=figures,
+        result_metadata=[
+            {
+                "content_id": f"metric_description:{test_id}",
+                "text": description,
+            }
+        ],
+        inputs=inputs,
+        output_template=output_template,
+    )
+def _get_run_method(func, inputs, params):
+    def run(self: Metric):
+        input_kwargs = {}
+        for k in inputs.keys():
+            try:
+                input_kwargs[k] = getattr(self.inputs, k)
+            except AttributeError:
+                raise MissingRequiredTestInputError(f"Missing required input: {k}.")
+        param_kwargs = {
+            k: self.params.get(k, params[k]["default"]) for k in params.keys()
+        }
+        raw_results = func(**input_kwargs, **param_kwargs)
+        self.result = _build_result(
+            results=raw_results,
+            test_id=self.test_id,
+            description=inspect.getdoc(self),
+            output_template=self.output_template,
+            inputs=list(inputs.keys()),
+        )
+        return self.result
+    return run
+def _get_save_func(func, test_id):
+    def save(root_folder=".", imports=None):
+        parts = test_id.split(".")
+        if len(parts) > 1:
+            path = os.path.join(root_folder, *parts[1:-1])
+            test_name = parts[-1]
+            new_test_id = f"<test_provider_namespace>.{'.'.join(parts[1:])}"
+        else:
+            path = root_folder
+            test_name = parts[0]
+            new_test_id = f"<test_provider_namespace>.{test_name}"
+        if not os.path.exists(path):
+            os.makedirs(path, exist_ok=True)
+        full_path = os.path.join(path, f"{test_name}.py")
+        source = inspect.getsource(func)
+        # remove decorator line
+        source = source.split("\n", 1)[1]
+        if imports:
+            imports = "\n".join(imports)
+            source = f"{imports}\n\n\n{source}"
+        # add comment to the top of the file
+        source = f"""
+# Saved from {func.__module__}.{func.__name__}
+# Original Test ID: {test_id}
+# New Test ID: {new_test_id}
+{source}
+"""
+        # ensure that the function name matches the test name
+        source = source.replace(f"def {func.__name__}", f"def {test_name}")
+        # use black to format the code
+        try:
+            import black
+            source = black.format_str(source, mode=black.FileMode())
+        except ImportError:
+            # ignore if not available
+            pass
+        with open(full_path, "w") as file:
+            file.writelines(source)
+        logger.info(
+            f"Saved to {os.path.abspath(full_path)}!"
+            "Be sure to add any necessary imports to the top of the file."
+        )
+        logger.info(
+            f"This metric can be run with the ID: {new_test_id}",
+        )
+    return save
+def metric(func_or_id):
+    """Decorator for creating and registering metrics with the ValidMind framework.
+    Creates a metric object and registers it with ValidMind under the provided ID. If
+    no ID is provided, the function name will be used as to build one. So if the
+    function name is `my_metric`, the metric will be registered under the ID
+    `validmind.custom_metrics.my_metric`.
+    This decorator works by creating a new `Metric` class will be created whose `run`
+    method calls the decorated function. This function should take as arguments the
+    inputs it requires (`dataset`, `datasets`, `model`, `models`) followed by any
+    parameters. It can return any number of the following types:
+    - Table: Either a list of dictionaries or a pandas DataFrame
+    - Plot: Either a matplotlib figure or a plotly figure
+    - Scalar: A single number or string
+    The function may also include a docstring. This docstring will be used and logged
+    as the metric's description.
+    Args:
+        func: The function to decorate
+        test_id: The identifier for the metric. If not provided, the function name is used.
+    Returns:
+        The decorated function.
+    """
+    from . import _register_custom_test
+    def decorator(func):
+        test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
+        inputs, params = _inspect_signature(func)
+        description = inspect.getdoc(func)
+        tasks = getattr(func, "__tasks__", [])
+        tags = getattr(func, "__tags__", [])
+        metric_class = type(
+            func.__name__,
+            (Metric,),
+            {
+                "run": _get_run_method(func, inputs, params),
+                "required_inputs": list(inputs.keys()),
+                "default_parameters": params,
+                "__doc__": description,
+                "metadata": {
+                    "task_types": tasks,
+                    "tags": tags,
+                },
+            },
+        )
+        _register_custom_test(test_id, metric_class)
+        # special function to allow the function to be saved to a file
+        func.save = _get_save_func(func, test_id)
+        return func
+    if callable(func_or_id):
+        return decorator(func_or_id)
+    return decorator
+def tasks(*tasks):
+    """Decorator for specifying the task types that a metric is designed for.
+    Args:
+        *tasks: The task types that the metric is designed for.
+    """
+    def decorator(func):
+        func.__tasks__ = list(tasks)
+        return func
+    return decorator
+def tags(*tags):
+    """Decorator for specifying tags for a metric.
+    Args:
+        *tags: The tags to apply to the metric.
+    """
+    def decorator(func):
+        func.__tags__ = list(tags)
+        return func
+    return decorator

validmind/tests/model_validation/BertScore.py CHANGED Viewed

@@ -57,7 +57,7 @@ class BertScore(Metric):
     def run(self):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
         # Load the bert evaluation metric
         bert = evaluate.load("bertscore")

validmind/tests/model_validation/BertScoreAggregate.py CHANGED Viewed

@@ -50,7 +50,7 @@ class BertScoreAggregate(Metric):
     def run(self):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
         bert = evaluate.load("bertscore")
         bert_s = bert.compute(predictions=y_pred, references=y_true, lang="en")

validmind/tests/model_validation/BleuScore.py CHANGED Viewed

@@ -55,7 +55,7 @@ class BleuScore(Metric):
         # Compute the BLEU score
         bleu = bleu.compute(
-            predictions=self.inputs.dataset.y_pred(self.inputs.model.input_id),
+            predictions=self.inputs.dataset.y_pred(self.inputs.model),
             references=self.inputs.dataset.y,
         )
         return self.cache_results(metric_value={"blue_score_metric": bleu})

validmind/tests/model_validation/ClusterSizeDistribution.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ClusterSizeDistribution(Metric):
     def run(self):
         y_true_train = self.inputs.dataset.y
-        y_pred_train = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
         y_true_train = y_true_train.astype(y_pred_train.dtype)
         df = pd.DataFrame(
             {"Actual": y_true_train.ravel(), "Prediction": y_pred_train.ravel()}

validmind/tests/model_validation/ContextualRecall.py CHANGED Viewed

@@ -66,7 +66,7 @@ class ContextualRecall(Metric):
     def run(self):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
         score_list = []
         for y_t, y_p in zip(y_true, y_pred):

validmind/tests/model_validation/FeaturesAUC.py ADDED Viewed

@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from dataclasses import dataclass
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from sklearn.metrics import roc_auc_score
+from validmind.errors import SkipTestError
+from validmind.logging import get_logger
+from validmind.vm_models import Figure, Metric
+logger = get_logger(__name__)
+@dataclass
+class FeaturesAUC(Metric):
+    """
+    Evaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately.
+    **Purpose**: The central objective of this metric is to quantify how well each feature on its own can differentiate between the two classes in a binary classification problem. It serves as a univariate analysis tool that can help in pre-modeling feature selection or post-modeling interpretation.
+    **Test Mechanism**: For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's univariate classification strength.
+    **Signs of High Risk**:
+    - A feature with a low AUC score may not be contributing significantly to the differentiation between the two classes, which could be a concern if it is expected to be predictive.
+    - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or other issues with the data.
+    **Strengths**:
+    - By isolating each feature, it highlights the individual contribution of features to the classification task without the influence of other variables.
+    - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual features after model training.
+    **Limitations**:
+    - Does not reflect the combined effects of features or any interaction between them, which can be critical in certain models.
+    - The AUC values are calculated without considering the model's use of the features, which could lead to different interpretations of feature importance when considering the model holistically.
+    - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass classification or regression without modifications.
+    """
+    name = "features_auc"
+    required_inputs = ["model", "dataset"]
+    default_params = {
+        "fontsize": 12,
+        "figure_height": 500,
+    }
+    metadata = {
+        "task_types": ["classification"],
+        "tags": [
+            "feature_importance",
+            "AUC",
+            "visualization",
+        ],
+    }
+    def run(self):
+        x = self.inputs.dataset.x_df()
+        y = self.inputs.dataset.y_df()
+        if y.nunique() != 2:
+            raise SkipTestError("FeaturesAUC metric requires a binary target variable.")
+        aucs = pd.DataFrame(index=x.columns, columns=["AUC"])
+        for column in x.columns:
+            feature_values = x[column]
+            if feature_values.nunique() > 1:
+                auc_score = roc_auc_score(y, feature_values)
+                aucs.loc[column, "AUC"] = auc_score
+            else:
+                aucs.loc[
+                    column, "AUC"
+                ] = np.nan  # Not enough unique values to calculate AUC
+        # Sorting the AUC scores in descending order
+        sorted_indices = aucs["AUC"].dropna().sort_values(ascending=False).index
+        # Plotting the results
+        fig = go.Figure()
+        fig.add_trace(
+            go.Bar(
+                y=[column for column in sorted_indices],
+                x=[aucs.loc[column, "AUC"] for column in sorted_indices],
+                orientation="h",
+            )
+        )
+        fig.update_layout(
+            title_text="Feature AUC Scores",
+            yaxis=dict(
+                tickmode="linear",
+                dtick=1,
+                tickfont=dict(size=self.params["fontsize"]),
+                title="Features",
+                autorange="reversed",  # Ensure that the highest AUC is at the top
+            ),
+            xaxis=dict(title="AUC"),
+            height=self.params["figure_height"],
+        )
+        return self.cache_results(
+            metric_value=aucs.to_dict(),
+            figures=[
+                Figure(
+                    for_object=self,
+                    key="features_auc",
+                    figure=fig,
+                ),
+            ],
+        )

validmind/tests/model_validation/MeteorScore.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from dataclasses import dataclass
+import evaluate
+import pandas as pd
+import plotly.graph_objects as go
+from validmind.vm_models import Figure, Metric
+@dataclass
+class MeteorScore(Metric):
+    """
+    Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
+    **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
+    of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
+    of translations, incorporating precision, recall, and word order into its assessment.
+    **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
+    corresponding human-produced reference. This is done by considering unigram matches between the translations, including
+    matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
+    adjusted for word order through a fragmentation penalty.
+    **Signs of High Risk**:
+    - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
+    - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
+    - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
+    **Strengths**:
+    - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
+      content coverage in translations.
+    - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
+    - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
+    **Limitations**:
+    - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
+      large datasets.
+    - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
+      quality and relevance to the specific translation task.
+    """
+    name = "meteor_score"
+    required_inputs = ["model", "dataset"]
+    def run(self):
+        # Load the METEOR metric
+        meteor = evaluate.load("meteor")
+        # Initialize a list to hold METEOR scores
+        meteor_scores = []
+        for prediction, reference in zip(
+            self.inputs.dataset.y_pred(self.inputs.model),
+            self.inputs.dataset.y,
+        ):
+            # Compute the METEOR score for the current prediction-reference pair
+            result = meteor.compute(predictions=[prediction], references=[reference])
+            meteor_scores.append(result["meteor"])
+        # Visualization of METEOR scores
+        figures = self.visualize_scores(meteor_scores)
+        return self.cache_results(figures=figures)
+    def visualize_scores(self, scores):
+        # Convert the scores list to a DataFrame for plotting
+        scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
+        # Create a line plot of the METEOR scores
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=scores_df.index,
+                y=scores_df["METEOR Score"],
+                mode="lines+markers",
+                name="METEOR Score",
+            )
+        )
+        fig.update_layout(
+            title="METEOR Scores Across Text Instances",
+            xaxis_title="Text Instance Index",
+            yaxis_title="METEOR Score",
+        )
+        # Wrap the Plotly figure for compatibility with your framework
+        figures = [Figure(for_object=self, key=self.key, figure=fig)]
+        return figures

validmind/tests/model_validation/RegardHistogram.py CHANGED Viewed

@@ -57,22 +57,20 @@ class RegardHistogram(Metric):
             raise AttributeError("The 'model' attribute is missing.")
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
-        input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
-        if not len(y_true) == len(y_pred) == len(input_text):
+        if not len(y_true) == len(y_pred):
             raise ValueError(
-                "Inconsistent lengths among input text, true summaries, and predicted summaries."
+                "Inconsistent lengths among true summaries and predicted summaries."
             )
-        return input_text, y_true, y_pred
+        return y_true, y_pred
     def regard_histogram(self):
         regard_tool = evaluate.load("regard")
-        input_text, y_true, y_pred = self._get_datasets()
+        y_true, y_pred = self._get_datasets()
         dataframes = {
-            "Input Text": input_text,
             "Target Text": y_true,
             "Predicted Summaries": y_pred,
         }
@@ -101,6 +99,7 @@ class RegardHistogram(Metric):
         )
         row_offset = 0
         for column_name, column_data in dataframes.items():
             results = regard_tool.compute(data=column_data)["regard"]
             regard_dicts = [

validmind/tests/model_validation/RegardScore.py CHANGED Viewed

@@ -58,22 +58,20 @@ class RegardScore(Metric):
             raise AttributeError("The 'model' attribute is missing.")
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
-        input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
-        if not len(y_true) == len(y_pred) == len(input_text):
+        if not len(y_true) == len(y_pred):
             raise ValueError(
                 "Inconsistent lengths among input text, true summaries, and predicted summaries."
             )
-        return input_text, y_true, y_pred
+        return y_true, y_pred
     def regard_line_plot(self):
         regard_tool = evaluate.load("regard")
-        input_text, y_true, y_pred = self._get_datasets()
+        y_true, y_pred = self._get_datasets()
         dataframes = {
-            "Input Text": input_text,
             "Target Text": y_true,
             "Predicted Summaries": y_pred,
         }

validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

validmind 2.0.1py3-none-any.whl → 2.1.0py3-none-any.whl