PyPI - validmind - Versions diffs - 2.0.1__py3-none-any.whl → 2.0.7__py3-none-any.whl - Mend

validmind 2.0.1py3-none-any.whl → 2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

validmind/__init__.py +4 -1
validmind/__version__.py +1 -1
validmind/ai.py +197 -0
validmind/api_client.py +16 -4
validmind/client.py +23 -3
validmind/datasets/classification/customer_churn.py +2 -2
validmind/datasets/nlp/__init__.py +5 -0
validmind/datasets/nlp/cnn_dailymail.py +98 -0
validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
validmind/errors.py +11 -1
validmind/models/huggingface.py +2 -2
validmind/models/pytorch.py +3 -3
validmind/models/sklearn.py +4 -4
validmind/tests/__init__.py +47 -9
validmind/tests/data_validation/DatasetDescription.py +0 -1
validmind/tests/data_validation/nlp/StopWords.py +1 -6
validmind/tests/data_validation/nlp/TextDescription.py +20 -9
validmind/tests/decorator.py +189 -0
validmind/tests/model_validation/MeteorScore.py +92 -0
validmind/tests/model_validation/RegardHistogram.py +5 -6
validmind/tests/model_validation/RegardScore.py +3 -5
validmind/tests/model_validation/RougeMetrics.py +6 -4
validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
validmind/tests/prompt_validation/ai_powered_test.py +2 -0
validmind/unit_metrics/__init__.py +0 -2
validmind/unit_metrics/composite.py +275 -0
validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
validmind/unit_metrics/regression/HuberLoss.py +27 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
validmind/unit_metrics/regression/QuantileLoss.py +25 -0
validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
validmind/unit_metrics/sklearn/classification/F1.py +2 -0
validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
validmind/utils.py +17 -1
validmind/vm_models/dataset.py +376 -21
validmind/vm_models/figure.py +52 -17
validmind/vm_models/test/metric.py +33 -30
validmind/vm_models/test/output_template.py +0 -27
validmind/vm_models/test/result_wrapper.py +57 -24
validmind/vm_models/test/test.py +2 -1
validmind/vm_models/test/threshold_test.py +24 -13
validmind/vm_models/test_context.py +7 -0
validmind/vm_models/test_suite/runner.py +1 -1
validmind/vm_models/test_suite/test.py +1 -1
{validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
{validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/RECORD +65 -44
validmind-2.0.7.dist-info/entry_points.txt +3 -0
{validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
{validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0

validmind/errors.py CHANGED Viewed

@@ -48,7 +48,7 @@ class MissingCacheResultsArgumentsError(BaseError):
     pass
-class MissingModelPredictFnError(BaseError):
+class MissingOrInvalidModelPredictFnError(BaseError):
     """
     When the pytorch model is missing a predict function or its predict
     method does not have the expected arguments.
@@ -315,6 +315,14 @@ class UnsupportedModelError(BaseError):
     pass
+class UnsupportedModelForSHAPError(BaseError):
+    """
+    When an unsupported model is used for SHAP importance.
+    """
+    pass
 class SkipTestError(BaseError):
     """
     Useful error to throw when a test cannot be executed.
@@ -361,6 +369,8 @@ def should_raise_on_fail_fast(error) -> bool:
     """
     error_class = error.__class__.__name__
     return error_class not in [
+        "MissingOrInvalidModelPredictFnError",
         "MissingRequiredTestInputError",
         "SkipTestError",
+        "UnsupportedModelForSHAPError",
     ]

validmind/models/huggingface.py CHANGED Viewed

@@ -6,7 +6,7 @@ from dataclasses import dataclass
 import pandas as pd
-from validmind.errors import MissingModelPredictFnError
+from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.logging import get_logger
 from validmind.vm_models.model import (
     ModelAttributes,
@@ -44,7 +44,7 @@ class HFModel(VMModel):
         Invoke predict_proba from underline model
         """
         if not has_method_with_arguments(self.model, "predict_proba", 1):
-            raise MissingModelPredictFnError(
+            raise MissingOrInvalidModelPredictFnError(
                 "Model requires a implementation of predict_proba method with 1 argument"
                 + " that is tensor features matrix"
             )

validmind/models/pytorch.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from validmind.errors import MissingModelPredictFnError
+from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.logging import get_logger
 from validmind.vm_models.model import (
     ModelAttributes,
@@ -41,7 +41,7 @@ class PyTorchModel(VMModel):
         Invoke predict_proba from underline model
         """
         if not has_method_with_arguments(self.model, "predict_proba", 1):
-            raise MissingModelPredictFnError(
+            raise MissingOrInvalidModelPredictFnError(
                 "Model requires a implemention of predict_proba method with 1 argument"
                 + " that is tensor features matrix"
             )
@@ -54,7 +54,7 @@ class PyTorchModel(VMModel):
         Predict method for the model. This is a wrapper around the model's
         """
         if not has_method_with_arguments(self.model, "predict", 1):
-            raise MissingModelPredictFnError(
+            raise MissingOrInvalidModelPredictFnError(
                 "Model requires a implemention of predict method with 1 argument"
                 + " that is tensor features matrix"
             )

validmind/models/sklearn.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from validmind.errors import MissingModelPredictFnError
+from validmind.errors import MissingOrInvalidModelPredictFnError
 from validmind.logging import get_logger
 from validmind.vm_models.model import (
     ModelAttributes,
@@ -40,9 +40,9 @@ class SKlearnModel(VMModel):
         predict_proba (for classification) or predict (for regression) method
         """
         if not has_method_with_arguments(self.model, "predict_proba", 1):
-            raise MissingModelPredictFnError(
-                "Model requires a implemention of predict_proba method with 1 argument"
-                + " that is features matrix"
+            raise MissingOrInvalidModelPredictFnError(
+                f"SKlearn model {self.model.__class__} Model does not have a compatible predict_proba implementation."
+                + " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
             )
         if callable(getattr(self.model, "predict_proba", None)):
             return self.model.predict_proba(*args, **kwargs)[:, 1]

validmind/tests/__init__.py CHANGED Viewed

@@ -18,6 +18,7 @@ from markdown import markdown
 from ..errors import LoadTestError
 from ..html_templates.content_blocks import test_content_block_html
 from ..logging import get_logger
+from ..unit_metrics.composite import load_composite_metric
 from ..utils import clean_docstring, format_dataframe, fuzzy_match, test_id_to_name
 from ..vm_models import TestContext, TestInput
 from .__types__ import ExternalTestProvider
@@ -43,6 +44,7 @@ __tests = None
 __test_classes = None
 __test_providers: Dict[str, ExternalTestProvider] = {}
+__custom_tests: Dict[str, object] = {}
 def _test_description(test_class, truncate=True):
@@ -260,13 +262,13 @@ def load_test(test_id, reload=False):  # noqa: C901
     error = None
     namespace = parts[0]
-    if namespace != "validmind" and namespace not in __test_providers:
-        error = (
-            f"Unable to load test {test_id}. "
-            f"No Test Provider found for the namespace: {namespace}."
-        )
+    if test_id.split(":")[0] in __custom_tests:
+        test = __custom_tests[test_id.split(":")[0]]
-    if namespace == "validmind":
+    elif test_id.startswith("validmind.composite_metric"):
+        test = load_composite_metric(test_id)
+    elif namespace == "validmind":
         test_module = ".".join(parts[1:-1])
         test_class = parts[-1]
@@ -284,6 +286,12 @@ def load_test(test_id, reload=False):  # noqa: C901
         except AttributeError:
             error = f"Unable to load test {test_id}. Class not in module: {test_class}"
+    elif namespace != "validmind" and namespace not in __test_providers:
+        error = (
+            f"Unable to load test {test_id}. "
+            f"No Test Provider found for the namespace: {namespace}."
+        )
     elif namespace in __test_providers:
         try:
             test = __test_providers[namespace].load_test(test_id.split(".", 1)[1])
@@ -346,11 +354,24 @@ def describe_test(test_id: str = None, raw: bool = False):
     )
-def run_test(test_id, params: dict = None, inputs=None, output_template=None, **kwargs):
+def run_test(
+    test_id: str = None,
+    name: str = None,
+    unit_metrics: list = None,
+    params: dict = None,
+    inputs=None,
+    output_template=None,
+    **kwargs,
+):
     """Run a test by test ID
     Args:
-        test_id (str): The test ID
+        test_id (str, option): The test ID to run - required when running a single test
+            i.e. when not running multiple unit metrics
+        name (str, optional): The name of the test (used to create a composite metric
+            out of multiple unit metrics) - required when running multiple unit metrics
+        unit_metrics (list, optional): A list of unit metric IDs to run as a composite
+            metric - required when running multiple unit metrics
         params (dict, optional): A dictionary of params to override the default params
         inputs: A dictionary of test inputs to pass to the Test
         output_template (str, optional): A template to use for customizing the output
@@ -360,7 +381,20 @@ def run_test(test_id, params: dict = None, inputs=None, output_template=None, **
             - models: A list of models to use for the test
             other inputs can be accessed inside the test via `self.inputs["input_name"]`
     """
-    TestClass = load_test(test_id, reload=True)
+    if not test_id and not name and not unit_metrics:
+        raise ValueError(
+            "`test_id` or `name` and `unit_metrics` must be provided to run a test"
+        )
+    if (unit_metrics and not name) or (name and not unit_metrics):
+        raise ValueError("`name` and `unit_metrics` must be provided together")
+    if unit_metrics:
+        TestClass = load_composite_metric(unit_metrics=unit_metrics, metric_name=name)
+        test_id = f"validmind.composite_metric.{name}"
+    else:
+        TestClass = load_test(test_id, reload=True)
     test = TestClass(
         test_id=test_id,
         context=TestContext(),
@@ -383,3 +417,7 @@ def register_test_provider(namespace: str, test_provider: ExternalTestProvider)
         test_provider (ExternalTestProvider): The test provider
     """
     __test_providers[namespace] = test_provider
+def _register_custom_test(test_id: str, test_class: object):
+    __custom_tests[test_id] = test_class

validmind/tests/data_validation/DatasetDescription.py CHANGED Viewed

@@ -122,7 +122,6 @@ class DatasetDescription(Metric):
         return self.cache_results(results)
     def infer_datatype(self, df):
         vm_dataset_variables = {}
         typeset = ProfilingTypeSet(Settings())
         variable_types = typeset.infer_type(df)

validmind/tests/data_validation/nlp/StopWords.py CHANGED Viewed

@@ -22,7 +22,6 @@ from validmind.vm_models import (
     ResultTableMetadata,
     ThresholdTest,
     ThresholdTestResult,
-    VMDataset,
 )
@@ -86,17 +85,13 @@ class StopWords(ThresholdTest):
                 ResultTable(
                     data=df,
                     metadata=ResultTableMetadata(
-                        title=f"Class Imbalance Results for Column {self.inputs.dataset.target_column}"
+                        title=f"Stop words results for column '{self.inputs.dataset.target_column}'"
                     ),
                 )
             ]
         )
     def run(self):
-        # Can only run this test if we have a Dataset object
-        if not isinstance(self.inputs.dataset, VMDataset):
-            raise ValueError("ClassImbalance requires a validmind Dataset object")
         text_column = self.inputs.dataset.text_column
         def create_corpus(df, text_column):

validmind/tests/data_validation/nlp/TextDescription.py CHANGED Viewed

@@ -92,9 +92,12 @@ class TextDescription(Metric):
             total_words = len(words)
             total_sentences = len(sentences)
             avg_sentence_length = round(
-                sum(len(sentence.split()) for sentence in sentences) / total_sentences
-                if total_sentences
-                else 0,
+                (
+                    sum(len(sentence.split()) for sentence in sentences)
+                    / total_sentences
+                    if total_sentences
+                    else 0
+                ),
                 1,
             )
             total_paragraphs = len(paragraphs)
@@ -161,9 +164,13 @@ class TextDescription(Metric):
         return combined_df
     def run(self):
+        # Enforce that text_column must be provided as part of the params
+        if self.inputs.dataset.text_column is None:
+            raise ValueError("A 'text_column' must be provided to run this test.")
         # Can only run this test if we have a Dataset object
         if not isinstance(self.inputs.dataset, VMDataset):
-            raise ValueError("TextDescretion requires a validmind Dataset object")
+            raise ValueError("TextDescription requires a validmind Dataset object")
         df_text_description = self.text_description_table(
             self.inputs.dataset.df, self.params
@@ -177,27 +184,31 @@ class TextDescription(Metric):
             ("Total Unique Words", "Lexical Diversity"),
         ]
         params = {"combinations_to_plot": combinations_to_plot}
-        figures = self.text_description_scatter_plot(df_text_description, params)
+        figures = self.text_description_plots(df_text_description, params)
         return self.cache_results(
             figures=figures,
         )
     # Function to plot scatter plots for specified combinations using Plotly
-    def text_description_scatter_plot(self, df, params):
+    def text_description_plots(self, df, params):
         combinations_to_plot = params["combinations_to_plot"]
         figures = []
         # Create hist plots for each column
         for i, column in enumerate(df.columns):
             fig = px.histogram(df, x=column)
             fig.update_layout(bargap=0.2)
-            figures.append(Figure(for_object=self, key=self.key, figure=fig))
+            # Generate a unique key for each histogram using the column name and index
+            histogram_key = f"{self.name}_histogram_{column}_{i}"
+            figures.append(Figure(for_object=self, key=histogram_key, figure=fig))
-        for metric1, metric2 in combinations_to_plot:
+        for j, (metric1, metric2) in enumerate(combinations_to_plot):
             fig = px.scatter(
                 df, x=metric1, y=metric2, title=f"Scatter Plot: {metric1} vs {metric2}"
             )
-            figures.append(Figure(for_object=self, key=self.key, figure=fig))
+            # Generate a unique key for each scatter plot using the metric names and index
+            scatter_key = f"{self.name}_scatter_{metric1}_vs_{metric2}_{j}"
+            figures.append(Figure(for_object=self, key=scatter_key, figure=fig))
         plt.close("all")
         return figures

validmind/tests/decorator.py ADDED Viewed

@@ -0,0 +1,189 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+"""Decorators for creating and registering metrics with the ValidMind framework."""
+import inspect
+from uuid import uuid4
+import pandas as pd
+from validmind.logging import get_logger
+from validmind.utils import clean_docstring
+from validmind.vm_models import (
+    Metric,
+    MetricResult,
+    ResultSummary,
+    ResultTable,
+    ResultTableMetadata,
+)
+from validmind.vm_models.figure import (
+    Figure,
+    is_matplotlib_figure,
+    is_plotly_figure,
+    is_png_image,
+)
+from validmind.vm_models.test.result_wrapper import MetricResultWrapper
+from . import _register_custom_test
+logger = get_logger(__name__)
+def _inspect_signature(test_func: callable):
+    input_keys = ["dataset", "datasets", "model", "models"]
+    inputs = {}
+    params = {}
+    for name, arg in inspect.signature(test_func).parameters.items():
+        if name in input_keys:
+            target_dict = inputs
+        else:
+            target_dict = params
+        target_dict[name] = {
+            "type": arg.annotation,
+            "default": (
+                arg.default if arg.default is not inspect.Parameter.empty else None
+            ),
+        }
+    return inputs, params
+def _build_result(results, test_id, description, output_template):
+    ref_id = str(uuid4())
+    figure_metadata = {
+        "_type": "metric",
+        "_name": test_id,
+        "_ref_id": ref_id,
+    }
+    tables = []
+    figures = []
+    def process_item(item):
+        if is_matplotlib_figure(item) or is_plotly_figure(item) or is_png_image(item):
+            figures.append(
+                Figure(
+                    key=f"{test_id}:{len(figures) + 1}",
+                    figure=item,
+                    metadata=figure_metadata,
+                )
+            )
+        elif isinstance(item, list):
+            tables.append(ResultTable(data=item))
+        elif isinstance(item, pd.DataFrame):
+            tables.append(ResultTable(data=item))
+        elif isinstance(item, dict):
+            for table_name, table in item.items():
+                tables.append(
+                    ResultTable(
+                        data=table,
+                        metadata=ResultTableMetadata(title=table_name),
+                    )
+                )
+        else:
+            raise ValueError(f"Invalid return type: {type(item)}")
+    # if the results are a tuple, process each item as a separate result
+    if isinstance(results, tuple):
+        for item in results:
+            process_item(item)
+    else:
+        process_item(results)
+    return MetricResultWrapper(
+        result_id=test_id,
+        metric=MetricResult(
+            key=test_id,
+            ref_id=ref_id,
+            value="Empty",
+            summary=ResultSummary(results=tables),
+        ),
+        figures=figures,
+        result_metadata=[
+            {
+                "content_id": f"metric_description:{test_id}",
+                "text": clean_docstring(description),
+            }
+        ],
+        inputs=[],
+        output_template=output_template,
+    )
+def get_run_method(func, inputs, params):
+    def run(self: Metric):
+        input_kwargs = {k: getattr(self.inputs, k) for k in inputs.keys()}
+        param_kwargs = {
+            k: self.params.get(k, params[k]["default"]) for k in params.keys()
+        }
+        raw_results = func(**input_kwargs, **param_kwargs)
+        self.result = _build_result(
+            results=raw_results,
+            test_id=self.test_id,
+            description=self.__doc__,
+            output_template=self.output_template,
+        )
+        return self.result
+    return run
+def metric(func_or_id):
+    """Decorator for creating and registering metrics with the ValidMind framework.
+    Creates a metric object and registers it with ValidMind under the provided ID. If
+    no ID is provided, the function name will be used as to build one. So if the
+    function name is `my_metric`, the metric will be registered under the ID
+    `validmind.custom_metrics.my_metric`.
+    This decorator works by creating a new `Metric` class will be created whose `run`
+    method calls the decorated function. This function should take as arguments the
+    inputs it requires (`dataset`, `datasets`, `model`, `models`) followed by any
+    parameters. It can return any number of the following types:
+    - Table: Either a list of dictionaries or a pandas DataFrame
+    - Plot: Either a matplotlib figure or a plotly figure
+    The function may also include a docstring. This docstring will be used and logged
+    as the metric's description.
+    Args:
+        func: The function to decorate
+        test_id: The identifier for the metric. If not provided, the function name is used.
+    Returns:
+        The decorated function.
+    """
+    def decorator(func):
+        test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
+        inputs, params = _inspect_signature(func)
+        description = inspect.getdoc(func)
+        metric_class = type(
+            func.__name__,
+            (Metric,),
+            {
+                "run": get_run_method(func, inputs, params),
+                "required_inputs": list(inputs.keys()),
+                "default_parameters": params,
+                "__doc__": description,
+            },
+        )
+        _register_custom_test(test_id, metric_class)
+        return func
+    if callable(func_or_id):
+        return decorator(func_or_id)
+    return decorator

validmind/tests/model_validation/MeteorScore.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from dataclasses import dataclass
+import evaluate
+import pandas as pd
+import plotly.graph_objects as go
+from validmind.vm_models import Figure, Metric
+@dataclass
+class MeteorScore(Metric):
+    """
+    Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
+    **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
+    of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
+    of translations, incorporating precision, recall, and word order into its assessment.
+    **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
+    corresponding human-produced reference. This is done by considering unigram matches between the translations, including
+    matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
+    adjusted for word order through a fragmentation penalty.
+    **Signs of High Risk**:
+    - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
+    - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
+    - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
+    **Strengths**:
+    - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
+      content coverage in translations.
+    - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
+    - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
+    **Limitations**:
+    - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
+      large datasets.
+    - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
+      quality and relevance to the specific translation task.
+    """
+    name = "meteor_score"
+    required_inputs = ["model", "dataset"]
+    def run(self):
+        # Load the METEOR metric
+        meteor = evaluate.load("meteor")
+        # Initialize a list to hold METEOR scores
+        meteor_scores = []
+        for prediction, reference in zip(
+            self.inputs.dataset.y_pred(self.inputs.model.input_id),
+            self.inputs.dataset.y,
+        ):
+            # Compute the METEOR score for the current prediction-reference pair
+            result = meteor.compute(predictions=[prediction], references=[reference])
+            meteor_scores.append(result["meteor"])
+        # Visualization of METEOR scores
+        figures = self.visualize_scores(meteor_scores)
+        return self.cache_results(figures=figures)
+    def visualize_scores(self, scores):
+        # Convert the scores list to a DataFrame for plotting
+        scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
+        # Create a line plot of the METEOR scores
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=scores_df.index,
+                y=scores_df["METEOR Score"],
+                mode="lines+markers",
+                name="METEOR Score",
+            )
+        )
+        fig.update_layout(
+            title="METEOR Scores Across Text Instances",
+            xaxis_title="Text Instance Index",
+            yaxis_title="METEOR Score",
+        )
+        # Wrap the Plotly figure for compatibility with your framework
+        figures = [Figure(for_object=self, key=self.key, figure=fig)]
+        return figures

validmind/tests/model_validation/RegardHistogram.py CHANGED Viewed

@@ -58,21 +58,19 @@ class RegardHistogram(Metric):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
         y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
-        input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
-        if not len(y_true) == len(y_pred) == len(input_text):
+        if not len(y_true) == len(y_pred):
             raise ValueError(
-                "Inconsistent lengths among input text, true summaries, and predicted summaries."
+                "Inconsistent lengths among true summaries and predicted summaries."
             )
-        return input_text, y_true, y_pred
+        return y_true, y_pred
     def regard_histogram(self):
         regard_tool = evaluate.load("regard")
-        input_text, y_true, y_pred = self._get_datasets()
+        y_true, y_pred = self._get_datasets()
         dataframes = {
-            "Input Text": input_text,
             "Target Text": y_true,
             "Predicted Summaries": y_pred,
         }
@@ -101,6 +99,7 @@ class RegardHistogram(Metric):
         )
         row_offset = 0
         for column_name, column_data in dataframes.items():
             results = regard_tool.compute(data=column_data)["regard"]
             regard_dicts = [

validmind/tests/model_validation/RegardScore.py CHANGED Viewed

@@ -59,21 +59,19 @@ class RegardScore(Metric):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
         y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
-        input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
-        if not len(y_true) == len(y_pred) == len(input_text):
+        if not len(y_true) == len(y_pred):
             raise ValueError(
                 "Inconsistent lengths among input text, true summaries, and predicted summaries."
             )
-        return input_text, y_true, y_pred
+        return y_true, y_pred
     def regard_line_plot(self):
         regard_tool = evaluate.load("regard")
-        input_text, y_true, y_pred = self._get_datasets()
+        y_true, y_pred = self._get_datasets()
         dataframes = {
-            "Input Text": input_text,
             "Target Text": y_true,
             "Predicted Summaries": y_pred,
         }

validmind 2.0.1__py3-none-any.whl → 2.0.7__py3-none-any.whl

validmind 2.0.1py3-none-any.whl → 2.0.7py3-none-any.whl