PyPI - validmind - Versions diffs - 2.3.5__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

validmind 2.3.5py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

validmind/tests/prompt_validation/Bias.py CHANGED Viewed

@@ -75,10 +75,8 @@ class Bias(ThresholdTest):
     name = "bias"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/prompt_validation/Clarity.py CHANGED Viewed

@@ -64,10 +64,8 @@ class Clarity(ThresholdTest):
     name = "clarity"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/prompt_validation/Conciseness.py CHANGED Viewed

@@ -64,10 +64,8 @@ class Conciseness(ThresholdTest):
     name = "conciseness"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/prompt_validation/Delimitation.py CHANGED Viewed

@@ -66,10 +66,8 @@ class Delimitation(ThresholdTest):
     name = "delimitation"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/prompt_validation/NegativeInstruction.py CHANGED Viewed

@@ -70,10 +70,8 @@ class NegativeInstruction(ThresholdTest):
     name = "negative_instruction"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/prompt_validation/Robustness.py CHANGED Viewed

@@ -60,10 +60,8 @@ class Robustness(ThresholdTest):
     name = "robustness"
     required_inputs = ["model"]
     default_params = {"num_tests": 10}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = '''
 You are a prompt evaluation researcher AI who is tasked with testing the robustness of LLM prompts.

validmind/tests/prompt_validation/Specificity.py CHANGED Viewed

@@ -66,10 +66,8 @@ class Specificity(ThresholdTest):
     name = "specificity"
     required_inputs = ["model.prompt"]
     default_params = {"min_threshold": 7}
-    metadata = {
-        "task_types": ["text_classification", "text_summarization"],
-        "tags": ["llm", "zero_shot", "few_shot"],
-    }
+    tasks = ["text_classification", "text_summarization"]
+    tags = ["llm", "zero_shot", "few_shot"]
     system_prompt = """
 You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.

validmind/tests/run.py ADDED Viewed

@@ -0,0 +1,394 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from itertools import product
+from typing import Any, Dict, List, Union
+from uuid import uuid4
+import pandas as pd
+from validmind.ai.test_descriptions import get_description_metadata
+from validmind.errors import LoadTestError
+from validmind.logging import get_logger
+from validmind.unit_metrics import run_metric
+from validmind.unit_metrics.composite import load_composite_metric
+from validmind.vm_models import (
+    MetricResult,
+    ResultSummary,
+    ResultTable,
+    TestContext,
+    TestInput,
+    ThresholdTestResults,
+)
+from validmind.vm_models.figure import is_matplotlib_figure, is_plotly_figure
+from validmind.vm_models.test.result_wrapper import (
+    MetricResultWrapper,
+    ThresholdTestResultWrapper,
+)
+from .__types__ import TestID
+from .load import load_test
+logger = get_logger(__name__)
+def _cartesian_product(input_grid: Dict[str, List[Any]]):
+    """Get all possible combinations for a set of inputs"""
+    return [dict(zip(input_grid, values)) for values in product(*input_grid.values())]
+def _combine_summaries(summaries: List[Dict[str, Any]]):
+    """Combine the summaries from multiple results
+    Args:
+        summaries (List[Dict[str, Any]]): A list of dictionaries where each dictionary
+            has two keys: "inputs" and "summary". The "inputs" key should contain the
+            inputs used for the test and the "summary" key should contain the actual
+            summary object.
+    Constraint: The summaries must all have the same structure meaning that each has
+    the same number of tables in the same order with the same columns etc. This
+    should always be the case for comparison tests since its the same test run
+    multiple times with different inputs.
+    """
+    if not summaries[0]["summary"]:
+        return None
+    def combine_tables(table_index):
+        combined_df = pd.DataFrame()
+        for summary_obj in summaries:
+            serialized = summary_obj["summary"].results[table_index].serialize()
+            summary_df = pd.DataFrame(serialized["data"])
+            summary_df = pd.concat(
+                [
+                    pd.DataFrame(summary_obj["inputs"], index=summary_df.index),
+                    summary_df,
+                ],
+                axis=1,
+            )
+            combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
+        return ResultTable(
+            data=combined_df.to_dict(orient="records"),
+            metadata=summaries[0]["summary"].results[table_index].metadata,
+        )
+    return ResultSummary(
+        results=[
+            combine_tables(table_index)
+            for table_index in range(len(summaries[0]["summary"].results))
+        ]
+    )
+def _update_plotly_titles(figures, input_groups, title_template):
+    current_title = figures[0].figure.layout.title.text
+    for i, figure in enumerate(figures):
+        figure.figure.layout.title.text = title_template.format(
+            current_title=f"{current_title} " if current_title else "",
+            input_description=", ".join(
+                f"{k}={v if isinstance(v, str) else v.input_id}"
+                for k, v in input_groups[i].items()
+            ),
+        )
+def _update_matplotlib_titles(figures, input_groups, title_template):
+    current_title = figures[0].figure.get_title()
+    for i, figure in enumerate(figures):
+        figure.figure.suptitle(
+            title_template.format(
+                current_title=f"{current_title} " if current_title else "",
+                input_description=" and ".join(
+                    f"{k}: {v if isinstance(v, str) else v.input_id}"
+                    for k, v in input_groups[i].items()
+                ),
+            )
+        )
+def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str, Any]]):
+    """Combine the figures from multiple results"""
+    if not figure_lists[0]:
+        return None
+    title_template = "{current_title}({input_description})"
+    for i, figures in enumerate(list(zip(*figure_lists))):
+        if is_plotly_figure(figures[0].figure):
+            _update_plotly_titles(figures, input_groups, title_template)
+        elif is_matplotlib_figure(figures[0].figure):
+            _update_matplotlib_titles(figures, input_groups, title_template)
+        else:
+            logger.warning("Cannot properly annotate png figures")
+    return [figure for figures in figure_lists for figure in figures]
+def metric_comparison(
+    results: List[MetricResultWrapper],
+    test_id: TestID,
+    input_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
+    output_template: str = None,
+    generate_description: bool = True,
+):
+    """Build a comparison result for multiple metric results"""
+    ref_id = str(uuid4())
+    input_group_strings = [
+        {k: v if isinstance(v, str) else v.input_id for k, v in group.items()}
+        for group in input_groups
+    ]
+    merged_summary = _combine_summaries(
+        [
+            {"inputs": input_group_strings[i], "summary": result.metric.summary}
+            for i, result in enumerate(results)
+        ]
+    )
+    merged_figures = _combine_figures(
+        [result.figures for result in results], input_groups
+    )
+    # Patch figure metadata so they are connected to the comparison result
+    if merged_figures and len(merged_figures):
+        for i, figure in enumerate(merged_figures):
+            figure.key = f"{figure.key}-{i}"
+            figure.metadata["_name"] = test_id
+            figure.metadata["_ref_id"] = ref_id
+    return MetricResultWrapper(
+        result_id=test_id,
+        result_metadata=[
+            get_description_metadata(
+                test_id=test_id,
+                default_description=f"Comparison test result for {test_id}",
+                summary=merged_summary.serialize() if merged_summary else None,
+                figures=merged_figures,
+                should_generate=generate_description,
+            ),
+        ],
+        inputs=[
+            input if isinstance(input, str) else input.input_id
+            for group in input_groups
+            for input in group.values()
+        ],
+        output_template=output_template,
+        metric=MetricResult(
+            key=test_id,
+            ref_id=ref_id,
+            value=[],
+            summary=merged_summary,
+        ),
+        figures=merged_figures,
+    )
+def threshold_test_comparison(
+    results: List[ThresholdTestResultWrapper],
+    test_id: TestID,
+    input_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
+    output_template: str = None,
+    generate_description: bool = True,
+):
+    """Build a comparison result for multiple threshold test results"""
+    ref_id = str(uuid4())
+    input_group_strings = [
+        {k: v if isinstance(v, str) else v.input_id for k, v in group.items()}
+        for group in input_groups
+    ]
+    merged_summary = _combine_summaries(
+        [
+            {"inputs": input_group_strings[i], "summary": result.test_results.summary}
+            for i, result in enumerate(results)
+        ]
+    )
+    merged_figures = _combine_figures(
+        [result.figures for result in results], input_groups
+    )
+    # Patch figure metadata so they are connected to the comparison result
+    if merged_figures and len(merged_figures):
+        for i, figure in enumerate(merged_figures):
+            figure.key = f"{figure.key}-{i}"
+            figure.metadata["_name"] = test_id
+            figure.metadata["_ref_id"] = ref_id
+    return ThresholdTestResultWrapper(
+        result_id=test_id,
+        result_metadata=[
+            get_description_metadata(
+                test_id=test_id,
+                default_description=f"Comparison test result for {test_id}",
+                summary=merged_summary.serialize() if merged_summary else None,
+                figures=merged_figures,
+                prefix="test_description",
+                should_generate=generate_description,
+            )
+        ],
+        inputs=[
+            input if isinstance(input, str) else input.input_id
+            for group in input_groups
+            for input in group.values()
+        ],
+        output_template=output_template,
+        test_results=ThresholdTestResults(
+            test_name=test_id,
+            ref_id=ref_id,
+            # TODO: when we have param_grid support, this will need to be updated
+            params=results[0].test_results.params,
+            passed=all(result.test_results.passed for result in results),
+            results=[],
+            summary=merged_summary,
+        ),
+        figures=merged_figures,
+    )
+def run_comparison_test(
+    test_id: TestID,
+    input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
+    params: Dict[str, Any] = None,
+    show: bool = True,
+    output_template: str = None,
+    generate_description: bool = True,
+):
+    """Run a comparison test"""
+    if isinstance(input_grid, dict):
+        input_groups = _cartesian_product(input_grid)
+    else:
+        input_groups = input_grid
+    results = [
+        run_test(
+            test_id,
+            inputs=inputs,
+            show=False,
+            params=params,
+            __generate_description=False,
+        )
+        for inputs in input_groups
+    ]
+    if isinstance(results[0], MetricResultWrapper):
+        func = metric_comparison
+    else:
+        func = threshold_test_comparison
+    result = func(results, test_id, input_groups, output_template, generate_description)
+    if show:
+        result.show()
+    return result
+def run_test(
+    test_id: TestID = None,
+    params: Dict[str, Any] = None,
+    inputs: Dict[str, Any] = None,
+    input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
+    name: str = None,
+    unit_metrics: List[TestID] = None,
+    output_template: str = None,
+    show: bool = True,
+    __generate_description: bool = True,
+    **kwargs,
+) -> Union[MetricResultWrapper, ThresholdTestResultWrapper]:
+    """Run a test by test ID
+    Args:
+        test_id (TestID, optional): The test ID to run. Not required if `unit_metrics` is provided.
+        params (dict, optional): A dictionary of parameters to pass into the test. Params
+            are used to customize the test behavior and are specific to each test. See the
+            test details for more information on the available parameters. Defaults to None.
+        inputs (Dict[str, Any], optional): A dictionary of test inputs to pass into the
+            test. Inputs are either models or datasets that have been initialized using
+            vm.init_model() or vm.init_dataset(). Defaults to None.
+        input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
+            a comparison test, provide either a dictionary of inputs where the keys are
+            the input names and the values are lists of different inputs, or a list of
+            dictionaries where each dictionary is a set of inputs to run the test with.
+            This will run the test multiple times with different sets of inputs and then
+            combine the results into a single output. When passing a dictionary, the grid
+            will be created by taking the Cartesian product of the input lists. Its simply
+            a more convenient way of forming the input grid as opposed to passing a list of
+            all possible combinations. Defaults to None.
+        name (str, optional): The name of the test (used to create a composite metric
+            out of multiple unit metrics) - required when running multiple unit metrics
+        unit_metrics (list, optional): A list of unit metric IDs to run as a composite
+            metric - required when running multiple unit metrics
+        output_template (str, optional): A jinja2 html template to customize the output
+            of the test. Defaults to None.
+        show (bool, optional): Whether to display the results. Defaults to True.
+        **kwargs: Keyword inputs to pass into the test (same as `inputs` but as keyword
+            args instead of a dictionary):
+            - dataset: A validmind Dataset object or a Pandas DataFrame
+            - model: A model to use for the test
+            - models: A list of models to use for the test
+            - dataset: A validmind Dataset object or a Pandas DataFrame
+    """
+    if not test_id and not name and not unit_metrics:
+        raise ValueError(
+            "`test_id` or `name` and `unit_metrics` must be provided to run a test"
+        )
+    if (unit_metrics and not name) or (name and not unit_metrics):
+        raise ValueError("`name` and `unit_metrics` must be provided together")
+    if (input_grid and kwargs) or (input_grid and inputs):
+        raise ValueError(
+            "When providing an `input_grid`, you cannot also provide `inputs` or `kwargs`"
+        )
+    if input_grid:
+        return run_comparison_test(
+            test_id,
+            input_grid,
+            params=params,
+            output_template=output_template,
+            show=show,
+            generate_description=__generate_description,
+        )
+    if test_id and test_id.startswith("validmind.unit_metrics"):
+        # TODO: as we move towards a more unified approach to metrics
+        # we will want to make everything functional and remove the
+        # separation between unit metrics and "normal" metrics
+        return run_metric(test_id, inputs=inputs, params=params, show=show)
+    if unit_metrics:
+        metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
+        test_id = f"validmind.composite_test.{metric_id_name}"
+        error, TestClass = load_composite_metric(
+            unit_metrics=unit_metrics, metric_name=metric_id_name
+        )
+        if error:
+            raise LoadTestError(error)
+    else:
+        TestClass = load_test(test_id, reload=True)
+    test = TestClass(
+        test_id=test_id,
+        context=TestContext(),
+        inputs=TestInput({**kwargs, **(inputs or {})}),
+        output_template=output_template,
+        params=params,
+        generate_description=__generate_description,
+    )
+    test.run()
+    if show:
+        test.result.show()
+    return test.result

validmind/tests/test_providers.py CHANGED Viewed

@@ -9,6 +9,8 @@ from typing import Protocol
 from validmind.logging import get_logger
+from ._store import test_provider_store
 logger = get_logger(__name__)
@@ -145,3 +147,13 @@ class LocalTestProvider:
             raise LocalTestProviderLoadTestError(
                 f"Failed to find the test class in the module. Error: {str(e)}"
             )
+def register_test_provider(namespace: str, test_provider: "TestProvider") -> None:
+    """Register an external test provider
+    Args:
+        namespace (str): The namespace of the test provider
+        test_provider (TestProvider): The test provider
+    """
+    test_provider_store.register_test_provider(namespace, test_provider)

validmind/tests/utils.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+"""Test Module Utils"""
+import inspect
+def test_description(test_class, truncate=True):
+    description = inspect.getdoc(test_class).strip()
+    if truncate and len(description.split("\n")) > 5:
+        return description.strip().split("\n")[0] + "..."
+    return description

validmind/unit_metrics/__init__.py CHANGED Viewed

@@ -6,8 +6,9 @@ import hashlib
 import json
 from importlib import import_module
-from ..tests.decorator import _build_result, _inspect_signature
-from ..utils import get_model_info, test_id_to_name
+from validmind.input_registry import input_registry
+from validmind.tests.decorator import _build_result, _inspect_signature
+from validmind.utils import get_model_info, test_id_to_name
 unit_metric_results_cache = {}
@@ -157,7 +158,10 @@ def run_metric(metric_id, inputs=None, params=None, show=True, value_only=False)
         show (bool): Whether to display the results
         value_only (bool): Whether to return only the value
     """
-    inputs = inputs or {}
+    inputs = {
+        k: input_registry.get(v) if isinstance(v, str) else v
+        for k, v in (inputs or {}).items()
+    }
     params = params or {}
     cache_key = get_metric_cache_key(metric_id, params, inputs)
@@ -168,7 +172,11 @@ def run_metric(metric_id, inputs=None, params=None, show=True, value_only=False)
         result = metric(
             **{k: v for k, v in inputs.items() if k in _inputs.keys()},
-            **{k: v for k, v in params.items() if k in _params.keys()},
+            **{
+                k: v
+                for k, v in params.items()
+                if k in _params.keys() or "kwargs" in _params.keys()
+            },
         )
         unit_metric_results_cache[cache_key] = (
             result,

validmind/unit_metrics/composite.py CHANGED Viewed

@@ -42,6 +42,7 @@ class CompositeMetric(Metric):
             params=self.params,
             output_template=self.output_template,
             show=False,
+            generate_description=self.generate_description,
         )
         return self.result
@@ -109,6 +110,7 @@ def run_metrics(
     params: dict = None,
     test_id: str = None,
     show: bool = True,
+    generate_description: bool = True,
 ) -> MetricResultWrapper:
     """Run a composite metric
@@ -209,6 +211,7 @@ def run_metrics(
                 test_id=test_id,
                 default_description=description,
                 summary=result_summary.serialize(),
+                should_generate=generate_description,
             ),
             {
                 "content_id": f"composite_metric_def:{test_id}:unit_metrics",

validmind/vm_models/test/metric.py CHANGED Viewed

@@ -78,11 +78,14 @@ class Metric(Test):
         self.result = MetricResultWrapper(
             result_id=self.test_id,
             result_metadata=[
-                get_description_metadata(
-                    test_id=self.test_id,
-                    default_description=self.description(),
-                    summary=metric.serialize()["summary"],
-                    figures=figures,
+                (
+                    get_description_metadata(
+                        test_id=self.test_id,
+                        default_description=self.description(),
+                        summary=metric.serialize()["summary"],
+                        figures=figures,
+                        should_generate=self.generate_description,
+                    )
                 )
             ],
             metric=metric,

validmind/vm_models/test/result_wrapper.py CHANGED Viewed

@@ -344,7 +344,8 @@ class MetricResultWrapper(ResultWrapper):
         """Check if the metric summary has columns from input datasets"""
         dataset_columns = set()
-        for input_id in self.inputs:
+        for input in self.inputs:
+            input_id = input if isinstance(input, str) else input.input_id
             input_obj = input_registry.get(input_id)
             if isinstance(input_obj, VMDataset):
                 dataset_columns.update(input_obj.columns)

validmind 2.3.5__py3-none-any.whl → 2.4.0__py3-none-any.whl

validmind 2.3.5py3-none-any.whl → 2.4.0py3-none-any.whl