PyPI - validmind - Versions diffs - 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl - Mend

validmind 2.2.5py3-none-any.whl → 2.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

validmind/tests/prompt_validation/Bias.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class Bias(ThresholdTest, AIPoweredTest):
+class Bias(ThresholdTest):
     """
     Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
@@ -103,12 +109,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -132,14 +132,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/Clarity.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class Clarity(ThresholdTest, AIPoweredTest):
+class Clarity(ThresholdTest):
     """
     Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
@@ -93,12 +99,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -122,14 +122,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/Conciseness.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class Conciseness(ThresholdTest, AIPoweredTest):
+class Conciseness(ThresholdTest):
     """
     Analyzes and grades the conciseness of prompts provided to a Large Language Model.
@@ -95,12 +101,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -124,14 +124,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/Delimitation.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class Delimitation(ThresholdTest, AIPoweredTest):
+class Delimitation(ThresholdTest):
     """
     Evaluates the proper use of delimiters in prompts provided to Large Language Models.
@@ -85,12 +91,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -114,14 +114,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/NegativeInstruction.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class NegativeInstruction(ThresholdTest, AIPoweredTest):
+class NegativeInstruction(ThresholdTest):
     """
     Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
@@ -96,12 +102,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -125,14 +125,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/Robustness.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import List
 import pandas as pd
-from validmind.errors import SkipTestError
+from validmind.errors import MissingRequiredTestInputError, SkipTestError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -16,11 +16,11 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import call_model, missing_prompt_message
 @dataclass
-class Robustness(ThresholdTest, AIPoweredTest):
+class Robustness(ThresholdTest):
     """
     Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
@@ -94,12 +94,6 @@ Prompt:
 Input:
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         results_table = [
             {
@@ -122,8 +116,14 @@ Input:
         )
     def run(self):
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
         # TODO: add support for multi-variable prompts
-        if len(self.inputs.model.prompt.variables) > 1:
+        if (
+            not self.inputs.model.prompt.variables
+            or len(self.inputs.model.prompt.variables) > 1
+        ):
             raise SkipTestError(
                 "Robustness only supports single-variable prompts for now"
             )
@@ -138,7 +138,7 @@ Input:
         results = []
         for _ in range(self.params["num_tests"]):
-            response = self.call_model(
+            response = call_model(
                 system_prompt=self.system_prompt,
                 user_prompt=self.user_prompt.format(
                     variables="\n".join(self.inputs.model.prompt.variables),

validmind/tests/prompt_validation/Specificity.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List
 import pandas as pd
+from validmind.errors import MissingRequiredTestInputError
 from validmind.vm_models import (
     ResultSummary,
     ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
-from .ai_powered_test import AIPoweredTest
+from .ai_powered_test import (
+    call_model,
+    get_explanation,
+    get_score,
+    missing_prompt_message,
+)
 @dataclass
-class Specificity(ThresholdTest, AIPoweredTest):
+class Specificity(ThresholdTest):
     """
     Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
     detail, and relevance.
@@ -91,12 +97,6 @@ Prompt:
 """
 '''.strip()
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)  # Call ThresholdTest.__init__
-        AIPoweredTest.__init__(
-            self, *args, **kwargs
-        )  # Explicitly call AIPoweredTest.__init__
     def summary(self, results: List[ThresholdTestResult], all_passed: bool):
         result = results[0]
         results_table = [
@@ -120,14 +120,17 @@ Prompt:
         )
     def run(self):
-        response = self.call_model(
+        if not hasattr(self.inputs.model, "prompt"):
+            raise MissingRequiredTestInputError(missing_prompt_message)
+        response = call_model(
             system_prompt=self.system_prompt,
             user_prompt=self.user_prompt.format(
                 prompt_to_test=self.inputs.model.prompt.template
             ),
         )
-        score = self.get_score(response)
-        explanation = self.get_explanation(response)
+        score = get_score(response)
+        explanation = get_explanation(response)
         passed = score > self.params["min_threshold"]
         results = [

validmind/tests/prompt_validation/ai_powered_test.py CHANGED Viewed

@@ -2,90 +2,68 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-import os
 import re
-from openai import AzureOpenAI, OpenAI
+from validmind.ai.utils import get_client_and_model
+missing_prompt_message = """
+Cannot run prompt validation tests on a model with no prompt.
+You can set a prompt when creating a vm_model object like this:
+my_vm_model = vm.init_model(
+    predict_fn=call_model,
+    prompt=Prompt(
+        template="<your-prompt-here>",
+        variables=[],
+    ),
+    input_id="my_llm_model",
+)
+"""
+def call_model(
+    system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
+):
+    """Call LLM with the given prompts and return the response"""
+    client, model = get_client_and_model()
+    return (
+        client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=temperature,
+            seed=seed,
+        )
+        .choices[0]
+        .message.content
+    )
-class AIPoweredTest:
-    """
-    Base class for tests powered by an LLM
-    """
+def get_score(response: str):
+    """Get just the score from the response string
+    TODO: use json response mode instead of this
-    api_key = None
-    client = None
-    endpoint = None
-    model_name = None
-    def __init__(self, *args, **kwargs):
-        if "OPENAI_API_KEY" in os.environ:
-            self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-            self.model_name = os.environ.get("VM_OPENAI_MODEL", "gpt-3.5-turbo")
-        elif "AZURE_OPENAI_KEY" in os.environ:
-            if "AZURE_OPENAI_ENDPOINT" not in os.environ:
-                raise ValueError(
-                    "AZURE_OPENAI_ENDPOINT must be set to run LLM tests with Azure"
-                )
-            if "AZURE_OPENAI_MODEL" not in os.environ:
-                raise ValueError(
-                    "AZURE_OPENAI_MODEL must be set to run LLM tests with Azure"
-                )
-            self.client = AzureOpenAI(
-                azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
-                api_key=os.environ.get("AZURE_OPENAI_KEY"),
-                api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-05-15"),
-            )
-            self.model_name = os.environ.get("AZURE_OPENAI_MODEL")
-        else:
-            raise ValueError(
-                "OPENAI_API_KEY or AZURE_OPENAI_KEY must be set to run LLM tests"
-            )
-    def call_model(self, user_prompt: str, system_prompt: str = None):
-        """
-        Call an LLM with the passed prompts and return the response. We're using GPT4 for now.
-        """
-        return (
-            self.client.chat.completions.create(
-                model=self.model_name,
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": user_prompt},
-                ],
-                temperature=0.0,
-                seed=42,
-            )
-            .choices[0]
-            .message.content
-        )
-    def get_score(self, response: str):
-        """
-        Get just the numeric data in the response string and convert it to an int
+    e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
+    """
+    score = re.search(r"Score: (\d+)", response)
-        e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
-        """
-        score = re.search(r"Score: (\d+)", response)
+    if not score:
+        raise ValueError("Could not find score in response")
-        if not score:
-            raise ValueError("Could not find score in response")
+    return int(score.group(1))
-        return int(score.group(1))
-    def get_explanation(self, response: str):
-        """
-        Get just the explanation from the response string
+def get_explanation(response: str):
+    """Get just the explanation from the response string
+    TODO: use json response mode instead of this
-        e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
-        """
-        explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
+    e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
+    """
+    explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
-        if not explanation:
-            raise ValueError("Could not find explanation in response")
+    if not explanation:
+        raise ValueError("Could not find explanation in response")
-        return explanation.group(1)
+    return explanation.group(1).strip().strip("`")

validmind/unit_metrics/composite.py CHANGED Viewed

@@ -6,9 +6,10 @@ from dataclasses import dataclass
 from typing import List, Tuple, Union
 from uuid import uuid4
+from ..ai.test_descriptions import get_description_metadata
 from ..logging import get_logger
 from ..tests.decorator import _inspect_signature
-from ..utils import get_description_metadata, run_async, test_id_to_name
+from ..utils import run_async, test_id_to_name
 from ..vm_models.test.metric import Metric
 from ..vm_models.test.metric_result import MetricResult
 from ..vm_models.test.result_summary import ResultSummary, ResultTable

validmind/utils.py CHANGED Viewed

@@ -6,7 +6,6 @@ import asyncio
 import difflib
 import json
 import math
-import os
 import re
 import sys
 from platform import python_version
@@ -26,8 +25,8 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
 from numpy import ndarray
 from tabulate import tabulate
-from .ai import generate_description
 from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
+from .logging import get_logger
 DEFAULT_BIG_NUMBER_DECIMALS = 2
 DEFAULT_SMALL_NUMBER_DECIMALS = 4
@@ -50,6 +49,8 @@ params = {
 pylab.rcParams.update(params)
 #################################
+logger = get_logger(__name__)
 def is_notebook() -> bool:
     """
@@ -307,7 +308,7 @@ def run_async_check(func, *args, **kwargs):
             if task.get_name() == name:
                 return task
-        return run_async(func, name=name, *args, **kwargs)
+        return run_async(func, name=name, *args, **kwargs)  # noqa B026
     except RuntimeError:
         pass
@@ -457,49 +458,3 @@ def md_to_html(md: str, mathml=False) -> str:
     )
     return html
-def get_description_metadata(
-    test_id,
-    default_description,
-    summary=None,
-    figures=None,
-    prefix="metric_description",
-):
-    """Get Metadata Dictionary for a Test or Metric Result
-    Generates an LLM interpretation of the test results or uses the default
-    description and returns a metadata object that can be logged with the test results.
-    To enable LLM-generated descriptions, set the VALIDMIND_LLM_DESCRIPTIONS_ENABLED
-    environment variable to "true". The default description will be used if LLM
-    descriptions are disabled.
-    Note: Either the summary or figures must be provided to generate the description.
-    Args:
-        test_id (str): The test ID
-        default_description (str): The default description for the test
-        summary (Any): The test summary or results to interpret
-        figures (List[Figure]): The figures to attach to the test suite result
-        prefix (str): The prefix to use for the content ID (Default: "metric_description")
-    Returns:
-        dict: The metadata object to be logged with the test results
-    """
-    if os.environ.get("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "false").lower() == "true":
-        revision_name = "Generated by ValidMind AI"
-        description = generate_description(
-            test_name=test_id,
-            test_description=default_description,
-            test_summary=summary,
-            figures=figures,
-        )
-    else:
-        revision_name = "Default Description"
-        description = default_description
-    return {
-        "content_id": f"{prefix}:{test_id}::{revision_name}",
-        "text": description,
-    }

validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl

validmind 2.2.5py3-none-any.whl → 2.3.1py3-none-any.whl