validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -63
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class NegativeInstruction(ThresholdTest
|
28
|
+
class NegativeInstruction(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
|
25
31
|
|
@@ -96,12 +102,6 @@ Prompt:
|
|
96
102
|
"""
|
97
103
|
'''.strip()
|
98
104
|
|
99
|
-
def __init__(self, *args, **kwargs):
|
100
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
101
|
-
AIPoweredTest.__init__(
|
102
|
-
self, *args, **kwargs
|
103
|
-
) # Explicitly call AIPoweredTest.__init__
|
104
|
-
|
105
105
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
106
106
|
result = results[0]
|
107
107
|
results_table = [
|
@@ -125,14 +125,17 @@ Prompt:
|
|
125
125
|
)
|
126
126
|
|
127
127
|
def run(self):
|
128
|
-
|
128
|
+
if not hasattr(self.inputs.model, "prompt"):
|
129
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
130
|
+
|
131
|
+
response = call_model(
|
129
132
|
system_prompt=self.system_prompt,
|
130
133
|
user_prompt=self.user_prompt.format(
|
131
134
|
prompt_to_test=self.inputs.model.prompt.template
|
132
135
|
),
|
133
136
|
)
|
134
|
-
score =
|
135
|
-
explanation =
|
137
|
+
score = get_score(response)
|
138
|
+
explanation = get_explanation(response)
|
136
139
|
|
137
140
|
passed = score > self.params["min_threshold"]
|
138
141
|
results = [
|
@@ -7,7 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
-
from validmind.errors import SkipTestError
|
10
|
+
from validmind.errors import MissingRequiredTestInputError, SkipTestError
|
11
11
|
from validmind.vm_models import (
|
12
12
|
ResultSummary,
|
13
13
|
ResultTable,
|
@@ -16,11 +16,11 @@ from validmind.vm_models import (
|
|
16
16
|
ThresholdTestResult,
|
17
17
|
)
|
18
18
|
|
19
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import call_model, missing_prompt_message
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass
|
23
|
-
class Robustness(ThresholdTest
|
23
|
+
class Robustness(ThresholdTest):
|
24
24
|
"""
|
25
25
|
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
|
26
26
|
|
@@ -94,12 +94,6 @@ Prompt:
|
|
94
94
|
Input:
|
95
95
|
'''.strip()
|
96
96
|
|
97
|
-
def __init__(self, *args, **kwargs):
|
98
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
99
|
-
AIPoweredTest.__init__(
|
100
|
-
self, *args, **kwargs
|
101
|
-
) # Explicitly call AIPoweredTest.__init__
|
102
|
-
|
103
97
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
104
98
|
results_table = [
|
105
99
|
{
|
@@ -122,8 +116,14 @@ Input:
|
|
122
116
|
)
|
123
117
|
|
124
118
|
def run(self):
|
119
|
+
if not hasattr(self.inputs.model, "prompt"):
|
120
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
121
|
+
|
125
122
|
# TODO: add support for multi-variable prompts
|
126
|
-
if
|
123
|
+
if (
|
124
|
+
not self.inputs.model.prompt.variables
|
125
|
+
or len(self.inputs.model.prompt.variables) > 1
|
126
|
+
):
|
127
127
|
raise SkipTestError(
|
128
128
|
"Robustness only supports single-variable prompts for now"
|
129
129
|
)
|
@@ -138,7 +138,7 @@ Input:
|
|
138
138
|
results = []
|
139
139
|
|
140
140
|
for _ in range(self.params["num_tests"]):
|
141
|
-
response =
|
141
|
+
response = call_model(
|
142
142
|
system_prompt=self.system_prompt,
|
143
143
|
user_prompt=self.user_prompt.format(
|
144
144
|
variables="\n".join(self.inputs.model.prompt.variables),
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Specificity(ThresholdTest
|
28
|
+
class Specificity(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
|
25
31
|
detail, and relevance.
|
@@ -91,12 +97,6 @@ Prompt:
|
|
91
97
|
"""
|
92
98
|
'''.strip()
|
93
99
|
|
94
|
-
def __init__(self, *args, **kwargs):
|
95
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
96
|
-
AIPoweredTest.__init__(
|
97
|
-
self, *args, **kwargs
|
98
|
-
) # Explicitly call AIPoweredTest.__init__
|
99
|
-
|
100
100
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
101
101
|
result = results[0]
|
102
102
|
results_table = [
|
@@ -120,14 +120,17 @@ Prompt:
|
|
120
120
|
)
|
121
121
|
|
122
122
|
def run(self):
|
123
|
-
|
123
|
+
if not hasattr(self.inputs.model, "prompt"):
|
124
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
125
|
+
|
126
|
+
response = call_model(
|
124
127
|
system_prompt=self.system_prompt,
|
125
128
|
user_prompt=self.user_prompt.format(
|
126
129
|
prompt_to_test=self.inputs.model.prompt.template
|
127
130
|
),
|
128
131
|
)
|
129
|
-
score =
|
130
|
-
explanation =
|
132
|
+
score = get_score(response)
|
133
|
+
explanation = get_explanation(response)
|
131
134
|
|
132
135
|
passed = score > self.params["min_threshold"]
|
133
136
|
results = [
|
@@ -2,90 +2,68 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import os
|
6
5
|
import re
|
7
6
|
|
8
|
-
from
|
7
|
+
from validmind.ai.utils import get_client_and_model
|
8
|
+
|
9
|
+
missing_prompt_message = """
|
10
|
+
Cannot run prompt validation tests on a model with no prompt.
|
11
|
+
You can set a prompt when creating a vm_model object like this:
|
12
|
+
my_vm_model = vm.init_model(
|
13
|
+
predict_fn=call_model,
|
14
|
+
prompt=Prompt(
|
15
|
+
template="<your-prompt-here>",
|
16
|
+
variables=[],
|
17
|
+
),
|
18
|
+
input_id="my_llm_model",
|
19
|
+
)
|
20
|
+
"""
|
21
|
+
|
22
|
+
|
23
|
+
def call_model(
|
24
|
+
system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
|
25
|
+
):
|
26
|
+
"""Call LLM with the given prompts and return the response"""
|
27
|
+
client, model = get_client_and_model()
|
28
|
+
|
29
|
+
return (
|
30
|
+
client.chat.completions.create(
|
31
|
+
model=model,
|
32
|
+
messages=[
|
33
|
+
{"role": "system", "content": system_prompt},
|
34
|
+
{"role": "user", "content": user_prompt},
|
35
|
+
],
|
36
|
+
temperature=temperature,
|
37
|
+
seed=seed,
|
38
|
+
)
|
39
|
+
.choices[0]
|
40
|
+
.message.content
|
41
|
+
)
|
9
42
|
|
10
43
|
|
11
|
-
|
12
|
-
"""
|
13
|
-
|
14
|
-
"""
|
44
|
+
def get_score(response: str):
|
45
|
+
"""Get just the score from the response string
|
46
|
+
TODO: use json response mode instead of this
|
15
47
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
model_name = None
|
20
|
-
|
21
|
-
def __init__(self, *args, **kwargs):
|
22
|
-
if "OPENAI_API_KEY" in os.environ:
|
23
|
-
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
24
|
-
self.model_name = os.getenv("VM_OPENAI_MODEL", "gpt-3.5-turbo")
|
25
|
-
|
26
|
-
elif "AZURE_OPENAI_KEY" in os.environ:
|
27
|
-
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
28
|
-
raise ValueError(
|
29
|
-
"AZURE_OPENAI_ENDPOINT must be set to run LLM tests with Azure"
|
30
|
-
)
|
31
|
-
|
32
|
-
if "AZURE_OPENAI_MODEL" not in os.environ:
|
33
|
-
raise ValueError(
|
34
|
-
"AZURE_OPENAI_MODEL must be set to run LLM tests with Azure"
|
35
|
-
)
|
36
|
-
|
37
|
-
self.client = AzureOpenAI(
|
38
|
-
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
39
|
-
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
40
|
-
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
41
|
-
)
|
42
|
-
self.model_name = os.getenv("AZURE_OPENAI_MODEL")
|
43
|
-
|
44
|
-
else:
|
45
|
-
raise ValueError(
|
46
|
-
"OPENAI_API_KEY or AZURE_OPENAI_KEY must be set to run LLM tests"
|
47
|
-
)
|
48
|
-
|
49
|
-
def call_model(self, user_prompt: str, system_prompt: str = None):
|
50
|
-
"""
|
51
|
-
Call an LLM with the passed prompts and return the response. We're using GPT4 for now.
|
52
|
-
"""
|
53
|
-
return (
|
54
|
-
self.client.chat.completions.create(
|
55
|
-
model=self.model_name,
|
56
|
-
messages=[
|
57
|
-
{"role": "system", "content": system_prompt},
|
58
|
-
{"role": "user", "content": user_prompt},
|
59
|
-
],
|
60
|
-
temperature=0.0,
|
61
|
-
seed=42,
|
62
|
-
)
|
63
|
-
.choices[0]
|
64
|
-
.message.content
|
65
|
-
)
|
66
|
-
|
67
|
-
def get_score(self, response: str):
|
68
|
-
"""
|
69
|
-
Get just the numeric data in the response string and convert it to an int
|
48
|
+
e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
|
49
|
+
"""
|
50
|
+
score = re.search(r"Score: (\d+)", response)
|
70
51
|
|
71
|
-
|
72
|
-
""
|
73
|
-
score = re.search(r"Score: (\d+)", response)
|
52
|
+
if not score:
|
53
|
+
raise ValueError("Could not find score in response")
|
74
54
|
|
75
|
-
|
76
|
-
raise ValueError("Could not find score in response")
|
55
|
+
return int(score.group(1))
|
77
56
|
|
78
|
-
return int(score.group(1))
|
79
57
|
|
80
|
-
|
81
|
-
|
82
|
-
|
58
|
+
def get_explanation(response: str):
|
59
|
+
"""Get just the explanation from the response string
|
60
|
+
TODO: use json response mode instead of this
|
83
61
|
|
84
|
-
|
85
|
-
|
86
|
-
|
62
|
+
e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
|
63
|
+
"""
|
64
|
+
explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
|
87
65
|
|
88
|
-
|
89
|
-
|
66
|
+
if not explanation:
|
67
|
+
raise ValueError("Could not find explanation in response")
|
90
68
|
|
91
|
-
|
69
|
+
return explanation.group(1).strip().strip("`")
|
@@ -6,9 +6,10 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import List, Tuple, Union
|
7
7
|
from uuid import uuid4
|
8
8
|
|
9
|
+
from ..ai.test_descriptions import get_description_metadata
|
9
10
|
from ..logging import get_logger
|
10
11
|
from ..tests.decorator import _inspect_signature
|
11
|
-
from ..utils import
|
12
|
+
from ..utils import run_async, test_id_to_name
|
12
13
|
from ..vm_models.test.metric import Metric
|
13
14
|
from ..vm_models.test.metric_result import MetricResult
|
14
15
|
from ..vm_models.test.result_summary import ResultSummary, ResultTable
|
validmind/utils.py
CHANGED
@@ -6,7 +6,6 @@ import asyncio
|
|
6
6
|
import difflib
|
7
7
|
import json
|
8
8
|
import math
|
9
|
-
import os
|
10
9
|
import re
|
11
10
|
import sys
|
12
11
|
from platform import python_version
|
@@ -26,11 +25,8 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
|
|
26
25
|
from numpy import ndarray
|
27
26
|
from tabulate import tabulate
|
28
27
|
|
29
|
-
from .ai import background_generate_description, is_configured
|
30
28
|
from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
|
31
|
-
|
32
|
-
AI_REVISION_NAME = "Generated by ValidMind AI"
|
33
|
-
DEFAULT_REVISION_NAME = "Default Description"
|
29
|
+
from .logging import get_logger
|
34
30
|
|
35
31
|
DEFAULT_BIG_NUMBER_DECIMALS = 2
|
36
32
|
DEFAULT_SMALL_NUMBER_DECIMALS = 4
|
@@ -53,6 +49,8 @@ params = {
|
|
53
49
|
pylab.rcParams.update(params)
|
54
50
|
#################################
|
55
51
|
|
52
|
+
logger = get_logger(__name__)
|
53
|
+
|
56
54
|
|
57
55
|
def is_notebook() -> bool:
|
58
56
|
"""
|
@@ -310,7 +308,7 @@ def run_async_check(func, *args, **kwargs):
|
|
310
308
|
if task.get_name() == name:
|
311
309
|
return task
|
312
310
|
|
313
|
-
return run_async(func, name=name, *args, **kwargs)
|
311
|
+
return run_async(func, name=name, *args, **kwargs) # noqa B026
|
314
312
|
|
315
313
|
except RuntimeError:
|
316
314
|
pass
|
@@ -460,60 +458,3 @@ def md_to_html(md: str, mathml=False) -> str:
|
|
460
458
|
)
|
461
459
|
|
462
460
|
return html
|
463
|
-
|
464
|
-
|
465
|
-
def get_description_metadata(
|
466
|
-
test_id,
|
467
|
-
default_description,
|
468
|
-
summary=None,
|
469
|
-
figures=None,
|
470
|
-
prefix="metric_description",
|
471
|
-
):
|
472
|
-
"""Get Metadata Dictionary for a Test or Metric Result
|
473
|
-
|
474
|
-
Generates an LLM interpretation of the test results or uses the default
|
475
|
-
description and returns a metadata object that can be logged with the test results.
|
476
|
-
|
477
|
-
By default, the description is generated by an LLM that will interpret the test
|
478
|
-
results and provide a human-readable description. If the summary or figures are
|
479
|
-
not provided, or the `VALIDMIND_LLM_DESCRIPTIONS_ENABLED` environment variable is
|
480
|
-
set to `0` or `false` or no LLM has been configured, the default description will
|
481
|
-
be used as the test result description.
|
482
|
-
|
483
|
-
Note: Either the summary or figures must be provided to generate the description.
|
484
|
-
|
485
|
-
Args:
|
486
|
-
test_id (str): The test ID
|
487
|
-
default_description (str): The default description for the test
|
488
|
-
summary (Any): The test summary or results to interpret
|
489
|
-
figures (List[Figure]): The figures to attach to the test suite result
|
490
|
-
prefix (str): The prefix to use for the content ID (Default: "metric_description")
|
491
|
-
|
492
|
-
Returns:
|
493
|
-
dict: The metadata object to be logged with the test results
|
494
|
-
"""
|
495
|
-
env_disabled = os.getenv("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "1") in [
|
496
|
-
"0",
|
497
|
-
"false",
|
498
|
-
]
|
499
|
-
|
500
|
-
if (summary or figures) and not env_disabled and is_configured():
|
501
|
-
revision_name = AI_REVISION_NAME
|
502
|
-
|
503
|
-
# get description future and set it as the description in the metadata
|
504
|
-
# this will lazily retrieved so it can run in the background in parallel
|
505
|
-
description = background_generate_description(
|
506
|
-
test_id=test_id,
|
507
|
-
test_description=default_description,
|
508
|
-
test_summary=summary,
|
509
|
-
figures=figures,
|
510
|
-
)
|
511
|
-
|
512
|
-
else:
|
513
|
-
revision_name = DEFAULT_REVISION_NAME
|
514
|
-
description = md_to_html(default_description, mathml=True)
|
515
|
-
|
516
|
-
return {
|
517
|
-
"content_id": f"{prefix}:{test_id}::{revision_name}",
|
518
|
-
"text": description,
|
519
|
-
}
|
@@ -195,7 +195,19 @@ class VMDataset:
|
|
195
195
|
probability_column: str = None,
|
196
196
|
probability_values: list = None,
|
197
197
|
prediction_probabilities: list = None, # DEPRECATED: use probability_values
|
198
|
+
**kwargs,
|
198
199
|
):
|
200
|
+
"""Assign predictions and probabilities to the dataset.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
model (VMModel): The model used to generate the predictions.
|
204
|
+
prediction_column (str, optional): The name of the column containing the predictions. Defaults to None.
|
205
|
+
prediction_values (list, optional): The values of the predictions. Defaults to None.
|
206
|
+
probability_column (str, optional): The name of the column containing the probabilities. Defaults to None.
|
207
|
+
probability_values (list, optional): The values of the probabilities. Defaults to None.
|
208
|
+
prediction_probabilities (list, optional): DEPRECATED: The values of the probabilities. Defaults to None.
|
209
|
+
kwargs: Additional keyword arguments that will get passed through to the model's `predict` method.
|
210
|
+
"""
|
199
211
|
if prediction_probabilities is not None:
|
200
212
|
warnings.warn(
|
201
213
|
"The `prediction_probabilities` argument is deprecated. Use `probability_values` instead.",
|
@@ -226,7 +238,9 @@ class VMDataset:
|
|
226
238
|
|
227
239
|
if prediction_values is None:
|
228
240
|
X = self.df if isinstance(model, (FunctionModel, PipelineModel)) else self.x
|
229
|
-
probability_values, prediction_values = compute_predictions(
|
241
|
+
probability_values, prediction_values = compute_predictions(
|
242
|
+
model, X, **kwargs
|
243
|
+
)
|
230
244
|
|
231
245
|
prediction_column = prediction_column or f"{model.input_id}_prediction"
|
232
246
|
self._add_column(prediction_column, prediction_values)
|
@@ -356,8 +370,8 @@ class VMDataset:
|
|
356
370
|
return as_df(self.df[self.probability_column(model)])
|
357
371
|
|
358
372
|
def target_classes(self):
|
359
|
-
"""Returns the
|
360
|
-
return [str(i) for i in np.unique(self.y)]
|
373
|
+
"""Returns the target class labels or unique values of the target column."""
|
374
|
+
return self.target_class_labels or [str(i) for i in np.unique(self.y)]
|
361
375
|
|
362
376
|
def __str__(self):
|
363
377
|
return (
|
@@ -94,7 +94,7 @@ def _is_probabilties(output):
|
|
94
94
|
return np.all((output >= 0) & (output <= 1)) and np.any((output > 0) & (output < 1))
|
95
95
|
|
96
96
|
|
97
|
-
def compute_predictions(model, X) -> tuple:
|
97
|
+
def compute_predictions(model, X, **kwargs) -> tuple:
|
98
98
|
probability_values = None
|
99
99
|
|
100
100
|
try:
|
@@ -108,7 +108,7 @@ def compute_predictions(model, X) -> tuple:
|
|
108
108
|
|
109
109
|
try:
|
110
110
|
logger.info("Running predict()... This may take a while")
|
111
|
-
prediction_values = model.predict(X)
|
111
|
+
prediction_values = model.predict(X, **kwargs)
|
112
112
|
logger.info("Done running predict()")
|
113
113
|
except MissingOrInvalidModelPredictFnError:
|
114
114
|
raise MissingOrInvalidModelPredictFnError(
|
validmind/vm_models/model.py
CHANGED
@@ -12,8 +12,8 @@ from typing import ClassVar, List, Optional, Union
|
|
12
12
|
|
13
13
|
import pandas as pd
|
14
14
|
|
15
|
+
from ...ai.test_descriptions import get_description_metadata
|
15
16
|
from ...errors import MissingCacheResultsArgumentsError
|
16
|
-
from ...utils import get_description_metadata
|
17
17
|
from ..figure import Figure
|
18
18
|
from .metric_result import MetricResult
|
19
19
|
from .result_wrapper import MetricResultWrapper
|
@@ -36,13 +36,6 @@ class Metric(Test):
|
|
36
36
|
# Instance Variables
|
37
37
|
result: MetricResultWrapper = None # populated by cache_results() method
|
38
38
|
|
39
|
-
@property
|
40
|
-
def key(self):
|
41
|
-
"""
|
42
|
-
Keep the key for compatibility reasons
|
43
|
-
"""
|
44
|
-
return self._key if hasattr(self, "_key") else self.name
|
45
|
-
|
46
39
|
@abstractmethod
|
47
40
|
def summary(self, metric_value: Optional[Union[dict, list, pd.DataFrame]] = None):
|
48
41
|
"""
|
@@ -15,10 +15,10 @@ import pandas as pd
|
|
15
15
|
from ipywidgets import HTML, GridBox, Layout, VBox
|
16
16
|
|
17
17
|
from ... import api_client
|
18
|
-
from ...ai import DescriptionFuture
|
18
|
+
from ...ai.test_descriptions import AI_REVISION_NAME, DescriptionFuture
|
19
19
|
from ...input_registry import input_registry
|
20
20
|
from ...logging import get_logger
|
21
|
-
from ...utils import
|
21
|
+
from ...utils import NumpyEncoder, display, run_async, test_id_to_name
|
22
22
|
from ..dataset import VMDataset
|
23
23
|
from ..figure import Figure
|
24
24
|
from .metric_result import MetricResult
|
validmind/vm_models/test/test.py
CHANGED
@@ -52,6 +52,9 @@ class Test(TestUtils):
|
|
52
52
|
"test_id is missing. It must be passed when initializing the test"
|
53
53
|
)
|
54
54
|
self._ref_id = str(uuid4())
|
55
|
+
self.key = (
|
56
|
+
self.test_id
|
57
|
+
) # for backwards compatibility - figures really should get keyed automatically
|
55
58
|
|
56
59
|
# TODO: add validation for required inputs
|
57
60
|
if self.default_params is None:
|
@@ -11,7 +11,7 @@ avoid confusion with the "tests" in the general data science/modeling sense.
|
|
11
11
|
from dataclasses import dataclass
|
12
12
|
from typing import ClassVar, List, Optional
|
13
13
|
|
14
|
-
from ...
|
14
|
+
from ...ai.test_descriptions import get_description_metadata
|
15
15
|
from ..figure import Figure
|
16
16
|
from .result_summary import ResultSummary, ResultTable
|
17
17
|
from .result_wrapper import ThresholdTestResultWrapper
|
@@ -83,11 +83,14 @@ class TestSuiteRunner:
|
|
83
83
|
test_configs = test_configs.get("params", {})
|
84
84
|
else:
|
85
85
|
if (test_configs) and ("params" not in test_configs):
|
86
|
-
|
87
|
-
|
88
|
-
"Setting test parameters directly in the 'config' parameter
|
89
|
-
|
86
|
+
# [DEPRECATED] This is the old way of setting test parameters
|
87
|
+
msg = (
|
88
|
+
"Setting test parameters directly in the 'config' parameter"
|
89
|
+
" of the run_documentation_tests() method is deprecated. "
|
90
|
+
"Instead, use the new format of the config: "
|
91
|
+
'config = {"test_id": {"params": {...}, "inputs": {...}}}'
|
90
92
|
)
|
93
|
+
logger.warning(msg)
|
91
94
|
|
92
95
|
test.load(inputs=inputs, context=self.context, config=test_configs)
|
93
96
|
|