validmind 2.2.4__py3-none-any.whl → 2.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +98 -34
- validmind/api_client.py +12 -12
- validmind/logging.py +38 -32
- validmind/tests/prompt_validation/ai_powered_test.py +6 -6
- validmind/utils.py +33 -12
- validmind/vm_models/test/result_wrapper.py +26 -33
- validmind/vm_models/test/threshold_test.py +1 -0
- validmind/vm_models/test_suite/runner.py +5 -2
- validmind/vm_models/test_suite/summary.py +18 -7
- validmind/vm_models/test_suite/test.py +13 -20
- {validmind-2.2.4.dist-info → validmind-2.2.6.dist-info}/METADATA +1 -1
- {validmind-2.2.4.dist-info → validmind-2.2.6.dist-info}/RECORD +16 -16
- {validmind-2.2.4.dist-info → validmind-2.2.6.dist-info}/LICENSE +0 -0
- {validmind-2.2.4.dist-info → validmind-2.2.6.dist-info}/WHEEL +0 -0
- {validmind-2.2.4.dist-info → validmind-2.2.6.dist-info}/entry_points.txt +0 -0
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.2.
|
1
|
+
__version__ = "2.2.6"
|
validmind/ai.py
CHANGED
@@ -7,6 +7,11 @@ import os
|
|
7
7
|
|
8
8
|
from openai import AzureOpenAI, OpenAI
|
9
9
|
|
10
|
+
from .logging import get_logger
|
11
|
+
|
12
|
+
logger = get_logger(__name__)
|
13
|
+
|
14
|
+
|
10
15
|
SYSTEM_PROMPT = """
|
11
16
|
You are an expert data scientist and MRM specialist.
|
12
17
|
You are tasked with analyzing the results of a quantitative test run on some model or dataset.
|
@@ -19,6 +24,7 @@ This will act as the description and interpretation of the result in the model d
|
|
19
24
|
It will be displayed alongside the test results table and figures.
|
20
25
|
|
21
26
|
Avoid long sentences and complex vocabulary.
|
27
|
+
Avoid overly verbose explanations - the goal is to explain to a user what they are seeing in the results.
|
22
28
|
Structure the response clearly and logically.
|
23
29
|
Use valid Markdown syntax to format the response.
|
24
30
|
Respond only with your analysis and insights, not the verbatim test results.
|
@@ -28,10 +34,12 @@ Use the Test ID that is provided to form the Test Name e.g. "ClassImbalance" ->
|
|
28
34
|
Explain the test, its purpose, its mechanism/formula etc and why it is useful.
|
29
35
|
If relevant, provide a very brief description of the way this test is used in model/dataset evaluation and how it is interpreted.
|
30
36
|
Highlight the key insights from the test results. The key insights should be concise and easily understood.
|
37
|
+
An insight should only be included if it is something not entirely obvious from the test results.
|
31
38
|
End the response with any closing remarks, summary or additional useful information.
|
32
39
|
|
33
|
-
Use the following format for the response (feel free to
|
34
|
-
|
40
|
+
Use the following format for the response (feel free to stray from it if necessary - this is a suggested starting point):
|
41
|
+
|
42
|
+
<ResponseFormat>
|
35
43
|
**<Test Name>** calculates the xyz <continue to explain what it does in detail>...
|
36
44
|
|
37
45
|
This test is useful for <explain why and for what this test is useful>...
|
@@ -42,8 +50,7 @@ The following key insights can be identified in the test results:
|
|
42
50
|
|
43
51
|
- **<key insight 1 - title>**: <concise explanation of key insight 1>
|
44
52
|
- ...<continue with any other key insights using the same format>
|
45
|
-
|
46
|
-
It is very important that the text is nicely formatted and contains enough information to be useful to the user as documentation.
|
53
|
+
</ResponseFormat>
|
47
54
|
""".strip()
|
48
55
|
|
49
56
|
|
@@ -73,12 +80,17 @@ The attached plots show the results of the test.
|
|
73
80
|
__client = None
|
74
81
|
__model = None
|
75
82
|
|
83
|
+
# can be None, True or False (ternary to represent initial state, ack and failed ack)
|
84
|
+
__ack = None
|
85
|
+
|
76
86
|
__executor = concurrent.futures.ThreadPoolExecutor()
|
77
87
|
|
78
88
|
|
79
89
|
def __get_client_and_model():
|
80
|
-
"""
|
81
|
-
|
90
|
+
"""Get model and client to use for generating interpretations
|
91
|
+
|
92
|
+
On first call, it will look in the environment for the API key endpoint, model etc.
|
93
|
+
and store them in a global variable to avoid loading them up again.
|
82
94
|
"""
|
83
95
|
global __client, __model
|
84
96
|
|
@@ -86,8 +98,10 @@ def __get_client_and_model():
|
|
86
98
|
return __client, __model
|
87
99
|
|
88
100
|
if "OPENAI_API_KEY" in os.environ:
|
89
|
-
__client = OpenAI(api_key=os.
|
90
|
-
__model = os.
|
101
|
+
__client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
102
|
+
__model = os.getenv("VM_OPENAI_MODEL", "gpt-4o")
|
103
|
+
|
104
|
+
logger.debug(f"Using OpenAI {__model} for generating descriptions")
|
91
105
|
|
92
106
|
elif "AZURE_OPENAI_KEY" in os.environ:
|
93
107
|
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
@@ -101,11 +115,13 @@ def __get_client_and_model():
|
|
101
115
|
)
|
102
116
|
|
103
117
|
__client = AzureOpenAI(
|
104
|
-
azure_endpoint=os.
|
105
|
-
api_key=os.
|
106
|
-
api_version=os.
|
118
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
119
|
+
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
120
|
+
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
107
121
|
)
|
108
|
-
__model = os.
|
122
|
+
__model = os.getenv("AZURE_OPENAI_MODEL")
|
123
|
+
|
124
|
+
logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")
|
109
125
|
|
110
126
|
else:
|
111
127
|
raise ValueError("OPENAI_API_KEY or AZURE_OPENAI_KEY must be set")
|
@@ -126,12 +142,19 @@ class DescriptionFuture:
|
|
126
142
|
self._future = future
|
127
143
|
|
128
144
|
def get_description(self):
|
129
|
-
|
130
|
-
|
145
|
+
from .utils import md_to_html
|
146
|
+
|
147
|
+
if isinstance(self._future, str):
|
148
|
+
description = self._future
|
149
|
+
else:
|
150
|
+
# This will block until the future is completed
|
151
|
+
description = self._future.result()
|
131
152
|
|
153
|
+
return md_to_html(description, mathml=True)
|
132
154
|
|
133
|
-
|
134
|
-
|
155
|
+
|
156
|
+
def generate_description(
|
157
|
+
test_id: str,
|
135
158
|
test_description: str,
|
136
159
|
test_summary: str,
|
137
160
|
figures: list = None,
|
@@ -140,14 +163,25 @@ def generate_description_async(
|
|
140
163
|
if not test_summary and not figures:
|
141
164
|
raise ValueError("No summary or figures provided - cannot generate description")
|
142
165
|
|
143
|
-
client,
|
166
|
+
client, model = __get_client_and_model()
|
144
167
|
# get last part of test id
|
145
|
-
test_name =
|
168
|
+
test_name = test_id.split(".")[-1]
|
169
|
+
# truncate the test description to save time
|
170
|
+
test_description = (
|
171
|
+
f"{test_description[:500]}..."
|
172
|
+
if len(test_description) > 500
|
173
|
+
else test_description
|
174
|
+
)
|
146
175
|
|
147
176
|
if test_summary:
|
177
|
+
logger.debug(
|
178
|
+
f"Generating description for test {test_name} with stringified summary"
|
179
|
+
)
|
148
180
|
return (
|
149
181
|
client.chat.completions.create(
|
150
|
-
model=
|
182
|
+
model=model,
|
183
|
+
temperature=0,
|
184
|
+
seed=42,
|
151
185
|
messages=[
|
152
186
|
{"role": "system", "content": SYSTEM_PROMPT},
|
153
187
|
{
|
@@ -161,13 +195,17 @@ def generate_description_async(
|
|
161
195
|
],
|
162
196
|
)
|
163
197
|
.choices[0]
|
164
|
-
.message.content.strip(
|
165
|
-
.strip()
|
198
|
+
.message.content.strip()
|
166
199
|
)
|
167
200
|
|
201
|
+
logger.debug(
|
202
|
+
f"Generating description for test {test_name} with {len(figures)} figures"
|
203
|
+
)
|
168
204
|
return (
|
169
205
|
client.chat.completions.create(
|
170
|
-
model=
|
206
|
+
model=model,
|
207
|
+
temperature=0,
|
208
|
+
seed=42,
|
171
209
|
messages=[
|
172
210
|
{"role": "system", "content": SYSTEM_PROMPT},
|
173
211
|
{
|
@@ -194,23 +232,49 @@ def generate_description_async(
|
|
194
232
|
],
|
195
233
|
)
|
196
234
|
.choices[0]
|
197
|
-
.message.content.strip(
|
198
|
-
.strip()
|
235
|
+
.message.content.strip()
|
199
236
|
)
|
200
237
|
|
201
238
|
|
202
|
-
def
|
203
|
-
|
239
|
+
def background_generate_description(
|
240
|
+
test_id: str,
|
204
241
|
test_description: str,
|
205
242
|
test_summary: str,
|
206
243
|
figures: list = None,
|
207
244
|
):
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
245
|
+
def wrapped():
|
246
|
+
try:
|
247
|
+
return generate_description(
|
248
|
+
test_id, test_description, test_summary, figures
|
249
|
+
)
|
250
|
+
except Exception as e:
|
251
|
+
logger.error(f"Failed to generate description: {e}")
|
252
|
+
|
253
|
+
return test_description
|
254
|
+
|
255
|
+
return DescriptionFuture(__executor.submit(wrapped))
|
256
|
+
|
257
|
+
|
258
|
+
def is_configured():
|
259
|
+
global __ack
|
260
|
+
|
261
|
+
if __ack:
|
262
|
+
return True
|
263
|
+
|
264
|
+
try:
|
265
|
+
client, model = __get_client_and_model()
|
266
|
+
# send an empty message with max_tokens=1 to "ping" the API
|
267
|
+
response = client.chat.completions.create(
|
268
|
+
model=model,
|
269
|
+
messages=[{"role": "user", "content": ""}],
|
270
|
+
max_tokens=1,
|
271
|
+
)
|
272
|
+
logger.debug(
|
273
|
+
f"Received response from OpenAI: {response.choices[0].message.content}"
|
274
|
+
)
|
275
|
+
__ack = True
|
276
|
+
except Exception as e:
|
277
|
+
logger.debug(f"Failed to connect to OpenAI: {e}")
|
278
|
+
__ack = False
|
215
279
|
|
216
|
-
return
|
280
|
+
return __ack
|
validmind/api_client.py
CHANGED
@@ -22,19 +22,19 @@ from aiohttp import FormData
|
|
22
22
|
from .client_config import client_config
|
23
23
|
from .errors import MissingAPICredentialsError, MissingProjectIdError, raise_api_error
|
24
24
|
from .logging import get_logger, init_sentry, send_single_error
|
25
|
-
from .utils import NumpyEncoder,
|
25
|
+
from .utils import NumpyEncoder, run_async
|
26
26
|
from .vm_models import Figure, MetricResult, ThresholdTestResults
|
27
27
|
|
28
28
|
# TODO: can't import types from vm_models because of circular dependency
|
29
29
|
|
30
30
|
logger = get_logger(__name__)
|
31
31
|
|
32
|
-
_api_key = os.
|
33
|
-
_api_secret = os.
|
34
|
-
_api_host = os.
|
32
|
+
_api_key = os.getenv("VM_API_KEY")
|
33
|
+
_api_secret = os.getenv("VM_API_SECRET")
|
34
|
+
_api_host = os.getenv("VM_API_HOST")
|
35
35
|
|
36
|
-
_project = os.
|
37
|
-
_run_cuid = os.
|
36
|
+
_project = os.getenv("VM_API_PROJECT")
|
37
|
+
_run_cuid = os.getenv("VM_RUN_CUID")
|
38
38
|
|
39
39
|
__api_session: aiohttp.ClientSession = None
|
40
40
|
|
@@ -102,21 +102,21 @@ def init(
|
|
102
102
|
api_secret = None
|
103
103
|
project = None
|
104
104
|
|
105
|
-
_project = project or os.
|
105
|
+
_project = project or os.getenv("VM_API_PROJECT")
|
106
106
|
|
107
107
|
if _project is None:
|
108
108
|
raise MissingProjectIdError()
|
109
109
|
|
110
|
-
_api_key = api_key or os.
|
111
|
-
_api_secret = api_secret or os.
|
110
|
+
_api_key = api_key or os.getenv("VM_API_KEY")
|
111
|
+
_api_secret = api_secret or os.getenv("VM_API_SECRET")
|
112
112
|
|
113
113
|
if _api_key is None or _api_secret is None:
|
114
114
|
raise MissingAPICredentialsError()
|
115
115
|
|
116
|
-
_api_host = api_host or os.
|
116
|
+
_api_host = api_host or os.getenv(
|
117
117
|
"VM_API_HOST", "http://127.0.0.1:5000/api/v1/tracking"
|
118
118
|
)
|
119
|
-
_run_cuid = os.
|
119
|
+
_run_cuid = os.getenv("VM_RUN_CUID", None)
|
120
120
|
|
121
121
|
try:
|
122
122
|
__ping()
|
@@ -349,7 +349,7 @@ async def log_metadata(
|
|
349
349
|
"""
|
350
350
|
metadata_dict = {"content_id": content_id}
|
351
351
|
if text is not None:
|
352
|
-
metadata_dict["text"] =
|
352
|
+
metadata_dict["text"] = text
|
353
353
|
if _json is not None:
|
354
354
|
metadata_dict["json"] = _json
|
355
355
|
|
validmind/logging.py
CHANGED
@@ -13,22 +13,45 @@ from sentry_sdk.utils import event_from_exception, exc_info_from_error
|
|
13
13
|
|
14
14
|
from .__version__ import __version__
|
15
15
|
|
16
|
-
__log_level = None
|
17
16
|
__dsn = "https://48f446843657444aa1e2c0d716ef864b@o1241367.ingest.sentry.io/4505239625465856"
|
18
17
|
|
19
18
|
|
20
19
|
def _get_log_level():
|
21
|
-
"""Get the log level from the environment variable
|
22
|
-
|
23
|
-
return __log_level
|
20
|
+
"""Get the log level from the environment variable"""
|
21
|
+
log_level_str = os.getenv("LOG_LEVEL", "INFO").upper()
|
24
22
|
|
25
|
-
log_level_str = os.environ.get("LOG_LEVEL", "INFO").upper()
|
26
23
|
if log_level_str not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
27
24
|
raise ValueError(f"Invalid log level: {log_level_str}")
|
28
25
|
|
29
26
|
return logging.getLevelName(log_level_str)
|
30
27
|
|
31
28
|
|
29
|
+
def get_logger(name="validmind", log_level=None):
|
30
|
+
"""Get a logger for the given module name"""
|
31
|
+
formatter = logging.Formatter(
|
32
|
+
fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
|
33
|
+
)
|
34
|
+
|
35
|
+
handler = logging.StreamHandler()
|
36
|
+
handler.setFormatter(formatter)
|
37
|
+
|
38
|
+
logger = logging.getLogger(name)
|
39
|
+
logger.setLevel(log_level or _get_log_level())
|
40
|
+
|
41
|
+
# Clear existing handlers if any (or refine the existing logic as necessary)
|
42
|
+
# TODO: move this to a yaml config and only configure once
|
43
|
+
if not any(
|
44
|
+
isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
|
45
|
+
for h in logger.handlers
|
46
|
+
):
|
47
|
+
logger.addHandler(handler)
|
48
|
+
|
49
|
+
# Prevent logger from propagating to root logger
|
50
|
+
logger.propagate = False
|
51
|
+
|
52
|
+
return logger
|
53
|
+
|
54
|
+
|
32
55
|
def init_sentry(server_config):
|
33
56
|
"""Initialize Sentry SDK for sending logs back to ValidMind
|
34
57
|
|
@@ -42,7 +65,10 @@ def init_sentry(server_config):
|
|
42
65
|
- dsn (str): The Sentry DSN
|
43
66
|
...: Other config options for Sentry
|
44
67
|
"""
|
45
|
-
if
|
68
|
+
if os.getenv("VM_NO_TELEMETRY", False):
|
69
|
+
return
|
70
|
+
|
71
|
+
if not server_config.get("send_logs", False):
|
46
72
|
return
|
47
73
|
|
48
74
|
config = {
|
@@ -53,33 +79,13 @@ def init_sentry(server_config):
|
|
53
79
|
"environment": "production",
|
54
80
|
}
|
55
81
|
config.update({k: v for k, v in server_config.items() if k != "send_logs"})
|
56
|
-
sentry_sdk.init(**config)
|
57
|
-
|
58
|
-
|
59
|
-
def get_logger(name="validmind", log_level=None):
|
60
|
-
"""Get a logger for the given name"""
|
61
|
-
formatter = logging.Formatter(
|
62
|
-
fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
|
63
|
-
)
|
64
|
-
|
65
|
-
handler = logging.StreamHandler()
|
66
|
-
handler.setFormatter(formatter)
|
67
|
-
|
68
|
-
logger = logging.getLogger(name)
|
69
|
-
logger.setLevel(log_level or _get_log_level())
|
70
|
-
|
71
|
-
# Clear existing handlers if any (or refine the existing logic as necessary)
|
72
|
-
# TODO: lets add some better handler management
|
73
|
-
if not any(
|
74
|
-
isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
|
75
|
-
for h in logger.handlers
|
76
|
-
):
|
77
|
-
logger.addHandler(handler)
|
78
|
-
|
79
|
-
# Prevent logger from propagating to root logger
|
80
|
-
logger.propagate = False
|
81
82
|
|
82
|
-
|
83
|
+
try:
|
84
|
+
sentry_sdk.init(**config)
|
85
|
+
except Exception as e:
|
86
|
+
logger = get_logger(__name__)
|
87
|
+
logger.info("Sentry failed to initialize - ignoring...")
|
88
|
+
logger.debug(f"Sentry error: {str(e)}")
|
83
89
|
|
84
90
|
|
85
91
|
def log_performance(func, name=None, logger=None, force=False):
|
@@ -20,8 +20,8 @@ class AIPoweredTest:
|
|
20
20
|
|
21
21
|
def __init__(self, *args, **kwargs):
|
22
22
|
if "OPENAI_API_KEY" in os.environ:
|
23
|
-
self.client = OpenAI(api_key=os.
|
24
|
-
self.model_name = os.
|
23
|
+
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
24
|
+
self.model_name = os.getenv("VM_OPENAI_MODEL", "gpt-3.5-turbo")
|
25
25
|
|
26
26
|
elif "AZURE_OPENAI_KEY" in os.environ:
|
27
27
|
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
@@ -35,11 +35,11 @@ class AIPoweredTest:
|
|
35
35
|
)
|
36
36
|
|
37
37
|
self.client = AzureOpenAI(
|
38
|
-
azure_endpoint=os.
|
39
|
-
api_key=os.
|
40
|
-
api_version=os.
|
38
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
39
|
+
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
40
|
+
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
41
41
|
)
|
42
|
-
self.model_name = os.
|
42
|
+
self.model_name = os.getenv("AZURE_OPENAI_MODEL")
|
43
43
|
|
44
44
|
else:
|
45
45
|
raise ValueError(
|
validmind/utils.py
CHANGED
@@ -26,9 +26,12 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
|
|
26
26
|
from numpy import ndarray
|
27
27
|
from tabulate import tabulate
|
28
28
|
|
29
|
-
from .ai import
|
29
|
+
from .ai import background_generate_description, is_configured
|
30
30
|
from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
|
31
31
|
|
32
|
+
AI_REVISION_NAME = "Generated by ValidMind AI"
|
33
|
+
DEFAULT_REVISION_NAME = "Default Description"
|
34
|
+
|
32
35
|
DEFAULT_BIG_NUMBER_DECIMALS = 2
|
33
36
|
DEFAULT_SMALL_NUMBER_DECIMALS = 4
|
34
37
|
|
@@ -459,15 +462,23 @@ def md_to_html(md: str, mathml=False) -> str:
|
|
459
462
|
return html
|
460
463
|
|
461
464
|
|
462
|
-
def get_description_metadata(
|
465
|
+
def get_description_metadata(
|
466
|
+
test_id,
|
467
|
+
default_description,
|
468
|
+
summary=None,
|
469
|
+
figures=None,
|
470
|
+
prefix="metric_description",
|
471
|
+
):
|
463
472
|
"""Get Metadata Dictionary for a Test or Metric Result
|
464
473
|
|
465
474
|
Generates an LLM interpretation of the test results or uses the default
|
466
475
|
description and returns a metadata object that can be logged with the test results.
|
467
476
|
|
468
|
-
|
469
|
-
|
470
|
-
|
477
|
+
By default, the description is generated by an LLM that will interpret the test
|
478
|
+
results and provide a human-readable description. If the summary or figures are
|
479
|
+
not provided, or the `VALIDMIND_LLM_DESCRIPTIONS_ENABLED` environment variable is
|
480
|
+
set to `0` or `false` or no LLM has been configured, the default description will
|
481
|
+
be used as the test result description.
|
471
482
|
|
472
483
|
Note: Either the summary or figures must be provided to generate the description.
|
473
484
|
|
@@ -476,23 +487,33 @@ def get_description_metadata(test_id, default_description, summary=None, figures
|
|
476
487
|
default_description (str): The default description for the test
|
477
488
|
summary (Any): The test summary or results to interpret
|
478
489
|
figures (List[Figure]): The figures to attach to the test suite result
|
490
|
+
prefix (str): The prefix to use for the content ID (Default: "metric_description")
|
479
491
|
|
480
492
|
Returns:
|
481
493
|
dict: The metadata object to be logged with the test results
|
482
494
|
"""
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
495
|
+
env_disabled = os.getenv("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "1") in [
|
496
|
+
"0",
|
497
|
+
"false",
|
498
|
+
]
|
499
|
+
|
500
|
+
if (summary or figures) and not env_disabled and is_configured():
|
501
|
+
revision_name = AI_REVISION_NAME
|
502
|
+
|
503
|
+
# get description future and set it as the description in the metadata
|
504
|
+
# this will lazily retrieved so it can run in the background in parallel
|
505
|
+
description = background_generate_description(
|
506
|
+
test_id=test_id,
|
487
507
|
test_description=default_description,
|
488
508
|
test_summary=summary,
|
489
509
|
figures=figures,
|
490
510
|
)
|
511
|
+
|
491
512
|
else:
|
492
|
-
revision_name =
|
493
|
-
description = default_description
|
513
|
+
revision_name = DEFAULT_REVISION_NAME
|
514
|
+
description = md_to_html(default_description, mathml=True)
|
494
515
|
|
495
516
|
return {
|
496
|
-
"content_id": f"
|
517
|
+
"content_id": f"{prefix}:{test_id}::{revision_name}",
|
497
518
|
"text": description,
|
498
519
|
}
|
@@ -7,7 +7,6 @@ Result Wrappers for test and metric results
|
|
7
7
|
"""
|
8
8
|
import asyncio
|
9
9
|
import json
|
10
|
-
import os
|
11
10
|
from abc import ABC, abstractmethod
|
12
11
|
from dataclasses import dataclass
|
13
12
|
from typing import Dict, List, Optional, Union
|
@@ -19,7 +18,7 @@ from ... import api_client
|
|
19
18
|
from ...ai import DescriptionFuture
|
20
19
|
from ...input_registry import input_registry
|
21
20
|
from ...logging import get_logger
|
22
|
-
from ...utils import NumpyEncoder, display,
|
21
|
+
from ...utils import AI_REVISION_NAME, NumpyEncoder, display, run_async, test_id_to_name
|
23
22
|
from ..dataset import VMDataset
|
24
23
|
from ..figure import Figure
|
25
24
|
from .metric_result import MetricResult
|
@@ -31,31 +30,35 @@ logger = get_logger(__name__)
|
|
31
30
|
|
32
31
|
|
33
32
|
async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] = None):
|
34
|
-
"""
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
"""
|
39
|
-
should_update = False
|
40
|
-
|
41
|
-
# check if the env variable is set to force overwriting metadata
|
42
|
-
if os.environ.get("VM_OVERRIDE_METADATA", "false").lower() == "true":
|
43
|
-
should_update = True
|
33
|
+
"""Create or Update a Metadata Object"""
|
34
|
+
parts = content_id.split("::")
|
35
|
+
content_id = parts[0]
|
36
|
+
revision_name = parts[1] if len(parts) > 1 else None
|
44
37
|
|
45
|
-
#
|
46
|
-
|
47
|
-
# we always want composite metric definitions to be updated
|
48
|
-
should_update = True
|
38
|
+
# we always want composite metric definitions to be updated
|
39
|
+
should_update = content_id.startswith("composite_metric_def:")
|
49
40
|
|
50
|
-
# if
|
51
|
-
if
|
41
|
+
# if we are updating a metric or test description, we check if the text
|
42
|
+
# has changed from the last time it was logged, and only update if it has
|
43
|
+
if content_id.split(":", 1)[0] in ["metric_description", "test_description"]:
|
52
44
|
try:
|
53
|
-
await api_client.get_metadata(content_id)
|
54
|
-
|
55
|
-
#
|
45
|
+
md = await api_client.get_metadata(content_id)
|
46
|
+
# if there is an existing description, only update it if the new one
|
47
|
+
# is different and is an AI-generated description
|
48
|
+
should_update = (
|
49
|
+
md["text"] != text if revision_name == AI_REVISION_NAME else False
|
50
|
+
)
|
51
|
+
logger.debug(f"Check if description has changed: {should_update}")
|
52
|
+
except Exception:
|
53
|
+
# if exception, assume its not created yet TODO: don't catch all
|
56
54
|
should_update = True
|
57
55
|
|
58
56
|
if should_update:
|
57
|
+
if revision_name:
|
58
|
+
content_id = f"{content_id}::{revision_name}"
|
59
|
+
|
60
|
+
logger.debug(f"Updating metadata for `{content_id}`")
|
61
|
+
|
59
62
|
await api_client.log_metadata(content_id, text, _json)
|
60
63
|
|
61
64
|
|
@@ -102,12 +105,6 @@ class ResultWrapper(ABC):
|
|
102
105
|
|
103
106
|
return self.to_widget()
|
104
107
|
|
105
|
-
def _markdown_description_to_html(self, description: str):
|
106
|
-
"""
|
107
|
-
Convert a markdown string to html
|
108
|
-
"""
|
109
|
-
return md_to_html(description)
|
110
|
-
|
111
108
|
def _summary_tables_to_widget(self, summary: ResultSummary):
|
112
109
|
"""
|
113
110
|
Create an ipywdiget representation of the summary tables
|
@@ -277,9 +274,7 @@ class MetricResultWrapper(ResultWrapper):
|
|
277
274
|
metric_description = metric_description.get_description()
|
278
275
|
self.result_metadata[0]["text"] = metric_description
|
279
276
|
|
280
|
-
vbox_children.append(
|
281
|
-
HTML(value=self._markdown_description_to_html(metric_description))
|
282
|
-
)
|
277
|
+
vbox_children.append(HTML(value=metric_description))
|
283
278
|
|
284
279
|
if self.metric:
|
285
280
|
if self.output_template:
|
@@ -464,9 +459,7 @@ class ThresholdTestResultWrapper(ResultWrapper):
|
|
464
459
|
metric_description = metric_description.get_description()
|
465
460
|
self.result_metadata[0]["text"] = metric_description
|
466
461
|
|
467
|
-
description_html.append(
|
468
|
-
self._markdown_description_to_html(metric_description)
|
469
|
-
)
|
462
|
+
description_html.append(metric_description)
|
470
463
|
|
471
464
|
description_html.append(
|
472
465
|
f"""
|
@@ -145,14 +145,17 @@ class TestSuiteRunner:
|
|
145
145
|
|
146
146
|
await asyncio.sleep(0.5)
|
147
147
|
|
148
|
-
def summarize(self):
|
148
|
+
def summarize(self, show_link: bool = True):
|
149
149
|
if not is_notebook():
|
150
150
|
return logger.info("Test suite done...")
|
151
151
|
|
152
|
+
self.pbar_description.value = "Collecting test results..."
|
153
|
+
|
152
154
|
summary = TestSuiteSummary(
|
153
155
|
title=self.suite.title,
|
154
156
|
description=self.suite.description,
|
155
157
|
sections=self.suite.sections,
|
158
|
+
show_link=show_link,
|
156
159
|
)
|
157
160
|
summary.display()
|
158
161
|
|
@@ -181,6 +184,6 @@ class TestSuiteRunner:
|
|
181
184
|
run_async(self.log_results)
|
182
185
|
run_async_check(self._check_progress)
|
183
186
|
|
184
|
-
self.summarize()
|
187
|
+
self.summarize(show_link=send)
|
185
188
|
|
186
189
|
self._stop_progress_bar()
|
@@ -35,8 +35,14 @@ class TestSuiteSectionSummary:
|
|
35
35
|
self._build_summary()
|
36
36
|
|
37
37
|
def _add_description(self):
|
38
|
-
|
39
|
-
|
38
|
+
if not self.description:
|
39
|
+
return
|
40
|
+
|
41
|
+
self._widgets.append(
|
42
|
+
widgets.HTML(
|
43
|
+
value=f'<div class="result">{md_to_html(self.description)}</div>'
|
44
|
+
)
|
45
|
+
)
|
40
46
|
|
41
47
|
def _add_tests_summary(self):
|
42
48
|
children = []
|
@@ -45,9 +51,9 @@ class TestSuiteSectionSummary:
|
|
45
51
|
for test in self.tests:
|
46
52
|
children.append(test.result.to_widget())
|
47
53
|
titles.append(
|
48
|
-
f"❌ {test.result.name}: {test.
|
54
|
+
f"❌ {test.result.name}: {test.name} ({test.test_id})"
|
49
55
|
if isinstance(test.result, FailedResultWrapper)
|
50
|
-
else f"{test.result.name}: {test.
|
56
|
+
else f"{test.result.name}: {test.name} ({test.test_id})"
|
51
57
|
)
|
52
58
|
|
53
59
|
self._widgets.append(widgets.Accordion(children=children, titles=titles))
|
@@ -71,6 +77,7 @@ class TestSuiteSummary:
|
|
71
77
|
title: str
|
72
78
|
description: str
|
73
79
|
sections: List[TestSuiteSection]
|
80
|
+
show_link: bool = True
|
74
81
|
|
75
82
|
_widgets: List[widgets.Widget] = None
|
76
83
|
|
@@ -100,8 +107,11 @@ class TestSuiteSummary:
|
|
100
107
|
self._widgets.append(widgets.HTML(value=results_link))
|
101
108
|
|
102
109
|
def _add_description(self):
|
103
|
-
|
104
|
-
|
110
|
+
self._widgets.append(
|
111
|
+
widgets.HTML(
|
112
|
+
value=f'<div class="result">{md_to_html(self.description)}</div>'
|
113
|
+
)
|
114
|
+
)
|
105
115
|
|
106
116
|
def _add_sections_summary(self):
|
107
117
|
children = []
|
@@ -145,7 +155,8 @@ class TestSuiteSummary:
|
|
145
155
|
self._widgets = []
|
146
156
|
|
147
157
|
self._add_title()
|
148
|
-
self.
|
158
|
+
if self.show_link:
|
159
|
+
self._add_results_link()
|
149
160
|
self._add_description()
|
150
161
|
if len(self.sections) == 1:
|
151
162
|
self._add_top_level_section_summary()
|
@@ -21,6 +21,7 @@ class TestSuiteTest:
|
|
21
21
|
|
22
22
|
test_id: str
|
23
23
|
output_template: str = None
|
24
|
+
name: str = None
|
24
25
|
|
25
26
|
_test_class: Test = None
|
26
27
|
_test_instance: Test = None
|
@@ -39,6 +40,8 @@ class TestSuiteTest:
|
|
39
40
|
self.test_id = test_id_or_obj["id"]
|
40
41
|
self.output_template = test_id_or_obj.get("output_template")
|
41
42
|
|
43
|
+
self.name = test_id_to_name(self.test_id)
|
44
|
+
|
42
45
|
try:
|
43
46
|
self._test_class = load_test_class(self.test_id)
|
44
47
|
except LoadTestError as e:
|
@@ -52,14 +55,6 @@ class TestSuiteTest:
|
|
52
55
|
# since _test_class is None
|
53
56
|
logger.error(f"Failed to load test '{self.test_id}': {e}")
|
54
57
|
|
55
|
-
@property
|
56
|
-
def title(self):
|
57
|
-
return test_id_to_name(self.test_id)
|
58
|
-
|
59
|
-
@property
|
60
|
-
def name(self):
|
61
|
-
return self._test_class.name
|
62
|
-
|
63
58
|
@property
|
64
59
|
def test_type(self):
|
65
60
|
return self._test_class.test_type
|
@@ -86,12 +81,12 @@ class TestSuiteTest:
|
|
86
81
|
)
|
87
82
|
except Exception as e:
|
88
83
|
logger.error(
|
89
|
-
f"Failed to load test '{self.
|
84
|
+
f"Failed to load test '{self.test_id}': "
|
90
85
|
f"({e.__class__.__name__}) {e}"
|
91
86
|
)
|
92
87
|
self.result = FailedResultWrapper(
|
93
88
|
error=e,
|
94
|
-
message=f"Failed to load test '{self.
|
89
|
+
message=f"Failed to load test '{self.name}'",
|
95
90
|
result_id=self.test_id,
|
96
91
|
)
|
97
92
|
|
@@ -107,7 +102,7 @@ class TestSuiteTest:
|
|
107
102
|
# run the test and log the performance if LOG_LEVEL is set to DEBUG
|
108
103
|
log_performance(
|
109
104
|
func=self._test_instance.run,
|
110
|
-
name=self.
|
105
|
+
name=self.test_id,
|
111
106
|
logger=logger,
|
112
107
|
)() # this is a decorator so we need to call it
|
113
108
|
|
@@ -116,14 +111,13 @@ class TestSuiteTest:
|
|
116
111
|
raise e # Re-raise the exception if we are in fail fast mode
|
117
112
|
|
118
113
|
logger.error(
|
119
|
-
f"Failed to run test '{self.
|
120
|
-
f"({e.__class__.__name__}) {e}"
|
114
|
+
f"Failed to run test '{self.test_id}': " f"({e.__class__.__name__}) {e}"
|
121
115
|
)
|
122
116
|
self.result = FailedResultWrapper(
|
123
117
|
name=f"Failed {self._test_instance.test_type}",
|
124
118
|
error=e,
|
125
|
-
message=f"Failed to run '{self.
|
126
|
-
result_id=self.
|
119
|
+
message=f"Failed to run '{self.name}'",
|
120
|
+
result_id=self.test_id,
|
127
121
|
)
|
128
122
|
|
129
123
|
return
|
@@ -132,8 +126,8 @@ class TestSuiteTest:
|
|
132
126
|
self.result = FailedResultWrapper(
|
133
127
|
name=f"Failed {self._test_instance.test_type}",
|
134
128
|
error=None,
|
135
|
-
message=f"'{self.
|
136
|
-
result_id=self.
|
129
|
+
message=f"'{self.name}' did not return a result",
|
130
|
+
result_id=self.test_id,
|
137
131
|
)
|
138
132
|
|
139
133
|
return
|
@@ -142,9 +136,8 @@ class TestSuiteTest:
|
|
142
136
|
self.result = FailedResultWrapper(
|
143
137
|
name=f"Failed {self._test_instance.test_type}",
|
144
138
|
error=None,
|
145
|
-
message=f"
|
146
|
-
|
147
|
-
result_id=self._test_instance.name,
|
139
|
+
message=f"{self.name} returned an invalid result: {self._test_instance.result}",
|
140
|
+
result_id=self.test_id,
|
148
141
|
)
|
149
142
|
|
150
143
|
return
|
@@ -1,7 +1,7 @@
|
|
1
1
|
validmind/__init__.py,sha256=XqPjCbFMvEYl0cIT42EZKP7DFMYDC7KDW6syo8MGkDg,3682
|
2
|
-
validmind/__version__.py,sha256=
|
3
|
-
validmind/ai.py,sha256=
|
4
|
-
validmind/api_client.py,sha256=
|
2
|
+
validmind/__version__.py,sha256=qzqxcwWCwWgKw_eJA2nZPycPzwfpaSjAKO3MwNvDqgw,22
|
3
|
+
validmind/ai.py,sha256=Uc09ulMZhu0VgbdZtHlRuzRg1QeCHVXJMXmZd6dbyEQ,9071
|
4
|
+
validmind/api_client.py,sha256=kIEO515kp_l5LA_QyRgHOumYaOIMSrCnl9Nj4Rm5TK8,15948
|
5
5
|
validmind/client.py,sha256=S_FozHlMJBgF8IQJES27LeFoYcoCcGZ6dkxE8adyIRQ,18607
|
6
6
|
validmind/client_config.py,sha256=58L6s6-9vFWC9vkSs_98CjV1YWmlksdhblJtPQxQsAk,1611
|
7
7
|
validmind/datasets/__init__.py,sha256=oYfcvW7BAyUgpghBOnTeGbQF6tpFAWg38rRirdLr8m8,262
|
@@ -59,7 +59,7 @@ validmind/errors.py,sha256=qy7Gp6Uom5J6WmLw-CpE5zaTN96SiN7kJjDGBaJdoxY,8023
|
|
59
59
|
validmind/html_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
60
|
validmind/html_templates/content_blocks.py,sha256=AHQ5MlhR1JYldel7soo5ztpTJJ5-kYtyKPBmh-vwxuI,3997
|
61
61
|
validmind/input_registry.py,sha256=zexO3x-vncaoWvQ6VfkvgDLn6x72e2BNel_jCbrVHSE,793
|
62
|
-
validmind/logging.py,sha256=
|
62
|
+
validmind/logging.py,sha256=J1Y1dYCH1dtkoYCHoXMOQH_B7EO4fJytWRDrDqZZz8U,5204
|
63
63
|
validmind/models/__init__.py,sha256=lraTbNwoKckXNP3Dbyj-euI78UTkZ_w5wpUOb8l5nWs,729
|
64
64
|
validmind/models/foundation.py,sha256=LSUdpnBYlPiOUVrTyofStPdoR6y0_nqJoM9TiYT1MRo,1758
|
65
65
|
validmind/models/function.py,sha256=loZoheqGyTvHze1XROEX1aqXgM08kPMr67X1nutaaeU,1629
|
@@ -259,7 +259,7 @@ validmind/tests/prompt_validation/NegativeInstruction.py,sha256=1aqNV_vB5oM2_8UX
|
|
259
259
|
validmind/tests/prompt_validation/Robustness.py,sha256=VIQotugWQ32Q1kr1kacBuqk-q1EPTRi9NZAIYrTDsY0,6826
|
260
260
|
validmind/tests/prompt_validation/Specificity.py,sha256=v823rZAr9a810Q_RlgH7FqPPxXZ00hDJApkFaJJ8mgk,6116
|
261
261
|
validmind/tests/prompt_validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
262
|
-
validmind/tests/prompt_validation/ai_powered_test.py,sha256=
|
262
|
+
validmind/tests/prompt_validation/ai_powered_test.py,sha256=qE3OXU0Db3z7KNqHMWZE8e78BeGg6pB0IozSzDNXIdc,2945
|
263
263
|
validmind/tests/test_providers.py,sha256=1tYn_sWNqifFpOp8eNvcVyJzxBjhHV5Py4FxO8opPZA,4944
|
264
264
|
validmind/unit_metrics/__init__.py,sha256=a7oV8YRC-O6dF7ePz4E8Fqrh4ax6AWT26Y996VPView,7084
|
265
265
|
validmind/unit_metrics/classification/sklearn/Accuracy.py,sha256=2Ra_OpKceY01h1dAFCqRFAwe--K2oVbCUiYjM5AH_nQ,480
|
@@ -279,7 +279,7 @@ validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py,sha256=LCNgpDw6FB
|
|
279
279
|
validmind/unit_metrics/regression/sklearn/MeanSquaredError.py,sha256=7UQnDTTO7yRRyMe3Zac9ZyjEbbD8pW_8WnZwHdVB_8U,463
|
280
280
|
validmind/unit_metrics/regression/sklearn/RSquaredScore.py,sha256=h9U5ndtnJfNNtKPZIo5n3KRp-m4akQcEo0t1iSwjVzY,420
|
281
281
|
validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py,sha256=_5IQIU9jNfmTE4NLJvaRWXbudRGV2PS7nYF5e4fkSMY,556
|
282
|
-
validmind/utils.py,sha256=
|
282
|
+
validmind/utils.py,sha256=ZQ016Cbgc_hrQb2HZ7s9KH80fDncnQZXFwa9oi8JO8g,16931
|
283
283
|
validmind/vm_models/__init__.py,sha256=lmWCD2u4tW6_AH39UnJ24sCcMUcsHbUttz7SaZfrh3s,1168
|
284
284
|
validmind/vm_models/dataset/__init__.py,sha256=U4CxZjdoc0dd9u2AqBl5PJh1UVbzXWNrmundmjLF-qE,346
|
285
285
|
validmind/vm_models/dataset/dataset.py,sha256=VlR5Wp5pCoXY3U0C8AbevaySFGf0KJ3QIK3go5OEbog,21843
|
@@ -290,17 +290,17 @@ validmind/vm_models/test/metric.py,sha256=R7Y-_fzBcIrkJw7-BeifQHMuHTV3HLDc8T3nS_
|
|
290
290
|
validmind/vm_models/test/metric_result.py,sha256=Bak4GDrMlNq5NtgP5exwlPsKZgz3tWgtC6jZqtHjvqM,1987
|
291
291
|
validmind/vm_models/test/output_template.py,sha256=njqCAMyLxwadkCWhACVskyL9-psTgmUysaeeirTVAX4,1500
|
292
292
|
validmind/vm_models/test/result_summary.py,sha256=QJcIKJUeBf5wW3lyue6ctsi1jKSyoiAIfmjudGJiJtc,2028
|
293
|
-
validmind/vm_models/test/result_wrapper.py,sha256=
|
293
|
+
validmind/vm_models/test/result_wrapper.py,sha256=an310hWJpVvWDrVSFvjTDUBDSE4XJ0aDliSVnKsgZaQ,17611
|
294
294
|
validmind/vm_models/test/test.py,sha256=434PqhPcbwfCmNjYVwHGMG-rViIatb9-1nmxkdZF8Xo,3104
|
295
|
-
validmind/vm_models/test/threshold_test.py,sha256=
|
295
|
+
validmind/vm_models/test/threshold_test.py,sha256=7d46Z5N_U1hTr6LGa2A0_ZuaIFl54xZ_eRzgf-KUGjk,3662
|
296
296
|
validmind/vm_models/test/threshold_test_result.py,sha256=EXP-g_e3NsnpkvNgYew030qVUoY6ZTHyuuFUXaq-BuM,1954
|
297
297
|
validmind/vm_models/test_context.py,sha256=AN7-atBgOcD04MLVitCFJYooxF6_iNmvI2H4nkv32iw,9035
|
298
|
-
validmind/vm_models/test_suite/runner.py,sha256=
|
299
|
-
validmind/vm_models/test_suite/summary.py,sha256=
|
300
|
-
validmind/vm_models/test_suite/test.py,sha256=
|
298
|
+
validmind/vm_models/test_suite/runner.py,sha256=uDt1eo3sHUXXV-ZN_gJUKR-0Hp5RNtUcDgKHQXtLf7s,6893
|
299
|
+
validmind/vm_models/test_suite/summary.py,sha256=co-xJJMUYGb7cOiVmw0i8vpZlfiMqrWjaCOmHKMAbcE,4686
|
300
|
+
validmind/vm_models/test_suite/test.py,sha256=_GfbK36l98SjzgVcucmp0OKBJKqMW3neO7SqJ3EWeps,5049
|
301
301
|
validmind/vm_models/test_suite/test_suite.py,sha256=Cns2wL54v0T5Mv5_HJb3kMeaa4rtycdqT8KxK9_rWEU,6279
|
302
|
-
validmind-2.2.
|
303
|
-
validmind-2.2.
|
304
|
-
validmind-2.2.
|
305
|
-
validmind-2.2.
|
306
|
-
validmind-2.2.
|
302
|
+
validmind-2.2.6.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
|
303
|
+
validmind-2.2.6.dist-info/METADATA,sha256=x00vy4OCVq0TNkM2jjt2Jzl-1FwH-I96zzIuoHfXRHU,3911
|
304
|
+
validmind-2.2.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
305
|
+
validmind-2.2.6.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
|
306
|
+
validmind-2.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|