validmind 2.2.5__py3-none-any.whl → 2.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +93 -27
- validmind/api_client.py +12 -12
- validmind/logging.py +38 -32
- validmind/tests/prompt_validation/ai_powered_test.py +6 -6
- validmind/utils.py +24 -10
- validmind/vm_models/test/result_wrapper.py +26 -33
- validmind/vm_models/test_suite/runner.py +5 -2
- validmind/vm_models/test_suite/summary.py +18 -7
- validmind/vm_models/test_suite/test.py +13 -20
- {validmind-2.2.5.dist-info → validmind-2.2.6.dist-info}/METADATA +1 -1
- {validmind-2.2.5.dist-info → validmind-2.2.6.dist-info}/RECORD +15 -15
- {validmind-2.2.5.dist-info → validmind-2.2.6.dist-info}/LICENSE +0 -0
- {validmind-2.2.5.dist-info → validmind-2.2.6.dist-info}/WHEEL +0 -0
- {validmind-2.2.5.dist-info → validmind-2.2.6.dist-info}/entry_points.txt +0 -0
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.2.
|
1
|
+
__version__ = "2.2.6"
|
validmind/ai.py
CHANGED
@@ -7,6 +7,11 @@ import os
|
|
7
7
|
|
8
8
|
from openai import AzureOpenAI, OpenAI
|
9
9
|
|
10
|
+
from .logging import get_logger
|
11
|
+
|
12
|
+
logger = get_logger(__name__)
|
13
|
+
|
14
|
+
|
10
15
|
SYSTEM_PROMPT = """
|
11
16
|
You are an expert data scientist and MRM specialist.
|
12
17
|
You are tasked with analyzing the results of a quantitative test run on some model or dataset.
|
@@ -19,6 +24,7 @@ This will act as the description and interpretation of the result in the model d
|
|
19
24
|
It will be displayed alongside the test results table and figures.
|
20
25
|
|
21
26
|
Avoid long sentences and complex vocabulary.
|
27
|
+
Avoid overly verbose explanations - the goal is to explain to a user what they are seeing in the results.
|
22
28
|
Structure the response clearly and logically.
|
23
29
|
Use valid Markdown syntax to format the response.
|
24
30
|
Respond only with your analysis and insights, not the verbatim test results.
|
@@ -28,9 +34,10 @@ Use the Test ID that is provided to form the Test Name e.g. "ClassImbalance" ->
|
|
28
34
|
Explain the test, its purpose, its mechanism/formula etc and why it is useful.
|
29
35
|
If relevant, provide a very brief description of the way this test is used in model/dataset evaluation and how it is interpreted.
|
30
36
|
Highlight the key insights from the test results. The key insights should be concise and easily understood.
|
37
|
+
An insight should only be included if it is something not entirely obvious from the test results.
|
31
38
|
End the response with any closing remarks, summary or additional useful information.
|
32
39
|
|
33
|
-
Use the following format for the response (feel free to
|
40
|
+
Use the following format for the response (feel free to stray from it if necessary - this is a suggested starting point):
|
34
41
|
|
35
42
|
<ResponseFormat>
|
36
43
|
**<Test Name>** calculates the xyz <continue to explain what it does in detail>...
|
@@ -73,12 +80,17 @@ The attached plots show the results of the test.
|
|
73
80
|
__client = None
|
74
81
|
__model = None
|
75
82
|
|
83
|
+
# can be None, True or False (ternary to represent initial state, ack and failed ack)
|
84
|
+
__ack = None
|
85
|
+
|
76
86
|
__executor = concurrent.futures.ThreadPoolExecutor()
|
77
87
|
|
78
88
|
|
79
89
|
def __get_client_and_model():
|
80
|
-
"""
|
81
|
-
|
90
|
+
"""Get model and client to use for generating interpretations
|
91
|
+
|
92
|
+
On first call, it will look in the environment for the API key endpoint, model etc.
|
93
|
+
and store them in a global variable to avoid loading them up again.
|
82
94
|
"""
|
83
95
|
global __client, __model
|
84
96
|
|
@@ -86,8 +98,10 @@ def __get_client_and_model():
|
|
86
98
|
return __client, __model
|
87
99
|
|
88
100
|
if "OPENAI_API_KEY" in os.environ:
|
89
|
-
__client = OpenAI(api_key=os.
|
90
|
-
__model = os.
|
101
|
+
__client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
102
|
+
__model = os.getenv("VM_OPENAI_MODEL", "gpt-4o")
|
103
|
+
|
104
|
+
logger.debug(f"Using OpenAI {__model} for generating descriptions")
|
91
105
|
|
92
106
|
elif "AZURE_OPENAI_KEY" in os.environ:
|
93
107
|
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
@@ -101,11 +115,13 @@ def __get_client_and_model():
|
|
101
115
|
)
|
102
116
|
|
103
117
|
__client = AzureOpenAI(
|
104
|
-
azure_endpoint=os.
|
105
|
-
api_key=os.
|
106
|
-
api_version=os.
|
118
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
119
|
+
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
120
|
+
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
107
121
|
)
|
108
|
-
__model = os.
|
122
|
+
__model = os.getenv("AZURE_OPENAI_MODEL")
|
123
|
+
|
124
|
+
logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")
|
109
125
|
|
110
126
|
else:
|
111
127
|
raise ValueError("OPENAI_API_KEY or AZURE_OPENAI_KEY must be set")
|
@@ -126,12 +142,19 @@ class DescriptionFuture:
|
|
126
142
|
self._future = future
|
127
143
|
|
128
144
|
def get_description(self):
|
129
|
-
|
130
|
-
|
145
|
+
from .utils import md_to_html
|
146
|
+
|
147
|
+
if isinstance(self._future, str):
|
148
|
+
description = self._future
|
149
|
+
else:
|
150
|
+
# This will block until the future is completed
|
151
|
+
description = self._future.result()
|
152
|
+
|
153
|
+
return md_to_html(description, mathml=True)
|
131
154
|
|
132
155
|
|
133
|
-
def
|
134
|
-
|
156
|
+
def generate_description(
|
157
|
+
test_id: str,
|
135
158
|
test_description: str,
|
136
159
|
test_summary: str,
|
137
160
|
figures: list = None,
|
@@ -140,14 +163,25 @@ def generate_description_async(
|
|
140
163
|
if not test_summary and not figures:
|
141
164
|
raise ValueError("No summary or figures provided - cannot generate description")
|
142
165
|
|
143
|
-
client,
|
166
|
+
client, model = __get_client_and_model()
|
144
167
|
# get last part of test id
|
145
|
-
test_name =
|
168
|
+
test_name = test_id.split(".")[-1]
|
169
|
+
# truncate the test description to save time
|
170
|
+
test_description = (
|
171
|
+
f"{test_description[:500]}..."
|
172
|
+
if len(test_description) > 500
|
173
|
+
else test_description
|
174
|
+
)
|
146
175
|
|
147
176
|
if test_summary:
|
177
|
+
logger.debug(
|
178
|
+
f"Generating description for test {test_name} with stringified summary"
|
179
|
+
)
|
148
180
|
return (
|
149
181
|
client.chat.completions.create(
|
150
|
-
model=
|
182
|
+
model=model,
|
183
|
+
temperature=0,
|
184
|
+
seed=42,
|
151
185
|
messages=[
|
152
186
|
{"role": "system", "content": SYSTEM_PROMPT},
|
153
187
|
{
|
@@ -164,9 +198,14 @@ def generate_description_async(
|
|
164
198
|
.message.content.strip()
|
165
199
|
)
|
166
200
|
|
201
|
+
logger.debug(
|
202
|
+
f"Generating description for test {test_name} with {len(figures)} figures"
|
203
|
+
)
|
167
204
|
return (
|
168
205
|
client.chat.completions.create(
|
169
|
-
model=
|
206
|
+
model=model,
|
207
|
+
temperature=0,
|
208
|
+
seed=42,
|
170
209
|
messages=[
|
171
210
|
{"role": "system", "content": SYSTEM_PROMPT},
|
172
211
|
{
|
@@ -197,18 +236,45 @@ def generate_description_async(
|
|
197
236
|
)
|
198
237
|
|
199
238
|
|
200
|
-
def
|
201
|
-
|
239
|
+
def background_generate_description(
|
240
|
+
test_id: str,
|
202
241
|
test_description: str,
|
203
242
|
test_summary: str,
|
204
243
|
figures: list = None,
|
205
244
|
):
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
245
|
+
def wrapped():
|
246
|
+
try:
|
247
|
+
return generate_description(
|
248
|
+
test_id, test_description, test_summary, figures
|
249
|
+
)
|
250
|
+
except Exception as e:
|
251
|
+
logger.error(f"Failed to generate description: {e}")
|
252
|
+
|
253
|
+
return test_description
|
254
|
+
|
255
|
+
return DescriptionFuture(__executor.submit(wrapped))
|
256
|
+
|
257
|
+
|
258
|
+
def is_configured():
|
259
|
+
global __ack
|
260
|
+
|
261
|
+
if __ack:
|
262
|
+
return True
|
263
|
+
|
264
|
+
try:
|
265
|
+
client, model = __get_client_and_model()
|
266
|
+
# send an empty message with max_tokens=1 to "ping" the API
|
267
|
+
response = client.chat.completions.create(
|
268
|
+
model=model,
|
269
|
+
messages=[{"role": "user", "content": ""}],
|
270
|
+
max_tokens=1,
|
271
|
+
)
|
272
|
+
logger.debug(
|
273
|
+
f"Received response from OpenAI: {response.choices[0].message.content}"
|
274
|
+
)
|
275
|
+
__ack = True
|
276
|
+
except Exception as e:
|
277
|
+
logger.debug(f"Failed to connect to OpenAI: {e}")
|
278
|
+
__ack = False
|
213
279
|
|
214
|
-
return
|
280
|
+
return __ack
|
validmind/api_client.py
CHANGED
@@ -22,19 +22,19 @@ from aiohttp import FormData
|
|
22
22
|
from .client_config import client_config
|
23
23
|
from .errors import MissingAPICredentialsError, MissingProjectIdError, raise_api_error
|
24
24
|
from .logging import get_logger, init_sentry, send_single_error
|
25
|
-
from .utils import NumpyEncoder,
|
25
|
+
from .utils import NumpyEncoder, run_async
|
26
26
|
from .vm_models import Figure, MetricResult, ThresholdTestResults
|
27
27
|
|
28
28
|
# TODO: can't import types from vm_models because of circular dependency
|
29
29
|
|
30
30
|
logger = get_logger(__name__)
|
31
31
|
|
32
|
-
_api_key = os.
|
33
|
-
_api_secret = os.
|
34
|
-
_api_host = os.
|
32
|
+
_api_key = os.getenv("VM_API_KEY")
|
33
|
+
_api_secret = os.getenv("VM_API_SECRET")
|
34
|
+
_api_host = os.getenv("VM_API_HOST")
|
35
35
|
|
36
|
-
_project = os.
|
37
|
-
_run_cuid = os.
|
36
|
+
_project = os.getenv("VM_API_PROJECT")
|
37
|
+
_run_cuid = os.getenv("VM_RUN_CUID")
|
38
38
|
|
39
39
|
__api_session: aiohttp.ClientSession = None
|
40
40
|
|
@@ -102,21 +102,21 @@ def init(
|
|
102
102
|
api_secret = None
|
103
103
|
project = None
|
104
104
|
|
105
|
-
_project = project or os.
|
105
|
+
_project = project or os.getenv("VM_API_PROJECT")
|
106
106
|
|
107
107
|
if _project is None:
|
108
108
|
raise MissingProjectIdError()
|
109
109
|
|
110
|
-
_api_key = api_key or os.
|
111
|
-
_api_secret = api_secret or os.
|
110
|
+
_api_key = api_key or os.getenv("VM_API_KEY")
|
111
|
+
_api_secret = api_secret or os.getenv("VM_API_SECRET")
|
112
112
|
|
113
113
|
if _api_key is None or _api_secret is None:
|
114
114
|
raise MissingAPICredentialsError()
|
115
115
|
|
116
|
-
_api_host = api_host or os.
|
116
|
+
_api_host = api_host or os.getenv(
|
117
117
|
"VM_API_HOST", "http://127.0.0.1:5000/api/v1/tracking"
|
118
118
|
)
|
119
|
-
_run_cuid = os.
|
119
|
+
_run_cuid = os.getenv("VM_RUN_CUID", None)
|
120
120
|
|
121
121
|
try:
|
122
122
|
__ping()
|
@@ -349,7 +349,7 @@ async def log_metadata(
|
|
349
349
|
"""
|
350
350
|
metadata_dict = {"content_id": content_id}
|
351
351
|
if text is not None:
|
352
|
-
metadata_dict["text"] =
|
352
|
+
metadata_dict["text"] = text
|
353
353
|
if _json is not None:
|
354
354
|
metadata_dict["json"] = _json
|
355
355
|
|
validmind/logging.py
CHANGED
@@ -13,22 +13,45 @@ from sentry_sdk.utils import event_from_exception, exc_info_from_error
|
|
13
13
|
|
14
14
|
from .__version__ import __version__
|
15
15
|
|
16
|
-
__log_level = None
|
17
16
|
__dsn = "https://48f446843657444aa1e2c0d716ef864b@o1241367.ingest.sentry.io/4505239625465856"
|
18
17
|
|
19
18
|
|
20
19
|
def _get_log_level():
|
21
|
-
"""Get the log level from the environment variable
|
22
|
-
|
23
|
-
return __log_level
|
20
|
+
"""Get the log level from the environment variable"""
|
21
|
+
log_level_str = os.getenv("LOG_LEVEL", "INFO").upper()
|
24
22
|
|
25
|
-
log_level_str = os.environ.get("LOG_LEVEL", "INFO").upper()
|
26
23
|
if log_level_str not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
27
24
|
raise ValueError(f"Invalid log level: {log_level_str}")
|
28
25
|
|
29
26
|
return logging.getLevelName(log_level_str)
|
30
27
|
|
31
28
|
|
29
|
+
def get_logger(name="validmind", log_level=None):
|
30
|
+
"""Get a logger for the given module name"""
|
31
|
+
formatter = logging.Formatter(
|
32
|
+
fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
|
33
|
+
)
|
34
|
+
|
35
|
+
handler = logging.StreamHandler()
|
36
|
+
handler.setFormatter(formatter)
|
37
|
+
|
38
|
+
logger = logging.getLogger(name)
|
39
|
+
logger.setLevel(log_level or _get_log_level())
|
40
|
+
|
41
|
+
# Clear existing handlers if any (or refine the existing logic as necessary)
|
42
|
+
# TODO: move this to a yaml config and only configure once
|
43
|
+
if not any(
|
44
|
+
isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
|
45
|
+
for h in logger.handlers
|
46
|
+
):
|
47
|
+
logger.addHandler(handler)
|
48
|
+
|
49
|
+
# Prevent logger from propagating to root logger
|
50
|
+
logger.propagate = False
|
51
|
+
|
52
|
+
return logger
|
53
|
+
|
54
|
+
|
32
55
|
def init_sentry(server_config):
|
33
56
|
"""Initialize Sentry SDK for sending logs back to ValidMind
|
34
57
|
|
@@ -42,7 +65,10 @@ def init_sentry(server_config):
|
|
42
65
|
- dsn (str): The Sentry DSN
|
43
66
|
...: Other config options for Sentry
|
44
67
|
"""
|
45
|
-
if
|
68
|
+
if os.getenv("VM_NO_TELEMETRY", False):
|
69
|
+
return
|
70
|
+
|
71
|
+
if not server_config.get("send_logs", False):
|
46
72
|
return
|
47
73
|
|
48
74
|
config = {
|
@@ -53,33 +79,13 @@ def init_sentry(server_config):
|
|
53
79
|
"environment": "production",
|
54
80
|
}
|
55
81
|
config.update({k: v for k, v in server_config.items() if k != "send_logs"})
|
56
|
-
sentry_sdk.init(**config)
|
57
|
-
|
58
|
-
|
59
|
-
def get_logger(name="validmind", log_level=None):
|
60
|
-
"""Get a logger for the given name"""
|
61
|
-
formatter = logging.Formatter(
|
62
|
-
fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
|
63
|
-
)
|
64
|
-
|
65
|
-
handler = logging.StreamHandler()
|
66
|
-
handler.setFormatter(formatter)
|
67
|
-
|
68
|
-
logger = logging.getLogger(name)
|
69
|
-
logger.setLevel(log_level or _get_log_level())
|
70
|
-
|
71
|
-
# Clear existing handlers if any (or refine the existing logic as necessary)
|
72
|
-
# TODO: lets add some better handler management
|
73
|
-
if not any(
|
74
|
-
isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
|
75
|
-
for h in logger.handlers
|
76
|
-
):
|
77
|
-
logger.addHandler(handler)
|
78
|
-
|
79
|
-
# Prevent logger from propagating to root logger
|
80
|
-
logger.propagate = False
|
81
82
|
|
82
|
-
|
83
|
+
try:
|
84
|
+
sentry_sdk.init(**config)
|
85
|
+
except Exception as e:
|
86
|
+
logger = get_logger(__name__)
|
87
|
+
logger.info("Sentry failed to initialize - ignoring...")
|
88
|
+
logger.debug(f"Sentry error: {str(e)}")
|
83
89
|
|
84
90
|
|
85
91
|
def log_performance(func, name=None, logger=None, force=False):
|
@@ -20,8 +20,8 @@ class AIPoweredTest:
|
|
20
20
|
|
21
21
|
def __init__(self, *args, **kwargs):
|
22
22
|
if "OPENAI_API_KEY" in os.environ:
|
23
|
-
self.client = OpenAI(api_key=os.
|
24
|
-
self.model_name = os.
|
23
|
+
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
24
|
+
self.model_name = os.getenv("VM_OPENAI_MODEL", "gpt-3.5-turbo")
|
25
25
|
|
26
26
|
elif "AZURE_OPENAI_KEY" in os.environ:
|
27
27
|
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
@@ -35,11 +35,11 @@ class AIPoweredTest:
|
|
35
35
|
)
|
36
36
|
|
37
37
|
self.client = AzureOpenAI(
|
38
|
-
azure_endpoint=os.
|
39
|
-
api_key=os.
|
40
|
-
api_version=os.
|
38
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
39
|
+
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
40
|
+
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
41
41
|
)
|
42
|
-
self.model_name = os.
|
42
|
+
self.model_name = os.getenv("AZURE_OPENAI_MODEL")
|
43
43
|
|
44
44
|
else:
|
45
45
|
raise ValueError(
|
validmind/utils.py
CHANGED
@@ -26,9 +26,12 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
|
|
26
26
|
from numpy import ndarray
|
27
27
|
from tabulate import tabulate
|
28
28
|
|
29
|
-
from .ai import
|
29
|
+
from .ai import background_generate_description, is_configured
|
30
30
|
from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
|
31
31
|
|
32
|
+
AI_REVISION_NAME = "Generated by ValidMind AI"
|
33
|
+
DEFAULT_REVISION_NAME = "Default Description"
|
34
|
+
|
32
35
|
DEFAULT_BIG_NUMBER_DECIMALS = 2
|
33
36
|
DEFAULT_SMALL_NUMBER_DECIMALS = 4
|
34
37
|
|
@@ -471,9 +474,11 @@ def get_description_metadata(
|
|
471
474
|
Generates an LLM interpretation of the test results or uses the default
|
472
475
|
description and returns a metadata object that can be logged with the test results.
|
473
476
|
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
+
By default, the description is generated by an LLM that will interpret the test
|
478
|
+
results and provide a human-readable description. If the summary or figures are
|
479
|
+
not provided, or the `VALIDMIND_LLM_DESCRIPTIONS_ENABLED` environment variable is
|
480
|
+
set to `0` or `false` or no LLM has been configured, the default description will
|
481
|
+
be used as the test result description.
|
477
482
|
|
478
483
|
Note: Either the summary or figures must be provided to generate the description.
|
479
484
|
|
@@ -487,17 +492,26 @@ def get_description_metadata(
|
|
487
492
|
Returns:
|
488
493
|
dict: The metadata object to be logged with the test results
|
489
494
|
"""
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
495
|
+
env_disabled = os.getenv("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "1") in [
|
496
|
+
"0",
|
497
|
+
"false",
|
498
|
+
]
|
499
|
+
|
500
|
+
if (summary or figures) and not env_disabled and is_configured():
|
501
|
+
revision_name = AI_REVISION_NAME
|
502
|
+
|
503
|
+
# get description future and set it as the description in the metadata
|
504
|
+
# this will lazily retrieved so it can run in the background in parallel
|
505
|
+
description = background_generate_description(
|
506
|
+
test_id=test_id,
|
494
507
|
test_description=default_description,
|
495
508
|
test_summary=summary,
|
496
509
|
figures=figures,
|
497
510
|
)
|
511
|
+
|
498
512
|
else:
|
499
|
-
revision_name =
|
500
|
-
description = default_description
|
513
|
+
revision_name = DEFAULT_REVISION_NAME
|
514
|
+
description = md_to_html(default_description, mathml=True)
|
501
515
|
|
502
516
|
return {
|
503
517
|
"content_id": f"{prefix}:{test_id}::{revision_name}",
|
@@ -7,7 +7,6 @@ Result Wrappers for test and metric results
|
|
7
7
|
"""
|
8
8
|
import asyncio
|
9
9
|
import json
|
10
|
-
import os
|
11
10
|
from abc import ABC, abstractmethod
|
12
11
|
from dataclasses import dataclass
|
13
12
|
from typing import Dict, List, Optional, Union
|
@@ -19,7 +18,7 @@ from ... import api_client
|
|
19
18
|
from ...ai import DescriptionFuture
|
20
19
|
from ...input_registry import input_registry
|
21
20
|
from ...logging import get_logger
|
22
|
-
from ...utils import NumpyEncoder, display,
|
21
|
+
from ...utils import AI_REVISION_NAME, NumpyEncoder, display, run_async, test_id_to_name
|
23
22
|
from ..dataset import VMDataset
|
24
23
|
from ..figure import Figure
|
25
24
|
from .metric_result import MetricResult
|
@@ -31,31 +30,35 @@ logger = get_logger(__name__)
|
|
31
30
|
|
32
31
|
|
33
32
|
async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] = None):
|
34
|
-
"""
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
"""
|
39
|
-
should_update = False
|
40
|
-
|
41
|
-
# check if the env variable is set to force overwriting metadata
|
42
|
-
if os.environ.get("VM_OVERRIDE_METADATA", "false").lower() == "true":
|
43
|
-
should_update = True
|
33
|
+
"""Create or Update a Metadata Object"""
|
34
|
+
parts = content_id.split("::")
|
35
|
+
content_id = parts[0]
|
36
|
+
revision_name = parts[1] if len(parts) > 1 else None
|
44
37
|
|
45
|
-
#
|
46
|
-
|
47
|
-
# we always want composite metric definitions to be updated
|
48
|
-
should_update = True
|
38
|
+
# we always want composite metric definitions to be updated
|
39
|
+
should_update = content_id.startswith("composite_metric_def:")
|
49
40
|
|
50
|
-
# if
|
51
|
-
if
|
41
|
+
# if we are updating a metric or test description, we check if the text
|
42
|
+
# has changed from the last time it was logged, and only update if it has
|
43
|
+
if content_id.split(":", 1)[0] in ["metric_description", "test_description"]:
|
52
44
|
try:
|
53
|
-
await api_client.get_metadata(content_id)
|
54
|
-
|
55
|
-
#
|
45
|
+
md = await api_client.get_metadata(content_id)
|
46
|
+
# if there is an existing description, only update it if the new one
|
47
|
+
# is different and is an AI-generated description
|
48
|
+
should_update = (
|
49
|
+
md["text"] != text if revision_name == AI_REVISION_NAME else False
|
50
|
+
)
|
51
|
+
logger.debug(f"Check if description has changed: {should_update}")
|
52
|
+
except Exception:
|
53
|
+
# if exception, assume its not created yet TODO: don't catch all
|
56
54
|
should_update = True
|
57
55
|
|
58
56
|
if should_update:
|
57
|
+
if revision_name:
|
58
|
+
content_id = f"{content_id}::{revision_name}"
|
59
|
+
|
60
|
+
logger.debug(f"Updating metadata for `{content_id}`")
|
61
|
+
|
59
62
|
await api_client.log_metadata(content_id, text, _json)
|
60
63
|
|
61
64
|
|
@@ -102,12 +105,6 @@ class ResultWrapper(ABC):
|
|
102
105
|
|
103
106
|
return self.to_widget()
|
104
107
|
|
105
|
-
def _markdown_description_to_html(self, description: str):
|
106
|
-
"""
|
107
|
-
Convert a markdown string to html
|
108
|
-
"""
|
109
|
-
return md_to_html(description)
|
110
|
-
|
111
108
|
def _summary_tables_to_widget(self, summary: ResultSummary):
|
112
109
|
"""
|
113
110
|
Create an ipywdiget representation of the summary tables
|
@@ -277,9 +274,7 @@ class MetricResultWrapper(ResultWrapper):
|
|
277
274
|
metric_description = metric_description.get_description()
|
278
275
|
self.result_metadata[0]["text"] = metric_description
|
279
276
|
|
280
|
-
vbox_children.append(
|
281
|
-
HTML(value=self._markdown_description_to_html(metric_description))
|
282
|
-
)
|
277
|
+
vbox_children.append(HTML(value=metric_description))
|
283
278
|
|
284
279
|
if self.metric:
|
285
280
|
if self.output_template:
|
@@ -464,9 +459,7 @@ class ThresholdTestResultWrapper(ResultWrapper):
|
|
464
459
|
metric_description = metric_description.get_description()
|
465
460
|
self.result_metadata[0]["text"] = metric_description
|
466
461
|
|
467
|
-
description_html.append(
|
468
|
-
self._markdown_description_to_html(metric_description)
|
469
|
-
)
|
462
|
+
description_html.append(metric_description)
|
470
463
|
|
471
464
|
description_html.append(
|
472
465
|
f"""
|
@@ -145,14 +145,17 @@ class TestSuiteRunner:
|
|
145
145
|
|
146
146
|
await asyncio.sleep(0.5)
|
147
147
|
|
148
|
-
def summarize(self):
|
148
|
+
def summarize(self, show_link: bool = True):
|
149
149
|
if not is_notebook():
|
150
150
|
return logger.info("Test suite done...")
|
151
151
|
|
152
|
+
self.pbar_description.value = "Collecting test results..."
|
153
|
+
|
152
154
|
summary = TestSuiteSummary(
|
153
155
|
title=self.suite.title,
|
154
156
|
description=self.suite.description,
|
155
157
|
sections=self.suite.sections,
|
158
|
+
show_link=show_link,
|
156
159
|
)
|
157
160
|
summary.display()
|
158
161
|
|
@@ -181,6 +184,6 @@ class TestSuiteRunner:
|
|
181
184
|
run_async(self.log_results)
|
182
185
|
run_async_check(self._check_progress)
|
183
186
|
|
184
|
-
self.summarize()
|
187
|
+
self.summarize(show_link=send)
|
185
188
|
|
186
189
|
self._stop_progress_bar()
|
@@ -35,8 +35,14 @@ class TestSuiteSectionSummary:
|
|
35
35
|
self._build_summary()
|
36
36
|
|
37
37
|
def _add_description(self):
|
38
|
-
|
39
|
-
|
38
|
+
if not self.description:
|
39
|
+
return
|
40
|
+
|
41
|
+
self._widgets.append(
|
42
|
+
widgets.HTML(
|
43
|
+
value=f'<div class="result">{md_to_html(self.description)}</div>'
|
44
|
+
)
|
45
|
+
)
|
40
46
|
|
41
47
|
def _add_tests_summary(self):
|
42
48
|
children = []
|
@@ -45,9 +51,9 @@ class TestSuiteSectionSummary:
|
|
45
51
|
for test in self.tests:
|
46
52
|
children.append(test.result.to_widget())
|
47
53
|
titles.append(
|
48
|
-
f"❌ {test.result.name}: {test.
|
54
|
+
f"❌ {test.result.name}: {test.name} ({test.test_id})"
|
49
55
|
if isinstance(test.result, FailedResultWrapper)
|
50
|
-
else f"{test.result.name}: {test.
|
56
|
+
else f"{test.result.name}: {test.name} ({test.test_id})"
|
51
57
|
)
|
52
58
|
|
53
59
|
self._widgets.append(widgets.Accordion(children=children, titles=titles))
|
@@ -71,6 +77,7 @@ class TestSuiteSummary:
|
|
71
77
|
title: str
|
72
78
|
description: str
|
73
79
|
sections: List[TestSuiteSection]
|
80
|
+
show_link: bool = True
|
74
81
|
|
75
82
|
_widgets: List[widgets.Widget] = None
|
76
83
|
|
@@ -100,8 +107,11 @@ class TestSuiteSummary:
|
|
100
107
|
self._widgets.append(widgets.HTML(value=results_link))
|
101
108
|
|
102
109
|
def _add_description(self):
|
103
|
-
|
104
|
-
|
110
|
+
self._widgets.append(
|
111
|
+
widgets.HTML(
|
112
|
+
value=f'<div class="result">{md_to_html(self.description)}</div>'
|
113
|
+
)
|
114
|
+
)
|
105
115
|
|
106
116
|
def _add_sections_summary(self):
|
107
117
|
children = []
|
@@ -145,7 +155,8 @@ class TestSuiteSummary:
|
|
145
155
|
self._widgets = []
|
146
156
|
|
147
157
|
self._add_title()
|
148
|
-
self.
|
158
|
+
if self.show_link:
|
159
|
+
self._add_results_link()
|
149
160
|
self._add_description()
|
150
161
|
if len(self.sections) == 1:
|
151
162
|
self._add_top_level_section_summary()
|
@@ -21,6 +21,7 @@ class TestSuiteTest:
|
|
21
21
|
|
22
22
|
test_id: str
|
23
23
|
output_template: str = None
|
24
|
+
name: str = None
|
24
25
|
|
25
26
|
_test_class: Test = None
|
26
27
|
_test_instance: Test = None
|
@@ -39,6 +40,8 @@ class TestSuiteTest:
|
|
39
40
|
self.test_id = test_id_or_obj["id"]
|
40
41
|
self.output_template = test_id_or_obj.get("output_template")
|
41
42
|
|
43
|
+
self.name = test_id_to_name(self.test_id)
|
44
|
+
|
42
45
|
try:
|
43
46
|
self._test_class = load_test_class(self.test_id)
|
44
47
|
except LoadTestError as e:
|
@@ -52,14 +55,6 @@ class TestSuiteTest:
|
|
52
55
|
# since _test_class is None
|
53
56
|
logger.error(f"Failed to load test '{self.test_id}': {e}")
|
54
57
|
|
55
|
-
@property
|
56
|
-
def title(self):
|
57
|
-
return test_id_to_name(self.test_id)
|
58
|
-
|
59
|
-
@property
|
60
|
-
def name(self):
|
61
|
-
return self._test_class.name
|
62
|
-
|
63
58
|
@property
|
64
59
|
def test_type(self):
|
65
60
|
return self._test_class.test_type
|
@@ -86,12 +81,12 @@ class TestSuiteTest:
|
|
86
81
|
)
|
87
82
|
except Exception as e:
|
88
83
|
logger.error(
|
89
|
-
f"Failed to load test '{self.
|
84
|
+
f"Failed to load test '{self.test_id}': "
|
90
85
|
f"({e.__class__.__name__}) {e}"
|
91
86
|
)
|
92
87
|
self.result = FailedResultWrapper(
|
93
88
|
error=e,
|
94
|
-
message=f"Failed to load test '{self.
|
89
|
+
message=f"Failed to load test '{self.name}'",
|
95
90
|
result_id=self.test_id,
|
96
91
|
)
|
97
92
|
|
@@ -107,7 +102,7 @@ class TestSuiteTest:
|
|
107
102
|
# run the test and log the performance if LOG_LEVEL is set to DEBUG
|
108
103
|
log_performance(
|
109
104
|
func=self._test_instance.run,
|
110
|
-
name=self.
|
105
|
+
name=self.test_id,
|
111
106
|
logger=logger,
|
112
107
|
)() # this is a decorator so we need to call it
|
113
108
|
|
@@ -116,14 +111,13 @@ class TestSuiteTest:
|
|
116
111
|
raise e # Re-raise the exception if we are in fail fast mode
|
117
112
|
|
118
113
|
logger.error(
|
119
|
-
f"Failed to run test '{self.
|
120
|
-
f"({e.__class__.__name__}) {e}"
|
114
|
+
f"Failed to run test '{self.test_id}': " f"({e.__class__.__name__}) {e}"
|
121
115
|
)
|
122
116
|
self.result = FailedResultWrapper(
|
123
117
|
name=f"Failed {self._test_instance.test_type}",
|
124
118
|
error=e,
|
125
|
-
message=f"Failed to run '{self.
|
126
|
-
result_id=self.
|
119
|
+
message=f"Failed to run '{self.name}'",
|
120
|
+
result_id=self.test_id,
|
127
121
|
)
|
128
122
|
|
129
123
|
return
|
@@ -132,8 +126,8 @@ class TestSuiteTest:
|
|
132
126
|
self.result = FailedResultWrapper(
|
133
127
|
name=f"Failed {self._test_instance.test_type}",
|
134
128
|
error=None,
|
135
|
-
message=f"'{self.
|
136
|
-
result_id=self.
|
129
|
+
message=f"'{self.name}' did not return a result",
|
130
|
+
result_id=self.test_id,
|
137
131
|
)
|
138
132
|
|
139
133
|
return
|
@@ -142,9 +136,8 @@ class TestSuiteTest:
|
|
142
136
|
self.result = FailedResultWrapper(
|
143
137
|
name=f"Failed {self._test_instance.test_type}",
|
144
138
|
error=None,
|
145
|
-
message=f"
|
146
|
-
|
147
|
-
result_id=self._test_instance.name,
|
139
|
+
message=f"{self.name} returned an invalid result: {self._test_instance.result}",
|
140
|
+
result_id=self.test_id,
|
148
141
|
)
|
149
142
|
|
150
143
|
return
|
@@ -1,7 +1,7 @@
|
|
1
1
|
validmind/__init__.py,sha256=XqPjCbFMvEYl0cIT42EZKP7DFMYDC7KDW6syo8MGkDg,3682
|
2
|
-
validmind/__version__.py,sha256=
|
3
|
-
validmind/ai.py,sha256=
|
4
|
-
validmind/api_client.py,sha256=
|
2
|
+
validmind/__version__.py,sha256=qzqxcwWCwWgKw_eJA2nZPycPzwfpaSjAKO3MwNvDqgw,22
|
3
|
+
validmind/ai.py,sha256=Uc09ulMZhu0VgbdZtHlRuzRg1QeCHVXJMXmZd6dbyEQ,9071
|
4
|
+
validmind/api_client.py,sha256=kIEO515kp_l5LA_QyRgHOumYaOIMSrCnl9Nj4Rm5TK8,15948
|
5
5
|
validmind/client.py,sha256=S_FozHlMJBgF8IQJES27LeFoYcoCcGZ6dkxE8adyIRQ,18607
|
6
6
|
validmind/client_config.py,sha256=58L6s6-9vFWC9vkSs_98CjV1YWmlksdhblJtPQxQsAk,1611
|
7
7
|
validmind/datasets/__init__.py,sha256=oYfcvW7BAyUgpghBOnTeGbQF6tpFAWg38rRirdLr8m8,262
|
@@ -59,7 +59,7 @@ validmind/errors.py,sha256=qy7Gp6Uom5J6WmLw-CpE5zaTN96SiN7kJjDGBaJdoxY,8023
|
|
59
59
|
validmind/html_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
60
|
validmind/html_templates/content_blocks.py,sha256=AHQ5MlhR1JYldel7soo5ztpTJJ5-kYtyKPBmh-vwxuI,3997
|
61
61
|
validmind/input_registry.py,sha256=zexO3x-vncaoWvQ6VfkvgDLn6x72e2BNel_jCbrVHSE,793
|
62
|
-
validmind/logging.py,sha256=
|
62
|
+
validmind/logging.py,sha256=J1Y1dYCH1dtkoYCHoXMOQH_B7EO4fJytWRDrDqZZz8U,5204
|
63
63
|
validmind/models/__init__.py,sha256=lraTbNwoKckXNP3Dbyj-euI78UTkZ_w5wpUOb8l5nWs,729
|
64
64
|
validmind/models/foundation.py,sha256=LSUdpnBYlPiOUVrTyofStPdoR6y0_nqJoM9TiYT1MRo,1758
|
65
65
|
validmind/models/function.py,sha256=loZoheqGyTvHze1XROEX1aqXgM08kPMr67X1nutaaeU,1629
|
@@ -259,7 +259,7 @@ validmind/tests/prompt_validation/NegativeInstruction.py,sha256=1aqNV_vB5oM2_8UX
|
|
259
259
|
validmind/tests/prompt_validation/Robustness.py,sha256=VIQotugWQ32Q1kr1kacBuqk-q1EPTRi9NZAIYrTDsY0,6826
|
260
260
|
validmind/tests/prompt_validation/Specificity.py,sha256=v823rZAr9a810Q_RlgH7FqPPxXZ00hDJApkFaJJ8mgk,6116
|
261
261
|
validmind/tests/prompt_validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
262
|
-
validmind/tests/prompt_validation/ai_powered_test.py,sha256=
|
262
|
+
validmind/tests/prompt_validation/ai_powered_test.py,sha256=qE3OXU0Db3z7KNqHMWZE8e78BeGg6pB0IozSzDNXIdc,2945
|
263
263
|
validmind/tests/test_providers.py,sha256=1tYn_sWNqifFpOp8eNvcVyJzxBjhHV5Py4FxO8opPZA,4944
|
264
264
|
validmind/unit_metrics/__init__.py,sha256=a7oV8YRC-O6dF7ePz4E8Fqrh4ax6AWT26Y996VPView,7084
|
265
265
|
validmind/unit_metrics/classification/sklearn/Accuracy.py,sha256=2Ra_OpKceY01h1dAFCqRFAwe--K2oVbCUiYjM5AH_nQ,480
|
@@ -279,7 +279,7 @@ validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py,sha256=LCNgpDw6FB
|
|
279
279
|
validmind/unit_metrics/regression/sklearn/MeanSquaredError.py,sha256=7UQnDTTO7yRRyMe3Zac9ZyjEbbD8pW_8WnZwHdVB_8U,463
|
280
280
|
validmind/unit_metrics/regression/sklearn/RSquaredScore.py,sha256=h9U5ndtnJfNNtKPZIo5n3KRp-m4akQcEo0t1iSwjVzY,420
|
281
281
|
validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py,sha256=_5IQIU9jNfmTE4NLJvaRWXbudRGV2PS7nYF5e4fkSMY,556
|
282
|
-
validmind/utils.py,sha256=
|
282
|
+
validmind/utils.py,sha256=ZQ016Cbgc_hrQb2HZ7s9KH80fDncnQZXFwa9oi8JO8g,16931
|
283
283
|
validmind/vm_models/__init__.py,sha256=lmWCD2u4tW6_AH39UnJ24sCcMUcsHbUttz7SaZfrh3s,1168
|
284
284
|
validmind/vm_models/dataset/__init__.py,sha256=U4CxZjdoc0dd9u2AqBl5PJh1UVbzXWNrmundmjLF-qE,346
|
285
285
|
validmind/vm_models/dataset/dataset.py,sha256=VlR5Wp5pCoXY3U0C8AbevaySFGf0KJ3QIK3go5OEbog,21843
|
@@ -290,17 +290,17 @@ validmind/vm_models/test/metric.py,sha256=R7Y-_fzBcIrkJw7-BeifQHMuHTV3HLDc8T3nS_
|
|
290
290
|
validmind/vm_models/test/metric_result.py,sha256=Bak4GDrMlNq5NtgP5exwlPsKZgz3tWgtC6jZqtHjvqM,1987
|
291
291
|
validmind/vm_models/test/output_template.py,sha256=njqCAMyLxwadkCWhACVskyL9-psTgmUysaeeirTVAX4,1500
|
292
292
|
validmind/vm_models/test/result_summary.py,sha256=QJcIKJUeBf5wW3lyue6ctsi1jKSyoiAIfmjudGJiJtc,2028
|
293
|
-
validmind/vm_models/test/result_wrapper.py,sha256=
|
293
|
+
validmind/vm_models/test/result_wrapper.py,sha256=an310hWJpVvWDrVSFvjTDUBDSE4XJ0aDliSVnKsgZaQ,17611
|
294
294
|
validmind/vm_models/test/test.py,sha256=434PqhPcbwfCmNjYVwHGMG-rViIatb9-1nmxkdZF8Xo,3104
|
295
295
|
validmind/vm_models/test/threshold_test.py,sha256=7d46Z5N_U1hTr6LGa2A0_ZuaIFl54xZ_eRzgf-KUGjk,3662
|
296
296
|
validmind/vm_models/test/threshold_test_result.py,sha256=EXP-g_e3NsnpkvNgYew030qVUoY6ZTHyuuFUXaq-BuM,1954
|
297
297
|
validmind/vm_models/test_context.py,sha256=AN7-atBgOcD04MLVitCFJYooxF6_iNmvI2H4nkv32iw,9035
|
298
|
-
validmind/vm_models/test_suite/runner.py,sha256=
|
299
|
-
validmind/vm_models/test_suite/summary.py,sha256=
|
300
|
-
validmind/vm_models/test_suite/test.py,sha256=
|
298
|
+
validmind/vm_models/test_suite/runner.py,sha256=uDt1eo3sHUXXV-ZN_gJUKR-0Hp5RNtUcDgKHQXtLf7s,6893
|
299
|
+
validmind/vm_models/test_suite/summary.py,sha256=co-xJJMUYGb7cOiVmw0i8vpZlfiMqrWjaCOmHKMAbcE,4686
|
300
|
+
validmind/vm_models/test_suite/test.py,sha256=_GfbK36l98SjzgVcucmp0OKBJKqMW3neO7SqJ3EWeps,5049
|
301
301
|
validmind/vm_models/test_suite/test_suite.py,sha256=Cns2wL54v0T5Mv5_HJb3kMeaa4rtycdqT8KxK9_rWEU,6279
|
302
|
-
validmind-2.2.
|
303
|
-
validmind-2.2.
|
304
|
-
validmind-2.2.
|
305
|
-
validmind-2.2.
|
306
|
-
validmind-2.2.
|
302
|
+
validmind-2.2.6.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
|
303
|
+
validmind-2.2.6.dist-info/METADATA,sha256=x00vy4OCVq0TNkM2jjt2Jzl-1FwH-I96zzIuoHfXRHU,3911
|
304
|
+
validmind-2.2.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
305
|
+
validmind-2.2.6.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
|
306
|
+
validmind-2.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|